use locale; use strict; use URI::URL; use LWP::Simple; use XML::Simple; use HTML::Entities; use Encode; if ( $#ARGV != 1 ) { die "Usage : ", $0, " requete nombre_documents\n"; } my $requete = $ARGV[0]; my $n_voulus = $ARGV[1]; # NOTE : insérez ci-dessous l'identifiant que vous avez déclaré # auprès de Yahoo Web Search Services lors de votre inscription. # Reportez-vous au site http://developper.yahoo.com/ my $identifiant_Yahoo = "xxx"; my $url = url( "http://api.search.yahoo.com/WebSearchService/V1/webSearch"); my $continuer = 1; my $decalage = 1; my $n_traites = 0; while ( $continuer ) { $url -> query_form ( appid => $identifiant_Yahoo, query => encode("utf8", $requete), region => "fr", type => "phrase", results => 100, start => $decalage, format => "html", adult_ok => 1, similar_ok=> 0, language => "fr", country => "fr", output => "xml" ); my $reponse_yahoo = get $url; while ( not defined ($reponse_yahoo) ) { sleep (1000); $reponse_yahoo = get $url; } my $arbre_XML = XMLin( $reponse_yahoo, ForceArray => ["Result"] ); if (defined ($arbre_XML->{"Result"})) { my @documents = @{$arbre_XML->{"Result"}}; while ( ($#documents >= 0) and ($n_traites < $n_voulus) ){ my $document = shift @documents; my $ok = traite_url ( $document->{"Url"} ); if ( $ok == 1 ){ $n_traites++; } } if ( ($n_traites >= $n_voulus) or ($decalage+100 >= $arbre_XML -> {"totalResultsAvailable"}) ) { $continuer = 0; } else { $decalage = $decalage + 100; } } else { $continuer = 0; } } sub traite_url { my $url = shift @_; my $page = get( $url ); if ( defined ($page) ){ print "---\nURL : $url\n"; my $codage_page = "latin1"; if ( $page =~ /\bcharset *= *([\w-]+)/i ) { $codage_page = $1; eval { decode ($codage_page, "test") }; if ( defined ($@) ) { $codage_page = "latin1"; } } my $page_unicode = decode( $codage_page, $page ); my $texte_unicode = supprime_html( $page_unicode ) ; my $texte = normalise_latin1 ( $texte_unicode ); $texte =~ s/\n/ /g; my $requete_texte = $requete; $requete_texte =~ s/\"//g; while ( $texte =~ /\b$requete_texte\b/ig ){ my $contexte_gauche = substr($`, ( length($`)-20) , 20); my $contexte_droit = substr($', 0, 20); print $contexte_gauche."|".$&."|".$contexte_droit."\n"; } return 1; } } sub normalise_latin1 { my $chaine = shift @_; $chaine =~ s/[\x{2018}\x{2019}]/\'/g; $chaine =~ s/[\x{201C}\x{201D}]/\"/g; $chaine =~ s/[\x{2013}\x{2014}]/-/g; $chaine =~ s/\x{2026}/.../g; $chaine =~ s/\x{0152}/OE/g; $chaine =~ s/\x{0153}/oe/g; $chaine =~ s/[^\x{0000}-\x{00FF}]//g; return $chaine; } sub supprime_html { my @balises_a_ignorer = ("applet","code","embed","head","object","script","server"); my $html = shift @_; $html =~ s/\n+/ /g; $html =~ s/\r+/ /g; decode_entities($html); foreach my $balise (@balises_a_ignorer) { $html=~s/<$balise.*?<\/$balise>//ig; } $html =~ s///g; $html =~ s/<\/?p\/?>/\n/ig; $html =~ s/<\/?br\/?>/\n/ig; $html =~ s/<\/tr>/\n/ig; $html =~ s//\n/ig; $html =~ s//\n/ig; $html =~ s/<.*?>//g; $html =~ s/\s*\n\s*/\n/g; $html =~ s/ +/ /g; return $html; }