use strict; use locale; use LWP::Simple; use Encode; use HTML::Entities; if ($#ARGV != 0) { die "Usage : ", $0, " URL\n"; } my $URL = $ARGV[0]; my $page = get( $URL ); if ( not defined($page) ){ die "Problème lors du téléchargement !\n"; } my $codage_page = "latin1"; if ($page =~ /\bcharset\s*=\s*([\w-]+)/i) { $codage_page = $1; eval { decode ($codage_page, "test") }; if ( defined ($@) ) { $codage_page = "latin1"; } } my $page_unicode = decode( $codage_page, $page ); my $texte_unicode = supprime_html( $page_unicode ); my $texte = normalise_latin1( $texte_unicode ); print $texte,"\n"; sub supprime_html { my @balises_a_ignorer = ("applet","code","embed","head","object","script","server"); my $html = shift @_; $html =~ s/\n+/ /g; $html =~ s/\r+/ /g; decode_entities($html); foreach my $balise (@balises_a_ignorer) { $html=~s/<$balise.*?<\/$balise>//ig; } $html =~ s///g; #commentaires $html =~ s/<\/?p\/?>/\n/ig; #paragraphes $html =~ s//\n/ig; #retours à la ligne $html =~ s/<\/tr>/\n/ig; #lignes de tableau $html =~ s/<\/?h[1-6]>/\n/ig; #titres $html =~ s/<\/?div.*?>/\n/ig; #sections $html =~ s/<.*?>//g; #autres balises $html =~ s/\s*\n\s*/\n/g; #espaces en début/fin de ligne $html =~ s/ +/ /g; #séquences de plusieurs espaces return $html; } sub normalise_latin1 { my $chaine = shift @_; $chaine =~ s/[\x{2019}\x{2018}]/\'/g; $chaine =~ s/[\x{201C}\x{201D}]/\"/g; $chaine =~ s/[\x{2013}\x{2014}]/-/g; $chaine =~ s/\x{2026}/.../g; $chaine =~ s/\x{0152}/OE/g; $chaine =~ s/\x{0153}/oe/g; $chaine =~ s/[^\x{0000}-\x{00FF}]//g; return $chaine; }