11use crossbeam:: channel:: { Receiver , Sender , TryRecvError } ;
22use crossbeam:: thread;
3+ use encoding_rs:: Encoding ;
4+ use lazy_static:: lazy_static;
5+ use rand:: Rng ;
6+ use regex:: Regex ;
37use url:: Url ;
48
9+ use std:: borrow:: Borrow ;
510use std:: collections:: HashMap ;
611use std:: collections:: HashSet ;
712use std:: process;
813use std:: sync:: Mutex ;
914use std:: time;
1015
11- use rand:: Rng ;
12-
13- use super :: downloader;
14-
1516use super :: args;
1617use super :: disk;
1718use super :: dom;
19+ use super :: downloader;
1820use super :: response;
1921use super :: url_helper;
2022
21- use crate :: { error, info} ;
23+ use crate :: { error, info, warn } ;
2224
2325/// Maximum number of empty recv() from the channel
2426static MAX_EMPTY_RECEIVES : usize = 10 ;
@@ -91,15 +93,79 @@ impl Scraper {
9193 old_url_str. push_str ( & new_url_str) ;
9294 }
9395
94- ///Proces an html file: add new url to the chanel and prepare for offline navigation
96+ /// Find the charset of the webpage. ``data`` is not a String as this might not be utf8.
97+ /// Returned String is lower cased
98+ /// This is a hack and should be check in case of a bug
99+ fn find_charset ( data : & [ u8 ] , http_charset : Option < String > ) -> Option < String > {
100+ lazy_static ! {
101+ static ref CHARSET_REGEX : Regex =
102+ Regex :: new( "<meta.*charset\\ s*=\\ s*\" ?([^\" \\ s;]+).*>" ) . unwrap( ) ;
103+ }
104+
105+ // We don't know the real charset yet. We hope that the charset is ASCII
106+ // compatible, because Rust String are in UTF-8 (also ASCII compatible).
107+ let data_utf8 = unsafe { String :: from_utf8_unchecked ( Vec :: from ( data) ) } ;
108+ let captures = CHARSET_REGEX . captures_iter ( & data_utf8) . next ( ) ;
109+
110+ // We use the first one, hopping we are in the <head> of the page... or if nothing is found
111+ // we used the http charset (if any).
112+ captures
113+ . map ( |first| String :: from ( first. get ( 1 ) . unwrap ( ) . as_str ( ) . to_lowercase ( ) ) )
114+ . or ( http_charset)
115+ }
116+
117+ /// Proceed to convert the data in utf8.
118+ fn charset_convert (
119+ data : & [ u8 ] ,
120+ charset_source : & ' static Encoding ,
121+ charset_dest : & ' static Encoding ,
122+ ) -> Vec < u8 > {
123+ let decode_result = charset_source. decode ( data) ;
124+ let decode_bytes = decode_result. 0 . borrow ( ) ;
125+
126+ let encode_result = charset_dest. encode ( decode_bytes) ;
127+ let encode_bytes = encode_result. 0 . into_owned ( ) ;
128+
129+ encode_bytes
130+ }
131+
132+ /// Check if the charset require conversion
133+ fn needs_charset_conversion ( charset : & str ) -> bool {
134+ match charset {
135+ "utf-8" => false ,
136+ _ => true ,
137+ }
138+ }
139+
140+ /// Proces an html file: add new url to the chanel and prepare for offline navigation
95141 fn handle_html (
96142 scraper : & Scraper ,
97143 transmitter : & Sender < ( Url , i32 ) > ,
98144 url : & Url ,
99145 depth : i32 ,
100- data : & str ,
146+ data : & [ u8 ] ,
147+ http_charset : Option < String > ,
101148 ) -> Vec < u8 > {
102- let dom = dom:: Dom :: new ( data) ;
149+ let charset_source_str = match Self :: find_charset ( data, http_charset) {
150+ Some ( s) => s,
151+ None => {
152+ warn ! ( "Charset not found for {}, defaulting to UTF-8" , url) ;
153+ String :: from ( "utf-8" )
154+ }
155+ } ;
156+
157+ let need_charset_conversion = Self :: needs_charset_conversion ( & charset_source_str) ;
158+
159+ let charset_source =
160+ encoding_rs:: Encoding :: for_label ( & charset_source_str. as_bytes ( ) ) . unwrap ( ) ;
161+ let charset_utf8 = encoding_rs:: UTF_8 ;
162+ let utf8_data = if need_charset_conversion {
163+ Self :: charset_convert ( data, charset_source, charset_utf8)
164+ } else {
165+ Vec :: from ( data)
166+ } ;
167+
168+ let dom = dom:: Dom :: new ( & String :: from_utf8_lossy ( & utf8_data) . into_owned ( ) ) ;
103169
104170 dom. find_urls_as_strings ( )
105171 . into_iter ( )
@@ -119,17 +185,28 @@ impl Scraper {
119185 scraper. fix_domtree ( next_url, & next_full_url) ;
120186 } ) ;
121187
122- dom. serialize ( ) . into_bytes ( )
188+ let utf8_data = dom. serialize ( ) . into_bytes ( ) ;
189+
190+ if need_charset_conversion {
191+ Self :: charset_convert ( & utf8_data, charset_utf8, charset_source)
192+ } else {
193+ utf8_data
194+ }
123195 }
124196
125197 /// Process a single URL
126198 fn handle_url ( scraper : & Scraper , transmitter : & Sender < ( Url , i32 ) > , url : Url , depth : i32 ) {
127199 match scraper. downloader . get ( & url) {
128200 Ok ( response) => {
129201 let data = match response. data {
130- response:: ResponseData :: Html ( data) => {
131- Scraper :: handle_html ( scraper, transmitter, & url, depth, & data)
132- }
202+ response:: ResponseData :: Html ( data) => Scraper :: handle_html (
203+ scraper,
204+ transmitter,
205+ & url,
206+ depth,
207+ & data,
208+ response. charset ,
209+ ) ,
133210 response:: ResponseData :: Other ( data) => data,
134211 } ;
135212
0 commit comments