Skip to content

Commit db3c933

Browse files
Support other charset (#105)
* scraper: Initial work for other charset support Not working * scraper: Get charset from html instead of http headers Not working * scraper: Charset working * scraper: Fix regex ('"' after equals) * scraper: Use lazy_static for charset regex * misc: Refactor pr * tests: Refactor fixtures * tests: Add html charset support tests * tests: Charset, split tests * downloader: Get charset from http headers * misc: Clarify iterator first * Apply suggestions from CohenArthur Co-authored-by: CohenArthur <arthur.cohen@epita.fr>
1 parent d26647c commit db3c933

14 files changed

+708
-235
lines changed

Cargo.lock

Lines changed: 308 additions & 192 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ percent-encoding = "^2.1"
3333
url = "^2.2"
3434
rand = "^0.8"
3535
regex = "^1.4"
36+
encoding_rs = "^0.8"
37+
lazy_static = "1.4.0"
3638

3739
[dev-dependencies]
3840
tiny_http = "^0.7"
41+
serial_test = "^0.5"

src/downloader.rs

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
use super::response::{Response, ResponseData};
22
use std::collections::HashMap;
3+
4+
use lazy_static::lazy_static;
5+
use regex::Regex;
36
use url::Url;
47

58
use crate::warn;
@@ -104,26 +107,48 @@ impl Downloader {
104107
};
105108
match req.send() {
106109
Ok(mut data) => {
107-
let data_type = match data.headers().get("content-type") {
108-
Some(data_type) => data_type.to_str().unwrap(),
109-
None => "text/html",
110-
};
111-
112-
let filename = if !Downloader::is_html(data_type) {
110+
lazy_static! {
111+
static ref DATA_TYPE_REGEX: Regex =
112+
Regex::new("^.*(\\b[a-z]+/[a-z-+\\.]+).*$").unwrap();
113+
static ref CHARSET_REGEX: Regex =
114+
Regex::new("^.*charset\\s*=\\s*\"?([^\"\\s;]+).*$").unwrap();
115+
}
116+
117+
let (data_type, charset): (String, Option<String>) =
118+
match data.headers().get("content-type") {
119+
Some(content_type_header) => {
120+
let content_type = content_type_header.to_str().unwrap();
121+
let data_type_captures =
122+
DATA_TYPE_REGEX.captures_iter(&content_type).nth(0);
123+
let data_type = data_type_captures
124+
.map_or(String::from("text/html"), |first| {
125+
String::from(first.get(1).unwrap().as_str().to_lowercase())
126+
});
127+
let charset_captures =
128+
CHARSET_REGEX.captures_iter(&content_type).nth(0);
129+
let charset = charset_captures.map(|first| {
130+
String::from(first.get(1).unwrap().as_str().to_lowercase())
131+
});
132+
(data_type, charset)
133+
}
134+
None => (String::from("text/html"), None),
135+
};
136+
137+
let filename = if !Downloader::is_html(&data_type) {
113138
Downloader::get_filename(data.headers())
114139
} else {
115140
None
116141
};
117142

118-
let data = if Downloader::is_html(data_type) {
119-
ResponseData::Html(data.text().unwrap())
143+
let mut raw_data: Vec<u8> = Vec::new();
144+
data.copy_to(&mut raw_data).unwrap();
145+
let response_data = if Downloader::is_html(&data_type) {
146+
ResponseData::Html(raw_data)
120147
} else {
121-
let mut raw_data: Vec<u8> = Vec::new();
122-
data.copy_to(&mut raw_data).unwrap();
123148
ResponseData::Other(raw_data)
124149
};
125150

126-
Ok(Response::new(data, filename))
151+
Ok(Response::new(response_data, filename, charset))
127152
}
128153

129154
Err(e) => {

src/response.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
11
/// Separates HTML responses and other content (PDFs, images...)
22
pub enum ResponseData {
3-
Html(String),
3+
Html(Vec<u8>),
44
Other(Vec<u8>),
55
}
66

77
/// Wrapper around `ResponseData`
88
pub struct Response {
99
pub data: ResponseData,
1010
pub filename: Option<String>,
11+
pub charset: Option<String>,
1112
}
1213

1314
impl Response {
1415
///Create a new Response
15-
pub fn new(data: ResponseData, filename: Option<String>) -> Response {
16-
Response { data, filename }
16+
pub fn new(data: ResponseData, filename: Option<String>, charset: Option<String>) -> Response {
17+
Response {
18+
data,
19+
filename,
20+
charset,
21+
}
1722
}
1823
}

src/scraper.rs

Lines changed: 89 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
11
use crossbeam::channel::{Receiver, Sender, TryRecvError};
22
use crossbeam::thread;
3+
use encoding_rs::Encoding;
4+
use lazy_static::lazy_static;
5+
use rand::Rng;
6+
use regex::Regex;
37
use url::Url;
48

9+
use std::borrow::Borrow;
510
use std::collections::HashMap;
611
use std::collections::HashSet;
712
use std::process;
813
use std::sync::Mutex;
914
use std::time;
1015

11-
use rand::Rng;
12-
13-
use super::downloader;
14-
1516
use super::args;
1617
use super::disk;
1718
use super::dom;
19+
use super::downloader;
1820
use super::response;
1921
use super::url_helper;
2022

21-
use crate::{error, info};
23+
use crate::{error, info, warn};
2224

2325
/// Maximum number of empty recv() from the channel
2426
static MAX_EMPTY_RECEIVES: usize = 10;
@@ -91,15 +93,79 @@ impl Scraper {
9193
old_url_str.push_str(&new_url_str);
9294
}
9395

94-
///Proces an html file: add new url to the chanel and prepare for offline navigation
96+
/// Find the charset of the webpage. ``data`` is not a String as this might not be utf8.
97+
/// Returned String is lower cased
98+
/// This is a hack and should be check in case of a bug
99+
fn find_charset(data: &[u8], http_charset: Option<String>) -> Option<String> {
100+
lazy_static! {
101+
static ref CHARSET_REGEX: Regex =
102+
Regex::new("<meta.*charset\\s*=\\s*\"?([^\"\\s;]+).*>").unwrap();
103+
}
104+
105+
// We don't know the real charset yet. We hope that the charset is ASCII
106+
// compatible, because Rust String are in UTF-8 (also ASCII compatible).
107+
let data_utf8 = unsafe { String::from_utf8_unchecked(Vec::from(data)) };
108+
let captures = CHARSET_REGEX.captures_iter(&data_utf8).next();
109+
110+
// We use the first one, hopping we are in the <head> of the page... or if nothing is found
111+
// we used the http charset (if any).
112+
captures
113+
.map(|first| String::from(first.get(1).unwrap().as_str().to_lowercase()))
114+
.or(http_charset)
115+
}
116+
117+
/// Proceed to convert the data in utf8.
118+
fn charset_convert(
119+
data: &[u8],
120+
charset_source: &'static Encoding,
121+
charset_dest: &'static Encoding,
122+
) -> Vec<u8> {
123+
let decode_result = charset_source.decode(data);
124+
let decode_bytes = decode_result.0.borrow();
125+
126+
let encode_result = charset_dest.encode(decode_bytes);
127+
let encode_bytes = encode_result.0.into_owned();
128+
129+
encode_bytes
130+
}
131+
132+
/// Check if the charset require conversion
133+
fn needs_charset_conversion(charset: &str) -> bool {
134+
match charset {
135+
"utf-8" => false,
136+
_ => true,
137+
}
138+
}
139+
140+
/// Proces an html file: add new url to the chanel and prepare for offline navigation
95141
fn handle_html(
96142
scraper: &Scraper,
97143
transmitter: &Sender<(Url, i32)>,
98144
url: &Url,
99145
depth: i32,
100-
data: &str,
146+
data: &[u8],
147+
http_charset: Option<String>,
101148
) -> Vec<u8> {
102-
let dom = dom::Dom::new(data);
149+
let charset_source_str = match Self::find_charset(data, http_charset) {
150+
Some(s) => s,
151+
None => {
152+
warn!("Charset not found for {}, defaulting to UTF-8", url);
153+
String::from("utf-8")
154+
}
155+
};
156+
157+
let need_charset_conversion = Self::needs_charset_conversion(&charset_source_str);
158+
159+
let charset_source =
160+
encoding_rs::Encoding::for_label(&charset_source_str.as_bytes()).unwrap();
161+
let charset_utf8 = encoding_rs::UTF_8;
162+
let utf8_data = if need_charset_conversion {
163+
Self::charset_convert(data, charset_source, charset_utf8)
164+
} else {
165+
Vec::from(data)
166+
};
167+
168+
let dom = dom::Dom::new(&String::from_utf8_lossy(&utf8_data).into_owned());
103169

104170
dom.find_urls_as_strings()
105171
.into_iter()
@@ -119,17 +185,28 @@ impl Scraper {
119185
scraper.fix_domtree(next_url, &next_full_url);
120186
});
121187

122-
dom.serialize().into_bytes()
188+
let utf8_data = dom.serialize().into_bytes();
189+
190+
if need_charset_conversion {
191+
Self::charset_convert(&utf8_data, charset_utf8, charset_source)
192+
} else {
193+
utf8_data
194+
}
123195
}
124196

125197
/// Process a single URL
126198
fn handle_url(scraper: &Scraper, transmitter: &Sender<(Url, i32)>, url: Url, depth: i32) {
127199
match scraper.downloader.get(&url) {
128200
Ok(response) => {
129201
let data = match response.data {
130-
response::ResponseData::Html(data) => {
131-
Scraper::handle_html(scraper, transmitter, &url, depth, &data)
132-
}
202+
response::ResponseData::Html(data) => Scraper::handle_html(
203+
scraper,
204+
transmitter,
205+
&url,
206+
depth,
207+
&data,
208+
response.charset,
209+
),
133210
response::ResponseData::Other(data) => data,
134211
};
135212

tests/auth.rs

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,19 @@
22
33
mod fixtures;
44

5-
use fixtures::get_file_count_with_pattern;
65
use std::fs::read_dir;
76
use std::process::Command;
87
use std::process::Stdio;
98
use std::sync::Once;
109

11-
const ADDR: &'static str = "http://0.0.0.0:8000";
10+
const PAGE: &'static str = "tests/fixtures/index.html";
1211
static START: Once = Once::new();
1312

1413
#[test]
1514
fn test_auth() {
1615
// Spawn a single instance of a local http server usable by all tests in this module.
1716
START.call_once(|| {
18-
fixtures::spawn_local_http_server(true);
17+
fixtures::spawn_local_http_server(PAGE, true, None);
1918
});
2019

2120
// Tests below are grouped together as they depend on the local_http_server above.
@@ -28,7 +27,7 @@ fn auth_different_host() {
2827
let output_dir = "w4";
2928
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
3029
.args(&[
31-
ADDR,
30+
fixtures::HTTP_ADDR,
3231
"-o",
3332
"w4",
3433
"-a",
@@ -54,7 +53,15 @@ fn auth_different_host() {
5453
fn auth_valid() {
5554
let output_dir = "w5";
5655
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
57-
.args(&[ADDR, "-o", "w5", "-a", "username password", "-j", "16"])
56+
.args(&[
57+
fixtures::HTTP_ADDR,
58+
"-o",
59+
"w5",
60+
"-a",
61+
"username password",
62+
"-j",
63+
"16",
64+
])
5865
.stdout(Stdio::inherit())
5966
.stderr(Stdio::inherit())
6067
.spawn()

tests/charset_html_found.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//! Test for charset detection/conversion
2+
3+
mod fixtures;
4+
5+
use std::fs;
6+
use std::process::{Command, Stdio};
7+
use std::sync::Once;
8+
9+
const PAGE_META: &'static str = "tests/fixtures/charset_test_html.html";
10+
static START: Once = Once::new();
11+
12+
#[test]
13+
fn test_html_charset_found() {
14+
// Spawn a single instance of a local http server usable by all tests in this module.
15+
START.call_once(|| {
16+
fixtures::spawn_local_http_server(PAGE_META, false, None);
17+
});
18+
19+
let output_dir = "charset_html_found";
20+
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
21+
.args(&[fixtures::HTTP_ADDR, "-o", output_dir])
22+
.stdout(Stdio::inherit())
23+
.stderr(Stdio::inherit())
24+
.spawn()
25+
.unwrap();
26+
let status = cmd.wait().unwrap();
27+
assert!(status.success());
28+
let file_path = fs::read_dir(output_dir)
29+
.unwrap()
30+
.next()
31+
.unwrap()
32+
.unwrap()
33+
.path(); // There is only one file in the directory
34+
35+
let data_source = fs::read(PAGE_META).unwrap();
36+
let data_downloaded = fs::read(file_path).unwrap();
37+
38+
assert!(fixtures::do_vecs_match(&data_source, &data_downloaded));
39+
40+
fs::remove_dir_all(output_dir).unwrap();
41+
}

tests/charset_html_not_found.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//! Test for charset detection/conversion
2+
3+
mod fixtures;
4+
5+
use std::fs;
6+
use std::process::{Command, Stdio};
7+
use std::sync::Once;
8+
9+
const PAGE_NO_META: &'static str = "tests/fixtures/charset_test_html_no_meta.html";
10+
static START: Once = Once::new();
11+
12+
#[test]
13+
fn test_html_charset_not_found() {
14+
// Spawn a single instance of a local http server usable by all tests in this module.
15+
START.call_once(|| {
16+
fixtures::spawn_local_http_server(PAGE_NO_META, false, None);
17+
});
18+
19+
let output_dir = "charset_html_not_found";
20+
let mut cmd = Command::new(env!("CARGO_BIN_EXE_suckit"))
21+
.args(&[fixtures::HTTP_ADDR, "-o", output_dir])
22+
.stdout(Stdio::inherit())
23+
.stderr(Stdio::inherit())
24+
.spawn()
25+
.unwrap();
26+
let status = cmd.wait().unwrap();
27+
assert!(status.success());
28+
let file_path = fs::read_dir(output_dir)
29+
.unwrap()
30+
.next()
31+
.unwrap()
32+
.unwrap()
33+
.path(); // There is only one file in the directory
34+
35+
let data_source = fs::read(PAGE_NO_META).unwrap();
36+
let data_downloaded = fs::read(file_path).unwrap();
37+
38+
assert!(!fixtures::do_vecs_match(&data_source, &data_downloaded));
39+
40+
fs::remove_dir_all(output_dir).unwrap();
41+
}

0 commit comments

Comments
 (0)