Skip to content

Commit 0530ebd

Browse files
authored
add html5ever convenience functions (#101)
1 parent a678571 commit 0530ebd

File tree

2 files changed

+120
-16
lines changed

2 files changed

+120
-16
lines changed

examples/scraper.rs

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,9 @@
1313
/// Requires the tree-builder feature.
1414
use std::io::{stdin, Read};
1515

16-
use html5ever::tree_builder::TreeBuilder;
17-
use html5gum::emitters::html5ever::Html5everEmitter;
18-
use html5gum::{IoReader, Tokenizer};
19-
use scraper::{Html, Selector};
20-
2116
use argh::FromArgs;
17+
use html5gum::emitters::html5ever::parse_document;
18+
use scraper::{Html, Selector};
2219

2320
/// Read some HTML from stdin and parse it according to the given selector.
2421
#[derive(FromArgs)]
@@ -37,20 +34,15 @@ struct Cli {
3734
fn main() {
3835
let cli: Cli = argh::from_env();
3936

37+
let mut input = String::new();
38+
stdin().read_to_string(&mut input).unwrap();
39+
4040
let dom = if cli.use_html5ever {
41-
let mut input = String::new();
42-
stdin().read_to_string(&mut input).unwrap();
4341
Html::parse_document(&input)
4442
} else {
45-
// parsing the document
4643
let dom = Html::new_document();
47-
let mut tree_builder = TreeBuilder::new(dom, Default::default());
48-
let token_emitter = Html5everEmitter::new(&mut tree_builder);
49-
let reader = IoReader::new(stdin().lock());
50-
let tokenizer = Tokenizer::new_with_emitter(reader, token_emitter);
51-
52-
tokenizer.finish().unwrap();
53-
tree_builder.sink
44+
let Ok(dom) = parse_document(&input, dom, Default::default());
45+
dom
5446
};
5547

5648
let selector = Selector::parse(&cli.selector).unwrap();

src/emitters/html5ever.rs

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@ use std::convert::Infallible;
33

44
use crate::emitters::callback::{Callback, CallbackEmitter, CallbackEvent};
55
use crate::utils::trace_log;
6-
use crate::{Emitter, Error, State};
6+
use crate::{Emitter, Error, Readable, Reader, State, Tokenizer};
77

8+
use html5ever::interface::{create_element, TreeSink};
9+
use html5ever::tokenizer::states::State as Html5everState;
810
use html5ever::tokenizer::{
911
states::RawKind, Doctype, Tag, TagKind, Token as Html5everToken, TokenSink, TokenSinkResult,
1012
};
13+
use html5ever::tree_builder::TreeBuilder;
14+
use html5ever::ParseOpts;
1115
use html5ever::{Attribute, QualName};
1216

1317
const BOGUS_LINENO: u64 = 1;
@@ -278,3 +282,111 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> {
278282
.adjusted_current_node_present_but_not_in_html_namespace()
279283
}
280284
}
285+
286+
fn map_tokenizer_state(input: Html5everState) -> State {
287+
match input {
288+
Html5everState::Data => State::Data,
289+
Html5everState::Plaintext => State::PlainText,
290+
Html5everState::RawData(RawKind::Rcdata) => State::RcData,
291+
Html5everState::RawData(RawKind::Rawtext) => State::RawText,
292+
Html5everState::RawData(RawKind::ScriptData) => State::ScriptData,
293+
x => todo!("{:?}", x),
294+
}
295+
}
296+
297+
/// Parse an HTML fragment
298+
///
299+
/// This is a convenience function for using [Html5everEmitter] together with html5ever. It is
300+
/// equivalent to the same functions in [html5ever::driver].
301+
///
302+
/// ```
303+
/// use html5ever::{local_name, QualName, ns, namespace_url}; // extern crate html5ever;
304+
/// use scraper::Html; // extern crate scraper;
305+
///
306+
/// let input = "<h1>hello world</h1>";
307+
///
308+
/// // equivalent to `Html::parse_fragment`
309+
/// let dom = Html::new_fragment();
310+
/// let Ok(dom) = html5gum::emitters::html5ever::parse_fragment(
311+
/// input,
312+
/// dom,
313+
/// Default::default(),
314+
/// QualName::new(None, ns!(html), local_name!("body")),
315+
/// Vec::new()
316+
/// );
317+
/// ```
318+
pub fn parse_fragment<'a, R, Sink>(
319+
input: R,
320+
mut sink: Sink,
321+
opts: ParseOpts,
322+
context_name: QualName,
323+
context_attrs: Vec<Attribute>,
324+
) -> Result<Sink, <R::Reader as Reader>::Error>
325+
where
326+
R: Readable<'a>,
327+
Sink: TreeSink,
328+
{
329+
let context_elem = create_element(&mut sink, context_name, context_attrs);
330+
parse_fragment_for_element(input, sink, opts, context_elem, None)
331+
}
332+
333+
/// Like `parse_fragment`, but with an existing context element
334+
/// and optionally a form element.
335+
///
336+
/// This is a convenience function for using [Html5everEmitter] together with html5ever. It is
337+
/// equivalent to the same functions in [html5ever::driver].
338+
pub fn parse_fragment_for_element<'a, R, Sink>(
339+
input: R,
340+
sink: Sink,
341+
opts: ParseOpts,
342+
context_element: Sink::Handle,
343+
form_element: Option<Sink::Handle>,
344+
) -> Result<Sink, <R::Reader as Reader>::Error>
345+
where
346+
R: Readable<'a>,
347+
Sink: TreeSink,
348+
{
349+
let mut tree_builder =
350+
TreeBuilder::new_for_fragment(sink, context_element, form_element, opts.tree_builder);
351+
352+
let initial_state = map_tokenizer_state(tree_builder.tokenizer_state_for_context_elem());
353+
let token_emitter = Html5everEmitter::new(&mut tree_builder);
354+
let mut tokenizer = Tokenizer::new_with_emitter(input, token_emitter);
355+
tokenizer.set_state(initial_state);
356+
tokenizer.finish()?;
357+
Ok(tree_builder.sink)
358+
}
359+
360+
/// Parse an HTML document.
361+
///
362+
/// This is a convenience function for using [Html5everEmitter] together with html5ever. It is
363+
/// equivalent to the same functions in [html5ever::driver].
364+
///
365+
/// ```rust
366+
/// use scraper::Html; // extern crate scraper;
367+
///
368+
/// let input = "<h1>hello world</h1>";
369+
///
370+
/// // equivalent to `Html::parse_document`
371+
/// let dom = Html::new_document();
372+
/// let Ok(dom) = html5gum::emitters::html5ever::parse_document(
373+
/// input,
374+
/// dom,
375+
/// Default::default()
376+
/// );
377+
/// ```
378+
pub fn parse_document<'a, R, Sink>(
379+
input: R,
380+
sink: Sink,
381+
opts: ParseOpts,
382+
) -> Result<Sink, <R::Reader as Reader>::Error>
383+
where
384+
R: Readable<'a>,
385+
Sink: TreeSink,
386+
{
387+
let mut tree_builder = TreeBuilder::new(sink, opts.tree_builder);
388+
let token_emitter = Html5everEmitter::new(&mut tree_builder);
389+
let tokenizer = Tokenizer::new_with_emitter(input, token_emitter);
390+
tokenizer.finish()?;
391+
Ok(tree_builder.sink)
392+
}

0 commit comments

Comments
 (0)