Skip to content

Commit 116d325

Browse files
authored
restructure crate (#99)
1 parent 72aa4fe commit 116d325

File tree

12 files changed

+92
-61
lines changed

12 files changed

+92
-61
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ assert_eq!(new_html, "<title>hello world</title>");
3333
`html5gum` provides multiple kinds of APIs:
3434

3535
* Iterating over tokens as shown above.
36-
* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example](examples/custom_emitter.rs).
37-
* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example](examples/callback_emitter.rs).
38-
* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example](examples/scraper.rs).
36+
* Implementing your own `Emitter` for maximum performance, see [the `custom_emitter.rs` example][examples/custom_emitter.rs].
37+
* A callbacks-based API for a middleground between convenience and performance, see [the `callback_emitter.rs` example][examples/callback_emitter.rs].
38+
* With the `tree-builder` feature, html5gum can be integrated with `html5ever` and `scraper`. See [the `scraper.rs` example][examples/scraper.rs].
3939

4040
## What a tokenizer does and what it does not do
4141

@@ -60,7 +60,7 @@ test suite](https://github.com/html5lib/html5lib-tests/tree/master/tokenizer). S
6060
With those caveats in mind, `html5gum` can pretty much ~parse~ _tokenize_
6161
anything that browsers can. However, using the experimental `tree-builder`
6262
feature, html5gum can be integrated with `html5ever` and `scraper`. See [the
63-
`scraper.rs` example](examples/scraper.rs).
63+
`scraper.rs` example][examples/scraper.rs].
6464

6565
## Other features
6666

examples/build_tree.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
///
44
/// You may want to refer to `examples/scraper.rs` for better ergonomics.
55
use html5ever::tree_builder::TreeBuilder;
6-
use html5gum::{Html5everEmitter, IoReader, Tokenizer};
6+
use html5gum::emitters::html5ever::Html5everEmitter;
7+
use html5gum::{IoReader, Tokenizer};
78
use markup5ever_rcdom::{Handle, NodeData, RcDom};
89

910
fn walk(indent: usize, handle: &Handle) {

examples/callback_emitter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
//! ```text
1111
//! link: foo
1212
//! ```
13-
use html5gum::callbacks::{CallbackEmitter, CallbackEvent};
13+
use html5gum::emitters::callback::{CallbackEmitter, CallbackEvent};
1414
use html5gum::{Emitter, IoReader, Tokenizer};
1515

1616
fn get_emitter() -> impl Emitter<Token = String> {

examples/scraper.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
use std::io::{stdin, Read};
1515

1616
use html5ever::tree_builder::TreeBuilder;
17-
use html5gum::{Html5everEmitter, IoReader, Tokenizer};
17+
use html5gum::emitters::html5ever::Html5everEmitter;
18+
use html5gum::{IoReader, Tokenizer};
1819
use scraper::{Html, Selector};
1920

2021
use argh::FromArgs;

src/callbacks.rs renamed to src/emitters/callback.rs

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
//! Consume the parsed HTML as a series of events through a callback.
22
//!
3-
//! While using the [DefaultEmitter] provides an easy-to-use API with low performance, and
4-
//! implementing your own [Emitter] brings maximal performance and maximal pain, this is a middle
3+
//! While using the [crate::DefaultEmitter] provides an easy-to-use API with low performance, and
4+
//! implementing your own [crate::Emitter] brings maximal performance and maximal pain, this is a middle
55
//! ground. All strings are borrowed from some intermediate buffer instead of individually
66
//! allocated.
77
//!
88
//! ```
99
//! // Extract all text between span tags, in a naive (but fast) way. Does not handle tags inside of the span. See `examples/` as well.
1010
//! use html5gum::Tokenizer;
11-
//! use html5gum::callbacks::{CallbackEvent, CallbackEmitter};
11+
//! use html5gum::emitters::callback::{CallbackEvent, CallbackEmitter};
1212
//!
1313
//! let mut is_in_span = false;
1414
//! let emitter = CallbackEmitter::new(move |event: CallbackEvent<'_>| -> Option<Vec<u8>> {
@@ -45,7 +45,7 @@ use crate::{naive_next_state, Emitter, Error, State};
4545

4646
/// Events used by [CallbackEmitter].
4747
///
48-
/// This operates at a slightly lower level than [Token], as start tags are split up into multiple
48+
/// This operates at a slightly lower level than [crate::Token], as start tags are split up into multiple
4949
/// events.
5050
#[derive(Debug)]
5151
pub enum CallbackEvent<'a> {
@@ -70,8 +70,8 @@ pub enum CallbackEvent<'a> {
7070
///
7171
/// Things like whitespace, quote handling is taken care of.
7272
///
73-
/// After this event, the start tag may be closed using [CloseStartTag], or another
74-
/// [AttributeName] may follow.
73+
/// After this event, the start tag may be closed using `CloseStartTag`, or another
74+
/// `AttributeName` may follow.
7575
AttributeValue {
7676
/// The value of the attribute.
7777
value: &'a [u8],
@@ -87,7 +87,7 @@ pub enum CallbackEvent<'a> {
8787
self_closing: bool,
8888
},
8989

90-
/// Visit `"</mytag>".
90+
/// Visit `"</mytag>"`.
9191
///
9292
/// Note: Because of strangeness in the HTML spec, attributes may be observed outside of start
9393
/// tags, before this event. It's best to ignore them as they are not valid HTML, but can still
@@ -146,7 +146,7 @@ struct CallbackState<F, T> {
146146
/// type.
147147
pub trait Callback<T> {
148148
/// Perform some action on a parsing event, and, optionally, return a value that can be yielded
149-
/// from the [Tokenizer] iterator.
149+
/// from the [crate::Tokenizer] iterator.
150150
fn handle_event(&mut self, event: CallbackEvent<'_>) -> Option<T>;
151151
}
152152

@@ -207,7 +207,8 @@ struct EmitterState {
207207
doctype_force_quirks: bool,
208208
}
209209

210-
/// The emitter class to pass to [Tokenizer::new_with_emitter]
210+
/// The emitter class to pass to [crate::Tokenizer::new_with_emitter]. Please refer to the
211+
/// module-level documentation on [crate::emitters::callback] for usage.
211212
#[derive(Debug)]
212213
pub struct CallbackEmitter<F, T = Infallible> {
213214
// this struct is only split out so [CallbackState::emit_event] can borrow things concurrently
@@ -232,10 +233,10 @@ impl<F, T> CallbackEmitter<F, T>
232233
where
233234
F: Callback<T>,
234235
{
235-
/// Create a new emitter. See type-level docs to understand basic usage.
236+
/// Create a new emitter.
236237
///
237238
/// The given callback may return optional tokens that then become available through the
238-
/// [Tokenizer]'s iterator. If that's not used, return [Option<Infallible>].
239+
/// [crate::Tokenizer]'s iterator. If that's not used, return `Option<Infallible>`.
239240
pub fn new(callback: F) -> Self {
240241
CallbackEmitter {
241242
callback_state: CallbackState {

src/default_emitter.rs renamed to src/emitters/default.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1+
//! The default emitter is what powers the simple SAX-like API that you see in the README.
12
use std::collections::btree_map::Entry;
23
use std::collections::BTreeMap;
34
use std::mem::take;
45

56
use crate::{Emitter, Error, HtmlString, State};
67

7-
use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent};
8+
use crate::emitters::callback::{Callback, CallbackEmitter, CallbackEvent};
89

910
#[derive(Debug, Default)]
1011
struct OurCallback {
@@ -71,25 +72,27 @@ impl Callback<Token> for OurCallback {
7172
}
7273
}
7374

74-
/// The default implementation of [`crate::Emitter`], used to produce ("emit") tokens.
75+
/// This is the emitter you implicitly use with [crate::Tokenizer::new]. Refer to the [crate
76+
/// docs](crate) for how usage looks like.
7577
#[derive(Default, Debug)]
7678
pub struct DefaultEmitter {
7779
inner: CallbackEmitter<OurCallback, Token>,
7880
}
7981

8082
impl DefaultEmitter {
81-
/// Whether to use [`naive_next_state`] to switch states automatically.
83+
/// Whether to use [crate::naive_next_state] to switch states automatically.
8284
///
8385
/// The default is off.
8486
pub fn naively_switch_states(&mut self, yes: bool) {
8587
self.inner.naively_switch_states(yes)
8688
}
8789
}
8890

89-
// opaque type around inner emitter
9091
impl Emitter for DefaultEmitter {
9192
type Token = Token;
9293

94+
// opaque type around inner emitter
95+
9396
fn set_last_start_tag(&mut self, last_start_tag: Option<&[u8]>) {
9497
self.inner.set_last_start_tag(last_start_tag)
9598
}
@@ -202,7 +205,7 @@ impl Emitter for DefaultEmitter {
202205
/// A HTML end/close tag, such as `<p>` or `<a>`.
203206
#[derive(Debug, Default, Eq, PartialEq, Clone)]
204207
pub struct StartTag {
205-
/// Whether this tag is self-closing. If it is self-closing, no following [`EndTag`] should be
208+
/// Whether this tag is self-closing. If it is self-closing, no following [EndTag] should be
206209
/// expected.
207210
pub self_closing: bool,
208211

@@ -212,7 +215,7 @@ pub struct StartTag {
212215
/// A mapping for any HTML attributes this start tag may have.
213216
///
214217
/// Duplicate attributes are ignored after the first one as per WHATWG spec. Implement your own
215-
/// [`Emitter`] to tweak this behavior.
218+
/// [crate::Emitter] to tweak this behavior.
216219
pub attributes: BTreeMap<HtmlString, HtmlString>,
217220
}
218221

File renamed without changes.

src/html5ever_emitter.rs renamed to src/emitters/html5ever.rs

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1+
//! See [`examples/scraper.rs`] for usage.
12
use std::convert::Infallible;
23

3-
use crate::callbacks::{Callback, CallbackEmitter, CallbackEvent};
4+
use crate::emitters::callback::{Callback, CallbackEmitter, CallbackEvent};
45
use crate::utils::trace_log;
56
use crate::{Emitter, Error, State};
67

@@ -175,27 +176,30 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> {
175176
}
176177

177178
fn emit_error(&mut self, error: Error) {
178-
self.emitter_inner.emit_error(error);
179+
self.emitter_inner.emit_error(error)
179180
}
180181

181-
fn pop_token(&mut self) -> Option<Infallible> {
182-
None
182+
fn should_emit_errors(&mut self) -> bool {
183+
self.emitter_inner.should_emit_errors()
183184
}
184185

186+
fn pop_token(&mut self) -> Option<Self::Token> {
187+
self.emitter_inner.pop_token()
188+
}
185189
fn emit_string(&mut self, c: &[u8]) {
186-
self.emitter_inner.emit_string(c);
190+
self.emitter_inner.emit_string(c)
187191
}
188192

189193
fn init_start_tag(&mut self) {
190-
self.emitter_inner.init_start_tag();
194+
self.emitter_inner.init_start_tag()
191195
}
192196

193197
fn init_end_tag(&mut self) {
194-
self.emitter_inner.init_end_tag();
198+
self.emitter_inner.init_end_tag()
195199
}
196200

197201
fn init_comment(&mut self) {
198-
self.emitter_inner.init_comment();
202+
self.emitter_inner.init_comment()
199203
}
200204

201205
fn emit_current_tag(&mut self) -> Option<State> {
@@ -204,63 +208,63 @@ impl<'a, S: TokenSink> Emitter for Html5everEmitter<'a, S> {
204208
}
205209

206210
fn emit_current_comment(&mut self) {
207-
self.emitter_inner.emit_current_comment();
211+
self.emitter_inner.emit_current_comment()
208212
}
209213

210214
fn emit_current_doctype(&mut self) {
211-
self.emitter_inner.emit_current_doctype();
215+
self.emitter_inner.emit_current_doctype()
212216
}
213217

214218
fn set_self_closing(&mut self) {
215-
self.emitter_inner.set_self_closing();
219+
self.emitter_inner.set_self_closing()
216220
}
217221

218222
fn set_force_quirks(&mut self) {
219-
self.emitter_inner.set_force_quirks();
223+
self.emitter_inner.set_force_quirks()
220224
}
221225

222226
fn push_tag_name(&mut self, s: &[u8]) {
223-
self.emitter_inner.push_tag_name(s);
227+
self.emitter_inner.push_tag_name(s)
224228
}
225229

226230
fn push_comment(&mut self, s: &[u8]) {
227-
self.emitter_inner.push_comment(s);
231+
self.emitter_inner.push_comment(s)
228232
}
229233

230234
fn push_doctype_name(&mut self, s: &[u8]) {
231-
self.emitter_inner.push_doctype_name(s);
235+
self.emitter_inner.push_doctype_name(s)
232236
}
233237

234238
fn init_doctype(&mut self) {
235-
self.emitter_inner.init_doctype();
239+
self.emitter_inner.init_doctype()
236240
}
237241

238242
fn init_attribute(&mut self) {
239-
self.emitter_inner.init_attribute();
243+
self.emitter_inner.init_attribute()
240244
}
241245

242246
fn push_attribute_name(&mut self, s: &[u8]) {
243-
self.emitter_inner.push_attribute_name(s);
247+
self.emitter_inner.push_attribute_name(s)
244248
}
245249

246250
fn push_attribute_value(&mut self, s: &[u8]) {
247-
self.emitter_inner.push_attribute_value(s);
251+
self.emitter_inner.push_attribute_value(s)
248252
}
249253

250254
fn set_doctype_public_identifier(&mut self, value: &[u8]) {
251-
self.emitter_inner.set_doctype_public_identifier(value);
255+
self.emitter_inner.set_doctype_public_identifier(value)
252256
}
253257

254258
fn set_doctype_system_identifier(&mut self, value: &[u8]) {
255-
self.emitter_inner.set_doctype_system_identifier(value);
259+
self.emitter_inner.set_doctype_system_identifier(value)
256260
}
257261

258-
fn push_doctype_public_identifier(&mut self, value: &[u8]) {
259-
self.emitter_inner.push_doctype_public_identifier(value);
262+
fn push_doctype_public_identifier(&mut self, s: &[u8]) {
263+
self.emitter_inner.push_doctype_public_identifier(s)
260264
}
261265

262-
fn push_doctype_system_identifier(&mut self, value: &[u8]) {
263-
self.emitter_inner.push_doctype_system_identifier(value);
266+
fn push_doctype_system_identifier(&mut self, s: &[u8]) {
267+
self.emitter_inner.push_doctype_system_identifier(s)
264268
}
265269

266270
fn current_is_appropriate_end_tag_token(&mut self) -> bool {

src/emitters/mod.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
//! [Emitter] is a "visitor" on the underlying token stream.
2+
//!
3+
//! When html5gum parses HTML, it (more specifically, the [crate::Tokenizer]) calls into emitters to keep
4+
//! track of state and to produce output.
5+
//!
6+
//! Emitters can yield control to the _caller_ of the tokenizer by emitting tokens in
7+
//! [Emitter::pop_token]. This is what powers the basic API where users just iterate over
8+
//! [crate::Tokenizer] which is an iterator over [default::Token].
9+
//!
10+
//! Most performant implementations don't implement `pop_token` and instead hold internal mutable
11+
//! state, or directly produce side effects.
12+
//!
13+
//! Emitters are "a way to consume parsing results." The following ways are available:
14+
//!
15+
//! * [default::DefaultEmitter], if you don't care about speed and only want convenience.
16+
//! * [callback::CallbackEmitter], if you can deal with some lifetime problems in exchange for way fewer allocations.
17+
//! * Implementing your own [Emitter] for maximum performance and maximum pain.
18+
pub mod callback;
19+
pub mod default;
20+
#[cfg(feature = "html5ever")]
21+
pub mod html5ever;
22+
23+
mod emitter;
24+
25+
pub use emitter::{naive_next_state, Emitter};

src/lib.rs

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#![doc = concat!("[LICENSE]: ", blob_url_prefix!(), "LICENSE")]
77
#![doc = concat!("[examples/tokenize_with_state_switches.rs]: ", blob_url_prefix!(), "examples/tokenize_with_state_switches.rs")]
88
#![doc = concat!("[examples/custom_emitter.rs]: ", blob_url_prefix!(), "examples/custom_emitter.rs")]
9+
#![doc = concat!("[examples/callback_emitter.rs]: ", blob_url_prefix!(), "examples/callback_emitter.rs")]
10+
#![doc = concat!("[examples/scraper.rs]: ", blob_url_prefix!(), "examples/scraper.rs")]
911
#![doc = include_str!("../README.md")]
1012
//
1113
#![warn(clippy::all)]
@@ -36,14 +38,10 @@ macro_rules! blob_url_prefix {
3638
use blob_url_prefix;
3739

3840
mod arrayvec;
39-
pub mod callbacks;
4041
mod char_validator;
41-
mod default_emitter;
42-
mod emitter;
42+
pub mod emitters;
4343
mod entities;
4444
mod error;
45-
#[cfg(feature = "html5ever")]
46-
mod html5ever_emitter;
4745
mod htmlstring;
4846
mod machine;
4947
mod machine_helper;
@@ -57,13 +55,10 @@ mod utils;
5755
#[doc(hidden)]
5856
pub mod testutils;
5957

60-
pub use default_emitter::{DefaultEmitter, Doctype, EndTag, StartTag, Token};
61-
pub use emitter::{naive_next_state, Emitter};
58+
pub use emitters::default::{DefaultEmitter, Doctype, EndTag, StartTag, Token};
59+
pub use emitters::{naive_next_state, Emitter};
6260
pub use error::Error;
6361
pub use htmlstring::HtmlString;
6462
pub use reader::{IoReader, Readable, Reader, StringReader};
6563
pub use state::State;
6664
pub use tokenizer::{InfallibleTokenizer, Tokenizer};
67-
68-
#[cfg(feature = "html5ever")]
69-
pub use html5ever_emitter::Html5everEmitter;

0 commit comments

Comments
 (0)