Skip to content

Commit 77a52ca

Browse files
author
bors-servo
authored
Auto merge of #328 - derekdreery:docs, r=jdm
Assorted documentation I'm just trying to learn how this lib works, and I'm looking at source to work it out. Hopefully if I doc as I go, others that come after me will benefit :)
2 parents e37aed4 + de94da4 commit 77a52ca

File tree

7 files changed

+185
-19
lines changed

7 files changed

+185
-19
lines changed

markup5ever/build.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,18 @@ fn named_entities_to_phf(from: &Path, to: &Path) {
9898
}
9999

100100
let mut file = File::create(to).unwrap();
101+
writeln!(&mut file, r#"
102+
/// A map of entity names to their codepoints. The second codepoint will
103+
/// be 0 if the entity contains a single codepoint. Entities have their preceeding '&' removed.
104+
///
105+
/// # Examples
106+
///
107+
/// ```
108+
/// use markup5ever::data::NAMED_ENTITIES;
109+
///
110+
/// assert_eq!(NAMED_ENTITIES.get("gt;").unwrap(), &(62, 0));
111+
/// ```
112+
"#).unwrap();
101113
write!(&mut file, "pub static NAMED_ENTITIES: Map<&'static str, (u32, u32)> = ").unwrap();
102114
phf_map.build(&mut file).unwrap();
103115
write!(&mut file, ";\n").unwrap();

markup5ever/data/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
9+
//! Data that is known at compile-time and hard-coded into the binary.
910
use phf::Map;
1011

1112
/// The spec replaces most characters in the ISO-2022 C1 control code range

markup5ever/lib.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@ extern crate string_cache;
1313
extern crate phf;
1414
pub extern crate tendril;
1515

16+
/// Create a [`SmallCharSet`], with each space-separated number stored in the set.
17+
///
18+
/// # Examples
19+
///
20+
/// ```
21+
/// # #[macro_use] extern crate markup5ever;
22+
/// # fn main() {
23+
/// let set = small_char_set!(12 54 42);
24+
/// assert_eq!(set.bits,
25+
/// 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000);
26+
/// # }
27+
/// ```
28+
///
29+
/// [`SmallCharSet`]: struct.SmallCharSet.html
1630
#[macro_export]
1731
macro_rules! small_char_set ( ($($e:expr)+) => (
1832
$ crate ::SmallCharSet {

markup5ever/rcdom.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ pub struct Node {
8484
}
8585

8686
impl Node {
87+
/// Create a new node from its contents
8788
fn new(data: NodeData) -> Rc<Self> {
8889
Rc::new(Node {
8990
data: data,
@@ -99,12 +100,15 @@ pub type Handle = Rc<Node>;
99100
/// Weak reference to a DOM node, used for parent pointers.
100101
pub type WeakHandle = Weak<Node>;
101102

103+
/// Append a parentless node to another nodes' children
102104
fn append(new_parent: &Handle, child: Handle) {
103105
let previous_parent = child.parent.replace(Some(Rc::downgrade(new_parent)));
106+
// Invariant: child cannot have existing parent
104107
assert!(previous_parent.is_none());
105108
new_parent.children.borrow_mut().push(child);
106109
}
107110

111+
/// If the node has a parent, get it and this node's position in its children
108112
fn get_parent_and_index(target: &Handle) -> Option<(Handle, usize)> {
109113
if let Some(weak) = target.parent.take() {
110114
let parent = weak.upgrade().expect("dangling weak pointer");

markup5ever/serialize.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,68 @@
66
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
9+
//! Traits for serializing elements. The serializer expects the data to be xml-like (with a name,
10+
//! and optional children, attrs, text, comments, doctypes, and [processing instructions]). It uses
11+
//! the visitor pattern, where the serializer and the serializable objects are decoupled and
12+
//! implement their own traits.
13+
//!
14+
//! [processing instructions]: https://en.wikipedia.org/wiki/Processing_Instruction
915
1016
use QualName;
1117
use std::io;
1218

1319
//§ serializing-html-fragments
20+
/// Used as a parameter to `serialize`, telling it if we want to skip the parent.
1421
#[derive(Clone, PartialEq)]
1522
pub enum TraversalScope {
23+
/// Include the parent node when serializing.
1624
IncludeNode,
25+
/// Only serialize the children of the node, treating any provided qualified name as the
26+
/// parent while serializing.
27+
///
28+
/// This is used in the implementation of [`html5ever::serialize::serialize`]
29+
///
30+
/// [`html5ever::serialize::serialize`]: ../../html5ever/serialize/fn.serialize.html
1731
ChildrenOnly(Option<QualName>)
1832
}
1933

34+
/// Types that can be serialized (according to the xml-like scheme in `Serializer`) implement this
35+
/// trait.
2036
pub trait Serialize {
37+
/// Take the serializer and call its methods to serialize this type. The type will dictate
38+
/// which methods are called and with what parameters.
2139
fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
2240
where S: Serializer;
2341
}
2442

43+
/// Types that are capable of serializing implement this trait
2544
pub trait Serializer {
45+
/// Serialize the start of an element, for example `<div class="test">`.
2646
fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()>
2747
where AttrIter: Iterator<Item=AttrRef<'a>>;
2848

49+
/// Serialize the end of an element, for example `</div>`.
2950
fn end_elem(&mut self, name: QualName) -> io::Result<()>;
3051

52+
/// Serialize a plain text node.
3153
fn write_text(&mut self, text: &str) -> io::Result<()>;
3254

55+
/// Serialize a comment node, for example `<!-- comment -->`.
3356
fn write_comment(&mut self, text: &str) -> io::Result<()>;
3457

58+
/// Serialize a doctype node, for example `<!doctype html>`.
3559
fn write_doctype(&mut self, name: &str) -> io::Result<()>;
3660

61+
/// Serialize a processing instruction node, for example
62+
/// `<?xml-stylesheet type="text/xsl" href="style.xsl"?>`.
3763
fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()>;
3864
}
3965

66+
/// A type alias for an attribute name and value (e.g. the `class="test"` in `<div class="test">`
67+
/// is represented as `(<QualName of type class>, "test")`.
68+
///
69+
/// This is used in [`Serializer::start_elem`] where the value being serialized must supply an
70+
/// iterator over the attributes for the current element
71+
///
72+
/// [`Serializer::start_elem`]: trait.Serializer.html#tymethod.start_elem
4073
pub type AttrRef<'a> = (&'a QualName, &'a str);

markup5ever/util/buffer_queue.rs

Lines changed: 86 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,17 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
//! The `BufferQueue` struct and helper types.
11+
//!
12+
//! This type is designed for the efficient parsing of string data, especially where many
13+
//! significant characters are from the ascii range 0-63. This includes, for example, important
14+
//! characters in xml/html parsing.
15+
//!
16+
//! Good and predictable performance is achieved by avoiding allocation where possible (a.k.a. zero
17+
//! copy).
18+
//!
19+
//! [`BufferQueue`]: struct.BufferQueue.html
20+
1021

1122
use std::collections::VecDeque;
1223

@@ -15,39 +26,53 @@ use tendril::StrTendril;
1526
pub use self::SetResult::{FromSet, NotFromSet};
1627
use util::smallcharset::SmallCharSet;
1728

18-
/// Result from `pop_except_from`.
29+
/// Result from [`pop_except_from`] containing either a character from a [`SmallCharSet`], or a
30+
/// string buffer of characters not from the set.
31+
///
32+
/// [`pop_except_from`]: struct.BufferQueue.html#method.pop_except_from
33+
/// [`SmallCharSet`]: ../struct.SmallCharSet.html
1934
#[derive(PartialEq, Eq, Debug)]
2035
pub enum SetResult {
36+
/// A character from the `SmallCharSet`.
2137
FromSet(char),
38+
/// A string buffer containing no characters from the `SmallCharSet`.
2239
NotFromSet(StrTendril),
2340
}
2441

25-
/// A queue of owned string buffers, which supports incrementally
26-
/// consuming characters.
42+
/// A queue of owned string buffers, which supports incrementally consuming characters.
43+
///
44+
/// Internally it uses [`VecDeque`] and has the same complexity properties.
45+
///
46+
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
2747
pub struct BufferQueue {
2848
/// Buffers to process.
2949
buffers: VecDeque<StrTendril>,
3050
}
3151

3252
impl BufferQueue {
3353
/// Create an empty BufferQueue.
54+
#[inline]
3455
pub fn new() -> BufferQueue {
3556
BufferQueue {
3657
buffers: VecDeque::with_capacity(16),
3758
}
3859
}
3960

4061
/// Returns whether the queue is empty.
62+
#[inline]
4163
pub fn is_empty(&self) -> bool {
4264
self.buffers.is_empty()
4365
}
4466

45-
/// Get the tendril at the beginning of the queue.
67+
/// Get the buffer at the beginning of the queue.
68+
#[inline]
4669
pub fn pop_front(&mut self) -> Option<StrTendril> {
4770
self.buffers.pop_front()
4871
}
4972

5073
/// Add a buffer to the beginning of the queue.
74+
///
75+
/// If the buffer is empty, it will be skipped.
5176
pub fn push_front(&mut self, buf: StrTendril) {
5277
if buf.len32() == 0 {
5378
return;
@@ -56,20 +81,25 @@ impl BufferQueue {
5681
}
5782

5883
/// Add a buffer to the end of the queue.
84+
///
85+
/// If the buffer is empty, it will be skipped.
5986
pub fn push_back(&mut self, buf: StrTendril) {
6087
if buf.len32() == 0 {
6188
return;
6289
}
6390
self.buffers.push_back(buf);
6491
}
6592

66-
/// Look at the next available character, if any.
93+
/// Look at the next available character without removing it, if the queue is not empty.
6794
pub fn peek(&self) -> Option<char> {
68-
// Invariant: all buffers in the queue are non-empty.
95+
debug_assert!(self.buffers.iter().skip_while(|el| el.len32() != 0).next().is_none(),
96+
"invariant \"all buffers in the queue are non-empty\" failed");
6997
self.buffers.front().map(|b| b.chars().next().unwrap())
7098
}
7199

72-
/// Get the next character, if one is available.
100+
/// Get the next character if one is available, removing it from the queue.
101+
///
102+
/// This function manages the buffers, removing them as they become empty.
73103
pub fn next(&mut self) -> Option<char> {
74104
let (result, now_empty) = match self.buffers.front_mut() {
75105
None => (None, false),
@@ -87,9 +117,32 @@ impl BufferQueue {
87117
}
88118

89119
/// Pops and returns either a single character from the given set, or
90-
/// a `StrTendril` of characters none of which are in the set. The set
91-
/// is represented as a bitmask and so can only contain the first 64
92-
/// ASCII characters.
120+
/// a buffer of characters none of which are in the set.
121+
///
122+
/// # Examples
123+
///
124+
/// ```
125+
/// # #[macro_use] extern crate markup5ever;
126+
/// # #[macro_use] extern crate tendril;
127+
/// # fn main() {
128+
/// use markup5ever::buffer_queue::{BufferQueue, SetResult};
129+
///
130+
/// let mut queue = BufferQueue::new();
131+
/// queue.push_back(format_tendril!(r#"<some_tag attr="text">SomeText</some_tag>"#));
132+
/// let set = small_char_set!(b'<' b'>' b' ' b'=' b'"' b'/');
133+
/// let tag = format_tendril!("some_tag");
134+
/// let attr = format_tendril!("attr");
135+
/// let attr_val = format_tendril!("text");
136+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('<')));
137+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(tag)));
138+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet(' ')));
139+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr)));
140+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('=')));
141+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::FromSet('"')));
142+
/// assert_eq!(queue.pop_except_from(set), Some(SetResult::NotFromSet(attr_val)));
143+
/// // ...
144+
/// # }
145+
/// ```
93146
pub fn pop_except_from(&mut self, set: SmallCharSet) -> Option<SetResult> {
94147
let (result, now_empty) = match self.buffers.front_mut() {
95148
None => (None, false),
@@ -117,12 +170,29 @@ impl BufferQueue {
117170
result
118171
}
119172

120-
// Check if the next characters are an ASCII case-insensitive match for
121-
// `pat`, which must be non-empty.
122-
//
123-
// If so, consume them and return Some(true).
124-
// If they do not match, return Some(false).
125-
// If not enough characters are available to know, return None.
173+
/// Consume bytes matching the pattern, using a custom comparison function `eq`.
174+
///
175+
/// Returns `Some(true)` if there is a match, `Some(false)` if there is no match, or `None` if
176+
/// it wasn't possible to know (more data is needed).
177+
///
178+
/// The custom comparison function is used elsewhere to compare ascii-case-insensitively.
179+
///
180+
/// # Examples
181+
///
182+
/// ```
183+
/// # extern crate markup5ever;
184+
/// # #[macro_use] extern crate tendril;
185+
/// # fn main() {
186+
/// use markup5ever::buffer_queue::{BufferQueue};
187+
///
188+
/// let mut queue = BufferQueue::new();
189+
/// queue.push_back(format_tendril!("testtext"));
190+
/// let test_str = "test";
191+
/// assert_eq!(queue.eat("test", |&a, &b| a == b), Some(true));
192+
/// assert_eq!(queue.eat("text", |&a, &b| a == b), Some(true));
193+
/// assert!(queue.is_empty());
194+
/// # }
195+
/// ```
126196
pub fn eat<F: Fn(&u8, &u8) -> bool>(&mut self, pat: &str, eq: F) -> Option<bool> {
127197
let mut buffers_exhausted = 0;
128198
let mut consumed_from_last = 0;

markup5ever/util/smallcharset.rs

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,55 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
//! This module contains a single struct [`SmallCharSet`]. See its documentation for details.
11+
//!
12+
//! [`SmallCharSet`]: struct.SmallCharSet.html
1013
1114

1215
/// Represents a set of "small characters", those with Unicode scalar
1316
/// values less than 64.
17+
///
18+
/// This is stored as a bitmap, with 1 bit for each value.
19+
#[derive(Debug, Eq, PartialEq, Clone, Copy, Hash)]
1420
pub struct SmallCharSet {
1521
pub bits: u64,
1622
}
1723

1824
impl SmallCharSet {
25+
/// Checks whether a character (u8 value below 64) is stored in the SmallCharSet.
26+
///
27+
/// # Examples
28+
///
29+
/// ```ignore
30+
/// # use markup5ever::SmallCharSet;
31+
/// let set = SmallCharSet {
32+
/// bits: 0b00000000_01000000_00000100_00000000_00000000_00000000_00010000_00000000
33+
/// };
34+
/// assert!(set.contains(64));
35+
/// assert!(set.contains(b'6')); // `b'6'` is the same as 64u8
36+
/// ```
1937
#[inline]
2038
fn contains(&self, n: u8) -> bool {
2139
0 != (self.bits & (1 << (n as usize)))
2240
}
2341

24-
/// Count the number of bytes of characters at the beginning
25-
/// of `buf` which are not in the set.
26-
/// See `tokenizer::buffer_queue::pop_except_from`.
42+
/// Count the number of bytes of characters at the beginning of `buf` which are not in the set.
43+
///
44+
/// This functionality is used in [`BufferQueue::pop_except_from`].
45+
///
46+
/// # Examples
47+
///
48+
/// ```
49+
/// # #[macro_use] extern crate markup5ever;
50+
/// # fn main() {
51+
/// let set = small_char_set!(48 49 50); // '0' '1' '2'
52+
/// // `test` is 4 chars, 😁 is 4 chars, then we meet a character in the set
53+
/// let test_str = "test😁01232afd";
54+
/// assert_eq!(set.nonmember_prefix_len(test_str), 8);
55+
/// # }
56+
/// ```
57+
///
58+
/// [`BufferQueue::pop_except_from`]: buffer_queue/struct.BufferQueue.html#method.pop_except_from
2759
pub fn nonmember_prefix_len(&self, buf: &str) -> u32 {
2860
let mut n = 0;
2961
for b in buf.bytes() {

0 commit comments

Comments
 (0)