Skip to content

Commit 8c7cbc2

Browse files
committed
Refactor parser for improved performance and clarity
1 parent 4124275 commit 8c7cbc2

File tree

5 files changed

+122
-120
lines changed

5 files changed

+122
-120
lines changed

src/parse.rs

Lines changed: 80 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
use crate::{
22
imp::{AuthMeta, Constraints, HostMeta, Meta},
3-
pct_enc::{self, table::*, Table},
3+
pct_enc::{self, encoder::*, Encoder},
44
utf8,
55
};
66
use core::{
7+
marker::PhantomData,
78
num::NonZeroUsize,
89
ops::{Deref, DerefMut},
910
str,
@@ -110,7 +111,7 @@ impl DerefMut for Parser<'_> {
110111
}
111112
}
112113

113-
#[derive(Clone, Copy)]
114+
#[derive(Clone, Copy, PartialEq, Eq)]
114115
enum PathKind {
115116
General,
116117
AbEmpty,
@@ -152,15 +153,8 @@ impl<'a> Reader<'a> {
152153
debug_assert!(self.pos <= self.len());
153154
}
154155

155-
// Returns `true` iff any byte is read.
156-
fn read(&mut self, table: Table) -> Result<bool> {
157-
let start = self.pos;
158-
self.read_with(table, |_, _| {})?;
159-
Ok(self.pos > start)
160-
}
161-
162156
#[cold]
163-
fn invalid_pct(&self) -> Result<()> {
157+
fn invalid_pct(&self) -> Result<bool> {
164158
let mut i = self.pos + 1;
165159
if let Some(&x) = self.bytes.get(i) {
166160
if pct_enc::is_hexdig(x) {
@@ -170,7 +164,18 @@ impl<'a> Reader<'a> {
170164
err!(i, UnexpectedCharOrEnd);
171165
}
172166

173-
fn read_with(&mut self, table: Table, mut f: impl FnMut(usize, u32)) -> Result<()> {
167+
#[inline(always)]
168+
fn read<E: Encoder>(&mut self) -> Result<bool> {
169+
struct Helper<E: Encoder> {
170+
_marker: PhantomData<E>,
171+
}
172+
173+
impl<E: Encoder> Helper<E> {
174+
const ALLOWS_PCT_ENCODED: bool = E::TABLE.allows_pct_encoded();
175+
const ALLOWS_NON_ASCII: bool = E::TABLE.allows_non_ascii();
176+
}
177+
178+
let start = self.pos;
174179
let mut i = self.pos;
175180

176181
macro_rules! do_loop {
@@ -187,40 +192,34 @@ impl<'a> Reader<'a> {
187192
i += 3;
188193
} else if $allow_non_ascii {
189194
let (x, len) = utf8::next_code_point(self.bytes, i);
190-
if !table.allows_code_point(x) {
195+
if !E::TABLE.allows_code_point(x) {
191196
break;
192197
}
193-
f(i, x);
194198
i += len;
195199
} else {
196-
if !table.allows_ascii(x) {
200+
if !E::TABLE.allows_ascii(x) {
197201
break;
198202
}
199-
f(i, x as u32);
200203
i += 1;
201204
}
202205
}
203206
};
204207
}
205208

206-
// This expansion alone doesn't help much, but combined with
207-
// `#[inline(always)]` on `utf8::next_code_point`,
208-
// it improves performance significantly for non-ASCII case.
209-
if table.allows_pct_encoded() {
210-
if table.allows_non_ascii() {
209+
if Helper::<E>::ALLOWS_PCT_ENCODED {
210+
if Helper::<E>::ALLOWS_NON_ASCII {
211211
do_loop!(true, true);
212212
} else {
213213
do_loop!(true, false);
214214
}
215-
} else if table.allows_non_ascii() {
216-
do_loop!(false, true);
217215
} else {
216+
assert!(!Helper::<E>::ALLOWS_NON_ASCII);
218217
do_loop!(false, false);
219218
}
220219

221220
// INVARIANT: `i` is non-decreasing.
222221
self.pos = i;
223-
Ok(())
222+
Ok(self.pos > start)
224223
}
225224

226225
fn read_str(&mut self, s: &str) -> bool {
@@ -411,7 +410,7 @@ impl<'a> Reader<'a> {
411410
if let Some(b'v' | b'V') = self.peek(0) {
412411
// INVARIANT: Skipping "v" or "V" is fine.
413412
self.skip(1);
414-
if self.read(HEXDIG)? && self.read_str(".") && self.read(IPV_FUTURE)? {
413+
if self.read::<Hexdig>()? && self.read_str(".") && self.read::<IpvFuture>()? {
415414
return Ok(());
416415
}
417416
}
@@ -436,23 +435,25 @@ pub(crate) fn parse_v6(bytes: &[u8]) -> [u16; 8] {
436435
}
437436

438437
impl Parser<'_> {
439-
fn select<T>(&self, for_uri: T, for_iri: T) -> T {
438+
#[inline(always)]
439+
fn select_read<U: Encoder, I: Encoder>(&mut self) -> Result<bool> {
440440
if self.constraints.ascii_only {
441-
for_uri
441+
self.read::<U>()
442442
} else {
443-
for_iri
443+
self.read::<I>()
444444
}
445445
}
446446

447447
fn read_v4_or_reg_name(&mut self) -> Result<HostMeta> {
448-
let reg_name_table = self.select(REG_NAME, IREG_NAME);
449-
Ok(match (self.read_v4(), self.read(reg_name_table)?) {
450-
(Some(_addr), false) => HostMeta::Ipv4(
451-
#[cfg(feature = "net")]
452-
_addr.into(),
453-
),
454-
_ => HostMeta::RegName,
455-
})
448+
Ok(
449+
match (self.read_v4(), self.select_read::<RegName, IRegName>()?) {
450+
(Some(_addr), false) => HostMeta::Ipv4(
451+
#[cfg(feature = "net")]
452+
_addr.into(),
453+
),
454+
_ => HostMeta::RegName,
455+
},
456+
)
456457
}
457458

458459
fn read_host(&mut self) -> Result<HostMeta> {
@@ -463,7 +464,7 @@ impl Parser<'_> {
463464
}
464465

465466
fn parse_from_scheme(&mut self) -> Result<()> {
466-
self.read(SCHEME)?;
467+
self.read::<Scheme>()?;
467468

468469
if self.peek(0) == Some(b':') {
469470
// Scheme starts with a letter.
@@ -493,110 +494,78 @@ impl Parser<'_> {
493494
}
494495

495496
fn parse_from_authority(&mut self) -> Result<()> {
496-
let host;
497+
// We first try to read host and port, noting that
498+
// a reg-name or IPv4address can also be part of userinfo.
499+
let host_start = self.pos;
500+
let host_meta = self.read_host()?;
501+
502+
let mut auth_meta = AuthMeta {
503+
host_bounds: (host_start, self.pos),
504+
host_meta,
505+
};
497506

498-
let mut colon_cnt = 0;
499-
let mut colon_idx = 0;
507+
self.read_port();
500508

501-
let auth_start = self.pos;
509+
if let HostMeta::Ipv4(..) | HostMeta::RegName = host_meta {
510+
let userinfo_read = self.select_read::<Userinfo, IUserinfo>()?;
502511

503-
let userinfo_table = self.select(USERINFO, IUSERINFO);
504-
// `userinfo_table` contains userinfo, registered name, ':', and port.
505-
self.read_with(userinfo_table, |i, x| {
506-
if x == ':' as u32 {
507-
colon_cnt += 1;
508-
colon_idx = i;
509-
}
510-
})?;
512+
if self.peek(0) == Some(b'@') {
513+
// Userinfo present.
514+
// INVARIANT: Skipping "@" is fine.
515+
self.skip(1);
511516

512-
if self.peek(0) == Some(b'@') {
513-
// Userinfo present.
514-
// INVARIANT: Skipping "@" is fine.
515-
self.skip(1);
517+
let host_start = self.pos;
518+
let host_meta = self.read_host()?;
516519

517-
let host_start = self.pos;
518-
let meta = self.read_host()?;
519-
host = (host_start, self.pos, meta);
520+
auth_meta = AuthMeta {
521+
host_bounds: (host_start, self.pos),
522+
host_meta,
523+
};
520524

521-
self.read_port();
522-
} else if self.pos == auth_start {
523-
// Nothing read. We're now at the start of an IP literal or the path.
524-
if let Some(meta) = self.read_ip_literal()? {
525-
host = (auth_start, self.pos, meta);
526525
self.read_port();
527-
} else {
528-
// Empty authority.
529-
host = (self.pos, self.pos, HostMeta::RegName);
526+
} else if userinfo_read {
527+
err!(self.pos, UnexpectedCharOrEnd);
530528
}
531-
} else {
532-
// The whole authority read. Try to parse the host and port.
533-
let host_end = match colon_cnt {
534-
// All host.
535-
0 => self.pos,
536-
// Host and port.
537-
1 => {
538-
for i in colon_idx + 1..self.pos {
539-
if !self.bytes[i].is_ascii_digit() {
540-
err!(i, UnexpectedCharOrEnd);
541-
}
542-
}
543-
colon_idx
544-
}
545-
// Multiple colons.
546-
_ => err!(colon_idx, UnexpectedCharOrEnd),
547-
};
548-
549-
let meta = parse_v4_or_reg_name(&self.bytes[auth_start..host_end]);
550-
host = (auth_start, host_end, meta);
551529
}
552530

553-
self.out.auth_meta = Some(AuthMeta {
554-
host_bounds: (host.0, host.1),
555-
host_meta: host.2,
556-
});
531+
self.out.auth_meta = Some(auth_meta);
557532
self.parse_from_path(PathKind::AbEmpty)
558533
}
559534

560535
fn parse_from_path(&mut self, kind: PathKind) -> Result<()> {
561-
let path_table = self.select(PATH, IPATH);
562-
self.out.path_bounds = match kind {
563-
PathKind::General => {
564-
let start = self.pos;
565-
self.read(path_table)?;
566-
(start, self.pos)
567-
}
568-
PathKind::AbEmpty => {
569-
let start = self.pos;
570-
// Either empty or starting with '/'.
571-
if self.read(path_table)? && self.bytes[start] != b'/' {
572-
err!(start, UnexpectedCharOrEnd);
573-
}
574-
(start, self.pos)
575-
}
536+
let path_start;
537+
538+
match kind {
539+
PathKind::General | PathKind::AbEmpty => path_start = self.pos,
576540
PathKind::ContinuedNoScheme => {
577-
let segment_table = self.select(SEGMENT_NZ_NC, ISEGMENT_NZ_NC);
578-
self.read(segment_table)?;
541+
path_start = 0;
542+
543+
self.select_read::<SegmentNzNc, ISegmentNzNc>()?;
579544

580545
if self.peek(0) == Some(b':') {
581546
// In a relative reference, the first path
582547
// segment cannot contain a colon character.
583548
err!(self.pos, UnexpectedCharOrEnd);
584549
}
585-
586-
self.read(path_table)?;
587-
(0, self.pos)
588550
}
589551
};
590552

553+
if self.select_read::<Path, IPath>()?
554+
&& kind == PathKind::AbEmpty
555+
&& self.bytes[path_start] != b'/'
556+
{
557+
err!(path_start, UnexpectedCharOrEnd);
558+
}
559+
560+
self.out.path_bounds = (path_start, self.pos);
561+
591562
if self.read_str("?") {
592-
let query_table = self.select(QUERY, IQUERY);
593-
self.read(query_table)?;
563+
self.select_read::<Query, IQuery>()?;
594564
self.out.query_end = NonZeroUsize::new(self.pos);
595565
}
596566

597567
if self.read_str("#") {
598-
let fragment_table = self.select(FRAGMENT, IFRAGMENT);
599-
self.read(fragment_table)?;
568+
self.select_read::<Fragment, IFragment>()?;
600569
}
601570

602571
if self.has_remaining() {

src/pct_enc/encoder.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,3 +122,35 @@ pub struct IData(());
122122
impl Encoder for IData {
123123
const TABLE: Table = UNRESERVED.or_pct_encoded().or_ucschar();
124124
}
125+
126+
// The following are used only in the parser.
127+
128+
pub(crate) struct Hexdig;
129+
130+
impl Encoder for Hexdig {
131+
const TABLE: Table = HEXDIG;
132+
}
133+
134+
pub(crate) struct IpvFuture;
135+
136+
impl Encoder for IpvFuture {
137+
const TABLE: Table = IPV_FUTURE;
138+
}
139+
140+
pub(crate) struct Scheme;
141+
142+
impl Encoder for Scheme {
143+
const TABLE: Table = SCHEME;
144+
}
145+
146+
pub(crate) struct SegmentNzNc;
147+
148+
impl Encoder for SegmentNzNc {
149+
const TABLE: Table = SEGMENT_NZ_NC;
150+
}
151+
152+
pub(crate) struct ISegmentNzNc;
153+
154+
impl Encoder for ISegmentNzNc {
155+
const TABLE: Table = ISEGMENT_NZ_NC;
156+
}

src/pct_enc/table.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
88
use crate::{pct_enc, utf8};
99

10-
const MASK_PCT_ENCODED: u64 = 1;
11-
const MASK_UCSCHAR: u64 = 2;
12-
const MASK_IPRIVATE: u64 = 4;
10+
const MASK_PCT_ENCODED: u64 = 1 << b'%';
11+
const MASK_UCSCHAR: u64 = 1;
12+
const MASK_IPRIVATE: u64 = 2;
1313
const MASK_UNENCODED_ASCII: u64 = !(MASK_PCT_ENCODED | MASK_UCSCHAR | MASK_IPRIVATE);
1414

1515
const fn is_ucschar(x: u32) -> bool {
@@ -31,14 +31,14 @@ impl Table {
3131
///
3232
/// # Panics
3333
///
34-
/// Panics if any of the bytes is not ASCII or equals `0`, `1`, `2`, or `b'%'`.
34+
/// Panics if any of the bytes is not ASCII or equals `0`, `1`, or `b'%'`.
3535
#[must_use]
3636
pub const fn new(mut bytes: &[u8]) -> Self {
3737
let mut table = 0;
3838
while let [cur, rem @ ..] = bytes {
3939
assert!(
40-
!matches!(cur, 0 | 1 | 2 | b'%' | 128..),
41-
"cannot allow non-ASCII byte, 0, 1, 2, or %"
40+
!matches!(cur, 0 | 1 | b'%' | 128..),
41+
"cannot allow non-ASCII byte, 0, 1, or %"
4242
);
4343
table |= 1u128.wrapping_shl(*cur as u32);
4444
bytes = rem;

src/utf8.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
1212
(ch << 6) | (byte & CONT_MASK) as u32
1313
}
1414

15-
// Make sure it's inlined into `Parser::read_with`.
15+
// Make sure it's inlined into `Parser::read`.
16+
// This improves performance significantly for non-ASCII case.
1617
#[inline(always)]
1718
pub const fn next_code_point(bytes: &[u8], i: usize) -> (u32, usize) {
1819
let x = bytes[i];

0 commit comments

Comments
 (0)