Skip to content

Commit e77b670

Browse files
committed
Stop decoding encoded private characters when normalizing query
1 parent 43b7880 commit e77b670

File tree

4 files changed

+44
-75
lines changed

4 files changed

+44
-75
lines changed
Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,8 @@
11
#![no_main]
2-
use fluent_uri::{
3-
component::Host,
4-
pct_enc::{encoder::Query, EStr},
5-
Iri,
6-
};
2+
use fluent_uri::{component::Host, Iri};
73
use iri_string::{format::ToDedicatedString, types::IriStr};
84
use libfuzzer_sys::fuzz_target;
95

10-
fn is_iprivate(x: u32) -> bool {
11-
(x >= 0xe000 && x <= 0xf8ff) || (x >= 0xf0000 && (x & 0xffff) <= 0xfffd)
12-
}
13-
14-
fn encode_iprivate(s: &str) -> String {
15-
let mut buf = String::with_capacity(s.len());
16-
for ch in s.chars() {
17-
if is_iprivate(ch as u32) {
18-
for x in ch.encode_utf8(&mut [0; 4]).bytes() {
19-
buf.push_str(EStr::<Query>::force_encode_byte(x).as_str());
20-
}
21-
} else {
22-
buf.push(ch);
23-
}
24-
}
25-
buf
26-
}
27-
286
fuzz_target!(|data: &str| {
297
let Ok(r1) = Iri::parse(data) else {
308
return;
@@ -60,12 +38,5 @@ fuzz_target!(|data: &str| {
6038
}
6139
}
6240

63-
if let Some(q1) = r1.query() {
64-
let q2 = r2.query().unwrap();
65-
if encode_iprivate(q1.as_str()) == encode_iprivate(q2.as_str()) {
66-
return;
67-
}
68-
}
69-
7041
panic!("{:?} != {:?}", r1.as_str(), r2.as_str());
7142
});

src/imp.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -610,7 +610,7 @@ macro_rules! ri_maybe_ref {
610610
/// along with IPv6 address normalization.
611611
/// This is effectively equivalent to taking the following steps in order:
612612
///
613-
/// - Decode any percent-encoded octets that correspond to an allowed character which is not reserved.
613+
/// - Decode any percent-encoded octet sequence that corresponds to an unreserved character.
614614
/// - Uppercase the hexadecimal digits within all percent-encoded octets.
615615
/// - Lowercase all ASCII characters within the scheme and the host except the percent-encoded octets.
616616
/// - Turn any IPv6 literal address into its canonical form as per

src/normalize.rs

Lines changed: 41 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::{
66
pct_enc::{
77
self,
88
encoder::{Data, IData},
9-
Decode, DecodedChunk, DecodedUtf8Chunk, Encode, EncodedChunk, Encoder, Table,
9+
Decode, DecodedChunk, DecodedUtf8Chunk, Encode, EncodedChunk, Encoder,
1010
},
1111
resolve,
1212
};
@@ -149,13 +149,6 @@ pub(crate) fn normalize(
149149
// For "a://[::ffff:5:9]/" the capacity is not enough,
150150
// but it's fine since this rarely happens.
151151
let mut buf = String::with_capacity(r.as_str().len());
152-
153-
let data_table = if ascii_only {
154-
Data::TABLE
155-
} else {
156-
IData::TABLE
157-
};
158-
159152
let mut meta = Meta::default();
160153

161154
if let Some(scheme) = r.scheme_opt() {
@@ -169,7 +162,7 @@ pub(crate) fn normalize(
169162
buf.push_str("//");
170163

171164
if let Some(userinfo) = auth.userinfo() {
172-
normalize_estr(&mut buf, userinfo.as_str(), false, data_table);
165+
normalize_estr(&mut buf, userinfo.as_str(), false, ascii_only);
173166
buf.push('@');
174167
}
175168

@@ -195,7 +188,7 @@ pub(crate) fn normalize(
195188
HostMeta::RegName => {
196189
let start = buf.len();
197190
let host = auth.host();
198-
normalize_estr(&mut buf, host, true, data_table);
191+
normalize_estr(&mut buf, host, true, ascii_only);
199192

200193
if buf.len() < start + host.len() {
201194
// Only reparse when the length is less than before.
@@ -229,7 +222,7 @@ pub(crate) fn normalize(
229222

230223
if r.has_scheme() && path.starts_with('/') {
231224
let mut path_buf = String::with_capacity(path.len());
232-
normalize_estr(&mut path_buf, path, false, data_table);
225+
normalize_estr(&mut path_buf, path, false, ascii_only);
233226

234227
let underflow_occurred = resolve::remove_dot_segments(&mut buf, &[&path_buf]);
235228
if underflow_occurred && !allow_path_underflow {
@@ -242,31 +235,58 @@ pub(crate) fn normalize(
242235
}
243236
} else {
244237
// Don't remove dot segments from relative reference or rootless path.
245-
normalize_estr(&mut buf, path, false, data_table);
238+
normalize_estr(&mut buf, path, false, ascii_only);
246239
}
247240

248241
meta.path_bounds.1 = buf.len();
249242

250243
if let Some(query) = r.query() {
244+
// We might as well decode percent-encoded private characters in query,
245+
// but Section 5.3.2.3 of RFC 3987 says:
246+
//
247+
// These IRIs should be normalized by decoding any
248+
// percent-encoded octet sequence that corresponds to an unreserved
249+
// character, as described in section 2.3 of [RFC3986].
250+
//
251+
// Also, the whole spec is quite ambiguous on whether private characters
252+
// are unreserved or not. So we just leave them percent-encoded for now.
251253
buf.push('?');
252-
253-
const IQUERY_DATA: Table = IData::TABLE.or_iprivate();
254-
let query_data_table = if ascii_only { Data::TABLE } else { IQUERY_DATA };
255-
256-
normalize_estr(&mut buf, query.as_str(), false, query_data_table);
254+
normalize_estr(&mut buf, query.as_str(), false, ascii_only);
257255
meta.query_end = NonZeroUsize::new(buf.len());
258256
}
259257

260258
if let Some(fragment) = r.fragment() {
261259
buf.push('#');
262-
normalize_estr(&mut buf, fragment.as_str(), false, data_table);
260+
normalize_estr(&mut buf, fragment.as_str(), false, ascii_only);
263261
}
264262

265263
Ok((buf, meta))
266264
}
267265

268-
fn normalize_estr(buf: &mut String, s: &str, to_ascii_lowercase: bool, table: Table) {
269-
if table.allows_non_ascii() {
266+
fn normalize_estr(buf: &mut String, s: &str, to_ascii_lowercase: bool, ascii_only: bool) {
267+
if ascii_only {
268+
for chunk in Decode::new(s) {
269+
match chunk {
270+
DecodedChunk::Unencoded(s) => {
271+
let i = buf.len();
272+
buf.push_str(s);
273+
if to_ascii_lowercase {
274+
buf[i..].make_ascii_lowercase();
275+
}
276+
}
277+
DecodedChunk::PctDecoded(mut x) => {
278+
if Data::TABLE.allows_ascii(x) {
279+
if to_ascii_lowercase {
280+
x.make_ascii_lowercase();
281+
}
282+
buf.push(x as char);
283+
} else {
284+
buf.push_str(pct_enc::encode_byte(x));
285+
}
286+
}
287+
}
288+
}
289+
} else {
270290
Decode::new(s).decode_utf8(|chunk| match chunk {
271291
DecodedUtf8Chunk::Unencoded(s) => {
272292
let i = buf.len();
@@ -276,7 +296,7 @@ fn normalize_estr(buf: &mut String, s: &str, to_ascii_lowercase: bool, table: Ta
276296
}
277297
}
278298
DecodedUtf8Chunk::Decoded { valid, invalid } => {
279-
for chunk in Encode::new(table, valid) {
299+
for chunk in Encode::new(IData::TABLE, valid) {
280300
match chunk {
281301
EncodedChunk::Unencoded(s) => {
282302
let i = buf.len();
@@ -293,28 +313,6 @@ fn normalize_estr(buf: &mut String, s: &str, to_ascii_lowercase: bool, table: Ta
293313
}
294314
}
295315
});
296-
} else {
297-
for chunk in Decode::new(s) {
298-
match chunk {
299-
DecodedChunk::Unencoded(s) => {
300-
let i = buf.len();
301-
buf.push_str(s);
302-
if to_ascii_lowercase {
303-
buf[i..].make_ascii_lowercase();
304-
}
305-
}
306-
DecodedChunk::PctDecoded(mut x) => {
307-
if table.allows_ascii(x) {
308-
if to_ascii_lowercase {
309-
x.make_ascii_lowercase();
310-
}
311-
buf.push(x as char);
312-
} else {
313-
buf.push_str(pct_enc::encode_byte(x));
314-
}
315-
}
316-
}
317-
}
318316
}
319317
}
320318

tests/normalize.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,5 +143,5 @@ fn normalize_iri() {
143143

144144
// Encoded private character in query.
145145
let r = IriRef::parse("?%EE%80%80").unwrap();
146-
assert_eq!(r.normalize(), "?\u{e000}");
146+
assert_eq!(r.normalize(), "?%EE%80%80");
147147
}

0 commit comments

Comments
 (0)