Skip to content

Commit 2c8533f

Browse files
authored
Parse qualified names with non-alpha characters in xpath (servo#39409)
The existing parsing rules are too strict and only allow alpha and alphanumeric characters. Instead, we should follow the production defined in https://www.w3.org/TR/REC-xml-names/#NT-NCName. Testing: New tests start to pass Part of servo#34527 --------- Signed-off-by: Simon Wülker <[email protected]>
1 parent 4d43844 commit 2c8533f

File tree

3 files changed

+29
-24
lines changed

3 files changed

+29
-24
lines changed

components/script/xpath/parser.rs

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@
44

55
use nom::branch::alt;
66
use nom::bytes::complete::{tag, take_while1};
7-
use nom::character::complete::{alpha1, alphanumeric1, char, digit1, multispace0};
7+
use nom::character::complete::{char, digit1, multispace0};
88
use nom::combinator::{map, opt, recognize, value};
99
use nom::error::{Error as NomError, ErrorKind as NomErrorKind, ParseError as NomParseError};
1010
use nom::multi::{many0, separated_list0};
1111
use nom::sequence::{delimited, pair, preceded};
12-
use nom::{Finish, IResult, Parser};
12+
use nom::{AsChar, Finish, IResult, Input, Parser};
13+
14+
use crate::dom::bindings::xmlname::{is_valid_continuation, is_valid_start};
1315

1416
pub(crate) fn parse(input: &str) -> Result<Expr, OwnedParserError> {
1517
let (_, ast) = expr(input).finish().map_err(OwnedParserError::from)?;
@@ -955,7 +957,7 @@ fn string_literal(input: &str) -> IResult<&str, Literal> {
955957
.parse(input)
956958
}
957959

958-
// QName parser
960+
/// <https://www.w3.org/TR/REC-xml-names/#NT-QName>
959961
fn qname(input: &str) -> IResult<&str, QName> {
960962
let (input, prefix) = opt((ncname, char(':'))).parse(input)?;
961963
let (input, local) = ncname(input)?;
@@ -969,13 +971,31 @@ fn qname(input: &str) -> IResult<&str, QName> {
969971
))
970972
}
971973

972-
// NCName parser
974+
/// <https://www.w3.org/TR/REC-xml-names/#NT-NCName>
973975
fn ncname(input: &str) -> IResult<&str, &str> {
974-
recognize(pair(
975-
alpha1,
976-
many0(alt((alphanumeric1, tag("-"), tag("_")))),
977-
))
978-
.parse(input)
976+
fn name_start_character<T, E: NomParseError<T>>(input: T) -> IResult<T, T, E>
977+
where
978+
T: Input,
979+
<T as Input>::Item: AsChar,
980+
{
981+
input.split_at_position1_complete(
982+
|character| !is_valid_start(character.as_char()) || character.as_char() == ':',
983+
NomErrorKind::OneOf,
984+
)
985+
}
986+
987+
fn name_character<T, E: NomParseError<T>>(input: T) -> IResult<T, T, E>
988+
where
989+
T: Input,
990+
<T as Input>::Item: AsChar,
991+
{
992+
input.split_at_position1_complete(
993+
|character| !is_valid_continuation(character.as_char()) || character.as_char() == ':',
994+
NomErrorKind::OneOf,
995+
)
996+
}
997+
998+
recognize(pair(name_start_character, many0(name_character))).parse(input)
979999
}
9801000

9811001
// Test functions to verify the parsers:

tests/wpt/meta/domxpath/text-html-attributes.html.ini

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,8 @@
22
[Select html element based on attribute mixed case]
33
expected: FAIL
44

5-
[Select HTML element with non-ascii attribute 1]
6-
expected: FAIL
7-
8-
[Select HTML element with non-ascii attribute 2]
9-
expected: FAIL
10-
115
[Select HTML element with non-ascii attribute 3]
126
expected: FAIL
137

148
[Select both HTML and SVG elements based on mixed case attribute]
159
expected: FAIL
16-
17-
[Select SVG element with non-ascii attribute 1]
18-
expected: FAIL
19-
20-
[Select SVG element with non-ascii attribute 2]
21-
expected: FAIL

tests/wpt/meta/domxpath/text-html-elements.html.ini

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
[HTML elements mixed case]
33
expected: FAIL
44

5-
[Non-ascii HTML element]
6-
expected: FAIL
7-
85
[Non-ascii HTML element3]
96
expected: FAIL
107

0 commit comments

Comments
 (0)