Skip to content

Commit 5cf5ae1

Browse files
committed
Add a nonunicode regex mode
This is a variant of the regress engine usage in which the `u` flag is not set. It is needed for usages which rely on unicode escapes *not* being interpreted. In particular, the azure pipelines schema won't pass metaschema validation (let alone apply correctly) if `pattern` interpretation uses unicode-mode regexes.
1 parent 509fc7b commit 5cf5ae1

File tree

7 files changed

+51
-16
lines changed

7 files changed

+51
-16
lines changed

.pre-commit-hooks.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
- id: check-azure-pipelines
2525
name: Validate Azure Pipelines
2626
description: 'Validate Azure Pipelines config against the schema provided by Microsoft'
27-
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines
27+
entry: check-jsonschema --builtin-schema vendor.azure-pipelines --data-transform azure-pipelines --regex-variant nonunicode
2828
language: python
2929
files: ^(\.)?azure-pipelines\.(yml|yaml)$
3030
types: [yaml]

docs/usage.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ for ``"pattern"`` interpretation. The modes are as follows:
197197
- description
198198
* - default
199199
- Use ECMAScript regex syntax.
200+
* - nonunicode
201+
- Use ECMAScript regex syntax, but without unicode escapes enabled.
200202
* - python
201203
- Use Python regex syntax.
202204

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ dependencies = [
2020
'tomli>=2.0;python_version<"3.11"',
2121
"ruamel.yaml==0.18.6",
2222
"jsonschema>=4.18.0,<5.0",
23-
"regress>=0.4.0",
23+
"regress>=2024.11.1",
2424
"requests<3.0",
2525
"click>=8,<9",
2626
]

src/check_jsonschema/catalog.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,12 @@ def _githubusercontent_url(owner: str, repo: str, ref: str, path: str) -> str:
3131
"Validate Azure Pipelines config against the schema provided "
3232
"by Microsoft"
3333
),
34-
"add_args": ["--data-transform", "azure-pipelines"],
34+
"add_args": [
35+
"--data-transform",
36+
"azure-pipelines",
37+
"--regex-variant",
38+
"nonunicode",
39+
],
3540
"files": r"^(\.)?azure-pipelines\.(yml|yaml)$",
3641
"types": "yaml",
3742
},

src/check_jsonschema/cli/main_command.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,9 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
7171
\b
7272
For handling of regexes, there are multiple modes which can be specified with
7373
'--regex-variant':
74-
default | use ECMAScript regex syntax (via regress)
75-
python | use python regex syntax
74+
default | use ECMAScript regex syntax (via regress)
75+
nonunicode | use ECMAScript regex syntax, but in non-unicode mode (via regress)
76+
python | use python regex syntax
7677
7778
\b
7879
The '--builtin-schema' flag supports the following schema names:
@@ -238,8 +239,8 @@ def main(
238239
no_cache: bool,
239240
cache_filename: str | None,
240241
disable_formats: tuple[list[str], ...],
241-
format_regex: t.Literal["python", "default"] | None,
242-
regex_variant: t.Literal["python", "default"] | None,
242+
format_regex: t.Literal["python", "nonunicode", "default"] | None,
243+
regex_variant: t.Literal["python", "nonunicode", "default"] | None,
243244
default_filetype: t.Literal["json", "yaml", "toml", "json5"],
244245
traceback_mode: t.Literal["full", "short"],
245246
data_transform: t.Literal["azure-pipelines", "gitlab-ci"] | None,

src/check_jsonschema/cli/parse_result.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ def __init__(self) -> None:
5151

5252
def set_regex_variant(
5353
self,
54-
variant_opt: Literal["python", "default"] | None,
54+
variant_opt: Literal["python", "nonunicode", "default"] | None,
5555
*,
56-
legacy_opt: Literal["python", "default"] | None = None,
56+
legacy_opt: Literal["python", "nonunicode", "default"] | None = None,
5757
) -> None:
58-
variant_name: Literal["python", "default"] | None = variant_opt or legacy_opt
58+
variant_name: Literal["python", "nonunicode", "default"] | None = (
59+
variant_opt or legacy_opt
60+
)
5961
if variant_name:
6062
self.regex_variant = RegexVariantName(variant_name)
6163

src/check_jsonschema/regex_variants.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
class RegexVariantName(enum.Enum):
1010
default = "default"
11+
nonunicode = "nonunicode"
1112
python = "python"
1213

1314

@@ -31,23 +32,47 @@ def __init__(self, variant: RegexVariantName) -> None:
3132
self.variant = variant
3233

3334
if self.variant == RegexVariantName.default:
34-
self._real_implementation = _RegressImplementation()
35+
self._real_implementation = _UnicodeRegressImplementation()
36+
elif self.variant == RegexVariantName.nonunicode:
37+
self._real_implementation = _NonunicodeRegressImplementation()
3538
else:
3639
self._real_implementation = _PythonImplementation()
3740

3841
self.check_format = self._real_implementation.check_format
3942
self.pattern_keyword = self._real_implementation.pattern_keyword
4043

4144

42-
class _RegressImplementation:
45+
class _UnicodeRegressImplementation:
46+
def check_format(self, instance: t.Any) -> bool:
47+
if not isinstance(instance, str):
48+
return True
49+
try:
50+
regress.Regex(instance, flags="u")
51+
except regress.RegressError:
52+
return False
53+
return True
54+
55+
def pattern_keyword(
56+
self, validator: t.Any, pattern: str, instance: str, schema: t.Any
57+
) -> t.Iterator[jsonschema.ValidationError]:
58+
if not validator.is_type(instance, "string"):
59+
return
60+
61+
try:
62+
regress_pattern = regress.Regex(pattern, flags="u")
63+
except regress.RegressError:
64+
yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile")
65+
if not regress_pattern.find(instance):
66+
yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}")
67+
68+
69+
class _NonunicodeRegressImplementation:
4370
def check_format(self, instance: t.Any) -> bool:
4471
if not isinstance(instance, str):
4572
return True
4673
try:
4774
regress.Regex(instance)
48-
# something is wrong with RegressError getting into the published types
49-
# needs investigation... for now, ignore the error
50-
except regress.RegressError: # type: ignore[attr-defined]
75+
except regress.RegressError:
5176
return False
5277
return True
5378

@@ -59,7 +84,7 @@ def pattern_keyword(
5984

6085
try:
6186
regress_pattern = regress.Regex(pattern)
62-
except regress.RegressError: # type: ignore[attr-defined]
87+
except regress.RegressError:
6388
yield jsonschema.ValidationError(f"pattern {pattern!r} failed to compile")
6489
if not regress_pattern.find(instance):
6590
yield jsonschema.ValidationError(f"{instance!r} does not match {pattern!r}")

0 commit comments

Comments
 (0)