Skip to content

Commit 092cc57

Browse files
authored
Get rid of google-re2 as dependency (apache#47493)
Following discusison in https://lists.apache.org/thread/nnttnblj3tcv2wqj79gwovtfhgq8lvmm and PR merged apache#47360 and opened issue apache#47364 ideal that could prevent new additions of regexp to UI. I think it's time to remove google-re2.
1 parent 85c3fba commit 092cc57

File tree

54 files changed

+113
-281
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+113
-281
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -421,15 +421,6 @@ repos:
421421
exclude: ^providers/.*/src/airflow/providers/
422422
entry: ./scripts/ci/pre_commit/check_cncf_k8s_used_for_k8s_executor_only.py
423423
additional_dependencies: ['rich>=12.4.4']
424-
- id: check-google-re2-as-dependency
425-
name: Check google-re2 declared as dep
426-
description: Check google-re2 is declared as dependency when needed
427-
entry: ./scripts/ci/pre_commit/check_google_re2_imports.py
428-
language: python
429-
pass_filenames: true
430-
require_serial: true
431-
files: ^providers/.*/src/airflow/providers/.*\.py$
432-
additional_dependencies: ['rich>=12.4.4']
433424
- id: update-local-yml-file
434425
name: Update mounts in the local yml file
435426
entry: ./scripts/ci/pre_commit/local_yml_mounts.py
@@ -518,7 +509,7 @@ repos:
518509
^providers/fab/src/airflow/providers/fab/migrations/versions/.*$|^providers/fab/src/airflow/providers/fab/migrations/versions|
519510
^airflow/utils/db.py$|
520511
^providers/fab/src/airflow/providers/fab/auth_manager/models/db.py$
521-
additional_dependencies: ['packaging','google-re2']
512+
additional_dependencies: ['packaging']
522513
- id: update-version
523514
name: Update versions in docs
524515
entry: ./scripts/ci/pre_commit/update_versions.py
@@ -1156,35 +1147,6 @@ repos:
11561147
language: python
11571148
pass_filenames: true
11581149
files: ^tests/.*\.py$
1159-
- id: check-usage-of-re2-over-re
1160-
language: pygrep
1161-
name: Use re2 module instead of re
1162-
description: Use re2 module instead of re
1163-
entry: "^\\s*from re\\s|^\\s*import re\\s"
1164-
pass_filenames: true
1165-
files: \.py$
1166-
exclude: |
1167-
(?x)
1168-
^airflow/configuration.py$ |
1169-
^airflow/metrics/validators.py$ |
1170-
^airflow/models/dag.py$ |
1171-
^airflow/serialization/serde.py$ |
1172-
^airflow/utils/file.py$ |
1173-
^airflow/utils/helpers.py$ |
1174-
^providers/ |
1175-
^tests/ |
1176-
^providers/tests/ |
1177-
^providers/.*/tests/ |
1178-
^task-sdk/src/airflow/sdk/definitions/dag.py$ |
1179-
^task-sdk/src/airflow/sdk/execution_time/secrets_masker.py$ |
1180-
^task-sdk/src/airflow/sdk/definitions/_internal/node.py$ |
1181-
^dev/.*\.py$ |
1182-
^scripts/.*\.py$ |
1183-
^docker_tests/.*$ |
1184-
^helm_tests/.*$ |
1185-
^devel-common/.*$ |
1186-
^docs/.*\.py$ |
1187-
^hatch_build.py$
11881150
- id: check-provider-docs-valid
11891151
name: Validate provider doc files
11901152
entry: ./scripts/ci/pre_commit/check_provider_docs.py

airflow/api_fastapi/core_api/routes/public/providers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from __future__ import annotations
1919

20-
import re2
20+
import re
2121

2222
from airflow.api_fastapi.common.parameters import QueryLimit, QueryOffset
2323
from airflow.api_fastapi.common.router import AirflowRouter
@@ -28,7 +28,7 @@
2828

2929

3030
def _remove_rst_syntax(value: str) -> str:
31-
return re2.sub("[`_<>]", "", value.strip(" \n."))
31+
return re.sub("[`_<>]", "", value.strip(" \n."))
3232

3333

3434
def _provider_mapper(provider: ProviderInfo) -> ProviderResponse:

airflow/cli/commands/remote_commands/dag_command.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424
import json
2525
import logging
2626
import operator
27+
import re
2728
import subprocess
2829
import sys
2930
from typing import TYPE_CHECKING
3031

31-
import re2
3232
from sqlalchemy import func, select
3333

3434
from airflow.api.client import get_current_api_client
@@ -529,7 +529,7 @@ def dag_test(args, dag: DAG | None = None, session: Session = NEW_SESSION) -> No
529529
use_executor = args.use_executor
530530

531531
mark_success_pattern = (
532-
re2.compile(args.mark_success_pattern) if args.mark_success_pattern is not None else None
532+
re.compile(args.mark_success_pattern) if args.mark_success_pattern is not None else None
533533
)
534534

535535
with _airflow_parsing_context_manager(dag_id=args.dag_id):

airflow/cli/commands/remote_commands/provider_command.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818

1919
from __future__ import annotations
2020

21+
import re
2122
import sys
2223

23-
import re2
24-
2524
from airflow.cli.simple_table import AirflowConsole
2625
from airflow.providers_manager import ProvidersManager
2726
from airflow.utils.cli import suppress_logs_and_warning
@@ -31,7 +30,7 @@
3130

3231

3332
def _remove_rst_syntax(value: str) -> str:
34-
return re2.sub("[`_<>]", "", value.strip(" \n."))
33+
return re.sub("[`_<>]", "", value.strip(" \n."))
3534

3635

3736
@suppress_logs_and_warning

airflow/configuration.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import multiprocessing
2626
import os
2727
import pathlib
28+
import re
2829
import shlex
2930
import stat
3031
import subprocess
@@ -41,7 +42,6 @@
4142
from typing import IO, TYPE_CHECKING, Any, Union
4243
from urllib.parse import urlsplit
4344

44-
import re2
4545
from packaging.version import parse as parse_version
4646
from typing_extensions import overload
4747

@@ -63,7 +63,7 @@
6363
warnings.filterwarnings(action="default", category=DeprecationWarning, module="airflow")
6464
warnings.filterwarnings(action="default", category=PendingDeprecationWarning, module="airflow")
6565

66-
_SQLITE3_VERSION_PATTERN = re2.compile(r"(?P<version>^\d+(?:\.\d+)*)\D?.*$")
66+
_SQLITE3_VERSION_PATTERN = re.compile(r"(?P<version>^\d+(?:\.\d+)*)\D?.*$")
6767

6868
ConfigType = Union[str, int, float, bool]
6969
ConfigOptionsDictType = dict[str, ConfigType]
@@ -354,37 +354,37 @@ def inversed_deprecated_sections(self):
354354
# about. Mapping of section -> setting -> { old, replace, by_version }
355355
deprecated_values: dict[str, dict[str, tuple[Pattern, str, str]]] = {
356356
"core": {
357-
"hostname_callable": (re2.compile(r":"), r".", "2.1"),
357+
"hostname_callable": (re.compile(r":"), r".", "2.1"),
358358
},
359359
"webserver": {
360-
"navbar_color": (re2.compile(r"(?i)\A#007A87\z"), "#fff", "2.1"),
361-
"dag_default_view": (re2.compile(r"^tree$"), "grid", "3.0"),
360+
"navbar_color": (re.compile(r"(?i)^#007A87$"), "#fff", "2.1"),
361+
"dag_default_view": (re.compile(r"^tree$"), "grid", "3.0"),
362362
},
363363
"email": {
364364
"email_backend": (
365-
re2.compile(r"^airflow\.contrib\.utils\.sendgrid\.send_email$"),
365+
re.compile(r"^airflow\.contrib\.utils\.sendgrid\.send_email$"),
366366
r"airflow.providers.sendgrid.utils.emailer.send_email",
367367
"2.1",
368368
),
369369
},
370370
"logging": {
371371
"log_filename_template": (
372-
re2.compile(re2.escape("{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log")),
372+
re.compile(re.escape("{{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log")),
373373
# The actual replacement value will be updated after defaults are loaded from config.yml
374374
"XX-set-after-default-config-loaded-XX",
375375
"3.0",
376376
),
377377
},
378378
"api": {
379379
"auth_backends": (
380-
re2.compile(r"^airflow\.api\.auth\.backend\.deny_all$|^$"),
380+
re.compile(r"^airflow\.api\.auth\.backend\.deny_all$|^$"),
381381
"airflow.providers.fab.auth_manager.api.auth.backend.session",
382382
"3.0",
383383
),
384384
},
385385
"elasticsearch": {
386386
"log_id_template": (
387-
re2.compile("^" + re2.escape("{dag_id}-{task_id}-{logical_date}-{try_number}") + "$"),
387+
re.compile("^" + re.escape("{dag_id}-{task_id}-{logical_date}-{try_number}") + "$"),
388388
"{dag_id}-{task_id}-{run_id}-{map_index}-{try_number}",
389389
"3.0",
390390
)
@@ -725,7 +725,7 @@ def _upgrade_postgres_metastore_conn(self):
725725
stacklevel=1,
726726
)
727727
self.upgraded_values[(section, key)] = old_value
728-
new_value = re2.sub("^" + re2.escape(f"{parsed.scheme}://"), f"{good_scheme}://", old_value)
728+
new_value = re.sub("^" + re.escape(f"{parsed.scheme}://"), f"{good_scheme}://", old_value)
729729
self._update_env_var(section=section, name=key, new_value=new_value)
730730

731731
# if the old value is set via env var, we need to wipe it

airflow/decorators/base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@
1818

1919
import inspect
2020
import itertools
21+
import re
2122
import textwrap
2223
import warnings
2324
from collections.abc import Collection, Iterator, Mapping, Sequence
2425
from functools import cached_property, update_wrapper
2526
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Generic, Protocol, TypeVar, cast, overload
2627

2728
import attr
28-
import re2
2929
import typing_extensions
3030

3131
from airflow.models.baseoperator import (
@@ -138,14 +138,14 @@ def get_unique_task_id(
138138
return task_id
139139

140140
def _find_id_suffixes(dag: DAG) -> Iterator[int]:
141-
prefix = re2.split(r"__\d+$", tg_task_id)[0]
141+
prefix = re.split(r"__\d+$", tg_task_id)[0]
142142
for task_id in dag.task_ids:
143-
match = re2.match(rf"^{prefix}__(\d+)$", task_id)
143+
match = re.match(rf"^{prefix}__(\d+)$", task_id)
144144
if match:
145145
yield int(match.group(1))
146146
yield 0 # Default if there's no matching task ID.
147147

148-
core = re2.split(r"__\d+$", task_id)[0]
148+
core = re.split(r"__\d+$", task_id)[0]
149149
return f"{core}__{max(_find_id_suffixes(dag)) + 1}"
150150

151151

airflow/metrics/validators.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,14 @@
2121

2222
import abc
2323
import logging
24+
import re
2425
import string
2526
import warnings
2627
from collections.abc import Iterable
2728
from functools import partial, wraps
2829
from re import Pattern
2930
from typing import Callable, cast
3031

31-
import re2
32-
3332
from airflow.configuration import conf
3433
from airflow.exceptions import InvalidStatsNameException
3534

@@ -82,7 +81,7 @@ class MetricNameLengthExemptionWarning(Warning):
8281
r"^dagrun\.schedule_delay\.(?P<dag_id>.*)$",
8382
r"^dagrun\.(?P<dag_id>.*)\.first_task_scheduling_delay$",
8483
}
85-
BACK_COMPAT_METRIC_NAMES: set[Pattern[str]] = {re2.compile(name) for name in BACK_COMPAT_METRIC_NAME_PATTERNS}
84+
BACK_COMPAT_METRIC_NAMES: set[Pattern[str]] = {re.compile(name) for name in BACK_COMPAT_METRIC_NAME_PATTERNS}
8685

8786
OTEL_NAME_MAX_LENGTH = 63
8887
DEFAULT_VALIDATOR_TYPE = "allow"
@@ -161,7 +160,7 @@ def stat_name_otel_handler(
161160
# If the name is in the exceptions list, do not fail it for being too long.
162161
# It may still be deemed invalid for other reasons below.
163162
for exemption in BACK_COMPAT_METRIC_NAMES:
164-
if re2.match(exemption, stat_name):
163+
if re.match(exemption, stat_name):
165164
# There is a back-compat exception for this name; proceed
166165
name_length_exemption = True
167166
matched_exemption = exemption.pattern
@@ -249,7 +248,7 @@ def test(self, name: str) -> bool:
249248

250249
def _has_pattern_match(self, name: str) -> bool:
251250
for entry in self.validate_list or ():
252-
if re2.findall(entry, name.strip().lower()):
251+
if re.findall(entry, name.strip().lower()):
253252
return True
254253
return False
255254

airflow/models/connection.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@
1919

2020
import json
2121
import logging
22+
import re
2223
from contextlib import suppress
2324
from json import JSONDecodeError
2425
from typing import Any
2526
from urllib.parse import parse_qsl, quote, unquote, urlencode, urlsplit
2627

27-
import re2
2828
from sqlalchemy import Boolean, Column, Integer, String, Text
2929
from sqlalchemy.orm import declared_attr, reconstructor, synonym
3030

@@ -43,7 +43,7 @@
4343
# the symbols #,!,-,_,.,:,\,/ and () requiring at least one match.
4444
#
4545
# You can try the regex here: https://regex101.com/r/69033B/1
46-
RE_SANITIZE_CONN_ID = re2.compile(r"^[\w\#\!\(\)\-\.\:\/\\]{1,}$")
46+
RE_SANITIZE_CONN_ID = re.compile(r"^[\w\#\!\(\)\-\.\:\/\\]{1,}$")
4747
# the conn ID max len should be 250
4848
CONN_ID_MAX_LEN: int = 250
4949

@@ -66,7 +66,7 @@ def sanitize_conn_id(conn_id: str | None, max_length=CONN_ID_MAX_LEN) -> str | N
6666
"""
6767
# check if `conn_id` or our match group is `None` and the `conn_id` is within the specified length.
6868
if (not isinstance(conn_id, str) or len(conn_id) > max_length) or (
69-
res := re2.match(RE_SANITIZE_CONN_ID, conn_id)
69+
res := re.match(RE_SANITIZE_CONN_ID, conn_id)
7070
) is None:
7171
return None
7272

airflow/models/dag.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import copy
2222
import functools
2323
import logging
24+
import re
2425
import sys
2526
import time
2627
from collections import defaultdict
@@ -42,7 +43,6 @@
4243
import attrs
4344
import methodtools
4445
import pendulum
45-
import re2
4646
import sqlalchemy_jsonfield
4747
from dateutil.relativedelta import relativedelta
4848
from packaging import version as packaging_version
@@ -1722,7 +1722,7 @@ def add_logger_if_needed(ti: TaskInstance):
17221722
ti.task = tasks[ti.task_id]
17231723

17241724
mark_success = (
1725-
re2.compile(mark_success_pattern).fullmatch(ti.task_id) is not None
1725+
re.compile(mark_success_pattern).fullmatch(ti.task_id) is not None
17261726
if mark_success_pattern is not None
17271727
else False
17281728
)
@@ -1804,9 +1804,9 @@ def create_dagrun(
18041804

18051805
# This is also done on the DagRun model class, but SQLAlchemy column
18061806
# validator does not work well for some reason.
1807-
if not re2.match(RUN_ID_REGEX, run_id):
1807+
if not re.match(RUN_ID_REGEX, run_id):
18081808
regex = airflow_conf.get("scheduler", "allowed_run_id_pattern").strip()
1809-
if not regex or not re2.match(regex, run_id):
1809+
if not regex or not re.match(regex, run_id):
18101810
raise ValueError(
18111811
f"The run_id provided '{run_id}' does not match regex pattern "
18121812
f"'{regex}' or '{RUN_ID_REGEX}'"

airflow/models/dagrun.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@
1919

2020
import itertools
2121
import os
22+
import re
2223
from collections import defaultdict
2324
from collections.abc import Iterable, Iterator, Sequence
2425
from typing import TYPE_CHECKING, Any, Callable, NamedTuple, TypeVar, overload
2526

26-
import re2
2727
from sqlalchemy import (
2828
JSON,
2929
Column,
@@ -298,10 +298,10 @@ def __repr__(self):
298298
def validate_run_id(self, key: str, run_id: str) -> str | None:
299299
if not run_id:
300300
return None
301-
if re2.match(RUN_ID_REGEX, run_id):
301+
if re.match(RUN_ID_REGEX, run_id):
302302
return run_id
303303
regex = airflow_conf.get("scheduler", "allowed_run_id_pattern").strip()
304-
if regex and re2.match(regex, run_id):
304+
if regex and re.match(regex, run_id):
305305
return run_id
306306
raise ValueError(
307307
f"The run_id provided '{run_id}' does not match regex pattern '{regex}' or '{RUN_ID_REGEX}'"

0 commit comments

Comments
 (0)