Skip to content

Commit 942c940

Browse files
committed
Use re with \w
Add few more tests for regex
1 parent 707a8a8 commit 942c940

File tree

4 files changed

+12
-114
lines changed

4 files changed

+12
-114
lines changed

poetry.lock

Lines changed: 1 addition & 104 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ lazy-object-proxy = ">=1.10.0"
5555
scrapy = { version = ">=2.11.0", optional = true }
5656
typing-extensions = ">=4.1.0"
5757
websockets = ">=10.0 <14.0.0"
58-
regex = "^2024.11.6"
5958

6059
[tool.poetry.group.dev.dependencies]
6160
build = "~1.2.0"

src/apify/storages/_actor_inputs.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import re
45
from asyncio import Task
56
from functools import partial
67
from typing import TYPE_CHECKING, Any
78

8-
import regex
99
from pydantic import BaseModel, ConfigDict, Field
1010

1111
if TYPE_CHECKING:
@@ -18,8 +18,8 @@
1818

1919
from ._known_actor_input_keys import ActorInputKeys
2020

21-
URL_NO_COMMAS_REGEX = regex.compile(
22-
r'https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?'
21+
URL_NO_COMMAS_REGEX = re.compile(
22+
r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
2323
)
2424

2525

@@ -108,7 +108,7 @@ async def _create_requests_from_url(
108108

109109
def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
110110
"""Callback to scrape response body with regexp and create Requests from matches."""
111-
matches = regex.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
111+
matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
112112
created_requests.extend([Request.from_url(
113113
match.group(0),
114114
method=request_input.method,

tests/unit/actor/test_actor_inputs.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from __future__ import annotations
22

3+
import re
34
from typing import Any, Iterator, get_args
45
from unittest import mock
56
from unittest.mock import call
67

78
import pytest
8-
import regex
99

1010
from crawlee._request import UserData
1111
from crawlee._types import HttpHeaders, HttpMethod
@@ -172,11 +172,13 @@ async def test_actor_create_request_list_from_url_additional_inputs() -> None:
172172
'http://a.com',
173173
'http://www.something.com/somethignelse'
174174
'http://www.something.com/somethignelse.txt',
175-
'http://non-english-chars-áíéåü.com'
175+
'http://non-english-chars-áíéåü.com',
176+
'http://www.port.com:1234',
177+
'http://username:[email protected]'
176178
])
177179
def test_url_no_commas_regex_true_positives(true_positive: str) -> None:
178180
example_string= f'Some text {true_positive} some more text'
179-
matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string))
181+
matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
180182
assert len(matches) == 1
181183
assert matches[0].group(0) == true_positive
182184

@@ -190,12 +192,12 @@ def test_url_no_commas_regex_true_positives(true_positive: str) -> None:
190192
])
191193
def test_url_no_commas_regex_false_positives(false_positive: str) -> None:
192194
example_string= f'Some text {false_positive} some more text'
193-
matches = list(regex.findall(URL_NO_COMMAS_REGEX, example_string))
195+
matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string))
194196
assert len(matches) == 0
195197

196198
def test_url_no_commas_regex_multi_line() -> None:
197199
true_positives = ('http://www.something.com', 'http://www.else.com')
198200
example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives)
199-
matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string))
201+
matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
200202
assert len(matches) == 2
201203
assert {match.group(0) for match in matches} == set(true_positives)

0 commit comments

Comments
 (0)