Use re with \w

Pijukatel · Pijukatel · commit 942c9407c648 · 2024-11-18T17:49:22.000+01:00
Add few more tests for regex
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,7 +55,6 @@ lazy-object-proxy = ">=1.10.0"
 scrapy = { version = ">=2.11.0", optional = true }
 typing-extensions = ">=4.1.0"
 websockets = ">=10.0 <14.0.0"
-regex = "^2024.11.6"
 
 [tool.poetry.group.dev.dependencies]
 build = "~1.2.0"
diff --git a/src/apify/storages/_actor_inputs.py b/src/apify/storages/_actor_inputs.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
 import asyncio
+import re
 from asyncio import Task
 from functools import partial
 from typing import TYPE_CHECKING, Any
 
-import regex
 from pydantic import BaseModel, ConfigDict, Field
 
 if TYPE_CHECKING:
@@ -18,8 +18,8 @@
 
 from ._known_actor_input_keys import ActorInputKeys
 
-URL_NO_COMMAS_REGEX = regex.compile(
-    r'https?:\/\/(www\.)?([\p{L}0-9]|[\p{L}0-9][-\p{L}0-9@:%._+~#=]{0,254}[\p{L}0-9])\.[a-z]{2,63}(:\d{1,5})?(\/[-\p{L}0-9@:%_+.~#?&/=()]*)?'
+URL_NO_COMMAS_REGEX = re.compile(
+    r'https?:\/\/(www\.)?([^\W_]|[^\W_][-\w0-9@:%._+~#=]{0,254}[^\W_])\.[a-z]{2,63}(:\d{1,5})?(\/[-\w@:%+.~#?&/=()]*)?'
 )
 
 
@@ -108,7 +108,7 @@ async def _create_requests_from_url(
 
     def create_requests_from_response(request_input: _RequestsFromUrlInput, task: Task) -> None:
         """Callback to scrape response body with regexp and create Requests from matches."""
-        matches = regex.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
+        matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
         created_requests.extend([Request.from_url(
             match.group(0),
             method=request_input.method,
diff --git a/tests/unit/actor/test_actor_inputs.py b/tests/unit/actor/test_actor_inputs.py
@@ -1,11 +1,11 @@
 from __future__ import annotations
 
+import re
 from typing import Any, Iterator, get_args
 from unittest import mock
 from unittest.mock import call
 
 import pytest
-import regex
 
 from crawlee._request import UserData
 from crawlee._types import HttpHeaders, HttpMethod
@@ -172,11 +172,13 @@ async def test_actor_create_request_list_from_url_additional_inputs()  -> None:
     'http://a.com',
     'http://www.something.com/somethignelse'
     'http://www.something.com/somethignelse.txt',
-    'http://non-english-chars-áíéåü.com'
+    'http://non-english-chars-áíéåü.com',
+    'http://www.port.com:1234',
+    'http://username:password@something.apify.com'
 ])
 def test_url_no_commas_regex_true_positives(true_positive: str) -> None:
     example_string= f'Some text {true_positive} some more text'
-    matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string))
+    matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
     assert len(matches) == 1
     assert matches[0].group(0) == true_positive
 
@@ -190,12 +192,12 @@ def test_url_no_commas_regex_true_positives(true_positive: str) -> None:
 ])
 def test_url_no_commas_regex_false_positives(false_positive: str) -> None:
     example_string= f'Some text {false_positive} some more text'
-    matches = list(regex.findall(URL_NO_COMMAS_REGEX, example_string))
+    matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string))
     assert len(matches) == 0
 
 def test_url_no_commas_regex_multi_line() -> None:
     true_positives = ('http://www.something.com', 'http://www.else.com')
     example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives)
-    matches = list(regex.finditer(URL_NO_COMMAS_REGEX, example_string))
+    matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string))
     assert len(matches) == 2
     assert {match.group(0) for match in matches} == set(true_positives)