|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
| 3 | +import re |
3 | 4 | import typing
|
4 | 5 | from unittest import mock
|
5 | 6 | from unittest.mock import call
|
|
11 | 12 | from crawlee.http_clients import HttpResponse, HttpxHttpClient
|
12 | 13 |
|
13 | 14 | from apify import Actor
|
| 15 | +from apify._actor_inputs import URL_NO_COMMAS_REGEX |
14 | 16 |
|
15 | 17 |
|
16 | 18 | @pytest.mark.parametrize('request_method', typing.get_args(HttpMethod))
|
@@ -47,8 +49,7 @@ async def test_actor_create_request_list_request_types(
|
47 | 49 | for key, value in optional_input['user_data'].items():
|
48 | 50 | expected_user_data[key] = value
|
49 | 51 | assert generated_request.user_data == expected_user_data
|
50 |
| - expected_headers = HttpHeaders(root=optional_input.get('headers', {})) |
51 |
| - assert generated_request.headers == expected_headers |
| 52 | + assert generated_request.headers.root == optional_input.get('headers', {}) |
52 | 53 |
|
53 | 54 |
|
54 | 55 | def _create_dummy_response(read_output: typing.Iterator[str]) -> HttpResponse:
|
@@ -130,12 +131,74 @@ async def test_actor_create_request_list_from_url() -> None:
|
130 | 131 | )
|
131 | 132 | generated_requests = []
|
132 | 133 | while request := await generated_request_list.fetch_next_request():
|
133 |
| - print(request) |
134 | 134 | generated_requests.append(request)
|
135 | 135 |
|
136 | 136 | # Check correctly created requests' urls in request list
|
137 | 137 | assert {generated_request.url for generated_request in generated_requests} == expected_urls
|
138 | 138 |
|
139 | 139 | async def test_actor_create_request_list_from_url_additional_inputs() -> None:
|
140 |
| - assert False |
141 |
| - # TODO test that will check that additional properties, like payload, headers request type are all properly passed. |
| 140 | + """Test that all generated request properties are correctly populated from input values.""" |
| 141 | + expected_simple_url = 'https://www.someurl.com' |
| 142 | + example_start_urls_input = [ |
| 143 | + {'requests_from_url': 'https://crawlee.dev/file.txt', 'method': 'POST', |
| 144 | + 'headers': {'key': 'value'}, |
| 145 | + 'payload': 'some_payload', |
| 146 | + 'user_data': {'another_key': 'another_value'}}, |
| 147 | + ] |
| 148 | + response_bodies = iter((expected_simple_url,)) |
| 149 | + http_client = HttpxHttpClient() |
| 150 | + with mock.patch.object(http_client, 'send_request', return_value=_create_dummy_response(response_bodies)): |
| 151 | + generated_request_list = await Actor.create_request_list( |
| 152 | + actor_start_urls_input=example_start_urls_input, http_client=http_client |
| 153 | + ) |
| 154 | + request = await generated_request_list.fetch_next_request() |
| 155 | + |
| 156 | + # Check all properties correctly created for request |
| 157 | + assert request.url == expected_simple_url |
| 158 | + assert request.method == example_start_urls_input[0]['method'] |
| 159 | + assert request.headers.root == example_start_urls_input[0]['headers'] |
| 160 | + assert request.payload == example_start_urls_input[0]['payload'].encode('utf-8') |
| 161 | + expected_user_data = UserData() |
| 162 | + for key, value in example_start_urls_input[0]['user_data'].items(): |
| 163 | + expected_user_data[key] = value |
| 164 | + assert request.user_data == expected_user_data |
| 165 | + |
| 166 | + |
| 167 | +@pytest.mark.parametrize('true_positive', [ |
| 168 | + 'http://www.something.com', |
| 169 | + 'https://www.something.net', |
| 170 | + 'http://nowww.cz', |
| 171 | + 'https://with-hypen.com', |
| 172 | + 'http://number1.com', |
| 173 | + 'http://www.number.123', |
| 174 | + 'http://many.dots.com', |
| 175 | + 'http://a.com', |
| 176 | + 'http://www.something.com/somethignelse' |
| 177 | + 'http://www.something.com/somethignelse.txt', |
| 178 | + # "http://non-english-chars-á.com" # re module not suitable, regex can do this with \p{L}. Do we want this? |
| 179 | +]) |
| 180 | +def test_url_no_commas_regex_true_positives(true_positive: str) -> None: |
| 181 | + example_string= f'Some text {true_positive} some more text' |
| 182 | + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) |
| 183 | + assert len(matches) == 1 |
| 184 | + assert matches[0].group(0) == true_positive |
| 185 | + |
| 186 | +@pytest.mark.parametrize('false_positive',[ |
| 187 | + 'http://www.a', |
| 188 | + 'http://a', |
| 189 | + 'http://a.a', |
| 190 | + 'http://123.456', |
| 191 | + 'www.something.com', |
| 192 | + 'http:www.something.com', |
| 193 | +]) |
| 194 | +def test_url_no_commas_regex_false_positives(false_positive: str) -> None: |
| 195 | + example_string= f'Some text {false_positive} some more text' |
| 196 | + matches = list(re.findall(URL_NO_COMMAS_REGEX, example_string)) |
| 197 | + assert len(matches) == 0 |
| 198 | + |
| 199 | +def test_url_no_commas_regex_multi_line() -> None: |
| 200 | + true_positives = ('http://www.something.com', 'http://www.else.com') |
| 201 | + example_string= 'Some text {} some more text \n Some new line text {} ...'.format(*true_positives) |
| 202 | + matches = list(re.finditer(URL_NO_COMMAS_REGEX, example_string)) |
| 203 | + assert len(matches) == 2 |
| 204 | + assert {match.group(0) for match in matches} == set(true_positives) |
0 commit comments