Skip to content

Commit 3e33e91

Browse files
authored
fix: use HttpHeaders type in Scrapy integration (#289)
- Use `HttpHeaders` type in Scrapy integration (and fix tests) - Make `Configuration.standby_url` field optional (and fix tests) - Add `asyncio_default_fixture_loop_scope` to Pytest options - Rm tests from pre-commit
1 parent 8cd2f2c commit 3e33e91

File tree

7 files changed

+128
-134
lines changed

7 files changed

+128
-134
lines changed

.pre-commit-config.yaml

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,14 @@ repos:
1313
language: system
1414
pass_filenames: false
1515

16-
- id: unit-tests
17-
name: Run unit tests
18-
entry: make unit-tests
19-
language: system
20-
pass_filenames: false
21-
2216
- id: check-changelog-entry
2317
name: Check changelog entry
2418
entry: make check-changelog-entry
2519
language: system
2620
pass_filenames: false
2721

28-
- id: check-version-conflict
29-
name: Check version conflict
30-
entry: make check-version-conflict
22+
- id: check-version-availability
23+
name: Check version availability
24+
entry: make check-version-availability
3125
language: system
3226
pass_filenames: false

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ keywords = [
4848
python = "^3.9"
4949
apify-client = ">=1.8.1"
5050
apify-shared = ">=1.1.2"
51-
crawlee = ">=0.3.5"
51+
crawlee = ">=0.3.8"
5252
cryptography = ">=42.0.0"
5353
httpx = ">=0.27.0"
5454
lazy-object-proxy = ">=1.10.0"
@@ -162,6 +162,7 @@ max-branches = 18
162162

163163
[tool.pytest.ini_options]
164164
addopts = "-ra"
165+
asyncio_default_fixture_loop_scope = "function"
165166
asyncio_mode = "auto"
166167
timeout = 1200
167168

src/apify/_configuration.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing_extensions import deprecated
99

1010
from crawlee._utils.models import timedelta_ms
11+
from crawlee._utils.urls import validate_http_url
1112
from crawlee.configuration import Configuration as CrawleeConfiguration
1213

1314

@@ -263,11 +264,12 @@ class Configuration(CrawleeConfiguration):
263264

264265
standby_url: Annotated[
265266
str,
267+
BeforeValidator(validate_http_url),
266268
Field(
267269
alias='actor_standby_url',
268270
description='URL for accessing web servers of Actor runs in Standby mode',
269271
),
270-
]
272+
] = 'http://localhost'
271273

272274
token: Annotated[
273275
str | None,

src/apify/scrapy/requests.py

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
) from exc
1717

1818
from crawlee import Request as CrawleeRequest
19+
from crawlee._types import HttpHeaders
1920
from crawlee._utils.crypto import crypto_random_object_id
2021
from crawlee._utils.requests import compute_unique_key, unique_key_to_request_id
2122

@@ -77,9 +78,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest
7778
id=request_id,
7879
)
7980

80-
# Convert Scrapy's headers to a dictionary and store them in the apify_request
81+
# Convert Scrapy's headers to a HttpHeaders and store them in the apify_request
8182
if isinstance(scrapy_request.headers, Headers):
82-
apify_request.headers = dict(scrapy_request.headers.to_unicode_dict())
83+
apify_request.headers = HttpHeaders(scrapy_request.headers.to_unicode_dict())
8384
else:
8485
Actor.log.warning(
8586
f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}'
@@ -164,13 +165,7 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request:
164165

165166
# Add optional 'headers' field
166167
if apify_request.headers:
167-
if isinstance(cast(Any, apify_request.headers), dict):
168-
scrapy_request.headers = Headers(apify_request.headers)
169-
else:
170-
Actor.log.warning(
171-
'apify_request[headers] is not an instance of the dict class, '
172-
f'apify_request[headers] = {apify_request.headers}',
173-
)
168+
scrapy_request.headers |= Headers(apify_request.headers)
174169

175170
# Add optional 'userData' field
176171
if apify_request.user_data:

tests/unit/actor/test_actor_env_helpers.py

Lines changed: 108 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -23,115 +23,115 @@
2323
import pytest
2424

2525

26-
class TestIsAtHome:
27-
async def test_is_at_home_local(self) -> None:
28-
async with Actor as actor:
29-
is_at_home = actor.is_at_home()
30-
assert is_at_home is False
31-
32-
async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> None:
33-
print('setenv')
34-
monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
35-
async with Actor as actor:
36-
is_at_home = actor.is_at_home()
37-
assert is_at_home is True
38-
39-
40-
class TestGetEnv:
41-
async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None:
42-
ignored_env_vars = {
43-
ApifyEnvVars.INPUT_KEY,
44-
ApifyEnvVars.MEMORY_MBYTES,
45-
ApifyEnvVars.STARTED_AT,
46-
ApifyEnvVars.TIMEOUT_AT,
47-
ApifyEnvVars.DEFAULT_DATASET_ID,
48-
ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
49-
ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
50-
ApifyEnvVars.SDK_LATEST_VERSION,
51-
ApifyEnvVars.LOG_FORMAT,
52-
ApifyEnvVars.LOG_LEVEL,
53-
}
54-
55-
legacy_env_vars = {
56-
ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
57-
ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
58-
ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
59-
ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
60-
ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
61-
ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
62-
ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
63-
ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
64-
ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
65-
}
66-
67-
# Set up random env vars
68-
expected_get_env: dict[str, Any] = {}
69-
expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'
70-
71-
for int_env_var in INTEGER_ENV_VARS:
72-
if int_env_var in ignored_env_vars:
73-
continue
74-
75-
int_get_env_var = int_env_var.name.lower()
76-
expected_get_env[int_get_env_var] = random.randint(1, 99999)
77-
monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')
78-
79-
for float_env_var in FLOAT_ENV_VARS:
80-
if float_env_var in ignored_env_vars:
81-
continue
82-
83-
float_get_env_var = float_env_var.name.lower()
84-
expected_get_env[float_get_env_var] = random.random()
85-
monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')
86-
87-
for bool_env_var in BOOL_ENV_VARS:
88-
if bool_env_var in ignored_env_vars:
89-
continue
90-
91-
bool_get_env_var = bool_env_var.name.lower()
92-
expected_get_env[bool_get_env_var] = random.choice([True, False])
93-
monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')
94-
95-
for datetime_env_var in DATETIME_ENV_VARS:
96-
if datetime_env_var in ignored_env_vars:
97-
continue
98-
99-
datetime_get_env_var = datetime_env_var.name.lower()
100-
expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore
101-
monkeypatch.setenv(
102-
datetime_env_var,
103-
expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
104-
)
105-
106-
for string_env_var in STRING_ENV_VARS:
107-
if string_env_var in ignored_env_vars:
108-
continue
109-
110-
string_get_env_var = string_env_var.name.lower()
111-
expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
112-
monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])
113-
114-
# We need this override so that the actor doesn't fail when connecting to the platform events websocket
115-
monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
116-
monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
117-
expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
118-
expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None
119-
120-
# Adjust expectations for timedelta fields
121-
for env_name, env_value in expected_get_env.items():
122-
if env_name.endswith('_millis'):
123-
expected_get_env[env_name] = timedelta(milliseconds=env_value)
124-
125-
# Convert dedicated_cpus to float
126-
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
127-
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
26+
async def test_is_at_home_local() -> None:
27+
async with Actor as actor:
28+
is_at_home = actor.is_at_home()
29+
assert is_at_home is False
30+
31+
32+
async def test_is_at_home_on_apify(monkeypatch: pytest.MonkeyPatch) -> None:
33+
print('setenv')
34+
monkeypatch.setenv(ApifyEnvVars.IS_AT_HOME, 'true')
35+
async with Actor as actor:
36+
is_at_home = actor.is_at_home()
37+
assert is_at_home is True
38+
39+
40+
async def test_get_env_use_env_vars(monkeypatch: pytest.MonkeyPatch) -> None:
41+
ignored_env_vars = {
42+
ApifyEnvVars.INPUT_KEY,
43+
ApifyEnvVars.MEMORY_MBYTES,
44+
ApifyEnvVars.STARTED_AT,
45+
ApifyEnvVars.TIMEOUT_AT,
46+
ApifyEnvVars.DEFAULT_DATASET_ID,
47+
ApifyEnvVars.DEFAULT_KEY_VALUE_STORE_ID,
48+
ApifyEnvVars.DEFAULT_REQUEST_QUEUE_ID,
49+
ApifyEnvVars.SDK_LATEST_VERSION,
50+
ApifyEnvVars.LOG_FORMAT,
51+
ApifyEnvVars.LOG_LEVEL,
52+
ActorEnvVars.STANDBY_PORT,
53+
}
54+
55+
legacy_env_vars = {
56+
ApifyEnvVars.ACT_ID: ActorEnvVars.ID,
57+
ApifyEnvVars.ACT_RUN_ID: ActorEnvVars.RUN_ID,
58+
ApifyEnvVars.ACTOR_ID: ActorEnvVars.ID,
59+
ApifyEnvVars.ACTOR_BUILD_ID: ActorEnvVars.BUILD_ID,
60+
ApifyEnvVars.ACTOR_BUILD_NUMBER: ActorEnvVars.BUILD_NUMBER,
61+
ApifyEnvVars.ACTOR_RUN_ID: ActorEnvVars.RUN_ID,
62+
ApifyEnvVars.ACTOR_TASK_ID: ActorEnvVars.TASK_ID,
63+
ApifyEnvVars.CONTAINER_URL: ActorEnvVars.WEB_SERVER_URL,
64+
ApifyEnvVars.CONTAINER_PORT: ActorEnvVars.WEB_SERVER_PORT,
65+
}
66+
67+
# Set up random env vars
68+
expected_get_env: dict[str, Any] = {}
69+
expected_get_env[ApifyEnvVars.LOG_LEVEL.name.lower()] = 'INFO'
70+
71+
for int_env_var in INTEGER_ENV_VARS:
72+
if int_env_var in ignored_env_vars:
73+
continue
74+
75+
int_get_env_var = int_env_var.name.lower()
76+
expected_get_env[int_get_env_var] = random.randint(1, 99999)
77+
monkeypatch.setenv(int_env_var, f'{expected_get_env[int_get_env_var]}')
78+
79+
for float_env_var in FLOAT_ENV_VARS:
80+
if float_env_var in ignored_env_vars:
81+
continue
82+
83+
float_get_env_var = float_env_var.name.lower()
84+
expected_get_env[float_get_env_var] = random.random()
85+
monkeypatch.setenv(float_env_var, f'{expected_get_env[float_get_env_var]}')
86+
87+
for bool_env_var in BOOL_ENV_VARS:
88+
if bool_env_var in ignored_env_vars:
89+
continue
90+
91+
bool_get_env_var = bool_env_var.name.lower()
92+
expected_get_env[bool_get_env_var] = random.choice([True, False])
93+
monkeypatch.setenv(bool_env_var, f'{"true" if expected_get_env[bool_get_env_var] else "false"}')
94+
95+
for datetime_env_var in DATETIME_ENV_VARS:
96+
if datetime_env_var in ignored_env_vars:
97+
continue
98+
99+
datetime_get_env_var = datetime_env_var.name.lower()
100+
expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore
101+
monkeypatch.setenv(
102+
datetime_env_var,
103+
expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
128104
)
129105

130-
# Update expectations for legacy configuration
131-
for old_name, new_name in legacy_env_vars.items():
132-
expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]
106+
for string_env_var in STRING_ENV_VARS:
107+
if string_env_var in ignored_env_vars:
108+
continue
133109

134-
await Actor.init()
135-
assert Actor.get_env() == expected_get_env
110+
string_get_env_var = string_env_var.name.lower()
111+
expected_get_env[string_get_env_var] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
112+
monkeypatch.setenv(string_env_var, expected_get_env[string_get_env_var])
136113

137-
await Actor.exit()
114+
# We need this override so that the actor doesn't fail when connecting to the platform events websocket
115+
monkeypatch.delenv(ActorEnvVars.EVENTS_WEBSOCKET_URL)
116+
monkeypatch.delenv(ApifyEnvVars.ACTOR_EVENTS_WS_URL)
117+
expected_get_env[ActorEnvVars.EVENTS_WEBSOCKET_URL.name.lower()] = None
118+
expected_get_env[ApifyEnvVars.ACTOR_EVENTS_WS_URL.name.lower()] = None
119+
120+
# Adjust expectations for timedelta fields
121+
for env_name, env_value in expected_get_env.items():
122+
if env_name.endswith('_millis'):
123+
expected_get_env[env_name] = timedelta(milliseconds=env_value)
124+
125+
# Convert dedicated_cpus to float
126+
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(
127+
expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]
128+
)
129+
130+
# Update expectations for legacy configuration
131+
for old_name, new_name in legacy_env_vars.items():
132+
expected_get_env[old_name.name.lower()] = expected_get_env[new_name.name.lower()]
133+
134+
await Actor.init()
135+
assert Actor.get_env() == expected_get_env
136+
137+
await Actor.exit()

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from scrapy import Request, Spider
55
from scrapy.http.headers import Headers
66

7+
from crawlee._types import HttpHeaders
8+
79
from apify.scrapy.requests import to_apify_request
810

911

@@ -36,7 +38,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:
3638
apify_request = to_apify_request(scrapy_request, spider)
3739

3840
assert apify_request is not None
39-
assert apify_request.headers == dict(scrapy_request_headers.to_unicode_dict())
41+
assert apify_request.headers == HttpHeaders(scrapy_request_headers.to_unicode_dict())
4042

4143

4244
def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:

tests/unit/scrapy/requests/test_to_scrapy_request.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
import pytest
66
from scrapy import Request, Spider
7-
from scrapy.http.headers import Headers
87

98
from crawlee import Request as CrawleeRequest
9+
from crawlee._types import HttpHeaders
1010

1111
from apify.scrapy.requests import to_scrapy_request
1212

@@ -47,7 +47,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
4747
method='GET',
4848
unique_key='https://crawlee.dev',
4949
id='fvwscO2UJLdr10B',
50-
headers={'Authorization': 'Bearer access_token'},
50+
headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
5151
user_data={'some_user_data': 'test'},
5252
)
5353

@@ -58,7 +58,7 @@ def test__to_scrapy_request__without_reconstruction_with_optional_fields(spider:
5858
assert apify_request.method == scrapy_request.method
5959
assert apify_request.id == scrapy_request.meta.get('apify_request_id')
6060
assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
61-
assert Headers(apify_request.headers) == scrapy_request.headers
61+
assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
6262
assert apify_request.user_data == scrapy_request.meta.get('userData')
6363

6464

@@ -91,7 +91,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
9191
method='GET',
9292
id='fvwscO2UJLdr10B',
9393
unique_key='https://apify.com',
94-
headers={'Authorization': 'Bearer access_token'},
94+
headers=HttpHeaders({'Authorization': 'Bearer access_token'}),
9595
user_data={
9696
'some_user_data': 'hello',
9797
'scrapy_request': 'gASVJgIAAAAAAAB9lCiMA3VybJSMEWh0dHBzOi8vYXBpZnkuY29tlIwIY2FsbGJhY2uUTowHZXJy\nYmFja5ROjAdoZWFkZXJzlH2UKEMGQWNjZXB0lF2UQz90ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0\nbWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSwqLyo7cT0wLjiUYUMPQWNjZXB0LUxhbmd1YWdl\nlF2UQwJlbpRhQwpVc2VyLUFnZW50lF2UQyNTY3JhcHkvMi4xMS4wICgraHR0cHM6Ly9zY3JhcHku\nb3JnKZRhQw9BY2NlcHQtRW5jb2RpbmeUXZRDDWd6aXAsIGRlZmxhdGWUYXWMBm1ldGhvZJSMA0dF\nVJSMBGJvZHmUQwCUjAdjb29raWVzlH2UjARtZXRhlH2UKIwQYXBpZnlfcmVxdWVzdF9pZJSMD2Z2\nd3NjTzJVSkxkcjEwQpSMGGFwaWZ5X3JlcXVlc3RfdW5pcXVlX2tleZSMEWh0dHBzOi8vYXBpZnku\nY29tlIwQZG93bmxvYWRfdGltZW91dJRHQGaAAAAAAACMDWRvd25sb2FkX3Nsb3SUjAlhcGlmeS5j\nb22UjBBkb3dubG9hZF9sYXRlbmN5lEc/tYIIAAAAAHWMCGVuY29kaW5nlIwFdXRmLTiUjAhwcmlv\ncml0eZRLAIwLZG9udF9maWx0ZXKUiYwFZmxhZ3OUXZSMCWNiX2t3YXJnc5R9lHUu\n', # noqa: E501
@@ -105,7 +105,7 @@ def test__to_scrapy_request__with_reconstruction_with_optional_fields(spider: Sp
105105
assert apify_request.method == scrapy_request.method
106106
assert apify_request.id == scrapy_request.meta.get('apify_request_id')
107107
assert apify_request.unique_key == scrapy_request.meta.get('apify_request_unique_key')
108-
assert Headers(apify_request.headers) == scrapy_request.headers
108+
assert apify_request.headers.get('authorization') == scrapy_request.headers.get('authorization').decode()
109109
assert apify_request.user_data == scrapy_request.meta.get('userData')
110110

111111

0 commit comments

Comments
 (0)