Skip to content

Commit 4ff56d7

Browse files
committed
feat: support wider variety of spider names
1 parent 41af4df commit 4ff56d7

File tree

2 files changed

+46
-2
lines changed

2 files changed

+46
-2
lines changed

src/apify/scrapy/extensions/_httpcache.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import gzip
44
import io
55
import pickle
6+
import re
67
import struct
78
from logging import getLogger
89
from time import time
@@ -47,7 +48,7 @@ def open_spider(self, spider: Spider) -> None:
4748
logger.debug('Using Apify key value cache storage', extra={'spider': spider})
4849
self._spider = spider
4950
self._fingerprinter = spider.crawler.request_fingerprinter
50-
kvs_name = f'httpcache-{spider.name}'
51+
kvs_name = get_kvs_name(spider.name)
5152

5253
async def open_kvs() -> KeyValueStore:
5354
config = Configuration.get_global_configuration()
@@ -177,3 +178,13 @@ def read_gzip_time(gzip_bytes: bytes) -> int:
177178
header_components = struct.unpack('<HBBI2B', header)
178179
mtime: int = header_components[3]
179180
return mtime
181+
182+
183+
def get_kvs_name(spider_name: str) -> str:
184+
"""Get the key value store name for a spider."""
185+
slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name)
186+
slug = re.sub(r'-+', '-', slug)
187+
slug = slug.strip('-')
188+
if not slug:
189+
raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})')
190+
return f'httpcache-{slug}'

tests/unit/scrapy/extensions/test_httpcache.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from time import time
22

3-
from apify.scrapy.extensions._httpcache import from_gzip, read_gzip_time, to_gzip
3+
import pytest
4+
5+
from apify.scrapy.extensions._httpcache import from_gzip, get_kvs_name, read_gzip_time, to_gzip
46

57
FIXTURE_DICT = {'name': 'Alice'}
68

@@ -35,3 +37,34 @@ def test_read_gzip_time_non_zero() -> None:
3537
data_bytes = to_gzip(FIXTURE_DICT, mtime=current_time)
3638

3739
assert read_gzip_time(data_bytes) == current_time
40+
41+
42+
@pytest.mark.parametrize(
43+
('spider_name', 'expected'),
44+
[
45+
('test', 'httpcache-test'),
46+
('123', 'httpcache-123'),
47+
('test-spider', 'httpcache-test-spider'),
48+
('test_spider', 'httpcache-test-spider'),
49+
('test spider', 'httpcache-test-spider'),
50+
('test👻spider', 'httpcache-test-spider'),
51+
('test@spider', 'httpcache-test-spider'),
52+
(' test spider ', 'httpcache-test-spider'),
53+
('testspider.com', 'httpcache-testspider-com'),
54+
],
55+
)
56+
def test_get_kvs_name(spider_name: str, expected: str) -> None:
57+
assert get_kvs_name(spider_name) == expected
58+
59+
60+
@pytest.mark.parametrize(
61+
('spider_name'),
62+
[
63+
'',
64+
'-',
65+
'-@-/-',
66+
],
67+
)
68+
def test_get_kvs_name_raises(spider_name: str) -> None:
69+
with pytest.raises(ValueError, match='Unsupported spider name'):
70+
assert get_kvs_name(spider_name)

0 commit comments

Comments
 (0)