Skip to content

Commit 84de11a

Browse files
authored
fix: Add storages name validation (#1457)
### Description - Added name validation for storages in accordance with Apify platform restrictions ### Issues - Closes: #1434 - Relates: #1354 ### Testing - Added new tests to verify correct validation.
1 parent 8f3e33b commit 84de11a

22 files changed

+263
-133
lines changed

docs/upgrading/upgrading_to_v1.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,7 @@ async def main() -> None:
333333

334334
await crawler.run(['https://crawlee.dev/'])
335335
```
336+
337+
### New storage naming restrictions
338+
339+
We've introduced naming restrictions for storages to ensure compatibility with Apify Platform requirements and prevent potential conflicts. Storage names may include only letters (a–z, A–Z), digits (0–9), and hyphens (-), with hyphens allowed only in the middle of the name (for example, my-storage-1).

src/crawlee/storages/_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ async def open(
4444
4545
Args:
4646
id: The storage ID.
47-
name: The storage name (global scope, persists across runs).
47+
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
48+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
49+
(e.g. "my-value-1").
4850
alias: The storage alias (run scope, creates unnamed storage).
4951
configuration: Configuration object used during the storage creation or restoration process.
5052
storage_client: Underlying storage client to use. If not provided, the default global storage client

src/crawlee/storages/_dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from ._base import Storage
1414
from ._key_value_store import KeyValueStore
15+
from ._utils import validate_storage_name
1516

1617
if TYPE_CHECKING:
1718
from collections.abc import AsyncIterator
@@ -75,6 +76,8 @@ def __init__(self, client: DatasetClient, id: str, name: str | None) -> None:
7576
id: The unique identifier of the storage.
7677
name: The name of the storage, if available.
7778
"""
79+
validate_storage_name(name)
80+
7881
self._client = client
7982
self._id = id
8083
self._name = name

src/crawlee/storages/_key_value_store.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from crawlee.storage_clients.models import KeyValueStoreMetadata
1616

1717
from ._base import Storage
18+
from ._utils import validate_storage_name
1819

1920
if TYPE_CHECKING:
2021
from collections.abc import AsyncIterator
@@ -84,6 +85,8 @@ def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> No
8485
id: The unique identifier of the storage.
8586
name: The name of the storage, if available.
8687
"""
88+
validate_storage_name(name)
89+
8790
self._client = client
8891
self._id = id
8992
self._name = name

src/crawlee/storages/_request_queue.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from crawlee.request_loaders import RequestManager
1414

1515
from ._base import Storage
16+
from ._utils import validate_storage_name
1617

1718
if TYPE_CHECKING:
1819
from collections.abc import Sequence
@@ -80,6 +81,8 @@ def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> Non
8081
id: The unique identifier of the storage.
8182
name: The name of the storage, if available.
8283
"""
84+
validate_storage_name(name)
85+
8386
self._client = client
8487
self._id = id
8588
self._name = name

src/crawlee/storages/_storage_instance_manager.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
99
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient
1010

11+
from ._utils import validate_storage_name
12+
1113
if TYPE_CHECKING:
1214
from ._base import Storage
1315

@@ -90,7 +92,9 @@ async def open_storage_instance(
9092
Args:
9193
cls: The storage class to instantiate.
9294
id: Storage ID.
93-
name: Storage name. (global scope, persists across runs).
95+
name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
96+
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
97+
(e.g. "my-value-1").
9498
alias: Storage alias (run scope, creates unnamed storage).
9599
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
96100
storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
@@ -146,6 +150,10 @@ async def open_storage_instance(
146150
f'Use a different name or drop the existing alias storage first.'
147151
)
148152

153+
# Validate storage name
154+
if name is not None:
155+
validate_storage_name(name)
156+
149157
# Create new instance
150158
client: KeyValueStoreClient | DatasetClient | RequestQueueClient
151159
client = await client_opener_coro

src/crawlee/storages/_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import re
2+
3+
NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')
4+
5+
6+
def validate_storage_name(name: str | None) -> None:
7+
if name and not NAME_REGEX.match(name):
8+
raise ValueError(
9+
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
10+
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
11+
)

tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
473473
async def test_adaptive_playwright_crawler_statistics_in_init() -> None:
474474
"""Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics."""
475475
persistence_enabled = True
476-
persist_state_kvs_name = 'some name'
476+
persist_state_kvs_name = 'some-name'
477477
persist_state_key = 'come key'
478478
log_message = 'some message'
479479
periodic_message_logger = logging.getLogger('some logger')

tests/unit/otel/test_crawler_instrumentor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None:
3838

3939
# Generate first telemetry data from `Dataset` public methods.
4040
# `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.
41-
dataset = await Dataset.open(name='test_dataset')
41+
dataset = await Dataset.open(name='test-dataset')
4242
await dataset.drop()
4343

4444
# Other traces will be from crawler run.

tests/unit/sessions/test_session_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from collections.abc import AsyncGenerator
1818

1919
MAX_POOL_SIZE = 3
20-
KVS_NAME = 'test_session_pool'
20+
KVS_NAME = 'test-session-pool'
2121
PERSIST_STATE_KEY = 'crawlee_session_pool_state'
2222

2323

0 commit comments

Comments
 (0)