Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/crawlee/storages/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ async def open(

Args:
id: The storage ID.
name: The storage name (global scope, persists across runs).
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
(e.g. "my-value-1").
alias: The storage alias (run scope, creates unnamed storage).
configuration: Configuration object used during the storage creation or restoration process.
storage_client: Underlying storage client to use. If not provided, the default global storage client
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ._base import Storage
from ._key_value_store import KeyValueStore
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -75,6 +76,8 @@ def __init__(self, client: DatasetClient, id: str, name: str | None) -> None:
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_key_value_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from crawlee.storage_clients.models import KeyValueStoreMetadata

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -84,6 +85,8 @@ def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> No
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from crawlee.request_loaders import RequestManager

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -80,6 +81,8 @@ def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> Non
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
10 changes: 9 additions & 1 deletion src/crawlee/storages/_storage_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient

from ._utils import validate_storage_name

if TYPE_CHECKING:
from ._base import Storage

Expand Down Expand Up @@ -90,7 +92,9 @@ async def open_storage_instance(
Args:
cls: The storage class to instantiate.
id: Storage ID.
name: Storage name. (global scope, persists across runs).
name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
(e.g. "my-value-1").
alias: Storage alias (run scope, creates unnamed storage).
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
Expand Down Expand Up @@ -146,6 +150,10 @@ async def open_storage_instance(
f'Use a different name or drop the existing alias storage first.'
)

# Validate storage name
if name is not None:
validate_storage_name(name)

# Create new instance
client: KeyValueStoreClient | DatasetClient | RequestQueueClient
client = await client_opener_coro
Expand Down
11 changes: 11 additions & 0 deletions src/crawlee/storages/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import re

NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')


def validate_storage_name(name: str | None) -> None:
if name and not NAME_REGEX.match(name):
raise ValueError(
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
)
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
async def test_adaptive_playwright_crawler_statistics_in_init() -> None:
"""Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics."""
persistence_enabled = True
persist_state_kvs_name = 'some name'
persist_state_kvs_name = 'some-name'
persist_state_key = 'come key'
log_message = 'some message'
periodic_message_logger = logging.getLogger('some logger')
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/otel/test_crawler_instrumentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None:

# Generate first telemetry data from `Dataset` public methods.
# `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.
dataset = await Dataset.open(name='test_dataset')
dataset = await Dataset.open(name='test-dataset')
await dataset.drop()

# Other traces will be from crawler run.
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/sessions/test_session_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from collections.abc import AsyncGenerator

MAX_POOL_SIZE = 3
KVS_NAME = 'test_session_pool'
KVS_NAME = 'test-session-pool'
PERSIST_STATE_KEY = 'crawlee_session_pool_state'


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ def configuration(tmp_path: Path) -> Configuration:
@pytest.fixture
async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]:
"""A fixture for a file system dataset client."""
client = await FileSystemStorageClient().create_dataset_client(name='test_dataset', configuration=configuration)
client = await FileSystemStorageClient().create_dataset_client(name='test-dataset', configuration=configuration)
yield client
await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
"""Test that file system dataset creates proper files and directories."""
client = await FileSystemStorageClient().create_dataset_client(name='new_dataset', configuration=configuration)
client = await FileSystemStorageClient().create_dataset_client(name='new-dataset', configuration=configuration)

# Verify files were created
assert client.path_to_dataset.exists()
Expand All @@ -45,7 +45,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
metadata = json.load(f)
client_metadata = await client.get_metadata()
assert metadata['id'] == client_metadata.id
assert metadata['name'] == 'new_dataset'
assert metadata['name'] == 'new-dataset'
assert metadata['item_count'] == 0

await client.drop()
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_file_system/test_fs_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ def configuration(tmp_path: Path) -> Configuration:
@pytest.fixture
async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]:
"""A fixture for a file system key-value store client."""
client = await FileSystemStorageClient().create_kvs_client(name='test_kvs', configuration=configuration)
client = await FileSystemStorageClient().create_kvs_client(name='test-kvs', configuration=configuration)
yield client
await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
"""Test that file system KVS creates proper files and directories."""
client = await FileSystemStorageClient().create_kvs_client(name='new_kvs', configuration=configuration)
client = await FileSystemStorageClient().create_kvs_client(name='new-kvs', configuration=configuration)

# Verify files were created
assert client.path_to_kvs.exists()
Expand All @@ -44,7 +44,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
with client.path_to_metadata.open() as f:
metadata = json.load(f)
assert metadata['id'] == (await client.get_metadata()).id
assert metadata['name'] == 'new_kvs'
assert metadata['name'] == 'new-kvs'

await client.drop()

Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_file_system/test_fs_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def configuration(tmp_path: Path) -> Configuration:
async def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]:
"""A fixture for a file system request queue client."""
client = await FileSystemStorageClient().create_rq_client(
name='test_request_queue',
name='test-request-queue',
)
yield client
await client.drop()


async def test_file_and_directory_creation() -> None:
"""Test that file system RQ creates proper files and directories."""
client = await FileSystemStorageClient().create_rq_client(name='new_request_queue')
client = await FileSystemStorageClient().create_rq_client(name='new-request-queue')

# Verify files were created
assert client.path_to_rq.exists()
Expand All @@ -46,7 +46,7 @@ async def test_file_and_directory_creation() -> None:
with client.path_to_metadata.open() as f:
metadata = json.load(f)
assert metadata['id'] == (await client.get_metadata()).id
assert metadata['name'] == 'new_request_queue'
assert metadata['name'] == 'new-request-queue'

await client.drop()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@pytest.fixture
async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]:
"""Fixture that provides a fresh memory dataset client for each test."""
client = await MemoryStorageClient().create_dataset_client(name='test_dataset')
client = await MemoryStorageClient().create_dataset_client(name='test-dataset')
yield client
await client.drop()

Expand All @@ -25,7 +25,7 @@ async def test_memory_specific_purge_behavior() -> None:
"""Test memory-specific purge behavior and in-memory storage characteristics."""
# Create dataset and add data
dataset_client1 = await MemoryStorageClient().create_dataset_client(
name='test_purge_dataset',
name='test-purge-dataset',
)
await dataset_client1.push_data({'item': 'initial data'})

Expand All @@ -35,7 +35,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
dataset_client2 = await MemoryStorageClient().create_dataset_client(
name='test_purge_dataset',
name='test-purge-dataset',
)

# Verify data was purged (memory storage specific behavior)
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_memory/test_memory_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@pytest.fixture
async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]:
"""Fixture that provides a fresh memory key-value store client for each test."""
client = await MemoryStorageClient().create_kvs_client(name='test_kvs')
client = await MemoryStorageClient().create_kvs_client(name='test-kvs')
yield client
await client.drop()

Expand All @@ -26,7 +26,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Create KVS and add data
kvs_client1 = await MemoryStorageClient().create_kvs_client(
name='test_purge_kvs',
name='test-purge-kvs',
)
await kvs_client1.set_value(key='test-key', value='initial value')

Expand All @@ -37,7 +37,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
kvs_client2 = await MemoryStorageClient().create_kvs_client(
name='test_purge_kvs',
name='test-purge-kvs',
)

# Verify value was purged (memory storage specific behavior)
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_memory/test_memory_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@pytest.fixture
async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]:
"""Fixture that provides a fresh memory request queue client for each test."""
client = await MemoryStorageClient().create_rq_client(name='test_rq')
client = await MemoryStorageClient().create_rq_client(name='test-rq')
yield client
await client.drop()

Expand All @@ -26,7 +26,7 @@ async def test_memory_specific_purge_behavior() -> None:
"""Test memory-specific purge behavior and in-memory storage characteristics."""
# Create RQ and add data
rq_client1 = await MemoryStorageClient().create_rq_client(
name='test_purge_rq',
name='test-purge-rq',
)
request = Request.from_url(url='https://example.com/initial')
await rq_client1.add_batch_of_requests([request])
Expand All @@ -36,7 +36,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
rq_client2 = await MemoryStorageClient().create_rq_client(
name='test_purge_rq',
name='test-purge-rq',
)

# Verify queue was purged (memory storage specific behavior)
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/storage_clients/_sql/test_sql_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async def dataset_client(
async with SqlStorageClient() as storage_client:
monkeypatch.setattr(storage_client, '_accessed_modified_update_interval', timedelta(seconds=0))
client = await storage_client.create_dataset_client(
name='test_dataset',
name='test-dataset',
configuration=configuration,
)
yield client
Expand All @@ -57,7 +57,7 @@ async def test_create_tables_with_connection_string(configuration: Configuration

async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -75,7 +75,7 @@ async def test_create_tables_with_engine(configuration: Configuration, tmp_path:

async with SqlStorageClient(engine=engine) as storage_client:
await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -89,7 +89,7 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
"""Test that SQL dataset creates proper tables and metadata records."""
async with SqlStorageClient() as storage_client:
client = await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -101,12 +101,12 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
assert 'datasets' in tables

async with client.get_session() as session:
stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new_dataset')
stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new-dataset')
result = await session.execute(stmt)
orm_metadata = result.scalar_one_or_none()
assert orm_metadata is not None
assert orm_metadata.id == client_metadata.id
assert orm_metadata.name == 'new_dataset'
assert orm_metadata.name == 'new-dataset'
assert orm_metadata.item_count == 0

await client.drop()
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/storage_clients/_sql/test_sql_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def kvs_client(
async with SqlStorageClient() as storage_client:
monkeypatch.setattr(storage_client, '_accessed_modified_update_interval', timedelta(seconds=0))
client = await storage_client.create_kvs_client(
name='test_kvs',
name='test-kvs',
configuration=configuration,
)
monkeypatch.setattr(client, '_accessed_modified_update_interval', timedelta(seconds=0))
Expand All @@ -60,7 +60,7 @@ async def test_create_tables_with_connection_string(configuration: Configuration

async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -78,7 +78,7 @@ async def test_create_tables_with_engine(configuration: Configuration, tmp_path:

async with SqlStorageClient(engine=engine) as storage_client:
await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -92,7 +92,7 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
"""Test that SQL key-value store creates proper tables and metadata records."""
async with SqlStorageClient() as storage_client:
client = await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -104,12 +104,12 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
assert 'key_value_store_records' in tables

async with client.get_session() as session:
stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new_kvs')
stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new-kvs')
result = await session.execute(stmt)
orm_metadata = result.scalar_one_or_none()
metadata = KeyValueStoreMetadata.model_validate(orm_metadata)
assert metadata.id == client_metadata.id
assert metadata.name == 'new_kvs'
assert metadata.name == 'new-kvs'

await client.drop()

Expand Down
Loading
Loading