Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/crawlee/storages/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ async def open(
Args:
id: The storage ID.
name: The storage name (global scope, persists across runs).
name: The storage name (global scope, persists across runs). Name can only contain letters "a" through "z",
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
(e.g. "my-value-1")
alias: The storage alias (run scope, creates unnamed storage).
configuration: Configuration object used during the storage creation or restoration process.
storage_client: Underlying storage client to use. If not provided, the default global storage client
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ._base import Storage
from ._key_value_store import KeyValueStore
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -75,6 +76,8 @@ def __init__(self, client: DatasetClient, id: str, name: str | None) -> None:
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_key_value_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from crawlee.storage_clients.models import KeyValueStoreMetadata

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import AsyncIterator
Expand Down Expand Up @@ -84,6 +85,8 @@ def __init__(self, client: KeyValueStoreClient, id: str, name: str | None) -> No
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
3 changes: 3 additions & 0 deletions src/crawlee/storages/_request_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from crawlee.request_loaders import RequestManager

from ._base import Storage
from ._utils import validate_storage_name

if TYPE_CHECKING:
from collections.abc import Sequence
Expand Down Expand Up @@ -80,6 +81,8 @@ def __init__(self, client: RequestQueueClient, id: str, name: str | None) -> Non
id: The unique identifier of the storage.
name: The name of the storage, if available.
"""
validate_storage_name(name)

self._client = client
self._id = id
self._name = name
Expand Down
10 changes: 9 additions & 1 deletion src/crawlee/storages/_storage_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient

from ._utils import validate_storage_name

if TYPE_CHECKING:
from ._base import Storage

Expand Down Expand Up @@ -90,7 +92,9 @@ async def open_storage_instance(
Args:
cls: The storage class to instantiate.
id: Storage ID.
name: Storage name. (global scope, persists across runs).
name: Storage name. (global scope, persists across runs). Name can only contain letters "a" through "z",
the digits "0" through "9", and the hyphen ("-") but only in the middle of the string
(e.g. "my-value-1")
alias: Storage alias (run scope, creates unnamed storage).
client_opener_coro: Coroutine to open the storage client when storage instance not found in cache.
storage_client_cache_key: Additional optional key from storage client to differentiate cache entries.
Expand Down Expand Up @@ -146,6 +150,10 @@ async def open_storage_instance(
f'Use a different name or drop the existing alias storage first.'
)

# Validate storage name
if name is not None:
validate_storage_name(name)

# Create new instance
client: KeyValueStoreClient | DatasetClient | RequestQueueClient
client = await client_opener_coro
Expand Down
11 changes: 11 additions & 0 deletions src/crawlee/storages/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import re

NAME_REGEX = re.compile(r'^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9-]*[a-zA-Z0-9])$')


def validate_storage_name(name: str | None) -> None:
if name and not NAME_REGEX.match(name):
raise ValueError(
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
)
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None:
async def test_adaptive_playwright_crawler_statistics_in_init() -> None:
"""Tests that adaptive crawler uses created AdaptivePlaywrightCrawlerStatistics from inputted Statistics."""
persistence_enabled = True
persist_state_kvs_name = 'some name'
persist_state_kvs_name = 'some-name'
persist_state_key = 'come key'
log_message = 'some message'
periodic_message_logger = logging.getLogger('some logger')
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/otel/test_crawler_instrumentor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def test_crawler_instrumentor_capability(server_url: URL) -> None:

# Generate first telemetry data from `Dataset` public methods.
# `Dataset` is in `instrument_classes` argument, and thus it's public methods are instrumented.
dataset = await Dataset.open(name='test_dataset')
dataset = await Dataset.open(name='test-dataset')
await dataset.drop()

# Other traces will be from crawler run.
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/sessions/test_session_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from collections.abc import AsyncGenerator

MAX_POOL_SIZE = 3
KVS_NAME = 'test_session_pool'
KVS_NAME = 'test-session-pool'
PERSIST_STATE_KEY = 'crawlee_session_pool_state'


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ def configuration(tmp_path: Path) -> Configuration:
@pytest.fixture
async def dataset_client(configuration: Configuration) -> AsyncGenerator[FileSystemDatasetClient, None]:
"""A fixture for a file system dataset client."""
client = await FileSystemStorageClient().create_dataset_client(name='test_dataset', configuration=configuration)
client = await FileSystemStorageClient().create_dataset_client(name='test-dataset', configuration=configuration)
yield client
await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
"""Test that file system dataset creates proper files and directories."""
client = await FileSystemStorageClient().create_dataset_client(name='new_dataset', configuration=configuration)
client = await FileSystemStorageClient().create_dataset_client(name='new-dataset', configuration=configuration)

# Verify files were created
assert client.path_to_dataset.exists()
Expand All @@ -45,7 +45,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
metadata = json.load(f)
client_metadata = await client.get_metadata()
assert metadata['id'] == client_metadata.id
assert metadata['name'] == 'new_dataset'
assert metadata['name'] == 'new-dataset'
assert metadata['item_count'] == 0

await client.drop()
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_file_system/test_fs_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ def configuration(tmp_path: Path) -> Configuration:
@pytest.fixture
async def kvs_client(configuration: Configuration) -> AsyncGenerator[FileSystemKeyValueStoreClient, None]:
"""A fixture for a file system key-value store client."""
client = await FileSystemStorageClient().create_kvs_client(name='test_kvs', configuration=configuration)
client = await FileSystemStorageClient().create_kvs_client(name='test-kvs', configuration=configuration)
yield client
await client.drop()


async def test_file_and_directory_creation(configuration: Configuration) -> None:
"""Test that file system KVS creates proper files and directories."""
client = await FileSystemStorageClient().create_kvs_client(name='new_kvs', configuration=configuration)
client = await FileSystemStorageClient().create_kvs_client(name='new-kvs', configuration=configuration)

# Verify files were created
assert client.path_to_kvs.exists()
Expand All @@ -44,7 +44,7 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
with client.path_to_metadata.open() as f:
metadata = json.load(f)
assert metadata['id'] == (await client.get_metadata()).id
assert metadata['name'] == 'new_kvs'
assert metadata['name'] == 'new-kvs'

await client.drop()

Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_file_system/test_fs_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ def configuration(tmp_path: Path) -> Configuration:
async def rq_client() -> AsyncGenerator[FileSystemRequestQueueClient, None]:
"""A fixture for a file system request queue client."""
client = await FileSystemStorageClient().create_rq_client(
name='test_request_queue',
name='test-request-queue',
)
yield client
await client.drop()


async def test_file_and_directory_creation() -> None:
"""Test that file system RQ creates proper files and directories."""
client = await FileSystemStorageClient().create_rq_client(name='new_request_queue')
client = await FileSystemStorageClient().create_rq_client(name='new-request-queue')

# Verify files were created
assert client.path_to_rq.exists()
Expand All @@ -46,7 +46,7 @@ async def test_file_and_directory_creation() -> None:
with client.path_to_metadata.open() as f:
metadata = json.load(f)
assert metadata['id'] == (await client.get_metadata()).id
assert metadata['name'] == 'new_request_queue'
assert metadata['name'] == 'new-request-queue'

await client.drop()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@pytest.fixture
async def dataset_client() -> AsyncGenerator[MemoryDatasetClient, None]:
"""Fixture that provides a fresh memory dataset client for each test."""
client = await MemoryStorageClient().create_dataset_client(name='test_dataset')
client = await MemoryStorageClient().create_dataset_client(name='test-dataset')
yield client
await client.drop()

Expand All @@ -25,7 +25,7 @@ async def test_memory_specific_purge_behavior() -> None:
"""Test memory-specific purge behavior and in-memory storage characteristics."""
# Create dataset and add data
dataset_client1 = await MemoryStorageClient().create_dataset_client(
name='test_purge_dataset',
name='test-purge-dataset',
)
await dataset_client1.push_data({'item': 'initial data'})

Expand All @@ -35,7 +35,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
dataset_client2 = await MemoryStorageClient().create_dataset_client(
name='test_purge_dataset',
name='test-purge-dataset',
)

# Verify data was purged (memory storage specific behavior)
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_memory/test_memory_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@pytest.fixture
async def kvs_client() -> AsyncGenerator[MemoryKeyValueStoreClient, None]:
"""Fixture that provides a fresh memory key-value store client for each test."""
client = await MemoryStorageClient().create_kvs_client(name='test_kvs')
client = await MemoryStorageClient().create_kvs_client(name='test-kvs')
yield client
await client.drop()

Expand All @@ -26,7 +26,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Create KVS and add data
kvs_client1 = await MemoryStorageClient().create_kvs_client(
name='test_purge_kvs',
name='test-purge-kvs',
)
await kvs_client1.set_value(key='test-key', value='initial value')

Expand All @@ -37,7 +37,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
kvs_client2 = await MemoryStorageClient().create_kvs_client(
name='test_purge_kvs',
name='test-purge-kvs',
)

# Verify value was purged (memory storage specific behavior)
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/storage_clients/_memory/test_memory_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
@pytest.fixture
async def rq_client() -> AsyncGenerator[MemoryRequestQueueClient, None]:
"""Fixture that provides a fresh memory request queue client for each test."""
client = await MemoryStorageClient().create_rq_client(name='test_rq')
client = await MemoryStorageClient().create_rq_client(name='test-rq')
yield client
await client.drop()

Expand All @@ -26,7 +26,7 @@ async def test_memory_specific_purge_behavior() -> None:
"""Test memory-specific purge behavior and in-memory storage characteristics."""
# Create RQ and add data
rq_client1 = await MemoryStorageClient().create_rq_client(
name='test_purge_rq',
name='test-purge-rq',
)
request = Request.from_url(url='https://example.com/initial')
await rq_client1.add_batch_of_requests([request])
Expand All @@ -36,7 +36,7 @@ async def test_memory_specific_purge_behavior() -> None:

# Reopen with same storage client instance
rq_client2 = await MemoryStorageClient().create_rq_client(
name='test_purge_rq',
name='test-purge-rq',
)

# Verify queue was purged (memory storage specific behavior)
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/storage_clients/_sql/test_sql_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ async def dataset_client(
async with SqlStorageClient() as storage_client:
monkeypatch.setattr(storage_client, '_accessed_modified_update_interval', timedelta(seconds=0))
client = await storage_client.create_dataset_client(
name='test_dataset',
name='test-dataset',
configuration=configuration,
)
yield client
Expand All @@ -57,7 +57,7 @@ async def test_create_tables_with_connection_string(configuration: Configuration

async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -75,7 +75,7 @@ async def test_create_tables_with_engine(configuration: Configuration, tmp_path:

async with SqlStorageClient(engine=engine) as storage_client:
await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -89,7 +89,7 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
"""Test that SQL dataset creates proper tables and metadata records."""
async with SqlStorageClient() as storage_client:
client = await storage_client.create_dataset_client(
name='new_dataset',
name='new-dataset',
configuration=configuration,
)

Expand All @@ -101,12 +101,12 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
assert 'datasets' in tables

async with client.get_session() as session:
stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new_dataset')
stmt = select(DatasetMetadataDb).where(DatasetMetadataDb.name == 'new-dataset')
result = await session.execute(stmt)
orm_metadata = result.scalar_one_or_none()
assert orm_metadata is not None
assert orm_metadata.id == client_metadata.id
assert orm_metadata.name == 'new_dataset'
assert orm_metadata.name == 'new-dataset'
assert orm_metadata.item_count == 0

await client.drop()
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/storage_clients/_sql/test_sql_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ async def kvs_client(
async with SqlStorageClient() as storage_client:
monkeypatch.setattr(storage_client, '_accessed_modified_update_interval', timedelta(seconds=0))
client = await storage_client.create_kvs_client(
name='test_kvs',
name='test-kvs',
configuration=configuration,
)
monkeypatch.setattr(client, '_accessed_modified_update_interval', timedelta(seconds=0))
Expand All @@ -60,7 +60,7 @@ async def test_create_tables_with_connection_string(configuration: Configuration

async with SqlStorageClient(connection_string=f'sqlite+aiosqlite:///{storage_dir}') as storage_client:
await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -78,7 +78,7 @@ async def test_create_tables_with_engine(configuration: Configuration, tmp_path:

async with SqlStorageClient(engine=engine) as storage_client:
await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -92,7 +92,7 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
"""Test that SQL key-value store creates proper tables and metadata records."""
async with SqlStorageClient() as storage_client:
client = await storage_client.create_kvs_client(
name='new_kvs',
name='new-kvs',
configuration=configuration,
)

Expand All @@ -104,12 +104,12 @@ async def test_tables_and_metadata_record(configuration: Configuration) -> None:
assert 'key_value_store_records' in tables

async with client.get_session() as session:
stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new_kvs')
stmt = select(KeyValueStoreMetadataDb).where(KeyValueStoreMetadataDb.name == 'new-kvs')
result = await session.execute(stmt)
orm_metadata = result.scalar_one_or_none()
metadata = KeyValueStoreMetadata.model_validate(orm_metadata)
assert metadata.id == client_metadata.id
assert metadata.name == 'new_kvs'
assert metadata.name == 'new-kvs'

await client.drop()

Expand Down
Loading
Loading