diff --git a/docs/04_upgrading/upgrading_to_v3.md b/docs/04_upgrading/upgrading_to_v3.md index 23c5d043..6eae9a21 100644 --- a/docs/04_upgrading/upgrading_to_v3.md +++ b/docs/04_upgrading/upgrading_to_v3.md @@ -101,3 +101,21 @@ async def main(): storage_client=custom_storage_client, ) ``` + +## Removed Actor.config property +- `Actor.config` property has been removed. Use `Actor.configuration` instead. + +## Default storage ids in configuration changed to None +- `Configuration.default_key_value_store_id` changed from `'default'` to `None`. +- `Configuration.default_dataset_id` changed from `'default'` to `None`. +- `Configuration.default_request_queue_id` changed from `'default'` to `None`. + +Previously using the default storage without specifying its `id` in `Configuration` would lead to using specific storage with id `'default'`. Now it will use newly created unnamed storage with `'id'` assigned by the Apify platform, consecutive calls to get the default storage will return the same storage. + +## Storages + + + +## Storage clients + + diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index aba566b9..28158b55 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -142,7 +142,7 @@ class Configuration(CrawleeConfiguration): ] = None default_dataset_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_dataset_id', @@ -150,10 +150,10 @@ class Configuration(CrawleeConfiguration): ), description='Default dataset ID used by the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None default_key_value_store_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_key_value_store_id', @@ -161,10 +161,10 @@ class Configuration(CrawleeConfiguration): ), description='Default key-value store ID for the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None default_request_queue_id: Annotated[ - str, + str | None, Field( validation_alias=AliasChoices( 'actor_default_request_queue_id', @@ -172,7 +172,7 @@ class Configuration(CrawleeConfiguration): ), description='Default request queue ID for the Apify storage client when no ID or name is provided.', ), - ] = 'default' + ] = None disable_outdated_warning: Annotated[ bool, diff --git a/src/apify/storage_clients/_apify/_dataset_client.py b/src/apify/storage_clients/_apify/_dataset_client.py index e5ec91d0..8b6f3e11 100644 --- a/src/apify/storage_clients/_apify/_dataset_client.py +++ b/src/apify/storage_clients/_apify/_dataset_client.py @@ -124,8 +124,10 @@ async def open( ) apify_datasets_client = apify_client_async.datasets() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed + # storage aliased as `__default__` + if not any([alias, name, id, configuration.default_dataset_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -150,6 +152,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_dataset_id + if not id: + raise ValueError( + 'Dataset "id", "name", or "alias" must be specified, ' + 'or a default dataset ID must be set in the configuration.' + ) # Now create the client for the determined ID apify_dataset_client = apify_client_async.dataset(dataset_id=id) diff --git a/src/apify/storage_clients/_apify/_key_value_store_client.py b/src/apify/storage_clients/_apify/_key_value_store_client.py index 9011d834..79215ba2 100644 --- a/src/apify/storage_clients/_apify/_key_value_store_client.py +++ b/src/apify/storage_clients/_apify/_key_value_store_client.py @@ -115,8 +115,10 @@ async def open( ) apify_kvss_client = apify_client_async.key_value_stores() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to + # unnamed storage aliased as `__default__` + if not any([alias, name, id, configuration.default_key_value_store_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -142,6 +144,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_key_value_store_id + if not id: + raise ValueError( + 'KeyValueStore "id", "name", or "alias" must be specified, ' + 'or a default KeyValueStore ID must be set in the configuration.' + ) # Now create the client for the determined ID apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id) diff --git a/src/apify/storage_clients/_apify/_request_queue_client.py b/src/apify/storage_clients/_apify/_request_queue_client.py index 39556d2d..893f26b9 100644 --- a/src/apify/storage_clients/_apify/_request_queue_client.py +++ b/src/apify/storage_clients/_apify/_request_queue_client.py @@ -200,8 +200,10 @@ async def open( ) apify_rqs_client = apify_client_async.request_queues() - # Normalize 'default' alias to None - alias = None if alias == 'default' else alias + # Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to + # unnamed storage aliased as `__default__` + if not any([alias, name, id, configuration.default_request_queue_id]): + alias = '__default__' if alias: # Check if there is pre-existing alias mapping in the default KVS. @@ -226,6 +228,11 @@ async def open( # If none are provided, try to get the default storage ID from environment variables. elif id is None: id = configuration.default_request_queue_id + if not id: + raise ValueError( + 'RequestQueue "id", "name", or "alias" must be specified, ' + 'or a default default_request_queue_id ID must be set in the configuration.' + ) # Use suitable client_key to make `hadMultipleClients` response of Apify API useful. # It should persist across migrated or resurrected Actor runs on the Apify platform. diff --git a/src/apify/storage_clients/_apify/_utils.py b/src/apify/storage_clients/_apify/_utils.py index 6d05bff3..ebae80f7 100644 --- a/src/apify/storage_clients/_apify/_utils.py +++ b/src/apify/storage_clients/_apify/_utils.py @@ -76,7 +76,7 @@ async def _get_alias_map(cls) -> dict[str, str]: Returns: Map of aliases and storage ids. """ - if not cls._alias_map: + if not cls._alias_map and Configuration.get_global_configuration().is_at_home: default_kvs_client = await _get_default_kvs_client() record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY) @@ -156,7 +156,8 @@ async def _get_default_kvs_client() -> KeyValueStoreClientAsync: min_delay_between_retries_millis=500, timeout_secs=360, ) - + if not configuration.default_key_value_store_id: + raise ValueError("'Configuration.default_key_value_store_id' must be set.") return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id) diff --git a/tests/integration/test_apify_storages.py b/tests/integration/test_apify_storages.py index 0cf0c9af..83ad7ebd 100644 --- a/tests/integration/test_apify_storages.py +++ b/tests/integration/test_apify_storages.py @@ -5,7 +5,7 @@ from crawlee import service_locator from crawlee.storages import Dataset, KeyValueStore, RequestQueue -from apify import Configuration +from apify import Actor, Configuration from apify.storage_clients import ApifyStorageClient @@ -32,3 +32,44 @@ async def test_alias_concurrent_creation_local( except AssertionError: for storage in storages: await storage.drop() + + +@pytest.mark.parametrize( + 'storage_type', + [Dataset, KeyValueStore, RequestQueue], +) +async def test_unnamed_default_without_config( + storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str +) -> None: + """Test that default Apify storage used locally is unnamed storage.""" + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(ApifyStorageClient()) + + # Open storage and make sure it has no name and it has id + storage = await storage_type.open() + assert storage.name is None + assert storage.id + + # Make sure the same instance is returned when opened again without name or alias + storage_again = await storage_type.open() + assert storage is storage_again + + await storage.drop() + + +@pytest.mark.parametrize( + 'storage_type', + [Dataset, KeyValueStore, RequestQueue], +) +async def test_aliases_not_stored_on_platform_when_local( + storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str +) -> None: + """Test that default Apify storage used locally is not persisting aliases to Apify based default KVS.""" + service_locator.set_configuration(Configuration(token=apify_token)) + service_locator.set_storage_client(ApifyStorageClient()) + async with Actor(configure_logging=False): + await storage_type.open(alias='test') + default_kvs = await Actor.open_key_value_store(force_cloud=True) + + # The default KVS should be empty + assert len(await default_kvs.list_keys()) == 0