Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
8e2f5d4
Draft for tests
Pijukatel Aug 26, 2025
1d869a4
Updated draft
Pijukatel Aug 27, 2025
08df986
Try to use list_head
Pijukatel Aug 27, 2025
6131fff
Locks not needed with in_progress
Pijukatel Aug 27, 2025
553663a
Add alternate client
Pijukatel Aug 27, 2025
eadab26
WIP
Pijukatel Aug 28, 2025
249f8f5
Find the chacing problem.
Pijukatel Aug 28, 2025
4ada123
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Aug 28, 2025
10e0652
Wip changes
Pijukatel Aug 28, 2025
359c46e
Add init cache test, update upgrading guide
Pijukatel Sep 12, 2025
ce090c0
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 18, 2025
b511011
Finalize change and add few more tests
Pijukatel Sep 19, 2025
fb32861
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 19, 2025
7ec13ef
Remove unnecesary methods from the specialized client
Pijukatel Sep 19, 2025
10bc7e2
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 19, 2025
7712410
Rename default_request_queue_apify
Pijukatel Sep 19, 2025
e63f546
Use single and shared literals and rename the RQ client classes
Pijukatel Sep 19, 2025
ffa70ff
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 19, 2025
e5bdff2
Update tests
Pijukatel Sep 22, 2025
57cd8ae
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 22, 2025
79c02f5
Update upgrading guide
Pijukatel Sep 22, 2025
d29a534
Extract storage related complexity from Actor to dedicated storage cl…
Pijukatel Sep 24, 2025
506b770
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 24, 2025
1cc80bb
Update log test
Pijukatel Sep 24, 2025
860b0ec
Rename access to request_queue_access
Pijukatel Sep 24, 2025
e6c6fc5
Update src/apify/_actor.py
Pijukatel Sep 24, 2025
da2f5df
Review comments
Pijukatel Sep 24, 2025
8861c5e
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 24, 2025
1e8a834
Review comments
Pijukatel Sep 24, 2025
de941d4
Update based on Crawlee update
Pijukatel Sep 25, 2025
b4a588d
Merge remote-tracking branch 'origin/master' into no-locking-queue
Pijukatel Sep 25, 2025
c5968bc
Use composition instead of inheritance
Pijukatel Sep 25, 2025
49c357e
Polish some docs
Pijukatel Sep 25, 2025
6edb093
More docs polishing
Pijukatel Sep 25, 2025
b17ebef
Track pending_request_count in local metadata estimation
Pijukatel Sep 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions docs/04_upgrading/upgrading_to_v3.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,28 @@ async def main():

<!-- TODO -->

## Explicit control over storage clients used in Actor
- It is now possible to have full control over which storage clients are used by the `Actor`. To make development of Actors convenient, the `Actor` has two storage clients. One that is used when running on Apify platform or when opening storages with `force_cloud=True` and the other client that is used when running outside the Apify platform. The `Actor` has reasonable defaults and for the majority of use-cases there is no need to change it. However, if you need to use a different storage client, you can set it up before entering `Actor` context through `service_locator`.

**Now (v3.0):**
```python
from crawlee import service_locator
from apify.storage_clients import ApifyStorageClient, ApifyHybridStorageClient, MemoryStorageClient
from apify import Actor


async def main():
service_locator.set_storage_client(
ApifyHybridStorageClient(
cloud_storage_client=ApifyStorageClient(access="single"),
local_storage_client=MemoryStorageClient()
)
)
async with Actor:
rq = await Actor.open_request_queue()
```


## The default use of optimized ApifyRequestQueueClient

- The default client for working with Apify platform based `RequestQueue` is now optimized and simplified client which does significantly lower amount of API calls, but does not support multiple consumers working on the same queue. It is cheaper and faster and is suitable for the majority of the use cases.
Expand All @@ -61,12 +83,13 @@ async def main():
**Now (v3.0):**

```python
from apify.storages import RequestQueue
from crawlee import service_locator
from apify.storage_clients import ApifyStorageClient
from apify import Actor

async def main():
# Full client that supports multiple consumers of the Apify Request Queue
rq_shared = await RequestQueue.open(storage_client=ApifyStorageClient(access="shared"))
# Default optimized client that expects only single consumer of the Apify Request Queue
rq_single = await RequestQueue.open(storage_client=ApifyStorageClient())
service_locator.set_storage_client(ApifyStorageClient(access="shared"))
async with Actor:
rq = await Actor.open_request_queue()
```
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ keywords = [
dependencies = [
"apify-client>=2.0.0,<3.0.0",
"apify-shared>=2.0.0,<3.0.0",
"crawlee==0.6.13b42",
"crawlee @ git+https://github.com/apify/crawlee-python.git@include-storag-client-in-additional-cache-key",
"cachetools>=5.5.0",
"cryptography>=42.0.0",
"impit>=0.6.1",
Expand Down
104 changes: 34 additions & 70 deletions src/apify/_actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
from apify.log import _configure_logging, logger
from apify.storage_clients import ApifyStorageClient
from apify.storage_clients._file_system import ApifyFileSystemStorageClient
from apify.storages import Dataset, KeyValueStore, RequestQueue
from apify.storage_clients._hybrid_apify._storage_client import ApifyHybridStorageClient

if TYPE_CHECKING:
import logging
Expand All @@ -48,9 +48,9 @@
from typing_extensions import Self

from crawlee.proxy_configuration import _NewUrlFunction
from crawlee.storage_clients import StorageClient

from apify._models import Webhook
from apify.storages import Dataset, KeyValueStore, RequestQueue


MainReturnType = TypeVar('MainReturnType')
Expand Down Expand Up @@ -131,7 +131,6 @@ def __init__(
self._configuration = configuration
self._configure_logging = configure_logging
self._apify_client: ApifyClientAsync | None = None
self._local_storage_client: StorageClient | None = None

self._is_initialized = False

Expand Down Expand Up @@ -234,45 +233,49 @@ def log(self) -> logging.Logger:
"""The logging.Logger instance the Actor uses."""
return logger

def _get_local_storage_client(self) -> StorageClient:
"""Get the local storage client the Actor instance uses."""
if self._local_storage_client:
return self._local_storage_client
def _raise_if_not_initialized(self) -> None:
if not self._is_initialized:
raise RuntimeError('The Actor was not initialized!')

@cached_property
def _storage_client(self) -> ApifyHybridStorageClient:
"""Storage client used by the actor.

Depending on the initialization of the service locator the client can be created in different ways.
"""
try:
# Set implicit default local storage client, unless local storage client was already set.
implicit_storage_client = ApifyFileSystemStorageClient()
# Notning was set by the user.
implicit_storage_client = ApifyHybridStorageClient(
local_storage_client=ApifyFileSystemStorageClient(), cloud_storage_client=ApifyStorageClient()
)
service_locator.set_storage_client(implicit_storage_client)
self._local_storage_client = implicit_storage_client
except ServiceConflictError:
self.log.debug(
'Storage client in service locator was set explicitly before Actor.init was called.'
'Using the existing storage client as implicit storage client for the Actor.'
)
else:
return implicit_storage_client

self._local_storage_client = service_locator.get_storage_client()
if type(self._local_storage_client) is FileSystemStorageClient:
# User set something in the service locator.
storage_client = service_locator.get_storage_client()
if isinstance(storage_client, ApifyHybridStorageClient):
# The client was manually set to the right type in the service locator. This is the explicit way.
return storage_client

if isinstance(storage_client, ApifyStorageClient):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So if I do service_locator.set_cstorage_client(ApifyStorageClient()) before Actor init, this will force using the filesystem locally? That doesn't sound too desirable.

Copy link
Contributor Author

@Pijukatel Pijukatel Sep 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, at this point, it is a guess what the user wants. Only a fully explicit setting of the SmartApifyStorageClient tells us what the user is really trying to do. We can guess in the other direction or throw an exception and allow only one of the two options: a fully implicit(default) or fully explicit client (SmartApifyStorageClient).

It is kind of an edge case, so I am fine with any of those. If you have a strong preference, I will do it that way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this particular case, I would throw instead of guessing, but include detailed instructions for setting a custom client.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, throwing

# The cloud storage client was manually set in the service locator.
return ApifyHybridStorageClient(cloud_storage_client=storage_client)

# The local storage client was manually set in the service locator
if type(storage_client) is FileSystemStorageClient:
self.log.warning(
f'Using {FileSystemStorageClient.__module__}.{FileSystemStorageClient.__name__} in Actor context is not'
f' recommended and can lead to problems with reading the input file. Use '
f'`apify.storage_clients.FileSystemStorageClient` instead.'
)

return self._local_storage_client

def _raise_if_not_initialized(self) -> None:
if not self._is_initialized:
raise RuntimeError('The Actor was not initialized!')

def _raise_if_cloud_requested_but_not_configured(self, *, force_cloud: bool) -> None:
if not force_cloud:
return

if not self.is_at_home() and self.configuration.token is None:
raise RuntimeError(
'In order to use the Apify cloud storage from your computer, '
'you need to provide an Apify token using the APIFY_TOKEN environment variable.'
)
return ApifyHybridStorageClient(cloud_storage_client=ApifyStorageClient(), local_storage_client=storage_client)

async def init(self) -> None:
"""Initialize the Actor instance.
Expand All @@ -298,22 +301,13 @@ async def init(self) -> None:
if _ActorType._is_any_instance_initialized:
self.log.warning('Repeated Actor initialization detected - this is non-standard usage, proceed with care')

# Create an instance of the cloud storage client, the local storage client is obtained
# from the service locator
self._cloud_storage_client = ApifyStorageClient()

# Make sure that the currently initialized instance is also available through the global `Actor` proxy
cast('Proxy', Actor).__wrapped__ = self

self._is_exiting = False
self._was_final_persist_state_emitted = False

# If the Actor is running on the Apify platform, we set the cloud storage client.
if self.is_at_home():
service_locator.set_storage_client(self._cloud_storage_client)
self._local_storage_client = self._cloud_storage_client
else:
self._get_local_storage_client()
self.log.debug(f'Storage client set to {self._storage_client}')

service_locator.set_event_manager(self.event_manager)

Expand Down Expand Up @@ -470,17 +464,7 @@ async def open_dataset(
An instance of the `Dataset` class for the given ID or name.
"""
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()

return await Dataset.open(
id=id,
alias=alias,
name=name,
configuration=self.configuration,
storage_client=storage_client,
)
return await self._storage_client.open_dataset(id=id, name=name, alias=alias, force_cloud=force_cloud)

async def open_key_value_store(
self,
Expand Down Expand Up @@ -509,17 +493,7 @@ async def open_key_value_store(
An instance of the `KeyValueStore` class for the given ID or name.
"""
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()

return await KeyValueStore.open(
id=id,
alias=alias,
name=name,
configuration=self.configuration,
storage_client=storage_client,
)
return await self._storage_client.open_key_value_store(id=id, name=name, alias=alias, force_cloud=force_cloud)

async def open_request_queue(
self,
Expand Down Expand Up @@ -550,17 +524,7 @@ async def open_request_queue(
An instance of the `RequestQueue` class for the given ID or name.
"""
self._raise_if_not_initialized()
self._raise_if_cloud_requested_but_not_configured(force_cloud=force_cloud)

storage_client = self._cloud_storage_client if force_cloud else self._get_local_storage_client()

return await RequestQueue.open(
id=id,
alias=alias,
name=name,
configuration=self.configuration,
storage_client=storage_client,
)
return await self._storage_client.open_request_queue(id=id, name=name, alias=alias, force_cloud=force_cloud)

@overload
async def push_data(self, data: dict | list[dict]) -> None: ...
Expand Down
2 changes: 2 additions & 0 deletions src/apify/storage_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from ._apify import ApifyStorageClient
from ._file_system import ApifyFileSystemStorageClient as FileSystemStorageClient
from ._hybrid_apify import ApifyHybridStorageClient

__all__ = [
'ApifyHybridStorageClient',
'ApifyStorageClient',
'FileSystemStorageClient',
'MemoryStorageClient',
Expand Down
1 change: 1 addition & 0 deletions src/apify/storage_clients/_hybrid_apify/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._storage_client import ApifyHybridStorageClient
Loading