Skip to content

Commit 1f47475

Browse files
committed
refactor!: Refactor service container and other related components
1 parent a5ff59c commit 1f47475

File tree

13 files changed

+148
-181
lines changed

13 files changed

+148
-181
lines changed

src/crawlee/_log_config.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44
import logging
55
import sys
66
import textwrap
7-
from typing import TYPE_CHECKING, Any
7+
from typing import Any
88

99
from colorama import Fore, Style, just_fix_windows_console
1010
from typing_extensions import assert_never
1111

12-
if TYPE_CHECKING:
13-
from crawlee.configuration import Configuration
12+
from crawlee import service_container
1413

1514
just_fix_windows_console()
1615

@@ -35,35 +34,32 @@
3534
_LOG_MESSAGE_INDENT = ' ' * 6
3635

3736

38-
def get_configured_log_level(configuration: Configuration) -> int:
39-
verbose_logging_requested = 'verbose_log' in configuration.model_fields_set and configuration.verbose_log
37+
def get_configured_log_level() -> int:
38+
config = service_container.get_configuration()
4039

41-
if 'log_level' in configuration.model_fields_set:
42-
if configuration.log_level == 'DEBUG':
40+
verbose_logging_requested = 'verbose_log' in config.model_fields_set and config.verbose_log
41+
42+
if 'log_level' in config.model_fields_set:
43+
if config.log_level == 'DEBUG':
4344
return logging.DEBUG
44-
if configuration.log_level == 'INFO':
45+
if config.log_level == 'INFO':
4546
return logging.INFO
46-
if configuration.log_level == 'WARNING':
47+
if config.log_level == 'WARNING':
4748
return logging.WARNING
48-
if configuration.log_level == 'ERROR':
49+
if config.log_level == 'ERROR':
4950
return logging.ERROR
50-
if configuration.log_level == 'CRITICAL':
51+
if config.log_level == 'CRITICAL':
5152
return logging.CRITICAL
5253

53-
assert_never(configuration.log_level)
54+
assert_never(config.log_level)
5455

5556
if sys.flags.dev_mode or verbose_logging_requested:
5657
return logging.DEBUG
5758

5859
return logging.INFO
5960

6061

61-
def configure_logger(
62-
logger: logging.Logger,
63-
configuration: Configuration,
64-
*,
65-
remove_old_handlers: bool = False,
66-
) -> None:
62+
def configure_logger(logger: logging.Logger, *, remove_old_handlers: bool = False) -> None:
6763
handler = logging.StreamHandler()
6864
handler.setFormatter(CrawleeLogFormatter())
6965

@@ -72,7 +68,7 @@ def configure_logger(
7268
logger.removeHandler(old_handler)
7369

7470
logger.addHandler(handler)
75-
logger.setLevel(get_configured_log_level(configuration))
71+
logger.setLevel(get_configured_log_level())
7672

7773

7874
class CrawleeLogFormatter(logging.Formatter):

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@
5050

5151
from crawlee._types import ConcurrencySettings, HttpMethod, JsonSerializable
5252
from crawlee.base_storage_client._models import DatasetItemsListPage
53-
from crawlee.configuration import Configuration
5453
from crawlee.events._event_manager import EventManager
5554
from crawlee.http_clients import BaseHttpClient, HttpResponse
5655
from crawlee.proxy_configuration import ProxyConfiguration, ProxyInfo
@@ -95,9 +94,6 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
9594
"""Maximum number of session rotations per request. The crawler rotates the session if a proxy error occurs
9695
or if the website blocks the request."""
9796

98-
configuration: NotRequired[Configuration]
99-
"""Crawler configuration."""
100-
10197
request_handler_timeout: NotRequired[timedelta]
10298
"""Maximum duration allowed for a single request handler to run."""
10399

@@ -172,7 +168,6 @@ def __init__(
172168
max_request_retries: int = 3,
173169
max_requests_per_crawl: int | None = None,
174170
max_session_rotations: int = 10,
175-
configuration: Configuration | None = None,
176171
request_handler_timeout: timedelta = timedelta(minutes=1),
177172
session_pool: SessionPool | None = None,
178173
use_session_pool: bool = True,
@@ -200,7 +195,6 @@ def __init__(
200195
this value.
201196
max_session_rotations: Maximum number of session rotations per request. The crawler rotates the session
202197
if a proxy error occurs or if the website blocks the request.
203-
configuration: Crawler configuration.
204198
request_handler_timeout: Maximum duration allowed for a single request handler to run.
205199
use_session_pool: Enable the use of a session pool for managing sessions during crawling.
206200
session_pool: A custom `SessionPool` instance, allowing the use of non-default configuration.
@@ -235,12 +229,13 @@ def __init__(
235229
self._max_session_rotations = max_session_rotations
236230

237231
self._request_provider = request_provider
238-
self._configuration = configuration or service_container.get_configuration()
232+
233+
config = service_container.get_configuration()
239234

240235
self._request_handler_timeout = request_handler_timeout
241236
self._internal_timeout = (
242-
self._configuration.internal_timeout
243-
if self._configuration.internal_timeout is not None
237+
config.internal_timeout
238+
if config.internal_timeout is not None
244239
else max(2 * request_handler_timeout, timedelta(minutes=5))
245240
)
246241

@@ -249,10 +244,8 @@ def __init__(
249244
self._event_manager = event_manager or service_container.get_event_manager()
250245
self._snapshotter = Snapshotter(
251246
self._event_manager,
252-
max_memory_size=ByteSize.from_mb(self._configuration.memory_mbytes)
253-
if self._configuration.memory_mbytes
254-
else None,
255-
available_memory_ratio=self._configuration.available_memory_ratio,
247+
max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None,
248+
available_memory_ratio=config.available_memory_ratio,
256249
)
257250
self._pool = AutoscaledPool(
258251
system_status=SystemStatus(self._snapshotter),
@@ -269,13 +262,11 @@ def __init__(
269262

270263
if configure_logging:
271264
root_logger = logging.getLogger()
272-
configure_logger(root_logger, self._configuration, remove_old_handlers=True)
265+
configure_logger(root_logger, remove_old_handlers=True)
273266

274267
# Silence HTTPX logger
275268
httpx_logger = logging.getLogger('httpx')
276-
httpx_logger.setLevel(
277-
logging.DEBUG if get_configured_log_level(self._configuration) <= logging.DEBUG else logging.WARNING
278-
)
269+
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
279270

280271
if not _logger:
281272
_logger = logging.getLogger(__name__)
@@ -360,7 +351,7 @@ async def get_request_provider(
360351
) -> RequestProvider:
361352
"""Return the configured request provider. If none is configured, open and return the default request queue."""
362353
if not self._request_provider:
363-
self._request_provider = await RequestQueue.open(id=id, name=name, configuration=self._configuration)
354+
self._request_provider = await RequestQueue.open(id=id, name=name)
364355

365356
return self._request_provider
366357

@@ -371,7 +362,7 @@ async def get_dataset(
371362
name: str | None = None,
372363
) -> Dataset:
373364
"""Return the dataset with the given ID or name. If none is provided, return the default dataset."""
374-
return await Dataset.open(id=id, name=name, configuration=self._configuration)
365+
return await Dataset.open(id=id, name=name)
375366

376367
async def get_key_value_store(
377368
self,
@@ -380,7 +371,7 @@ async def get_key_value_store(
380371
name: str | None = None,
381372
) -> KeyValueStore:
382373
"""Return the key-value store with the given ID or name. If none is provided, return the default KVS."""
383-
return await KeyValueStore.open(id=id, name=name, configuration=self._configuration)
374+
return await KeyValueStore.open(id=id, name=name)
384375

385376
def error_handler(
386377
self, handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext]
@@ -425,7 +416,7 @@ async def run(
425416
request_provider = await self.get_request_provider()
426417
if purge_request_queue and isinstance(request_provider, RequestQueue):
427418
await request_provider.drop()
428-
self._request_provider = await RequestQueue.open(configuration=self._configuration)
419+
self._request_provider = await RequestQueue.open()
429420

430421
if requests is not None:
431422
await self.add_requests(requests)

src/crawlee/configuration.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -229,20 +229,3 @@ class Configuration(BaseSettings):
229229
),
230230
] = False
231231
"""This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670."""
232-
233-
@classmethod
234-
def get_global_configuration(cls) -> Self:
235-
"""Retrieve the global instance of the configuration."""
236-
from crawlee import service_container
237-
238-
if service_container.get_configuration_if_set() is None:
239-
service_container.set_configuration(cls())
240-
241-
global_instance = service_container.get_configuration()
242-
243-
if not isinstance(global_instance, cls):
244-
raise TypeError(
245-
f'Requested global configuration object of type {cls}, but {global_instance.__class__} was found'
246-
)
247-
248-
return global_instance

src/crawlee/errors.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
'RequestHandlerError',
1717
'SessionError',
1818
'UserDefinedErrorHandlerError',
19-
'ServiceConflictError',
2019
]
2120

2221
TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext)
@@ -89,13 +88,3 @@ def __init__(self, wrapped_exception: Exception, crawling_context: BasicCrawling
8988
@docs_group('Errors')
9089
class ContextPipelineInterruptedError(Exception):
9190
"""May be thrown in the initialization phase of a middleware to signal that the request should not be processed."""
92-
93-
94-
@docs_group('Errors')
95-
class ServiceConflictError(RuntimeError):
96-
"""Thrown when a service container is getting reconfigured."""
97-
98-
def __init__(self, service_name: str, new_value: object, old_value: object) -> None:
99-
super().__init__(
100-
f"Service '{service_name}' was already set (existing value is '{old_value}', new value is '{new_value}')."
101-
)

src/crawlee/memory_storage_client/_creation_management.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from logging import getLogger
1111
from typing import TYPE_CHECKING
1212

13+
from crawlee import service_container
1314
from crawlee._consts import METADATA_FILENAME
1415
from crawlee._utils.data_processing import maybe_parse_body
1516
from crawlee._utils.file import json_dumps
@@ -405,17 +406,17 @@ def _determine_storage_path(
405406
from crawlee.memory_storage_client._request_queue_client import RequestQueueClient
406407
from crawlee.storages._creation_management import _get_default_storage_id
407408

408-
configuration = memory_storage_client._configuration # noqa: SLF001
409+
config = service_container.get_configuration()
409410

410411
if issubclass(resource_client_class, DatasetClient):
411412
storages_dir = memory_storage_client.datasets_directory
412-
default_id = _get_default_storage_id(configuration, Dataset)
413+
default_id = _get_default_storage_id(config, Dataset)
413414
elif issubclass(resource_client_class, KeyValueStoreClient):
414415
storages_dir = memory_storage_client.key_value_stores_directory
415-
default_id = _get_default_storage_id(configuration, KeyValueStore)
416+
default_id = _get_default_storage_id(config, KeyValueStore)
416417
elif issubclass(resource_client_class, RequestQueueClient):
417418
storages_dir = memory_storage_client.request_queues_directory
418-
default_id = _get_default_storage_id(configuration, RequestQueue)
419+
default_id = _get_default_storage_id(config, RequestQueue)
419420
else:
420421
raise TypeError('Invalid resource client class.')
421422

src/crawlee/memory_storage_client/_memory_storage_client.py

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010

1111
from typing_extensions import override
1212

13+
from crawlee import service_container
1314
from crawlee._utils.docs import docs_group
1415
from crawlee.base_storage_client import BaseStorageClient
15-
from crawlee.configuration import Configuration
1616
from crawlee.memory_storage_client._dataset_client import DatasetClient
1717
from crawlee.memory_storage_client._dataset_collection_client import DatasetCollectionClient
1818
from crawlee.memory_storage_client._key_value_store_client import KeyValueStoreClient
@@ -45,13 +45,34 @@ class MemoryStorageClient(BaseStorageClient):
4545
_TEMPORARY_DIR_NAME = '__CRAWLEE_TEMPORARY'
4646
"""Name of the directory used to temporarily store files during purges."""
4747

48-
def __init__(self, configuration: Configuration | None = None) -> None:
48+
def __init__(
49+
self,
50+
write_metadata: bool | None = None,
51+
persist_storage: bool | None = None,
52+
storage_dir: str | None = None,
53+
default_request_queue_id: str | None = None,
54+
default_key_value_store_id: str | None = None,
55+
default_dataset_id: str | None = None,
56+
) -> None:
4957
"""A default constructor.
5058
5159
Args:
52-
configuration: Configuration object to use. If None, a default instance will be created.
60+
write_metadata: Whether to write metadata to the storage.
61+
persist_storage: Whether to persist the storage.
62+
storage_dir: Path to the storage directory.
63+
default_request_queue_id: The default request queue ID.
64+
default_key_value_store_id: The default key-value store ID.
65+
default_dataset_id: The default dataset ID.
5366
"""
54-
self._explicit_configuration = configuration
67+
config = service_container.get_configuration()
68+
69+
# Set the internal attributes.
70+
self._write_metadata = write_metadata or config.write_metadata
71+
self._persist_storage = persist_storage or config.persist_storage
72+
self._storage_dir = storage_dir or config.storage_dir
73+
self._default_request_queue_id = default_request_queue_id or config.default_request_queue_id
74+
self._default_key_value_store_id = default_key_value_store_id or config.default_key_value_store_id
75+
self._default_dataset_id = default_dataset_id or config.default_dataset_id
5576

5677
self.datasets_handled: list[DatasetClient] = []
5778
self.key_value_stores_handled: list[KeyValueStoreClient] = []
@@ -60,24 +81,20 @@ def __init__(self, configuration: Configuration | None = None) -> None:
6081
self._purged_on_start = False # Indicates whether a purge was already performed on this instance.
6182
self._purge_lock = asyncio.Lock()
6283

63-
@property
64-
def _configuration(self) -> Configuration:
65-
return self._explicit_configuration or Configuration.get_global_configuration()
66-
6784
@property
6885
def write_metadata(self) -> bool:
6986
"""Whether to write metadata to the storage."""
70-
return self._configuration.write_metadata
87+
return self._write_metadata
7188

7289
@property
7390
def persist_storage(self) -> bool:
7491
"""Whether to persist the storage."""
75-
return self._configuration.persist_storage
92+
return self._persist_storage
7693

7794
@property
7895
def storage_dir(self) -> str:
7996
"""Path to the storage directory."""
80-
return self._configuration.storage_dir
97+
return self._storage_dir
8198

8299
@property
83100
def datasets_directory(self) -> str:
@@ -197,14 +214,14 @@ async def _purge_default_storages(self) -> None:
197214
self._TEMPORARY_DIR_NAME
198215
) or key_value_store_folder.name.startswith('__OLD'):
199216
await self._batch_remove_files(key_value_store_folder.path)
200-
elif key_value_store_folder.name == self._configuration.default_key_value_store_id:
217+
elif key_value_store_folder.name == self._default_key_value_store_id:
201218
await self._handle_default_key_value_store(key_value_store_folder.path)
202219

203220
# Datasets
204221
if await asyncio.to_thread(os.path.exists, self.datasets_directory):
205222
dataset_folders = await asyncio.to_thread(os.scandir, self.datasets_directory)
206223
for dataset_folder in dataset_folders:
207-
if dataset_folder.name == self._configuration.default_dataset_id or dataset_folder.name.startswith(
224+
if dataset_folder.name == self._default_dataset_id or dataset_folder.name.startswith(
208225
self._TEMPORARY_DIR_NAME
209226
):
210227
await self._batch_remove_files(dataset_folder.path)
@@ -213,9 +230,8 @@ async def _purge_default_storages(self) -> None:
213230
if await asyncio.to_thread(os.path.exists, self.request_queues_directory):
214231
request_queue_folders = await asyncio.to_thread(os.scandir, self.request_queues_directory)
215232
for request_queue_folder in request_queue_folders:
216-
if (
217-
request_queue_folder.name == self._configuration.default_request_queue_id
218-
or request_queue_folder.name.startswith(self._TEMPORARY_DIR_NAME)
233+
if request_queue_folder.name == self._default_request_queue_id or request_queue_folder.name.startswith(
234+
self._TEMPORARY_DIR_NAME
219235
):
220236
await self._batch_remove_files(request_queue_folder.path)
221237

0 commit comments

Comments
 (0)