fix: Set default desired concurrency for non-browser crawlers to 10 (#1419)

vdusek · web-flow · commit 1cc940197600 · 2025-09-25T11:21:20.000+02:00
- Set default desired concurrency for non-browser crawlers to 10. - The default desired concurrency for browser crawlers stays at 1. - The default maximum concurrency is set to 100. - Based on the Slack discussion https://apify.slack.com/archives/CD0SF6KD4/p1756993901117969.
diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py
@@ -110,9 +110,9 @@ class ConcurrencySettings:
     def __init__(
         self,
         min_concurrency: int = 1,
-        max_concurrency: int = 200,
+        max_concurrency: int = 100,
         max_tasks_per_minute: float = float('inf'),
-        desired_concurrency: int | None = None,
+        desired_concurrency: int = 10,
     ) -> None:
         """Initialize a new instance.
 
@@ -125,21 +125,24 @@ def __init__(
             desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
                 if there is a large enough supply of them. By default, it is `min_concurrency`.
         """
-        if desired_concurrency is not None and desired_concurrency < 1:
-            raise ValueError('desired_concurrency must be 1 or larger')
-
         if min_concurrency < 1:
             raise ValueError('min_concurrency must be 1 or larger')
 
         if max_concurrency < min_concurrency:
             raise ValueError('max_concurrency cannot be less than min_concurrency')
 
+        if desired_concurrency < min_concurrency:
+            raise ValueError('desired_concurrency cannot be less than min_concurrency')
+
+        if desired_concurrency > max_concurrency:
+            raise ValueError('desired_concurrency cannot be greater than max_concurrency')
+
         if max_tasks_per_minute <= 0:
             raise ValueError('max_tasks_per_minute must be positive')
 
         self.min_concurrency = min_concurrency
         self.max_concurrency = max_concurrency
-        self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
+        self.desired_concurrency = desired_concurrency
         self.max_tasks_per_minute = max_tasks_per_minute
 
 
diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py
@@ -12,7 +12,7 @@
 from parsel import Selector
 from typing_extensions import Self, TypeVar, override
 
-from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
+from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
 from crawlee._utils.docs import docs_group
 from crawlee._utils.wait import wait_for
 from crawlee.crawlers import (
@@ -158,6 +158,10 @@ def __init__(
         self.result_checker = result_checker or (lambda _: True)
         self.result_comparator = result_comparator or create_default_comparator(result_checker)
 
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
+
         super().__init__(statistics=statistics, **kwargs)
 
         # Sub crawlers related.
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -12,6 +12,7 @@
 
 from crawlee import service_locator
 from crawlee._request import Request, RequestOptions
+from crawlee._types import ConcurrencySettings
 from crawlee._utils.blocked import RETRY_CSS_SELECTORS
 from crawlee._utils.docs import docs_group
 from crawlee._utils.robots import RobotsTxtFile
@@ -194,6 +195,10 @@ def __init__(
 
         kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
 
+        # Set default concurrency settings for browser crawlers if not provided
+        if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
+            kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
+
         super().__init__(**kwargs)
 
     async def _open_page(
diff --git a/tests/unit/_autoscaling/test_autoscaled_pool.py b/tests/unit/_autoscaling/test_autoscaled_pool.py
@@ -135,6 +135,7 @@ async def run() -> None:
         is_finished_function=lambda: future(started_count > 0),
         concurrency_settings=ConcurrencySettings(
             min_concurrency=1,
+            desired_concurrency=1,
             max_concurrency=1,
         ),
     )
@@ -320,6 +321,7 @@ async def run() -> None:
         is_finished_function=lambda: future(done_count >= 4),
         concurrency_settings=ConcurrencySettings(
             min_concurrency=4,
+            desired_concurrency=4,
             max_concurrency=4,
         ),
     )
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -799,7 +799,7 @@ async def test_max_requests_per_crawl() -> None:
 
     # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
     crawler = BasicCrawler(
-        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
         max_requests_per_crawl=3,
     )
 
@@ -820,7 +820,7 @@ async def test_max_crawl_depth() -> None:
 
     # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
     crawler = BasicCrawler(
-        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
         max_crawl_depth=2,
     )
 
@@ -859,7 +859,10 @@ async def test_abort_on_error(
 ) -> None:
     starts_urls = []
 
-    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1), abort_on_error=True)
+    crawler = BasicCrawler(
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
+        abort_on_error=True,
+    )
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
@@ -991,7 +994,7 @@ async def test_crawler_manual_stop() -> None:
     processed_urls = []
 
     # Set max_concurrency to 1 to ensure testing urls are visited one by one in order.
-    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1))
+    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1))
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
@@ -1018,8 +1021,8 @@ async def test_crawler_multiple_stops_in_parallel() -> None:
     ]
     processed_urls = []
 
-    # Set max_concurrency to 2 to ensure two urls are being visited in parallel.
-    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=2))
+    # Set concurrency to 2 to ensure two urls are being visited in parallel.
+    crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2))
 
     both_handlers_started = asyncio.Barrier(2)  # type:ignore[attr-defined]  # Test is skipped in older Python versions.
     only_one_handler_at_a_time = asyncio.Semaphore(1)
@@ -1298,7 +1301,7 @@ async def test_keep_alive(
         keep_alive=keep_alive,
         max_requests_per_crawl=max_requests_per_crawl,
         # If more request can run in parallel, then max_requests_per_crawl is not deterministic.
-        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
     )
     mocked_handler = Mock()
 
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -82,7 +82,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl
 
     # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
     crawler = BeautifulSoupCrawler(
-        concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
+        max_requests_per_crawl=3,
+        http_client=http_client,
     )
 
     @crawler.router.default_handler
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -381,7 +381,7 @@ async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> No
         ),
         http_client=http_client,
         max_request_retries=10,
-        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
     )
 
     @crawler.router.default_handler
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -103,7 +103,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl
 
     # Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
     crawler = ParselCrawler(
-        concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
+        max_requests_per_crawl=3,
+        http_client=http_client,
     )
 
     @crawler.router.default_handler
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -335,7 +335,7 @@ async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL)
     crawler = PlaywrightCrawler(
         session_pool=SessionPool(max_pool_size=1),
         use_incognito_pages=use_incognito_pages,
-        concurrency_settings=ConcurrencySettings(max_concurrency=1),
+        concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
     )
 
     @crawler.router.default_handler

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl`
`82`	`82`
`83`	`83`	`# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately`
`84`	`84`	`crawler = BeautifulSoupCrawler(`
`85`		`- concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client`
	`85`	`+ concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),`
	`86`	`+ max_requests_per_crawl=3,`
	`87`	`+ http_client=http_client,`
`86`	`88`	`)`
`87`	`89`
`88`	`90`	`@crawler.router.default_handler`
Original file line number	Diff line number	Diff line change
`@@ -381,7 +381,7 @@ async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> No`
`381`	`381`	`),`
`382`	`382`	`http_client=http_client,`
`383`	`383`	`max_request_retries=10,`
`384`		`- concurrency_settings=ConcurrencySettings(max_concurrency=1),`
	`384`	`+ concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),`
`385`	`385`	`)`
`386`	`386`
`387`	`387`	`@crawler.router.default_handler`
Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl`
`103`	`103`
`104`	`104`	`# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately`
`105`	`105`	`crawler = ParselCrawler(`
`106`		`- concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client`
	`106`	`+ concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),`
	`107`	`+ max_requests_per_crawl=3,`
	`108`	`+ http_client=http_client,`
`107`	`109`	`)`
`108`	`110`
`109`	`111`	`@crawler.router.default_handler`
Original file line number	Diff line number	Diff line change
`@@ -335,7 +335,7 @@ async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL)`
`335`	`335`	`crawler = PlaywrightCrawler(`
`336`	`336`	`session_pool=SessionPool(max_pool_size=1),`
`337`	`337`	`use_incognito_pages=use_incognito_pages,`
`338`		`- concurrency_settings=ConcurrencySettings(max_concurrency=1),`
	`338`	`+ concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),`
`339`	`339`	`)`
`340`	`340`
`341`	`341`	`@crawler.router.default_handler`