Skip to content

Commit 1cc9401

Browse files
authored
fix: Set default desired concurrency for non-browser crawlers to 10 (#1419)
- Set default desired concurrency for non-browser crawlers to 10. - The default desired concurrency for browser crawlers stays at 1. - The default maximum concurrency is set to 100. - Based on the Slack discussion https://apify.slack.com/archives/CD0SF6KD4/p1756993901117969.
1 parent 5ada081 commit 1cc9401

File tree

9 files changed

+39
-18
lines changed

9 files changed

+39
-18
lines changed

src/crawlee/_types.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,9 @@ class ConcurrencySettings:
110110
def __init__(
111111
self,
112112
min_concurrency: int = 1,
113-
max_concurrency: int = 200,
113+
max_concurrency: int = 100,
114114
max_tasks_per_minute: float = float('inf'),
115-
desired_concurrency: int | None = None,
115+
desired_concurrency: int = 10,
116116
) -> None:
117117
"""Initialize a new instance.
118118
@@ -125,21 +125,24 @@ def __init__(
125125
desired_concurrency: The desired number of tasks that should be running parallel on the start of the pool,
126126
if there is a large enough supply of them. By default, it is `min_concurrency`.
127127
"""
128-
if desired_concurrency is not None and desired_concurrency < 1:
129-
raise ValueError('desired_concurrency must be 1 or larger')
130-
131128
if min_concurrency < 1:
132129
raise ValueError('min_concurrency must be 1 or larger')
133130

134131
if max_concurrency < min_concurrency:
135132
raise ValueError('max_concurrency cannot be less than min_concurrency')
136133

134+
if desired_concurrency < min_concurrency:
135+
raise ValueError('desired_concurrency cannot be less than min_concurrency')
136+
137+
if desired_concurrency > max_concurrency:
138+
raise ValueError('desired_concurrency cannot be greater than max_concurrency')
139+
137140
if max_tasks_per_minute <= 0:
138141
raise ValueError('max_tasks_per_minute must be positive')
139142

140143
self.min_concurrency = min_concurrency
141144
self.max_concurrency = max_concurrency
142-
self.desired_concurrency = desired_concurrency if desired_concurrency is not None else min_concurrency
145+
self.desired_concurrency = desired_concurrency
143146
self.max_tasks_per_minute = max_tasks_per_minute
144147

145148

src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from parsel import Selector
1313
from typing_extensions import Self, TypeVar, override
1414

15-
from crawlee._types import BasicCrawlingContext, JsonSerializable, RequestHandlerRunResult
15+
from crawlee._types import BasicCrawlingContext, ConcurrencySettings, JsonSerializable, RequestHandlerRunResult
1616
from crawlee._utils.docs import docs_group
1717
from crawlee._utils.wait import wait_for
1818
from crawlee.crawlers import (
@@ -158,6 +158,10 @@ def __init__(
158158
self.result_checker = result_checker or (lambda _: True)
159159
self.result_comparator = result_comparator or create_default_comparator(result_checker)
160160

161+
# Set default concurrency settings for browser crawlers if not provided
162+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
163+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
164+
161165
super().__init__(statistics=statistics, **kwargs)
162166

163167
# Sub crawlers related.

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
from crawlee import service_locator
1414
from crawlee._request import Request, RequestOptions
15+
from crawlee._types import ConcurrencySettings
1516
from crawlee._utils.blocked import RETRY_CSS_SELECTORS
1617
from crawlee._utils.docs import docs_group
1718
from crawlee._utils.robots import RobotsTxtFile
@@ -194,6 +195,10 @@ def __init__(
194195

195196
kwargs['http_client'] = PlaywrightHttpClient() if not kwargs.get('http_client') else kwargs['http_client']
196197

198+
# Set default concurrency settings for browser crawlers if not provided
199+
if 'concurrency_settings' not in kwargs or kwargs['concurrency_settings'] is None:
200+
kwargs['concurrency_settings'] = ConcurrencySettings(desired_concurrency=1)
201+
197202
super().__init__(**kwargs)
198203

199204
async def _open_page(

tests/unit/_autoscaling/test_autoscaled_pool.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ async def run() -> None:
135135
is_finished_function=lambda: future(started_count > 0),
136136
concurrency_settings=ConcurrencySettings(
137137
min_concurrency=1,
138+
desired_concurrency=1,
138139
max_concurrency=1,
139140
),
140141
)
@@ -320,6 +321,7 @@ async def run() -> None:
320321
is_finished_function=lambda: future(done_count >= 4),
321322
concurrency_settings=ConcurrencySettings(
322323
min_concurrency=4,
324+
desired_concurrency=4,
323325
max_concurrency=4,
324326
),
325327
)

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -799,7 +799,7 @@ async def test_max_requests_per_crawl() -> None:
799799

800800
# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
801801
crawler = BasicCrawler(
802-
concurrency_settings=ConcurrencySettings(max_concurrency=1),
802+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
803803
max_requests_per_crawl=3,
804804
)
805805

@@ -820,7 +820,7 @@ async def test_max_crawl_depth() -> None:
820820

821821
# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
822822
crawler = BasicCrawler(
823-
concurrency_settings=ConcurrencySettings(max_concurrency=1),
823+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
824824
max_crawl_depth=2,
825825
)
826826

@@ -859,7 +859,10 @@ async def test_abort_on_error(
859859
) -> None:
860860
starts_urls = []
861861

862-
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1), abort_on_error=True)
862+
crawler = BasicCrawler(
863+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
864+
abort_on_error=True,
865+
)
863866

864867
@crawler.router.default_handler
865868
async def handler(context: BasicCrawlingContext) -> None:
@@ -991,7 +994,7 @@ async def test_crawler_manual_stop() -> None:
991994
processed_urls = []
992995

993996
# Set max_concurrency to 1 to ensure testing urls are visited one by one in order.
994-
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=1))
997+
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1))
995998

996999
@crawler.router.default_handler
9971000
async def handler(context: BasicCrawlingContext) -> None:
@@ -1018,8 +1021,8 @@ async def test_crawler_multiple_stops_in_parallel() -> None:
10181021
]
10191022
processed_urls = []
10201023

1021-
# Set max_concurrency to 2 to ensure two urls are being visited in parallel.
1022-
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(max_concurrency=2))
1024+
# Set concurrency to 2 to ensure two urls are being visited in parallel.
1025+
crawler = BasicCrawler(concurrency_settings=ConcurrencySettings(desired_concurrency=2, max_concurrency=2))
10231026

10241027
both_handlers_started = asyncio.Barrier(2) # type:ignore[attr-defined] # Test is skipped in older Python versions.
10251028
only_one_handler_at_a_time = asyncio.Semaphore(1)
@@ -1298,7 +1301,7 @@ async def test_keep_alive(
12981301
keep_alive=keep_alive,
12991302
max_requests_per_crawl=max_requests_per_crawl,
13001303
# If more request can run in parallel, then max_requests_per_crawl is not deterministic.
1301-
concurrency_settings=ConcurrencySettings(max_concurrency=1),
1304+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
13021305
)
13031306
mocked_handler = Mock()
13041307

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl
8282

8383
# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
8484
crawler = BeautifulSoupCrawler(
85-
concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
85+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
86+
max_requests_per_crawl=3,
87+
http_client=http_client,
8688
)
8789

8890
@crawler.router.default_handler

tests/unit/crawlers/_http/test_http_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ async def test_isolation_cookies(http_client: HttpClient, server_url: URL) -> No
381381
),
382382
http_client=http_client,
383383
max_request_retries=10,
384-
concurrency_settings=ConcurrencySettings(max_concurrency=1),
384+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
385385
)
386386

387387
@crawler.router.default_handler

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,9 @@ async def test_enqueue_links_with_max_crawl(server_url: URL, http_client: HttpCl
103103

104104
# Set max_concurrency to 1 to ensure testing max_requests_per_crawl accurately
105105
crawler = ParselCrawler(
106-
concurrency_settings=ConcurrencySettings(max_concurrency=1), max_requests_per_crawl=3, http_client=http_client
106+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
107+
max_requests_per_crawl=3,
108+
http_client=http_client,
107109
)
108110

109111
@crawler.router.default_handler

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL)
335335
crawler = PlaywrightCrawler(
336336
session_pool=SessionPool(max_pool_size=1),
337337
use_incognito_pages=use_incognito_pages,
338-
concurrency_settings=ConcurrencySettings(max_concurrency=1),
338+
concurrency_settings=ConcurrencySettings(desired_concurrency=1, max_concurrency=1),
339339
)
340340

341341
@crawler.router.default_handler

0 commit comments

Comments
 (0)