Skip to content

Commit 8ca6f0d

Browse files
authored
chore: Add e2e tests for playwright templates and pip based templates (#1136)
### Description - Add e2e tests for playwright templates - Add e2e tests for pip-based templates - Fix template issue with pip template that has additional dependencies - Separate all e2e tests in CI to run in it's own runner to allow selective re-triggers (e2e tests work on real network and it will always have some flakiness due to that) - Reduce `max_requests_per_crawl` to 10 in all templates to be aligned with doc examples and to make the tests faster. ### Issues - Closes: #1109
1 parent 196555b commit 8ca6f0d

File tree

11 files changed

+136
-38
lines changed

11 files changed

+136
-38
lines changed

.github/workflows/templates_e2e_tests.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@ name: Templates end-to-end tests
22

33
on:
44
workflow_dispatch:
5-
secrets:
6-
APIFY_TEST_USER_API_TOKEN:
7-
description: API token of the Python testing user on Apify
8-
required: true
5+
schedule:
6+
- cron: '0 6 * * *'
97

108
jobs:
119
end_to_end_tests:
1210
name: End-to-end tests
1311
strategy:
1412
fail-fast: false
13+
max-parallel: 12
14+
matrix:
15+
crawler-type: ["playwright_camoufox", "playwright", "parsel", "beautifulsoup"]
16+
http-client: [ "httpx", "curl_impersonate"]
17+
package-manager: ["pip", "uv", "poetry"]
1518

1619
runs-on: "ubuntu-latest"
1720
env:
@@ -44,10 +47,11 @@ jobs:
4447
with:
4548
python-version: ${{ env.python-version }}
4649

50+
# Sync the project, but no need to install the browsers into the test runner environment.
4751
- name: Install Python dependencies
48-
run: make install-dev
52+
run: make install-sync
4953

5054
- name: Run templates end-to-end tests
51-
run: make e2e-templates-tests
55+
run: make e2e-templates-tests args="-m ${{ matrix.http-client }} and ${{ matrix.crawler-type }} and ${{ matrix.package-manager }}"
5256
env:
5357
APIFY_TEST_USER_API_TOKEN: ${{ secrets.APIFY_TEST_USER_API_TOKEN }}

Makefile

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
1-
.PHONY: clean install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov \
2-
integration-tests format check-code build-api-reference run-docs
1+
.PHONY: clean install-sync install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov \
2+
e2e-templates-tests format check-code build-api-reference run-docs
33

44
# This is default for local testing, but GitHub workflows override it to a higher value in CI
55
E2E_TESTS_CONCURRENCY = 1
66

77
clean:
88
rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage
99

10-
install-dev:
10+
install-sync:
1111
uv sync --all-extras
12+
13+
install-dev:
14+
make install-sync
1215
uv run pre-commit install
1316
uv run playwright install
1417

@@ -32,8 +35,8 @@ unit-tests:
3235
unit-tests-cov:
3336
uv run pytest --numprocesses=auto --verbose --cov=src/crawlee --cov-report=html tests/unit
3437

35-
e2e-templates-tests:
36-
uv run pytest --numprocesses=$(E2E_TESTS_CONCURRENCY) --verbose tests/e2e/project_template
38+
e2e-templates-tests $(args):
39+
uv run pytest --numprocesses=$(E2E_TESTS_CONCURRENCY) --verbose tests/e2e/project_template "$(args)"
3740

3841
format:
3942
uv run ruff check --fix

src/crawlee/project_template/templates/main_beautifulsoup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
# % block instantiation
88
crawler = BeautifulSoupCrawler(
99
request_handler=router,
10-
max_requests_per_crawl=50,
10+
max_requests_per_crawl=10,
1111
{{ self.http_client_instantiation() }})
1212
# % endblock

src/crawlee/project_template/templates/main_parsel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
# % block instantiation
88
crawler = ParselCrawler(
99
request_handler=router,
10-
max_requests_per_crawl=50,
10+
max_requests_per_crawl=10,
1111
{{ self.http_client_instantiation() }})
1212
# % endblock

src/crawlee/project_template/templates/main_playwright.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88
crawler = PlaywrightCrawler(
99
request_handler=router,
1010
headless=True,
11-
max_requests_per_crawl=50,
11+
max_requests_per_crawl=10,
1212
{{ self.http_client_instantiation() }})
1313
# % endblock

src/crawlee/project_template/{{cookiecutter.project_name}}/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ RUN echo "Python version:" \
7272
&& cat requirements.txt | \
7373
# Replace playwright version so that it matches whatever is pre-installed in the image (the `hash` checks if playwright is installed)
7474
sed "s/^playwright==\(.*\)/playwright==$(hash playwright 2>/dev/null && (playwright --version | cut -d ' ' -f 2) || echo '\1')/" | \
75-
# Install everything using pip (ignore dependency checks - the lockfile is correct, period)
76-
pip install -r /dev/stdin --no-dependencies \
75+
# Install everything using pip
76+
pip install -r /dev/stdin \
7777
&& echo "All installed Python packages:" \
7878
&& pip freeze
7979
# % elif cookiecutter.package_manager == 'manual'

src/crawlee/project_template/{{cookiecutter.project_name}}/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ requires-python = ">=3.9,<4.0"
1919
dependencies = [
2020
"crawlee[{{ extras|join(',') }}]",
2121
# % if cookiecutter.crawler_type == 'playwright-camoufox'
22-
"camoufox[geoip]>=0.4.5",
22+
"camoufox[geoip]~=0.4.5",
2323
# % endif
2424
# % if cookiecutter.enable_apify_integration
2525
"apify",

src/crawlee/project_template/{{cookiecutter.project_name}}/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# % if cookiecutter.crawler_type == 'playwright-camoufox'
2-
camoufox
2+
camoufox[geoip]~=0.4.5
33
# % set extras = ['playwright']
44
# % else
55
# % set extras = [cookiecutter.crawler_type]

tests/e2e/conftest.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,27 @@
22
from pathlib import Path
33

44
import pytest
5+
from _pytest.config import Config
56
from filelock import FileLock
67

78
_CRAWLEE_ROOT_PATH = Path(__file__).parent.parent.parent.resolve()
89

910

11+
def pytest_configure(config: Config) -> None:
12+
for marker in [
13+
'httpx',
14+
'curl_impersonate',
15+
'playwright',
16+
'playwright_camoufox',
17+
'parsel',
18+
'beautifulsoup',
19+
'uv',
20+
'poetry',
21+
'pip',
22+
]:
23+
config.addinivalue_line('markers', f'{marker}: Integration test parameter marker.')
24+
25+
1026
@pytest.fixture(scope='session')
1127
def crawlee_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) -> Path:
1228
"""Build the package wheel if it hasn't been built yet, and return the path to the wheel."""

tests/e2e/project_template/test_static_crawlers_templates.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,50 @@
22
import re
33
import subprocess
44
from pathlib import Path
5+
from typing import Literal
56

67
import pytest
78
from apify_client import ApifyClientAsync
89
from cookiecutter.main import cookiecutter
910

1011
from crawlee._cli import default_start_url, template_directory
1112
from crawlee._utils.crypto import crypto_random_object_id
12-
from tests.e2e.project_template.utils import patch_crawlee_version_in_pyproject_toml_based_project
13+
from tests.e2e.project_template.utils import patch_crawlee_version_in_project
1314

1415
# To run these tests locally, make sure you have apify-cli installed and available in the path.
1516
# https://docs.apify.com/cli/docs/installation
1617

1718

18-
@pytest.mark.parametrize('http_client', ['httpx', 'curl-impersonate'])
19-
@pytest.mark.parametrize('crawler_type', ['parsel', 'beautifulsoup'])
20-
@pytest.mark.parametrize('package_manager', ['uv', 'poetry'])
19+
@pytest.mark.parametrize(
20+
'crawler_type',
21+
[
22+
pytest.param('playwright-camoufox', marks=pytest.mark.playwright_camoufox),
23+
pytest.param('playwright', marks=pytest.mark.playwright),
24+
pytest.param('parsel', marks=pytest.mark.parsel),
25+
pytest.param('beautifulsoup', marks=pytest.mark.beautifulsoup),
26+
],
27+
)
28+
@pytest.mark.parametrize(
29+
'http_client',
30+
[
31+
pytest.param('httpx', marks=pytest.mark.httpx),
32+
pytest.param('curl-impersonate', marks=pytest.mark.curl_impersonate),
33+
],
34+
)
35+
@pytest.mark.parametrize(
36+
'package_manager',
37+
[
38+
pytest.param('pip', marks=pytest.mark.pip),
39+
pytest.param('uv', marks=pytest.mark.uv),
40+
pytest.param('poetry', marks=pytest.mark.poetry),
41+
],
42+
)
2143
async def test_static_crawler_actor_at_apify(
22-
tmp_path: Path, crawlee_wheel_path: Path, package_manager: str, crawler_type: str, http_client: str
44+
tmp_path: Path,
45+
crawlee_wheel_path: Path,
46+
package_manager: Literal['pip', 'uv', 'poetry'],
47+
crawler_type: str,
48+
http_client: str,
2349
) -> None:
2450
# Generate new actor name
2551
actor_name = f'crawlee-python-template-e2e-test-{crypto_random_object_id(8).lower()}'
@@ -40,8 +66,8 @@ async def test_static_crawler_actor_at_apify(
4066
output_dir=tmp_path,
4167
)
4268

43-
patch_crawlee_version_in_pyproject_toml_based_project(
44-
project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path
69+
patch_crawlee_version_in_project(
70+
project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager
4571
)
4672

4773
# Build actor using sequence of cli commands as the user would
@@ -56,16 +82,19 @@ async def test_static_crawler_actor_at_apify(
5682
build_process = subprocess.run(['apify', 'push'], capture_output=True, check=False, cwd=tmp_path / actor_name) # noqa: ASYNC221, S603, S607
5783
# Get actor ID from build log
5884
actor_id_regexp = re.compile(r'https:\/\/console\.apify\.com\/actors\/(.*)#\/builds\/\d*\.\d*\.\d*')
59-
# Why is it in stderr and not in stdout???
60-
actor_id = re.findall(actor_id_regexp, build_process.stderr.decode())[0]
85+
86+
if match := re.findall(actor_id_regexp, build_process.stderr.decode()):
87+
actor_id = match[0]
88+
else:
89+
raise AssertionError(f'Failed to find actor id in build log: {build_process.stderr.decode()}')
6190

6291
client = ApifyClientAsync(token=os.getenv('APIFY_TEST_USER_API_TOKEN'))
6392
actor = client.actor(actor_id)
6493

6594
# Run actor
6695
try:
6796
assert build_process.returncode == 0
68-
started_run_data = await actor.start()
97+
started_run_data = await actor.start(memory_mbytes=8192)
6998
actor_run = client.run(started_run_data['id'])
7099

71100
finished_run_data = await actor_run.wait_for_finish()
@@ -80,6 +109,6 @@ async def test_static_crawler_actor_at_apify(
80109
assert finished_run_data
81110
assert finished_run_data['status'] == 'SUCCEEDED', additional_run_info
82111
assert (
83-
'Crawler.stop() was called with following reason: The crawler has reached its limit of 50 requests per crawl.'
112+
'Crawler.stop() was called with following reason: The crawler has reached its limit of 10 requests per crawl.'
84113
) in actor_run_log, additional_run_info
85-
assert int(re.findall(r'requests_finished\s*│\s*(\d*)', actor_run_log)[-1]) >= 50, additional_run_info
114+
assert int(re.findall(r'requests_finished\s*│\s*(\d*)', actor_run_log)[-1]) >= 10, additional_run_info

0 commit comments

Comments
 (0)