Skip to content

Commit f61b1f8

Browse files
committed
Merge remote-tracking branch 'origin/master' into only-apply-timeout-to-request-handler
2 parents 4b80365 + 131f1f0 commit f61b1f8

File tree

24 files changed

+1285
-483
lines changed

24 files changed

+1285
-483
lines changed

.github/workflows/build_and_deploy_docs.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ jobs:
6767
uses: actions/deploy-pages@v4
6868

6969
- name: Invalidate CloudFront cache
70-
run: gh workflow run invalidate.yaml --repo apify/apify-docs-private
70+
run: |
71+
gh workflow run invalidate-cloudfront.yml \
72+
--repo apify/apify-docs-private \
73+
--field deployment=crawlee-web
74+
echo "✅ CloudFront cache invalidation workflow triggered successfully"
7175
env:
7276
GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}

CHANGELOG.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,21 @@
33
All notable changes to this project will be documented in this file.
44

55
<!-- git-cliff-unreleased-start -->
6-
## 1.1.1 - **not yet released**
6+
## 1.1.2 - **not yet released**
7+
8+
9+
<!-- git-cliff-unreleased-end -->
10+
## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
711

812
### 🐛 Bug Fixes
913

1014
- Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
1115
- Fix `same-domain` strategy ignoring public suffix ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
1216
- Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
1317
- Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
18+
- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)
1419

1520

16-
<!-- git-cliff-unreleased-end -->
1721
## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)
1822

1923
### 🚀 Features

docs/deployment/code_examples/google/cloud_run_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from crawlee.storage_clients import MemoryStorageClient
1010

1111

12-
@get('/')
12+
@get('/') # type: ignore[untyped-decorator]
1313
async def main() -> str:
1414
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
1515
# highlight-start

docs/deployment/code_examples/google/google_example.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,7 @@
66
import functions_framework
77
from flask import Request, Response
88

9-
from crawlee.crawlers import (
10-
BeautifulSoupCrawler,
11-
BeautifulSoupCrawlingContext,
12-
)
9+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1310
from crawlee.storage_clients import MemoryStorageClient
1411

1512

@@ -51,7 +48,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
5148
# highlight-end
5249

5350

54-
@functions_framework.http
51+
@functions_framework.http # type: ignore[untyped-decorator]
5552
def crawlee_run(request: Request) -> Response:
5653
# You can pass data to your crawler using `request`
5754
function_id = request.headers['Function-Execution-Id']

docs/guides/code_examples/running_in_web_server/server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
app = FastAPI(lifespan=lifespan, title='Crawler app')
1515

1616

17-
@app.get('/', response_class=HTMLResponse)
17+
@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator]
1818
def index() -> str:
1919
return """
2020
<!DOCTYPE html>
@@ -32,7 +32,7 @@ def index() -> str:
3232
"""
3333

3434

35-
@app.get('/scrape')
35+
@app.get('/scrape') # type: ignore[untyped-decorator]
3636
async def scrape_url(request: Request, url: str | None = None) -> dict:
3737
if not url:
3838
return {'url': 'missing', 'scrape result': 'no results'}

pyproject.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "crawlee"
7-
version = "1.1.1"
7+
version = "1.1.2"
88
description = "Crawlee for Python"
99
authors = [{ name = "Apify Technologies s.r.o.", email = "[email protected]" }]
1010
license = { file = "LICENSE" }
@@ -75,7 +75,7 @@ otel = [
7575
]
7676
sql_postgres = [
7777
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
78-
"asyncpg>=0.24.0; python_version < '3.14'" # TODO: https://github.com/apify/crawlee-python/issues/1555
78+
"asyncpg>=0.24.0"
7979
]
8080
sql_sqlite = [
8181
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
@@ -102,7 +102,7 @@ dev = [
102102
"build<2.0.0", # For e2e tests.
103103
"dycw-pytest-only<3.0.0",
104104
"fakeredis[probabilistic,json,lua]<3.0.0",
105-
"mypy~=1.18.0",
105+
"mypy~=1.19.0",
106106
"pre-commit<5.0.0",
107107
"proxy-py<3.0.0",
108108
"pydoc-markdown<5.0.0",

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,9 +193,15 @@ async def extract_links(
193193
kwargs.setdefault('strategy', 'same-hostname')
194194

195195
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
196-
links_iterator = to_absolute_url_iterator(
197-
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
196+
197+
# Get base URL from <base> tag if present
198+
extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
199+
base_url: str = (
200+
str(extracted_base_urls[0])
201+
if extracted_base_urls
202+
else context.request.loaded_url or context.request.url
198203
)
204+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
199205

200206
if robots_txt_file:
201207
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -396,9 +396,12 @@ async def extract_links(
396396
links_iterator: Iterator[str] = iter(
397397
[url for element in elements if (url := await element.get_attribute('href')) is not None]
398398
)
399-
links_iterator = to_absolute_url_iterator(
400-
context.request.loaded_url or context.request.url, links_iterator, logger=context.log
401-
)
399+
400+
# Get base URL from <base> tag if present
401+
extracted_base_url = await context.page.evaluate('document.baseURI')
402+
base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
403+
404+
links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
402405

403406
if robots_txt_file:
404407
skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)

src/crawlee/storage_clients/_sql/_storage_client.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import sys
43
import warnings
54
from datetime import timedelta
65
from pathlib import Path
@@ -269,14 +268,6 @@ def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
269268
'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
270269
)
271270

272-
# TODO: https://github.com/apify/crawlee-python/issues/1555
273-
if 'postgresql' in connection_string and sys.version_info >= (3, 14):
274-
raise ValueError(
275-
'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
276-
'due to asyncpg compatibility limitations. '
277-
'Please use Python 3.13 or earlier, or switch to SQLite.'
278-
)
279-
280271
self._engine = create_async_engine(
281272
connection_string,
282273
future=True,

tests/e2e/project_template/test_static_crawlers_templates.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ async def test_static_crawler_actor_at_apify(
7171
project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager
7272
)
7373

74+
# Print apify version for debugging purposes in rare cases of CLI failures
75+
subprocess.run(['apify', '--version'], check=True) # noqa: ASYNC221, S607
76+
7477
# Build actor using sequence of cli commands as the user would
7578
subprocess.run( # noqa: ASYNC221, S603
7679
['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']], # noqa: S607

0 commit comments

Comments
 (0)