Merge remote-tracking branch 'origin/master' into only-apply-timeout-to-request-handler

janbuchar · janbuchar · commit f61b1f8502c7 · 2025-12-04T13:18:07.000+01:00
diff --git a/.github/workflows/build_and_deploy_docs.yaml b/.github/workflows/build_and_deploy_docs.yaml
@@ -67,6 +67,10 @@ jobs:
         uses: actions/deploy-pages@v4
 
       - name: Invalidate CloudFront cache
-        run: gh workflow run invalidate.yaml --repo apify/apify-docs-private
+        run: |
+          gh workflow run invalidate-cloudfront.yml \
+            --repo apify/apify-docs-private \
+            --field deployment=crawlee-web
+          echo "✅ CloudFront cache invalidation workflow triggered successfully"
         env:
           GITHUB_TOKEN: ${{ secrets.APIFY_SERVICE_ACCOUNT_GITHUB_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,17 +3,21 @@
 All notable changes to this project will be documented in this file.
 
 <!-- git-cliff-unreleased-start -->
-## 1.1.1 - **not yet released**
+## 1.1.2 - **not yet released**
+
+
+<!-- git-cliff-unreleased-end -->
+## [1.1.1](https://github.com/apify/crawlee-python/releases/tag/v1.1.1) (2025-12-02)
 
 ### 🐛 Bug Fixes
 
 - Unify separators in `unique_key` construction ([#1569](https://github.com/apify/crawlee-python/pull/1569)) ([af46a37](https://github.com/apify/crawlee-python/commit/af46a3733b059a8052489296e172f005def953f7)) by [@vdusek](https://github.com/vdusek), closes [#1512](https://github.com/apify/crawlee-python/issues/1512)
 - Fix `same-domain` strategy ignoring public suffix  ([#1572](https://github.com/apify/crawlee-python/pull/1572)) ([3d018b2](https://github.com/apify/crawlee-python/commit/3d018b21a28a4bee493829783057188d6106a69b)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1571](https://github.com/apify/crawlee-python/issues/1571)
 - Make context helpers work in `FailedRequestHandler` and `ErrorHandler` ([#1570](https://github.com/apify/crawlee-python/pull/1570)) ([b830019](https://github.com/apify/crawlee-python/commit/b830019350830ac33075316061659e2854f7f4a5)) by [@Pijukatel](https://github.com/Pijukatel), closes [#1532](https://github.com/apify/crawlee-python/issues/1532)
 - Fix non-ASCII character corruption in `FileSystemStorageClient` on systems without UTF-8 default encoding ([#1580](https://github.com/apify/crawlee-python/pull/1580)) ([f179f86](https://github.com/apify/crawlee-python/commit/f179f8671b0b6af9264450e4fef7e49d1cecd2bd)) by [@Mantisus](https://github.com/Mantisus), closes [#1579](https://github.com/apify/crawlee-python/issues/1579)
+- Respect `&lt;base&gt;` when enqueuing ([#1590](https://github.com/apify/crawlee-python/pull/1590)) ([de517a1](https://github.com/apify/crawlee-python/commit/de517a1629cc29b20568143eb64018f216d4ba33)) by [@Mantisus](https://github.com/Mantisus), closes [#1589](https://github.com/apify/crawlee-python/issues/1589)
 
 
-<!-- git-cliff-unreleased-end -->
 ## [1.1.0](https://github.com/apify/crawlee-python/releases/tag/v1.1.0) (2025-11-18)
 
 ### 🚀 Features
diff --git a/docs/deployment/code_examples/google/cloud_run_example.py b/docs/deployment/code_examples/google/cloud_run_example.py
@@ -9,7 +9,7 @@
 from crawlee.storage_clients import MemoryStorageClient
 
 
-@get('/')
+@get('/')  # type: ignore[untyped-decorator]
 async def main() -> str:
     """The crawler entry point that will be called when the HTTP endpoint is accessed."""
     # highlight-start
diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py
@@ -6,10 +6,7 @@
 import functions_framework
 from flask import Request, Response
 
-from crawlee.crawlers import (
-    BeautifulSoupCrawler,
-    BeautifulSoupCrawlingContext,
-)
+from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
 from crawlee.storage_clients import MemoryStorageClient
 
 
@@ -51,7 +48,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     # highlight-end
 
 
-@functions_framework.http
+@functions_framework.http  # type: ignore[untyped-decorator]
 def crawlee_run(request: Request) -> Response:
     # You can pass data to your crawler using `request`
     function_id = request.headers['Function-Execution-Id']
diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py
@@ -14,7 +14,7 @@
 app = FastAPI(lifespan=lifespan, title='Crawler app')
 
 
-@app.get('/', response_class=HTMLResponse)
+@app.get('/', response_class=HTMLResponse)  # type: ignore[untyped-decorator]
 def index() -> str:
     return """
 <!DOCTYPE html>
@@ -32,7 +32,7 @@ def index() -> str:
 """
 
 
-@app.get('/scrape')
+@app.get('/scrape')  # type: ignore[untyped-decorator]
 async def scrape_url(request: Request, url: str | None = None) -> dict:
     if not url:
         return {'url': 'missing', 'scrape result': 'no results'}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "crawlee"
-version = "1.1.1"
+version = "1.1.2"
 description = "Crawlee for Python"
 authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }]
 license = { file = "LICENSE" }
@@ -75,7 +75,7 @@ otel = [
 ]
 sql_postgres = [
     "sqlalchemy[asyncio]>=2.0.0,<3.0.0",
-    "asyncpg>=0.24.0; python_version < '3.14'" # TODO: https://github.com/apify/crawlee-python/issues/1555
+    "asyncpg>=0.24.0"
 ]
 sql_sqlite = [
     "sqlalchemy[asyncio]>=2.0.0,<3.0.0",
@@ -102,7 +102,7 @@ dev = [
     "build<2.0.0", # For e2e tests.
     "dycw-pytest-only<3.0.0",
     "fakeredis[probabilistic,json,lua]<3.0.0",
-    "mypy~=1.18.0",
+    "mypy~=1.19.0",
     "pre-commit<5.0.0",
     "proxy-py<3.0.0",
     "pydoc-markdown<5.0.0",
diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
@@ -193,9 +193,15 @@ async def extract_links(
             kwargs.setdefault('strategy', 'same-hostname')
 
             links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
-            links_iterator = to_absolute_url_iterator(
-                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
+
+            # Get base URL from <base> tag if present
+            extracted_base_urls = list(self._parser.find_links(parsed_content, 'base[href]'))
+            base_url: str = (
+                str(extracted_base_urls[0])
+                if extracted_base_urls
+                else context.request.loaded_url or context.request.url
             )
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py
@@ -396,9 +396,12 @@ async def extract_links(
             links_iterator: Iterator[str] = iter(
                 [url for element in elements if (url := await element.get_attribute('href')) is not None]
             )
-            links_iterator = to_absolute_url_iterator(
-                context.request.loaded_url or context.request.url, links_iterator, logger=context.log
-            )
+
+            # Get base URL from <base> tag if present
+            extracted_base_url = await context.page.evaluate('document.baseURI')
+            base_url: str = extracted_base_url or context.request.loaded_url or context.request.url
+
+            links_iterator = to_absolute_url_iterator(base_url, links_iterator, logger=context.log)
 
             if robots_txt_file:
                 skipped, links_iterator = partition(lambda url: robots_txt_file.is_allowed(url), links_iterator)
diff --git a/src/crawlee/storage_clients/_sql/_storage_client.py b/src/crawlee/storage_clients/_sql/_storage_client.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import sys
 import warnings
 from datetime import timedelta
 from pathlib import Path
@@ -269,14 +268,6 @@ def _get_or_create_engine(self, configuration: Configuration) -> AsyncEngine:
                 'Unsupported database. Supported: sqlite, postgresql. Consider using a different database.'
             )
 
-        # TODO: https://github.com/apify/crawlee-python/issues/1555
-        if 'postgresql' in connection_string and sys.version_info >= (3, 14):
-            raise ValueError(
-                'SqlStorageClient cannot use PostgreSQL with Python 3.14 '
-                'due to asyncpg compatibility limitations. '
-                'Please use Python 3.13 or earlier, or switch to SQLite.'
-            )
-
         self._engine = create_async_engine(
             connection_string,
             future=True,
diff --git a/tests/e2e/project_template/test_static_crawlers_templates.py b/tests/e2e/project_template/test_static_crawlers_templates.py
@@ -71,6 +71,9 @@ async def test_static_crawler_actor_at_apify(
         project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager
     )
 
+    # Print apify version for debugging purposes in rare cases of CLI failures
+    subprocess.run(['apify', '--version'], check=True)  # noqa: ASYNC221, S607
+
     # Build actor using sequence of cli commands as the user would
     subprocess.run(  # noqa: ASYNC221, S603
         ['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']],  # noqa: S607
diff --git a/tests/unit/_utils/test_system.py b/tests/unit/_utils/test_system.py
@@ -54,6 +54,7 @@ def no_extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barr
 
         def extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier) -> None:
             memory = SharedMemory(size=extra_memory_size, create=True)
+            assert memory.buf is not None
             memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])
             print(f'Using the memory... {memory.buf[-1]}')
             ready.wait()
@@ -64,6 +65,7 @@ def extra_memory_child(ready: synchronize.Barrier, measured: synchronize.Barrier
         def shared_extra_memory_child(
             ready: synchronize.Barrier, measured: synchronize.Barrier, memory: SharedMemory
         ) -> None:
+            assert memory.buf is not None
             print(f'Using the memory... {memory.buf[-1]}')
             ready.wait()
             measured.wait()
@@ -79,6 +81,7 @@ def get_additional_memory_estimation_while_running_processes(
 
             if use_shared_memory:
                 shared_memory = SharedMemory(size=extra_memory_size, create=True)
+                assert shared_memory.buf is not None
                 shared_memory.buf[:] = bytearray([255 for _ in range(extra_memory_size)])
                 extra_args = [shared_memory]
             else:
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -305,7 +305,7 @@ async def request_handler(context: BasicCrawlingContext) -> None:
         raise RuntimeError('Arbitrary crash for testing purposes')
 
     # Apply one of the handlers
-    @getattr(crawler, handler)  # type:ignore[misc] # Untyped decorator is ok to make the test concise
+    @getattr(crawler, handler)  # type: ignore[untyped-decorator]
     async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None:
         await context.push_data(test_data)
         await context.add_requests(requests=[test_request], rq_alias=rq_alias)
diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
@@ -60,6 +60,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -133,6 +136,9 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
         str(server_url / 'sub_index'),
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
+        str(server_url / 'base_page'),
+        str(server_url / 'page_4'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
     # # all urls added to `enqueue_links` must have a custom header
@@ -166,6 +172,8 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
     assert visited == {
         str(server_url / 'start_enqueue'),
         str(server_url / 'sub_index'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -223,6 +231,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
     }
 
 
diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py
@@ -61,6 +61,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -151,6 +154,9 @@ async def request_handler(context: ParselCrawlingContext) -> None:
         str(server_url / 'sub_index'),
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
+        str(server_url / 'page_4'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
     # # all urls added to `enqueue_links` must have a custom header
@@ -258,6 +264,8 @@ async def request_handler(context: ParselCrawlingContext) -> None:
     assert visited == {
         str(server_url / 'start_enqueue'),
         str(server_url / 'sub_index'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -315,6 +323,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
     }
 
 
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -101,6 +101,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -670,6 +673,8 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
     assert visited == {
         str(server_url / 'start_enqueue'),
         str(server_url / 'sub_index'),
+        str(server_url / 'base_page'),
+        str(server_url / 'base_subpath/page_5'),
     }
 
 
@@ -726,6 +731,7 @@ async def skipped_hook(url: str, _reason: SkippedReason) -> None:
         str(server_url / 'page_1'),
         str(server_url / 'page_2'),
         str(server_url / 'page_3'),
+        str(server_url / 'page_4'),
     }
 
 
diff --git a/tests/unit/server.py b/tests/unit/server.py
@@ -15,6 +15,7 @@
 from yarl import URL
 
 from tests.unit.server_endpoints import (
+    BASE_INDEX,
     GENERIC_RESPONSE,
     HELLO_WORLD,
     INCAPSULA,
@@ -105,6 +106,7 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
         'page_1': generic_response_endpoint,
         'page_2': generic_response_endpoint,
         'page_3': generic_response_endpoint,
+        'base_page': base_index_endpoint,
         'problematic_links': problematic_links_endpoint,
         'set_cookies': set_cookies,
         'set_complex_cookies': set_complex_cookies,
@@ -441,6 +443,16 @@ async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, s
     )
 
 
+async def base_index_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
+    """Handle requests for the base index page."""
+    host = f'http://{get_headers_dict(_scope).get("host", "localhost")}'
+    content = BASE_INDEX.format(host=host).encode()
+    await send_html_response(
+        send,
+        content,
+    )
+
+
 class TestServer(Server):
     """A test HTTP server implementation based on Uvicorn Server."""
 
diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py
@@ -24,6 +24,18 @@
 <body>
     <a href="/page_3">Link 3</a>
     <a href="/page_2">Link 4</a>
+    <a href="/base_page">Base Page</a>
+</body></html>"""
+
+BASE_INDEX = """\
+<html><head>
+    <base href="{host}/base_subpath/">
+    <base href="{host}/sub_index/">
+    <title>Hello</title>
+</head>
+<body>
+    <a href="page_5">Link 5</a>
+    <a href="/page_4">Link 6</a>
 </body></html>"""
 
 INCAPSULA = b"""\
diff --git a/uv.lock b/uv.lock
diff --git a/website/package.json b/website/package.json
diff --git a/website/src/components/LLMButtons.jsx b/website/src/components/LLMButtons.jsx
diff --git a/website/src/components/LLMButtons.module.css b/website/src/components/LLMButtons.module.css
diff --git a/website/src/theme/DocItem/Content/index.js b/website/src/theme/DocItem/Content/index.js
diff --git a/website/src/theme/DocItem/Content/styles.module.css b/website/src/theme/DocItem/Content/styles.module.css
diff --git a/website/yarn.lock b/website/yarn.lock

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,9 @@ async def test_static_crawler_actor_at_apify(`
`71`	`71`	`project_path=tmp_path / actor_name, wheel_path=crawlee_wheel_path, package_manager=package_manager`
`72`	`72`	`)`
`73`	`73`
	`74`	`+ # Print apify version for debugging purposes in rare cases of CLI failures`
	`75`	`+ subprocess.run(['apify', '--version'], check=True) # noqa: ASYNC221, S607`
	`76`	`+`
`74`	`77`	`# Build actor using sequence of cli commands as the user would`
`75`	`78`	`subprocess.run( # noqa: ASYNC221, S603`
`76`	`79`	`['apify', 'login', '-t', os.environ['APIFY_TEST_USER_API_TOKEN']], # noqa: S607`