diff --git a/.github/workflows/check_pr_title.yaml b/.github/workflows/check_pr_title.yaml new file mode 100644 index 00000000..6970d93c --- /dev/null +++ b/.github/workflows/check_pr_title.yaml @@ -0,0 +1,14 @@ +name: Check PR title + +on: + pull_request_target: + types: [opened, edited, synchronize] + +jobs: + check_pr_title: + name: Check PR title + runs-on: ubuntu-latest + steps: + - uses: amannn/action-semantic-pull-request@v5.5.3 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 05866515..4b84ba92 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -104,16 +104,12 @@ jobs: if: steps.get-release-type.outputs.release_type != 'final' run: python ./scripts/update_version_for_prerelease.py ${{ steps.get-release-type.outputs.release_type }} - - # Build a source distribution and a python3-only wheel - name: Build distribution files + # Builds the package. + - name: Build package run: make build - - # Check whether the package description will render correctly on PyPI - name: Check package rendering on PyPI - run: make twine-check - - - # Publish package to PyPI using their official GitHub action - name: Publish package to PyPI + # Publishes the package to PyPI using PyPA official GitHub action with OIDC authentication. + - name: Publish package to PyPI uses: pypa/gh-action-pypi-publish@release/v1 - # Tag the current commit with the version tag if this is not made from the release event (releases are tagged with the release process) diff --git a/.github/workflows/update_new_issue.yaml b/.github/workflows/update_new_issue.yaml new file mode 100644 index 00000000..ffa81f77 --- /dev/null +++ b/.github/workflows/update_new_issue.yaml @@ -0,0 +1,23 @@ +name: Update new issue + +on: + workflow_call: + +jobs: + label_issues: + name: Label issues + runs-on: ubuntu-latest + permissions: + issues: write + + steps: + # Add the "t-tooling" label to all new issues + - uses: actions/github-script@v7 + with: + script: | + github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ["t-tooling"] + }) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4caf9f6f..4a0af384 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,8 +19,14 @@ repos: language: system pass_filenames: false - - id: check-changelog - name: Check whether current version is mentioned in changelog + - id: check-changelog-entry + name: Check changelog entry entry: make check-changelog-entry language: system pass_filenames: false + + - id: check-version-conflict + name: Check version conflict + entry: make check-version-conflict + language: system + pass_filenames: false diff --git a/CHANGELOG.md b/CHANGELOG.md index 35f5d80f..34c088bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [1.7.3](../../releases/tag/v1.7.3) - Unreleased +## [2.0.0](../../releases/tag/v2.0.0) - Unreleased ### Added diff --git a/Makefile b/Makefile index d0b15794..ca1f63b8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: clean install-dev build publish twine-check lint unit-tests integration-tests type-check check-code format check-version-availability check-changelog-entry build-api-reference +.PHONY: clean install-dev build publish-to-pypi lint type-check unit-tests unit-tests-cov integration-tests format check-code check-version-availability check-changelog-entry build-api-reference run-doc DIRS_WITH_CODE = src tests scripts @@ -6,48 +6,53 @@ DIRS_WITH_CODE = src tests scripts INTEGRATION_TESTS_CONCURRENCY = 1 clean: - rm -rf build dist .mypy_cache .pytest_cache .ruff_cache src/*.egg-info __pycache__ + rm -rf .mypy_cache .pytest_cache .ruff_cache build dist htmlcov .coverage install-dev: - python3 -m pip install --upgrade pip - pip install --no-cache-dir -e ".[dev,scrapy]" - pre-commit install + python3 -m pip install --upgrade pip poetry + poetry install --all-extras + poetry run pre-commit install build: - python3 -m build + poetry build --no-interaction -vv -publish: - python3 -m twine upload dist/* - -twine-check: - python3 -m twine check dist/* +# APIFY_PYPI_TOKEN_CRAWLEE is expected to be set in the environment +publish-to-pypi: + poetry config pypi-token.pypi "${APIFY_PYPI_TOKEN_CRAWLEE}" + poetry publish --no-interaction -vv lint: - python3 -m ruff check $(DIRS_WITH_CODE) + poetry run ruff format --check $(DIRS_WITH_CODE) + poetry run ruff check $(DIRS_WITH_CODE) + +type-check: + poetry run mypy $(DIRS_WITH_CODE) unit-tests: - python3 -m pytest --numprocesses=auto --verbose -ra --cov=src/apify tests/unit + poetry run pytest --numprocesses=auto --verbose --cov=src/apify tests/unit unit-tests-cov: - python3 -m pytest --numprocesses=auto --verbose -ra --cov=src/apify --cov-report=html tests/unit + poetry run pytest --numprocesses=auto --verbose --cov=src/apify --cov-report=html tests/unit integration-tests: - python3 -m pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) --verbose -ra tests/integration + poetry run pytest --numprocesses=$(INTEGRATION_TESTS_CONCURRENCY) tests/integration -type-check: - python3 -m mypy $(DIRS_WITH_CODE) +format: + poetry run ruff check --fix $(DIRS_WITH_CODE) + poetry run ruff format $(DIRS_WITH_CODE) +# The check-code target runs a series of checks equivalent to those performed by pre-commit hooks +# and the run_checks.yaml GitHub Actions workflow. check-code: lint type-check unit-tests -format: - python3 -m ruff check --fix $(DIRS_WITH_CODE) - python3 -m ruff format $(DIRS_WITH_CODE) - check-version-availability: - python3 scripts/check_version_availability.py + poetry run python scripts/check_version_availability.py check-changelog-entry: - python3 scripts/check_version_in_changelog.py + poetry run python scripts/check_version_in_changelog.py build-api-reference: cd website && ./build_api_reference.sh + +run-doc: build-api-reference + cd website && npm clean-install && npm run start diff --git a/pyproject.toml b/pyproject.toml index f630f420..4e66bf39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,130 +1,125 @@ -[project] +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] name = "apify" version = "2.0.0" description = "Apify SDK for Python" +authors = ["Apify Technologies s.r.o. "] +license = "Apache-2.0" readme = "README.md" -license = { text = "Apache Software License" } -authors = [{ name = "Apify Technologies s.r.o.", email = "support@apify.com" }] -keywords = ["apify", "sdk", "actor", "scraping", "automation"] - +packages = [{ include = "apify", from = "src" }] classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Topic :: Software Development :: Libraries", ] - -requires-python = ">=3.9" - -# We use inclusive ordered comparison clause for non-Apify packages intentionally in order to enhance the Apify SDK's -# compatibility with a wide range of external packages. This decision was discussed in detail in the following PR: -# https://github.com/apify/apify-sdk-python/pull/154 -dependencies = [ - "apify-client ~= 1.7.1", - "apify-shared ~= 1.1.2", - "aiofiles >= 22.1.0", - "aioshutil >= 1.0", - "colorama >= 0.4.6", - "crawlee >= 0.3.0", - "cryptography >= 39.0.0", - "httpx >= 0.24.0", - "lazy-object-proxy >= 1.10.0", - "psutil >= 5.9.0", - "pyee >= 11.0.0", - "sortedcollections >= 2.0.0", - "typing-extensions >= 4.1.0", - "websockets >= 10.1", -] - -[project.optional-dependencies] -dev = [ - "build ~= 1.2.0", - "filelock ~= 3.15.0", - "mypy ~= 1.10.0", - "pre-commit ~= 3.5.0", - "pydoc-markdown ~= 4.8.0", - "pytest ~= 8.2.0", - "pytest-asyncio ~= 0.23.0", - "pytest-cov ~= 5.0.0", - "pytest-only ~= 2.1.0", - "pytest-timeout ~= 2.3.0", - "pytest-xdist ~= 3.6.0", - "respx ~= 0.21.0", - "ruff ~= 0.5.0", - "setuptools ~= 70.3.0", # setuptools are used by pytest, but not explicitly required - "twine ~= 5.1.0", - "types-aiofiles ~= 24.1.0.20240626", - "types-colorama ~= 0.4.15.20240311", - "types-psutil ~= 6.0.0.20240621", -] -scrapy = [ - "scrapy >= 2.11.0", +keywords = [ + "apify", + "sdk", + "automation", + "chrome", + "crawlee", + "crawler", + "headless", + "scraper", + "scraping", ] -[project.urls] +[tool.poetry.urls] "Homepage" = "https://docs.apify.com/sdk/python/" -"Documentation" = "https://docs.apify.com/sdk/python/" -"Source" = "https://github.com/apify/apify-sdk-python" -"Issue tracker" = "https://github.com/apify/apify-sdk-python/issues" -"Changelog" = "https://github.com/apify/apify-sdk-python/blob/master/CHANGELOG.md" "Apify Homepage" = "https://apify.com" - -[build-system] -requires = ["setuptools ~= 70.3.0", "wheel"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.packages.find] -where = ["src"] -include = ["apify*"] - -[tool.setuptools.package-data] -apify = ["py.typed"] +"Changelog" = "https://docs.apify.com/sdk/python/docs/changelog" +"Documentation" = "https://docs.apify.com/sdk/python/" +"Issue Tracker" = "https://github.com/apify/apify-sdk-python/issues" +"Repository" = "https://github.com/apify/apify-sdk-python" + +# We use inclusive ordered comparison clauses for external packages intentionally in order to enhance SDK's +# compatibility with external packages. This decision was discussed in detail in the following PR: +# https://github.com/apify/apify-sdk-python/pull/154. +[tool.poetry.dependencies] +python = "^3.9" +aiofiles = "^22.1.0" +aioshutil = "^1.0" +apify-client = "^1.7.1" +apify-shared = "^1.1.2" +colorama = "^0.4.6" +crawlee = "^0.3.0" +cryptography = "^39.0.0" +httpx = "^0.27.0" +lazy-object-proxy = "^1.10.0" +psutil = "^6.0.0" +pyee = "^11.0.0" +scrapy = { version = "^2.11.0", optional = true } +sortedcollections = "^2.0.0" +typing-extensions = "^4.1.0" +websockets = "^10.1" + +[tool.poetry.group.dev.dependencies] +build = "^1.2.0" +filelock = "^3.15.0" +mypy = "^1.11.0" +pre-commit = "^3.8.0" +pydoc-markdown = "^4.8.0" +pytest = "^8.3.0" +pytest-asyncio = "^0.24.0" +pytest-cov = "^5.0.0" +pytest-only = "^2.1.0" +pytest-timeout = "^2.3.0" +pytest-xdist = "^3.6.0" +respx = "^0.21.0" +ruff = "^0.6.0" +setuptools = "^74.0.0" # setuptools are used by pytest but not explicitly required +twine = "^5.1.0" +types-aiofiles = "^24.1.0.20240626" +types-colorama = "^0.4.15.20240311" +types-psutil = "^6.0.0.20240621" + +[tool.poetry.extras] +scrapy = ["scrapy"] [tool.ruff] -line-length = 150 +line-length = 120 [tool.ruff.lint] select = ["ALL"] ignore = [ - "A002", # Argument is shadowing a Python builtin - "ANN101", # Missing type annotation for `self` in method - "ANN102", # Missing type annotation for `cls` in method - "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} - "BLE001", # Do not catch blind exception - "C901", # `{name}` is too complex - "COM812", # This rule may cause conflicts when used with the formatter - "D100", # Missing docstring in public module - "D104", # Missing docstring in public package - "D107", # Missing docstring in `__init__` - "EM", # flake8-errmsg - "G004", # Logging statement uses f-string - "ISC001", # This rule may cause conflicts when used with the formatter - "FIX", # flake8-fixme - "PGH003", # Use specific rule codes when ignoring type issues - "PLR0911", # Too many return statements - "PLR0913", # Too many arguments in function definition - "PLR0915", # Too many statements - "PTH", # flake8-use-pathlib - "PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime - "PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None` - "S102", # Use of `exec` detected - "S105", # Possible hardcoded password assigned to - "S106", # Possible hardcoded password assigned to argument: "{name}" - "S301", # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue - "S303", # Use of insecure MD2, MD4, MD5, or SHA1 hash function - "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes - "TD002", # Missing author in TODO; try: `# TODO(): ...` or `# TODO @: ... - "TRY003", # Avoid specifying long messages outside the exception class - - # TODO: Remove this once the following issue is fixed - # https://github.com/apify/apify-sdk-python/issues/150 - "SLF001", # Private member accessed: `{name}` + "ANN101", # Missing type annotation for `self` in method + "ANN102", # Missing type annotation for `{name}` in classmethod + "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} + "ASYNC109", # Async function definition with a `timeout` parameter + "BLE001", # Do not catch blind exception + "C901", # `{name}` is too complex + "COM812", # This rule may cause conflicts when used with the formatter + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D107", # Missing docstring in `__init__` + "EM", # flake8-errmsg + "G004", # Logging statement uses f-string + "ISC001", # This rule may cause conflicts when used with the formatter + "FIX", # flake8-fixme + "PGH003", # Use specific rule codes when ignoring type issues + "PLR0911", # Too many return statements + "PLR0913", # Too many arguments in function definition + "PLR0915", # Too many statements + "PTH", # flake8-use-pathlib + "PYI034", # `__aenter__` methods in classes like `{name}` usually return `self` at runtime + "PYI036", # The second argument in `__aexit__` should be annotated with `object` or `BaseException | None` + "S102", # Use of `exec` detected + "S105", # Possible hardcoded password assigned to + "S106", # Possible hardcoded password assigned to argument: "{name}" + "S301", # `pickle` and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue + "S303", # Use of insecure MD2, MD4, MD5, or SHA1 hash function + "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes + "TD002", # Missing author in TODO; try: `# TODO(): ...` or `# TODO @: ... + "TRY003", # Avoid specifying long messages outside the exception class ] [tool.ruff.format] @@ -144,29 +139,38 @@ indent-style = "space" "**/{tests}/*" = [ "D", # Everything from the pydocstyle "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py - "PT011", # `pytest.raises({ExceptionType})` is too broad, set the `match` parameter or use a more specific exception "PLR2004", # Magic value used in comparison, consider replacing {value} with a constant variable "S101", # Use of assert detected + "SLF001", # Private member accessed: `{name}` "T20", # flake8-print - "TID252", # Relative imports from parent modules are banned "TRY301", # Abstract `raise` to an inner function + "TID252", # Prefer absolute imports over relative imports from parent modules +] +"**/{docs}/**" = [ + "D", # Everything from the pydocstyle + "INP001", # File {filename} is part of an implicit namespace package, add an __init__.py + "F841", # Local variable {variable} is assigned to but never used ] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" inline-quotes = "single" -[tool.ruff.lint.isort] -known-local-folder = ["apify"] -known-first-party = ["apify_client", "apify_shared", "crawlee"] +[tool.ruff.lint.flake8-builtins] +builtins-ignorelist = ["id"] [tool.ruff.lint.pydocstyle] convention = "google" -[tool.basedpyright] -typeCheckingMode = "standard" +[tool.ruff.lint.isort] +known-local-folder = ["apify"] +known-first-party = ["apify_client", "apify_shared", "crawlee"] + +[tool.ruff.lint.pylint] +max-branches = 18 [tool.pytest.ini_options] +addopts = "-ra" asyncio_mode = "auto" timeout = 1200 @@ -184,7 +188,21 @@ warn_redundant_casts = true warn_return_any = true warn_unreachable = true warn_unused_ignores = true +exclude = [] [[tool.mypy.overrides]] module = ['scrapy', 'scrapy.*', 'sortedcollections', 'lazy_object_proxy'] ignore_missing_imports = true + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "if TYPE_CHECKING:", + "assert_never()" +] + +[tool.basedpyright] +typeCheckingMode = "standard" + +[tool.ipdb] +context = 7 diff --git a/renovate.json b/renovate.json index ea1447bc..e7f2e483 100644 --- a/renovate.json +++ b/renovate.json @@ -1,9 +1,26 @@ { - "$schema": "https://docs.renovatebot.com/renovate-schema.json", - "extends": [ - "config:recommended" + "extends": ["config:base", ":semanticCommitTypeAll(chore)"], + "pinVersions": false, + "separateMajorMinor": false, + "dependencyDashboard": false, + "semanticCommits": "enabled", + "lockFileMaintenance": { + "enabled": true, + "schedule": ["before 1am on monday"], + "automerge": true, + "automergeType": "branch" + }, + "packageRules": [ + { + "matchPaths": ["pyproject.toml"], + "matchDepTypes": ["devDependencies"], + "matchUpdateTypes": ["major", "minor"], + "groupName": "major/minor dev dependencies", + "groupSlug": "dev-dependencies", + "automerge": true, + "automergeType": "branch" + } ], - "ignorePaths": [ - "website/**" - ] + "schedule": ["before 1am on monday"], + "ignoreDeps": ["apify", "docusaurus-plugin-typedoc-api"] } diff --git a/scripts/check_version_in_changelog.py b/scripts/check_version_in_changelog.py index 63778f86..1bc6cba4 100755 --- a/scripts/check_version_in_changelog.py +++ b/scripts/check_version_in_changelog.py @@ -23,4 +23,6 @@ if re.match(rf'## \[{current_package_version}\].*$', line): break else: - raise RuntimeError(f'There is no entry in the changelog for the current package version ({current_package_version})') + raise RuntimeError( + f'There is no entry in the changelog for the current package version ({current_package_version})' + ) diff --git a/scripts/update_version_for_prerelease.py b/scripts/update_version_for_prerelease.py index ee1882f4..f09dbd3d 100755 --- a/scripts/update_version_for_prerelease.py +++ b/scripts/update_version_for_prerelease.py @@ -29,7 +29,9 @@ # We can only transform a stable release version (X.Y.Z) to a prerelease version (X.Y.ZxxxN) if not re.match(r'^\d+\.\d+\.\d+$', current_version): - raise RuntimeError(f'The current version {current_version} does not match the proper semver format for stable releases (X.Y.Z)') + raise RuntimeError( + f'The current version {current_version} does not match the proper semver format for stable releases (X.Y.Z)' + ) # Load the version numbers of the currently published versions from PyPI published_versions = get_published_package_versions() @@ -43,8 +45,7 @@ for version in published_versions: if version.startswith(f'{current_version}{prerelease_prefix}'): prerelease_version = int(version.split(prerelease_prefix)[1]) - if prerelease_version > latest_prerelease: - latest_prerelease = prerelease_version + latest_prerelease = max(prerelease_version, latest_prerelease) # Write the latest prerelease version number to pyproject.toml new_prerelease_version_number = f'{current_version}{prerelease_prefix}{latest_prerelease + 1}' diff --git a/src/apify/_actor.py b/src/apify/_actor.py index d2b85a7b..82a60cfc 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -46,7 +46,8 @@ class _ActorType: def __init__(self, config: Configuration | None = None) -> None: """Create an Actor instance. - Note that you don't have to do this, all the functionality is accessible using the default instance (e.g. `Actor.open_dataset()`). + Note that you don't have to do this, all the functionality is accessible using the default instance + (e.g. `Actor.open_dataset()`). Args: config: The Actor configuration to be used. If not passed, a new Configuration instance will be created. @@ -74,9 +75,8 @@ async def __aenter__(self) -> Self: Automatically initializes the Actor instance when you use it in an `async with ...` statement. - When you exit the `async with` block, the `Actor.exit()` method is called, - and if any exception happens while executing the block code, - the `Actor.fail` method is called. + When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while + executing the block code, the `Actor.fail` method is called. """ await self.init() return self @@ -90,9 +90,8 @@ async def __aexit__( ) -> None: """Exit the Actor, handling any exceptions properly. - When you exit the `async with` block, the `Actor.exit()` method is called, - and if any exception happens while executing the block code, - the `Actor.fail` method is called. + When you exit the `async with` block, the `Actor.exit()` method is called, and if any exception happens while + executing the block code, the `Actor.fail` method is called. """ if not self._is_exiting: if exc_value: @@ -140,14 +139,13 @@ def _raise_if_not_initialized(self) -> None: async def init(self) -> None: """Initialize the Actor instance. - This initializes the Actor instance. - It configures the right storage client based on whether the Actor is running locally or on the Apify platform, - it initializes the event manager for processing Actor events, - and starts an interval for regularly sending `PERSIST_STATE` events, - so that the Actor can regularly persist its state in response to these events. + This initializes the Actor instance. It configures the right storage client based on whether the Actor is + running locally or on the Apify platform, it initializes the event manager for processing Actor events, + and starts an interval for regularly sending `PERSIST_STATE` events, so that the Actor can regularly persist + its state in response to these events. - This method should be called immediately before performing any additional Actor actions, - and it should be called only once. + This method should be called immediately before performing any additional Actor actions, and it should be + called only once. """ if self._is_initialized: raise RuntimeError('The Actor was already initialized!') @@ -185,11 +183,8 @@ async def exit( ) -> None: """Exit the Actor instance. - This stops the Actor instance. - It cancels all the intervals for regularly sending `PERSIST_STATE` events, - sends a final `PERSIST_STATE` event, - waits for all the event listeners to finish, - and stops the event manager. + This stops the Actor instance. It cancels all the intervals for regularly sending `PERSIST_STATE` events, + sends a final `PERSIST_STATE` event, waits for all the event listeners to finish, and stops the event manager. Args: exit_code: The exit code with which the Actor should fail (defaults to `0`). @@ -216,7 +211,7 @@ async def finalize() -> None: await self._event_manager.wait_for_all_listeners_to_complete(timeout=event_listeners_timeout) await self._event_manager.__aexit__(None, None, None) - cast(dict, service_container._services).clear() + cast(dict, service_container._services).clear() # noqa: SLF001 await asyncio.wait_for(finalize(), cleanup_timeout.total_seconds()) self._is_initialized = False @@ -239,8 +234,7 @@ async def fail( ) -> None: """Fail the Actor instance. - This performs all the same steps as Actor.exit(), - but it additionally sets the exit code to `1` (by default). + This performs all the same steps as Actor.exit(), but it additionally sets the exit code to `1` (by default). Args: exit_code: The exit code with which the Actor should fail (defaults to `1`). @@ -267,18 +261,20 @@ def new_client( ) -> ApifyClientAsync: """Return a new instance of the Apify API client. - The `ApifyClientAsync` class is provided by the [apify-client](https://github.com/apify/apify-client-python) package, - and it is automatically configured using the `APIFY_API_BASE_URL` and `APIFY_TOKEN` environment variables. + The `ApifyClientAsync` class is provided by the [apify-client](https://github.com/apify/apify-client-python) + package, and it is automatically configured using the `APIFY_API_BASE_URL` and `APIFY_TOKEN` environment + variables. - You can override the token via the available options. - That's useful if you want to use the client as a different Apify user than the SDK internals are using. + You can override the token via the available options. That's useful if you want to use the client + as a different Apify user than the SDK internals are using. Args: - token: The Apify API token - api_url: The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com - max_retries: How many times to retry a failed request at most - min_delay_between_retries: How long will the client wait between retrying requests (increases exponentially from this value) - timeout: The socket timeout of the HTTP requests sent to the Apify API + token: The Apify API token. + api_url: The URL of the Apify API server to which to connect to. Defaults to https://api.apify.com. + max_retries: How many times to retry a failed request at most. + min_delay_between_retries: How long will the client wait between retrying requests + (increases exponentially from this value). + timeout: The socket timeout of the HTTP requests sent to the Apify API. """ token = token or self._configuration.token api_url = api_url or self._configuration.api_base_url @@ -286,7 +282,9 @@ def new_client( token=token, api_url=api_url, max_retries=max_retries, - min_delay_between_retries_millis=int(min_delay_between_retries.total_seconds() * 1000) if min_delay_between_retries is not None else None, + min_delay_between_retries_millis=int(min_delay_between_retries.total_seconds() * 1000) + if min_delay_between_retries is not None + else None, timeout_secs=int(timeout.total_seconds()) if timeout else None, ) @@ -299,20 +297,20 @@ async def open_dataset( ) -> Dataset: """Open a dataset. - Datasets are used to store structured data where each object stored has the same attributes, - such as online store products or real estate offers. - The actual data is stored either on the local filesystem or in the Apify cloud. + Datasets are used to store structured data where each object stored has the same attributes, such as online + store products or real estate offers. The actual data is stored either on the local filesystem or in + the Apify cloud. Args: - id: ID of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. - name: Name of the dataset to be opened. - If neither `id` nor `name` are provided, the method returns the default dataset associated with the Actor run. - force_cloud: If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: An instance of the `Dataset` class for the given ID or name. - + id: ID of the dataset to be opened. If neither `id` nor `name` are provided, the method returns + the default dataset associated with the Actor run. + name: Name of the dataset to be opened. If neither `id` nor `name` are provided, the method returns + the default dataset associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `Dataset` class for the given ID or name. """ self._raise_if_not_initialized() @@ -332,19 +330,19 @@ async def open_key_value_store( ) -> KeyValueStore: """Open a key-value store. - Key-value stores are used to store records or files, along with their MIME content type. - The records are stored and retrieved using a unique key. - The actual data is stored either on a local filesystem or in the Apify cloud. + Key-value stores are used to store records or files, along with their MIME content type. The records are stored + and retrieved using a unique key. The actual data is stored either on a local filesystem or in the Apify cloud. Args: - id: ID of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. - name: Name of the key-value store to be opened. - If neither `id` nor `name` are provided, the method returns the default key-value store associated with the Actor run. - force_cloud: If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: An instance of the `KeyValueStore` class for the given ID or name. + id: ID of the key-value store to be opened. If neither `id` nor `name` are provided, the method returns + the default key-value store associated with the Actor run. + name: Name of the key-value store to be opened. If neither `id` nor `name` are provided, the method + returns the default key-value store associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `KeyValueStore` class for the given ID or name. """ self._raise_if_not_initialized() @@ -364,20 +362,21 @@ async def open_request_queue( ) -> RequestQueue: """Open a request queue. - Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in the Apify cloud. - The queue is used for deep crawling of websites, where you start with several URLs and then - recursively follow links to other pages. The data structure supports both breadth-first - and depth-first crawling orders. + Request queue represents a queue of URLs to crawl, which is stored either on local filesystem or in + the Apify cloud. The queue is used for deep crawling of websites, where you start with several URLs and then + recursively follow links to other pages. The data structure supports both breadth-first and depth-first + crawling orders. Args: - id: ID of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. - name: Name of the request queue to be opened. - If neither `id` nor `name` are provided, the method returns the default request queue associated with the Actor run. - force_cloud: If set to `True` then the Apify cloud storage is always used. - This way it is possible to combine local and cloud storage. - - Returns: An instance of the `RequestQueue` class for the given ID or name. + id: ID of the request queue to be opened. If neither `id` nor `name` are provided, the method returns + the default request queue associated with the Actor run. + name: Name of the request queue to be opened. If neither `id` nor `name` are provided, the method returns + the default request queue associated with the Actor run. + force_cloud: If set to `True` then the Apify cloud storage is always used. This way it is possible + to combine local and cloud storage. + + Returns: + An instance of the `RequestQueue` class for the given ID or name. """ self._raise_if_not_initialized() @@ -453,27 +452,25 @@ def on(self, event_name: Event, listener: Callable) -> Callable: """Add an event listener to the Actor's event manager. The following events can be emitted: - - `Event.SYSTEM_INFO`: - Emitted every minute, the event data contains info about the resource usage of the Actor. - - `Event.MIGRATING`: - Emitted when the Actor running on the Apify platform is going to be migrated to another worker server soon. - You can use it to persist the state of the Actor and gracefully stop your in-progress tasks, - so that they are not interrupted by the migration.. - - `Event.PERSIST_STATE`: - Emitted in regular intervals (by default 60 seconds) to notify the Actor that it should persist its state, - in order to avoid repeating all work when the Actor restarts. - This event is automatically emitted together with the migrating event, - in which case the `isMigrating` flag in the event data is set to True, otherwise the flag is False. - Note that this event is provided merely for your convenience, - you can achieve the same effect using an interval and listening for the migrating event. - - `Event.ABORTING`: - When a user aborts an Actor run on the Apify platform, - they can choose to abort it gracefully, to allow the Actor some time before getting terminated. - This graceful abort emits the aborting event, which you can use to clean up the Actor state. + + - `Event.SYSTEM_INFO`: Emitted every minute; the event data contains information about the Actor's resource + usage. + + - `Event.MIGRATING`: Emitted when the Actor on the Apify platform is about to be migrated to another worker + server. Use this event to persist the Actor's state and gracefully stop in-progress tasks, preventing + disruption. + + - `Event.PERSIST_STATE`: Emitted regularly (default: 60 seconds) to notify the Actor to persist its state, + preventing work repetition after a restart. This event is emitted together with the `MIGRATING` event, where + the `isMigrating` flag in the event data is `True`; otherwise, the flag is `False`. This event is for + convenience; the same effect can be achieved by setting an interval and listening for the `MIGRATING` event. + + - `Event.ABORTING`: Emitted when a user aborts an Actor run on the Apify platform, allowing the Actor time + to clean up its state if the abort is graceful. Args: - event_name: The Actor event for which to listen to. - listener: The function which is to be called when the event is emitted (can be async). + event_name: The Actor event to listen for. + listener: The function to be called when the event is emitted (can be async). """ self._raise_if_not_initialized() @@ -485,22 +482,23 @@ def off(self, event_name: Event, listener: Callable | None = None) -> None: Args: event_name: The Actor event for which to remove listeners. - listener: The listener which is supposed to be removed. If not passed, all listeners of this event are removed. + listener: The listener which is supposed to be removed. If not passed, all listeners of this event + are removed. """ self._raise_if_not_initialized() self._event_manager.off(event=event_name, listener=listener) def is_at_home(self) -> bool: - """Return `True` when the Actor is running on the Apify platform, and `False` otherwise (for example when running locally).""" + """Return `True` when the Actor is running on the Apify platform, and `False` otherwise (e.g. local run).""" return self._configuration.is_at_home def get_env(self) -> dict: """Return a dictionary with information parsed from all the `APIFY_XXX` environment variables. - For a list of all the environment variables, - see the [Actor documentation](https://docs.apify.com/actors/development/environment-variables). - If some variables are not defined or are invalid, the corresponding value in the resulting dictionary will be None. + For a list of all the environment variables, see the + [Actor documentation](https://docs.apify.com/actors/development/environment-variables). If some variables + are not defined or are invalid, the corresponding value in the resulting dictionary will be None. """ self._raise_if_not_initialized() @@ -546,21 +544,24 @@ async def start( run_input: The input to pass to the Actor run. token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type: The content type of the input. - build: Specifies the Actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the Actor (typically latest). - memory_mbytes: Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout: Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - wait_for_finish: The maximum number of seconds the server waits for the run to finish. By default, it is 0, the maximum value is 300. - webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with the Actor run which can be used to - receive a notification, e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor or task, - you do not have to add it again here. Each webhook is represented by a dictionary containing these items: - * ``event_types``: list of ``WebhookEventType`` values which trigger the webhook - * ``request_url``: URL to which to send the webhook HTTP request - * ``payload_template`` (optional): Optional template for the request payload - - Returns: Info about the started Actor run + build: Specifies the Actor build to run. It can be either a build tag or build number. By default, + the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified + in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in + the default run configuration for the Actor. + wait_for_finish: The maximum number of seconds the server waits for the run to finish. By default, + it is 0, the maximum value is 300. + webhooks: Optional ad-hoc webhooks (https://docs.apify.com/webhooks/ad-hoc-webhooks) associated with + the Actor run which can be used to receive a notification, e.g. when the Actor finished or failed. + If you already have a webhook set up for the Actor or task, you do not have to add it again here. + Each webhook is represented by a dictionary containing these items: + * `event_types`: list of `WebhookEventType` values which trigger the webhook + * `request_url`: URL to which to send the webhook HTTP request + * `payload_template` (optional): Optional template for the request payload + + Returns: + Info about the started Actor run """ self._raise_if_not_initialized() @@ -584,17 +585,20 @@ async def abort( status_message: str | None = None, gracefully: bool | None = None, ) -> dict: - """Abort given Actor run on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable). + """Abort given Actor run on the Apify platform using the current user account. + + The user account is determined by the `APIFY_TOKEN` environment variable. Args: run_id: The ID of the Actor run to be aborted. token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). status_message: Status message of the Actor to be set on the platform. - gracefully: If True, the Actor run will abort gracefully. - It will send ``aborting`` and ``persistStates`` events into the run and force-stop the run after 30 seconds. - It is helpful in cases where you plan to resurrect the run later. + gracefully: If True, the Actor run will abort gracefully. It will send `aborting` and `persistState` + events into the run and force-stop the run after 30 seconds. It is helpful in cases where you plan + to resurrect the run later. - Returns: Info about the aborted Actor run + Returns: + Info about the aborted Actor run. """ self._raise_if_not_initialized() @@ -627,17 +631,20 @@ async def call( run_input: The input to pass to the Actor run. token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type: The content type of the input. - build: Specifies the Actor build to run. It can be either a build tag or build number. By default, the run uses the build specified in - the default run configuration for the Actor (typically latest). - memory_mbytes: Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout: Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, - e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. - wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: Info about the started Actor run + build: Specifies the Actor build to run. It can be either a build tag or build number. By default, + the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified + in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in + the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can + be used to receive a notification, e.g. when the Actor finished or failed. If you already have + a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, + waits indefinitely. + + Returns: + Info about the started Actor run. """ self._raise_if_not_initialized() @@ -669,25 +676,28 @@ async def call_task( It waits indefinitely, unless the wait argument is provided. - Note that an Actor task is a saved input configuration and options for an Actor. - If you want to run an Actor directly rather than an Actor task, please use the `Actor.call` + Note that an Actor task is a saved input configuration and options for an Actor. If you want to run an Actor + directly rather than an Actor task, please use the `Actor.call` Args: task_id: The ID of the Actor to be run. task_input: Overrides the input to pass to the Actor run. token: The Apify API token to use for this request (defaults to the `APIFY_TOKEN` environment variable). content_type: The content type of the input. - build: Specifies the Actor build to run. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the Actor (typically latest). - memory_mbytes: Memory limit for the run, in megabytes. - By default, the run uses a memory limit specified in the default run configuration for the Actor. - timeout: Optional timeout for the run, in seconds. - By default, the run uses timeout specified in the default run configuration for the Actor. - webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can be used to receive a notification, - e.g. when the Actor finished or failed. If you already have a webhook set up for the Actor, you do not have to add it again here. - wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits indefinitely. - - Returns: Info about the started Actor run + build: Specifies the Actor build to run. It can be either a build tag or build number. By default, + the run uses the build specified in the default run configuration for the Actor (typically latest). + memory_mbytes: Memory limit for the run, in megabytes. By default, the run uses a memory limit specified + in the default run configuration for the Actor. + timeout: Optional timeout for the run, in seconds. By default, the run uses timeout specified in + the default run configuration for the Actor. + webhooks: Optional webhooks (https://docs.apify.com/webhooks) associated with the Actor run, which can + be used to receive a notification, e.g. when the Actor finished or failed. If you already have + a webhook set up for the Actor, you do not have to add it again here. + wait: The maximum number of seconds the server waits for the run to finish. If not provided, waits + indefinitely. + + Returns: + Info about the started Actor run. """ self._raise_if_not_initialized() @@ -713,15 +723,16 @@ async def metamorph( ) -> None: """Transform this Actor run to an Actor run of a different Actor. - The platform stops the current Actor container and starts a new container with the new Actor instead. - All the default storages are preserved, - and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. + The platform stops the current Actor container and starts a new container with the new Actor instead. All + the default storages are preserved, and the new input is stored under the `INPUT-METAMORPH-1` key in the same + default key-value store. Args: target_actor_id: ID of the target Actor that the run should be transformed into run_input: The input to pass to the new run. target_actor_build: The build of the target Actor. It can be either a build tag or build number. - By default, the run uses the build specified in the default run configuration for the target Actor (typically the latest build). + By default, the run uses the build specified in the default run configuration for the target Actor + (typically the latest build). content_type: The content type of the input. custom_after_sleep: How long to sleep for after the metamorph, to wait for the container to be stopped. """ @@ -797,8 +808,8 @@ async def add_webhook( This webhook lets you receive a notification when the Actor run finished or failed. - Note that webhooks are only supported for Actors running on the Apify platform. - When running the Actor locally, the function will print a warning and have no effect. + Note that webhooks are only supported for Actors running on the Apify platform. When running the Actor locally, + the function will print a warning and have no effect. For more information about Apify Actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). @@ -808,9 +819,11 @@ async def add_webhook( payload_template: Specification of the payload that will be sent to request_url ignore_ssl_errors: Whether the webhook should ignore SSL errors returned by request_url do_not_retry: Whether the webhook should retry sending the payload to request_url upon failure. - idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create the same webhook multiple times. + idempotency_key: A unique identifier of a webhook. You can use it to ensure that you won't create + the same webhook multiple times. - Returns: The created webhook + Returns: + The created webhook. """ self._raise_if_not_initialized() @@ -844,7 +857,8 @@ async def set_status_message( status_message: The status message to set to the run. is_terminal: Set this flag to True if this is the final status message of the Actor run. - Returns: The updated Actor run object + Returns: + The updated Actor run object. """ self._raise_if_not_initialized() @@ -864,7 +878,8 @@ async def set_status_message( async def create_proxy_configuration( self, *, - actor_proxy_input: dict | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here + actor_proxy_input: dict + | None = None, # this is the raw proxy input from the actor run input, it is not spread or snake_cased in here password: str | None = None, groups: list[str] | None = None, country_code: str | None = None, @@ -873,21 +888,24 @@ async def create_proxy_configuration( ) -> ProxyConfiguration | None: """Create a ProxyConfiguration object with the passed proxy configuration. - Configures connection to a proxy server with the provided options. - Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. + Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target + websites from blocking your crawlers based on IP address rate limits or blacklists. For more details and code examples, see the `ProxyConfiguration` class. Args: - actor_proxy_input: Proxy configuration field from the Actor input, if input has such input field. - If you pass this argument, all the other arguments will be inferred from it. - password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + actor_proxy_input: Proxy configuration field from the Actor input, if input has such input field. If you + pass this argument, all the other arguments will be inferred from it. + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], + if available. groups: Proxy groups which the Apify Proxy should use, if provided. country_code: Country which the Apify Proxy should use, if provided. proxy_urls: Custom proxy server URLs which should be rotated through. new_url_function: Function which returns a custom proxy URL to be used. - Returns: ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based on the configuration. + Returns: + ProxyConfiguration object with the passed configuration, or None, if no proxy should be used based + on the configuration. """ self._raise_if_not_initialized() diff --git a/src/apify/_configuration.py b/src/apify/_configuration.py index e65c7102..ad410d18 100644 --- a/src/apify/_configuration.py +++ b/src/apify/_configuration.py @@ -161,7 +161,8 @@ class Configuration(CrawleeConfiguration): Field( alias='apify_sdk_latest_version', deprecated=True, - description='Specifies the most recent release version of the Apify SDK for Javascript. Used for checking for updates.', + description='Specifies the most recent release version of the Apify SDK for Javascript. Used for ' + 'checking for updates.', ), ] = None diff --git a/src/apify/_crypto.py b/src/apify/_crypto.py index 499beaa0..c97549fc 100644 --- a/src/apify/_crypto.py +++ b/src/apify/_crypto.py @@ -37,8 +37,15 @@ def public_encrypt(value: str, *, public_key: rsa.RSAPublicKey) -> dict: password_bytes = key_bytes + initialized_vector_bytes - # NOTE: Auth Tag is appended to the end of the encrypted data, it has length of 16 bytes and ensures integrity of the data. - cipher = Cipher(algorithms.AES(key_bytes), modes.GCM(initialized_vector_bytes, min_tag_length=ENCRYPTION_AUTH_TAG_LENGTH)) + # NOTE: Auth Tag is appended to the end of the encrypted data, it has length of 16 bytes and ensures integrity + # of the data. + cipher = Cipher( + algorithms.AES(key_bytes), + modes.GCM( + initialized_vector_bytes, + min_tag_length=ENCRYPTION_AUTH_TAG_LENGTH, + ), + ) encryptor = cipher.encryptor() encrypted_value_bytes = encryptor.update(value_bytes) + encryptor.finalize() encrypted_password_bytes = public_key.encrypt( @@ -94,7 +101,9 @@ def private_decrypt( initialization_vector_bytes = password_bytes[ENCRYPTION_KEY_LENGTH:] try: - cipher = Cipher(algorithms.AES(encryption_key_bytes), modes.GCM(initialization_vector_bytes, authentication_tag_bytes)) + cipher = Cipher( + algorithms.AES(encryption_key_bytes), modes.GCM(initialization_vector_bytes, authentication_tag_bytes) + ) decryptor = cipher.decryptor() decipher_bytes = decryptor.update(encrypted_data_bytes) + decryptor.finalize() except InvalidTagException as exc: @@ -124,21 +133,21 @@ def _load_public_key(public_key_file_base64: str) -> rsa.RSAPublicKey: return public_key -def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input: Any) -> Any: +def decrypt_input_secrets(private_key: rsa.RSAPrivateKey, input_data: Any) -> Any: """Decrypt input secrets.""" - if not isinstance(input, dict): - return input + if not isinstance(input_data, dict): + return input_data - for key, value in input.items(): + for key, value in input_data.items(): if isinstance(value, str): match = ENCRYPTED_INPUT_VALUE_REGEXP.fullmatch(value) if match: encrypted_password = match.group(1) encrypted_value = match.group(2) - input[key] = private_decrypt( + input_data[key] = private_decrypt( encrypted_password, encrypted_value, private_key=private_key, ) - return input + return input_data diff --git a/src/apify/_platform_event_manager.py b/src/apify/_platform_event_manager.py index 0eb0dda6..ed4c2034 100644 --- a/src/apify/_platform_event_manager.py +++ b/src/apify/_platform_event_manager.py @@ -11,7 +11,14 @@ from apify_shared.utils import ignore_docs from crawlee.events._event_manager import EventManager, EventManagerOptions from crawlee.events._local_event_manager import LocalEventManager -from crawlee.events._types import Event, EventAbortingData, EventExitData, EventMigratingData, EventPersistStateData, EventSystemInfoData +from crawlee.events._types import ( + Event, + EventAbortingData, + EventExitData, + EventMigratingData, + EventPersistStateData, + EventSystemInfoData, +) from apify._log import logger @@ -146,7 +153,9 @@ async def __aenter__(self) -> Self: # Run tasks but don't await them if self._config.actor_events_ws_url: - self._process_platform_messages_task = asyncio.create_task(self._process_platform_messages(self._config.actor_events_ws_url)) + self._process_platform_messages_task = asyncio.create_task( + self._process_platform_messages(self._config.actor_events_ws_url) + ) is_connected = await self._connected_to_platform_websocket if not is_connected: raise RuntimeError('Error connecting to platform events websocket!') @@ -181,7 +190,10 @@ async def _process_platform_messages(self, ws_url: str) -> None: parsed_message = event_data_adapter.validate_json(message) if isinstance(parsed_message, UnknownEvent): - logger.info(f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}') + logger.info( + f'Unknown message received: event_name={parsed_message.name}, ' + f'event_data={parsed_message.data}' + ) continue self.emit( diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index 47347c8b..cfa9bed9 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -72,17 +72,15 @@ class ProxyInfo(CrawleeProxyInfo): """Provides information about a proxy connection that is used for requests.""" groups: list[str] = field(default_factory=list) - """An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy). - If not provided, the proxy will select the groups automatically. - """ + """An array of proxy groups to be used by the [Apify Proxy](https://docs.apify.com/proxy). If not provided, + the proxy will select the groups automatically.""" country_code: str | None = None - """If set and relevant proxies are available in your Apify account, all proxied requests will - use IP addresses that are geolocated to the specified country. For example `GB` for IPs - from Great Britain. Note that online services often have their own rules for handling - geolocation and thus the country selection is a best attempt at geolocation, rather than - a guaranteed hit. This parameter is optional, by default, each proxied request is assigned - an IP address from a random country. The country code needs to be a two letter ISO country code. + """If set and relevant proxies are available in your Apify account, all proxied requests will use IP addresses + that are geolocated to the specified country. For example `GB` for IPs from Great Britain. Note that online + services often have their own rules for handling geolocation and thus the country selection is a best attempt + at geolocation, rather than a guaranteed hit. This parameter is optional, by default, each proxied request is + assigned an IP address from a random country. The country code needs to be a two letter ISO country code. See the [full list of available country codes](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Officially_assigned_code_elements). This parameter is optional, by default, the proxy uses all available proxy servers from all countries. """ @@ -91,13 +89,13 @@ class ProxyInfo(CrawleeProxyInfo): class ProxyConfiguration(CrawleeProxyConfiguration): """Configures a connection to a proxy server with the provided options. - Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or blacklists. - The default servers used by this class are managed by [Apify Proxy](https://docs.apify.com/proxy). - To be able to use Apify Proxy, you need an Apify account and access to the selected proxies. If you provide no configuration option, - the proxies will be managed automatically using a smart algorithm. + Proxy servers are used to prevent target websites from blocking your crawlers based on IP address rate limits or + blacklists. The default servers used by this class are managed by [Apify Proxy](https://docs.apify.com/proxy). + To be able to use Apify Proxy, you need an Apify account and access to the selected proxies. If you provide + no configuration option, the proxies will be managed automatically using a smart algorithm. - If you want to use your own proxies, use the `proxy_urls` or `new_url_function` constructor options. - Your list of proxy URLs will be rotated by the configuration, if this option is provided. + If you want to use your own proxies, use the `proxy_urls` or `new_url_function` constructor options. Your list + of proxy URLs will be rotated by the configuration, if this option is provided. """ _configuration: Configuration @@ -115,10 +113,13 @@ def __init__( _actor_config: Configuration | None = None, _apify_client: ApifyClientAsync | None = None, ) -> None: - """Create a ProxyConfiguration instance. It is highly recommended to use `Actor.create_proxy_configuration()` instead of this. + """Create a ProxyConfiguration instance. + + It is highly recommended to use `Actor.create_proxy_configuration()` instead of this. Args: - password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], if available. + password: Password for the Apify Proxy. If not provided, will use os.environ['APIFY_PROXY_PASSWORD'], + if available. groups: Proxy groups which the Apify Proxy should use, if provided. country_code: Country which the Apify Proxy should use, if provided. proxy_urls: Custom proxy server URLs which should be rotated through. @@ -145,14 +146,17 @@ def __init__( if proxy_urls and any('apify.com' in url for url in proxy_urls): logger.warning( - 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties instead of `proxy_urls`.\n' + 'Some Apify proxy features may work incorrectly. Please consider setting up Apify properties ' + 'instead of `proxy_urls`.\n' 'See https://sdk.apify.com/docs/guides/proxy-management#apify-proxy-configuration' ) self._uses_apify_proxy = not (proxy_urls or new_url_function or tiered_proxy_urls) super().__init__( - proxy_urls=[f'http://{_actor_config.proxy_hostname}:{_actor_config.proxy_port}'] if self._uses_apify_proxy else proxy_urls, + proxy_urls=[f'http://{_actor_config.proxy_hostname}:{_actor_config.proxy_port}'] + if self._uses_apify_proxy + else proxy_urls, new_url_function=new_url_function, tiered_proxy_urls=tiered_proxy_urls, ) @@ -170,13 +174,13 @@ def __init__( self._country_code = country_code async def initialize(self) -> None: - """Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and provided proxy groups. + """Load the Apify Proxy password if the API token is provided and check access to Apify Proxy and proxy groups. - Only called if Apify Proxy configuration is used. - Also checks if country has access to Apify Proxy groups if the country code is provided. + Only called if Apify Proxy configuration is used. Also checks if country has access to Apify Proxy groups + if the country code is provided. - You should use the Actor.create_proxy_configuration function - to create a pre-initialized `ProxyConfiguration` instance instead of calling this manually. + You should use the Actor.create_proxy_configuration function to create a pre-initialized + `ProxyConfiguration` instance instead of calling this manually. """ if self._uses_apify_proxy: await self._maybe_fetch_password() @@ -190,18 +194,19 @@ async def new_proxy_info( ) -> ProxyInfo | None: """Create a new ProxyInfo object. - Use it if you want to work with a rich representation of a proxy URL. - If you need the URL string only, use `ProxyConfiguration.new_url`. + Use it if you want to work with a rich representation of a proxy URL. If you need the URL string only, + use `ProxyConfiguration.new_url`. Args: session_id: Represents the identifier of a proxy session (https://docs.apify.com/proxy#sessions). - All the HTTP requests going through the proxy with the same session identifier - will use the same target proxy server (i.e. the same IP address). - The identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. - request: request for which the proxy info is being issued, used in proxy tier handling - proxy_tier: allows forcing the proxy tier to be used - - Returns: Dictionary that represents information about the proxy and its configuration. + All the HTTP requests going through the proxy with the same session identifier will use the same + target proxy server (i.e. the same IP address). The identifier must not be longer than 50 characters + and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + request: request for which the proxy info is being issued, used in proxy tier handling. + proxy_tier: allows forcing the proxy tier to be used. + + Returns: + Dictionary that represents information about the proxy and its configuration. """ if session_id is not None: _check(session_id, label='session_id', max_length=SESSION_ID_MAX_LENGTH, pattern=APIFY_PROXY_VALUE_REGEX) @@ -250,17 +255,17 @@ async def _maybe_fetch_password(self) -> None: if self._password: if self._password != password: logger.warning( - 'The Apify Proxy password you provided belongs to' - ' a different user than the Apify token you are using. Are you sure this is correct?' + 'The Apify Proxy password you provided belongs to a different user than the Apify ' + 'token you are using. Are you sure this is correct?' ) else: self._password = password if not self._password: raise ValueError( - 'Apify Proxy password must be provided using the "password" constructor argument' - f' or the "{ApifyEnvVars.PROXY_PASSWORD}" environment variable.' - f' If you add the "{ApifyEnvVars.TOKEN}" environment variable, the password will be automatically inferred.' + 'Apify Proxy password must be provided using the "password" constructor argument ' + f'or the "{ApifyEnvVars.PROXY_PASSWORD}" environment variable. If you add ' + f'the "{ApifyEnvVars.TOKEN}" environment variable, the password will be automatically inferred.' ) async def _check_access(self) -> None: @@ -288,8 +293,8 @@ async def _check_access(self) -> None: self.is_man_in_the_middle = status['isManInTheMiddle'] else: logger.warning( - 'Apify Proxy access check timed out. Watch out for errors with status code 407. ' - "If you see some, it most likely means you don't have access to either all or some of the proxies you're trying to use." + 'Apify Proxy access check timed out. Watch out for errors with status code 407. If you see some, it ' + 'most likely means you do not have access to either all or some of the proxies you are trying to use.' ) def _get_username(self, session_id: int | str | None = None) -> str: diff --git a/src/apify/apify_storage_client/_dataset_client.py b/src/apify/apify_storage_client/_dataset_client.py index dd10ced8..ad211d04 100644 --- a/src/apify/apify_storage_client/_dataset_client.py +++ b/src/apify/apify_storage_client/_dataset_client.py @@ -48,7 +48,7 @@ async def list_items( self, *, offset: int | None = 0, - limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, + limit: int | None = BaseDatasetClient._LIST_ITEMS_LIMIT, # noqa: SLF001 clean: bool = False, desc: bool = False, fields: list[str] | None = None, diff --git a/src/apify/apify_storage_client/_key_value_store_client.py b/src/apify/apify_storage_client/_key_value_store_client.py index d02d18cb..48ab72cb 100644 --- a/src/apify/apify_storage_client/_key_value_store_client.py +++ b/src/apify/apify_storage_client/_key_value_store_client.py @@ -5,7 +5,12 @@ from typing_extensions import override -from crawlee.base_storage_client import BaseKeyValueStoreClient, KeyValueStoreListKeysPage, KeyValueStoreMetadata, KeyValueStoreRecord +from crawlee.base_storage_client import ( + BaseKeyValueStoreClient, + KeyValueStoreListKeysPage, + KeyValueStoreMetadata, + KeyValueStoreRecord, +) if TYPE_CHECKING: from collections.abc import AsyncIterator diff --git a/src/apify/apify_storage_client/_key_value_store_collection_client.py b/src/apify/apify_storage_client/_key_value_store_collection_client.py index 27f76f37..cf22821e 100644 --- a/src/apify/apify_storage_client/_key_value_store_collection_client.py +++ b/src/apify/apify_storage_client/_key_value_store_collection_client.py @@ -40,4 +40,11 @@ async def list( offset: int | None = None, desc: bool = False, ) -> KeyValueStoreListPage: - return KeyValueStoreListPage.model_validate(await self._client.list(unnamed=unnamed, limit=limit, offset=offset, desc=desc)) + return KeyValueStoreListPage.model_validate( + await self._client.list( + unnamed=unnamed, + limit=limit, + offset=offset, + desc=desc, + ) + ) diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index 3120f972..3e64508e 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -49,14 +49,18 @@ def from_crawler(cls: type[ApifyHttpProxyMiddleware], crawler: Crawler) -> Apify proxy_settings: dict | None = crawler.settings.get('APIFY_PROXY_SETTINGS') if proxy_settings is None: - Actor.log.warning('ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing in the Actor input.') + Actor.log.warning( + 'ApifyHttpProxyMiddleware is not going to be used. Object "proxyConfiguration" is probably missing ' + ' in the Actor input.' + ) raise NotConfigured use_apify_proxy = proxy_settings.get('useApifyProxy', False) if use_apify_proxy is not True: Actor.log.warning( - 'ApifyHttpProxyMiddleware is not going to be used. Actor input field "proxyConfiguration.useApifyProxy" is probably set to False.' + 'ApifyHttpProxyMiddleware is not going to be used. Actor input field ' + '"proxyConfiguration.useApifyProxy" is probably set to False.' ) raise NotConfigured @@ -106,7 +110,10 @@ def process_exception( ) if isinstance(exception, TunnelError): - Actor.log.warning(f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", reason="{exception}", skipping...') + Actor.log.warning( + f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", ' + 'reason="{exception}", skipping...' + ) return request return None @@ -129,7 +136,9 @@ async def _get_new_proxy_url(self: ApifyHttpProxyMiddleware) -> ParseResult: # If the proxy configuration is still not available, raise an error. However, this should not happen due # to the checks in the `from_crawler` method. if proxy_cfg is None: - Actor.log.error('Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.') + Actor.log.error( + 'Creation of proxy configuration failed. Check the field "proxyConfiguration" in the Actor input.' + ) raise NotConfigured # Store the proxy configuration for future use diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 6d2fd348..7287a69a 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -60,7 +60,7 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest else: unique_key = crypto_random_object_id(8) - if scrapy_request.meta.get('apify_request_id'): # noqa: SIM108 + if scrapy_request.meta.get('apify_request_id'): request_id = scrapy_request.meta['apify_request_id'] else: request_id = unique_key_to_request_id(unique_key) @@ -78,7 +78,9 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> CrawleeRequest if isinstance(scrapy_request.headers, Headers): apify_request.headers = dict(scrapy_request.headers.to_unicode_dict()) else: - Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}') + Actor.log.warning( + f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}' + ) # Serialize the Scrapy Request and store it in the apify_request. # - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64, @@ -140,7 +142,8 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} meta.update({'apify_request_id': apify_request.id, 'apify_request_unique_key': apify_request.unique_key}) - scrapy_request._meta = meta # scrapy_request.meta is a property, so we have to set it like this + # scrapy_request.meta is a property, so we have to set it like this + scrapy_request._meta = meta # noqa: SLF001 # If the apify_request comes directly from the Request Queue, typically start URLs else: @@ -161,7 +164,8 @@ def to_scrapy_request(apify_request: CrawleeRequest, spider: Spider) -> Request: scrapy_request.headers = Headers(apify_request.headers) else: Actor.log.warning( - f'apify_request[headers] is not an instance of the dict class, apify_request[headers] = {apify_request.headers}', + 'apify_request[headers] is not an instance of the dict class, ' + f'apify_request[headers] = {apify_request.headers}', ) # Add optional 'userData' field diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index db8f6ad0..36d149c9 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -130,7 +130,9 @@ def next_request(self: ApifyScheduler) -> Request | None: traceback.print_exc() raise - Actor.log.debug(f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})') + Actor.log.debug( + f'[{call_id}]: a new apify_request from the scheduler was fetched (apify_request={apify_request})' + ) if apify_request is None: return None @@ -148,6 +150,7 @@ def next_request(self: ApifyScheduler) -> Request | None: scrapy_request = to_scrapy_request(apify_request, spider=self.spider) Actor.log.debug( - f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned (scrapy_request={scrapy_request})', + f'[{call_id}]: apify_request was transformed to the scrapy_request which is gonna be returned ' + f'(scrapy_request={scrapy_request})', ) return scrapy_request diff --git a/src/apify/scrapy/utils.py b/src/apify/scrapy/utils.py index dbd43a2b..5afadb82 100644 --- a/src/apify/scrapy/utils.py +++ b/src/apify/scrapy/utils.py @@ -10,7 +10,8 @@ from scrapy.utils.python import to_bytes except ImportError as exc: raise ImportError( - 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run "pip install apify[scrapy]".', + 'To use this module, you need to install the "scrapy" extra. For example, if you use pip, run ' + '"pip install apify[scrapy]".' ) from exc diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 040bc71a..c1c2d6bb 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -45,7 +45,7 @@ def _reset_and_patch_default_instances() -> None: # because `httpx.AsyncClient` in `ApifyClientAsync` tries to reuse the same event loop across requests, # but `pytest-asyncio` closes the event loop after each test, # and uses a new one for the next test. -@pytest.fixture() +@pytest.fixture def apify_client_async() -> ApifyClientAsync: api_token = os.getenv(TOKEN_ENV_VAR) api_url = os.getenv(API_URL_ENV_VAR) @@ -91,9 +91,8 @@ def sdk_wheel_path(tmp_path_factory: pytest.TempPathFactory, testrun_uid: str) - def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: """Create a dictionary of the base source files for a testing Actor. - It takes the files from `tests/integration/actor_source_base`, - builds the Apify SDK wheel from the current codebase, - and adds them all together in a dictionary. + It takes the files from `tests/integration/actor_source_base`, builds the Apify SDK wheel from + the current codebase, and adds them all together in a dictionary. """ source_files: dict[str, str | bytes] = {} @@ -113,11 +112,17 @@ def actor_base_source_files(sdk_wheel_path: Path) -> dict[str, str | bytes]: sdk_wheel_file_name = sdk_wheel_path.name source_files[sdk_wheel_file_name] = sdk_wheel_path.read_bytes() - source_files['requirements.txt'] = str(source_files['requirements.txt']).replace('APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}') + source_files['requirements.txt'] = str(source_files['requirements.txt']).replace( + 'APIFY_SDK_WHEEL_PLACEHOLDER', f'./{sdk_wheel_file_name}' + ) current_major_minor_python_version = '.'.join([str(x) for x in sys.version_info[:2]]) - integration_tests_python_version = os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version - source_files['Dockerfile'] = str(source_files['Dockerfile']).replace('BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version) + integration_tests_python_version = ( + os.getenv('INTEGRATION_TESTS_PYTHON_VERSION') or current_major_minor_python_version + ) + source_files['Dockerfile'] = str(source_files['Dockerfile']).replace( + 'BASE_IMAGE_VERSION_PLACEHOLDER', integration_tests_python_version + ) return source_files @@ -134,8 +139,11 @@ def __call__( ) -> Awaitable[ActorClientAsync]: ... -@pytest.fixture() -async def make_actor(actor_base_source_files: dict[str, str | bytes], apify_client_async: ApifyClientAsync) -> AsyncIterator[ActorFactory]: +@pytest.fixture +async def make_actor( + actor_base_source_files: dict[str, str | bytes], + apify_client_async: ApifyClientAsync, +) -> AsyncIterator[ActorFactory]: """A fixture for returning a temporary Actor factory.""" actor_clients_for_cleanup: list[ActorClientAsync] = [] @@ -148,7 +156,8 @@ async def _make_actor( ) -> ActorClientAsync: """Create a temporary Actor from the given main function or source file(s). - The Actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will be automatically deleted. + The Actor will be uploaded to the Apify Platform, built there, and after the test finishes, it will + be automatically deleted. You have to pass exactly one of the `main_func`, `main_py` and `source_files` arguments. @@ -158,7 +167,8 @@ async def _make_actor( main_py: The `src/main.py` file of the Actor. source_files: A dictionary of the source files of the Actor. - Returns: A resource client for the created Actor. + Returns: + A resource client for the created Actor. """ if not (main_func or main_py or source_files): raise TypeError('One of `main_func`, `main_py` or `source_files` arguments must be specified') @@ -189,7 +199,8 @@ async def _make_actor( assert source_files is not None - # Copy the source files dict from the fixture so that we're not overwriting it, and merge the passed argument in it + # Copy the source files dict from the fixture so that we're not overwriting it, and merge the passed + # argument in it. actor_source_files = actor_base_source_files.copy() actor_source_files.update(source_files) diff --git a/tests/integration/test_actor_api_helpers.py b/tests/integration/test_actor_api_helpers.py index 589528fa..a0788a69 100644 --- a/tests/integration/test_actor_api_helpers.py +++ b/tests/integration/test_actor_api_helpers.py @@ -136,7 +136,9 @@ async def main_outer() -> None: inner_actor_id = (await inner_actor.get() or {})['id'] test_value = crypto_random_object_id() - outer_run_result = await outer_actor.call(run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id}) + outer_run_result = await outer_actor.call( + run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id} + ) assert outer_run_result is not None assert outer_run_result['status'] == 'SUCCEEDED' @@ -177,7 +179,9 @@ async def main_outer() -> None: inner_actor_id = (await inner_actor.get() or {})['id'] test_value = crypto_random_object_id() - outer_run_result = await outer_actor.call(run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id}) + outer_run_result = await outer_actor.call( + run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id} + ) assert outer_run_result is not None assert outer_run_result['status'] == 'SUCCEEDED' @@ -316,7 +320,9 @@ async def main_outer() -> None: inner_actor_id = (await inner_actor.get() or {})['id'] test_value = crypto_random_object_id() - outer_run_result = await outer_actor.call(run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id}) + outer_run_result = await outer_actor.call( + run_input={'test_value': test_value, 'inner_actor_id': inner_actor_id} + ) assert outer_run_result is not None assert outer_run_result['status'] == 'SUCCEEDED' @@ -425,7 +431,9 @@ async def main_client() -> None: server_actor_initialized = await server_actor.last_run().key_value_store().get_record('INITIALIZED') await asyncio.sleep(1) - client_actor_run_result = await client_actor.call(run_input={'server_actor_container_url': server_actor_container_url}) + client_actor_run_result = await client_actor.call( + run_input={'server_actor_container_url': server_actor_container_url} + ) assert client_actor_run_result is not None assert client_actor_run_result['status'] == 'SUCCEEDED' diff --git a/tests/integration/test_actor_create_proxy_configuration.py b/tests/integration/test_actor_create_proxy_configuration.py index 50c5f78b..bd841a58 100644 --- a/tests/integration/test_actor_create_proxy_configuration.py +++ b/tests/integration/test_actor_create_proxy_configuration.py @@ -61,7 +61,10 @@ async def main() -> None: } ) assert proxy_configuration is not None - assert await proxy_configuration.new_url() == f'http://groups-{"+".join(groups)},country-{country_code}:{proxy_url_suffix}' + assert ( + await proxy_configuration.new_url() + == f'http://groups-{"+".join(groups)},country-{country_code}:{proxy_url_suffix}' + ) await Actor.exit() diff --git a/tests/integration/test_actor_events.py b/tests/integration/test_actor_events.py index f1a89ace..01b45bbe 100644 --- a/tests/integration/test_actor_events.py +++ b/tests/integration/test_actor_events.py @@ -60,8 +60,12 @@ async def log_event(data: Any) -> None: assert run_result is not None assert run_result['status'] == 'SUCCEEDED' dataset_items_page = await actor.last_run().dataset().list_items() - persist_state_events = [item for item in dataset_items_page.items if item['event_type'] == ActorEventTypes.PERSIST_STATE] - system_info_events = [item for item in dataset_items_page.items if item['event_type'] == ActorEventTypes.SYSTEM_INFO] + persist_state_events = [ + item for item in dataset_items_page.items if item['event_type'] == ActorEventTypes.PERSIST_STATE + ] + system_info_events = [ + item for item in dataset_items_page.items if item['event_type'] == ActorEventTypes.SYSTEM_INFO + ] assert len(persist_state_events) > 2 assert len(system_info_events) > 0 diff --git a/tests/integration/test_actor_log.py b/tests/integration/test_actor_log.py index dcfe8d8d..dcbb06ed 100644 --- a/tests/integration/test_actor_log.py +++ b/tests/integration/test_actor_log.py @@ -73,7 +73,9 @@ async def main() -> None: assert run_log_lines.pop(0) == 'ACTOR: Creating Docker container.' assert run_log_lines.pop(0) == 'ACTOR: Starting Docker container.' assert run_log_lines.pop(0) == '[apify] INFO Initializing Actor...' - assert run_log_lines.pop(0).startswith(f'[apify] INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "') + assert run_log_lines.pop(0).startswith( + f'[apify] INFO System info ({{"apify_sdk_version": "{__version__}", "apify_client_version": "' + ) assert run_log_lines.pop(0) == '[apify] DEBUG Debug message' assert run_log_lines.pop(0) == '[apify] INFO Info message' assert run_log_lines.pop(0) == '[apify] WARN Warning message' diff --git a/tests/unit/actor/test_actor_create_proxy_configuration.py b/tests/unit/actor/test_actor_create_proxy_configuration.py index 8dd4db95..5111b452 100644 --- a/tests/unit/actor/test_actor_create_proxy_configuration.py +++ b/tests/unit/actor/test_actor_create_proxy_configuration.py @@ -18,7 +18,7 @@ DUMMY_PASSWORD = 'DUMMY_PASSWORD' -@pytest.fixture() +@pytest.fixture def patched_apify_client(apify_client_async_patcher: ApifyClientAsyncPatcher) -> ApifyClientAsync: apify_client_async_patcher.patch('user', 'get', return_value={'proxy': {'password': DUMMY_PASSWORD}}) return ApifyClientAsync() @@ -134,7 +134,10 @@ async def test_create_proxy_configuration_actor_proxy_input( } ) assert proxy_configuration is not None - assert await proxy_configuration.new_url() == f'http://groups-{"+".join(groups)},country-{country_code}:{DUMMY_PASSWORD}@proxy.apify.com:8000' + assert ( + await proxy_configuration.new_url() + == f'http://groups-{"+".join(groups)},country-{country_code}:{DUMMY_PASSWORD}@proxy.apify.com:8000' + ) assert len(patched_apify_client.calls['user']['get']) == 2 # type: ignore assert len(route.calls) == 2 diff --git a/tests/unit/actor/test_actor_env_helpers.py b/tests/unit/actor/test_actor_env_helpers.py index 36a5268f..0d583388 100644 --- a/tests/unit/actor/test_actor_env_helpers.py +++ b/tests/unit/actor/test_actor_env_helpers.py @@ -7,7 +7,15 @@ from pydantic_core import TzInfo -from apify_shared.consts import BOOL_ENV_VARS, DATETIME_ENV_VARS, FLOAT_ENV_VARS, INTEGER_ENV_VARS, STRING_ENV_VARS, ActorEnvVars, ApifyEnvVars +from apify_shared.consts import ( + BOOL_ENV_VARS, + DATETIME_ENV_VARS, + FLOAT_ENV_VARS, + INTEGER_ENV_VARS, + STRING_ENV_VARS, + ActorEnvVars, + ApifyEnvVars, +) from apify import Actor @@ -30,7 +38,7 @@ async def test_is_at_home_on_apify(self, monkeypatch: pytest.MonkeyPatch) -> Non class TestGetEnv: - async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None: # noqa: PLR0912 + async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> None: ignored_env_vars = { ApifyEnvVars.INPUT_KEY, ApifyEnvVars.MEMORY_MBYTES, @@ -87,7 +95,10 @@ async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> No datetime_get_env_var = datetime_env_var.name.lower() expected_get_env[datetime_get_env_var] = datetime.now(TzInfo(0)) # type: ignore - monkeypatch.setenv(datetime_env_var, expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ')) + monkeypatch.setenv( + datetime_env_var, + expected_get_env[datetime_get_env_var].strftime('%Y-%m-%dT%H:%M:%S.%fZ'), + ) for string_env_var in STRING_ENV_VARS: if string_env_var in ignored_env_vars: @@ -109,7 +120,9 @@ async def test_get_env_use_env_vars(self, monkeypatch: pytest.MonkeyPatch) -> No expected_get_env[env_name] = timedelta(milliseconds=env_value) # Convert dedicated_cpus to float - expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float(expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()]) + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] = float( + expected_get_env[ApifyEnvVars.DEDICATED_CPUS.name.lower()] + ) # Update expectations for legacy configuration for old_name, new_name in legacy_env_vars.items(): diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 0d6a08d0..33997d80 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -118,7 +118,9 @@ async def test_actor_add_webhook_not_work_locally( caplog: pytest.LogCaptureFixture, ) -> None: async with Actor: - await Actor.add_webhook(event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com') + await Actor.add_webhook( + event_types=[WebhookEventType.ACTOR_BUILD_ABORTED], request_url='https://example.com' + ) assert len(caplog.records) == 1 assert caplog.records[0].levelname == 'ERROR' diff --git a/tests/unit/actor/test_actor_key_value_store.py b/tests/unit/actor/test_actor_key_value_store.py index 5d855f36..b6ece50d 100644 --- a/tests/unit/actor/test_actor_key_value_store.py +++ b/tests/unit/actor/test_actor_key_value_store.py @@ -78,7 +78,7 @@ async def test_get_input_with_secrets( encrypted_secret = public_encrypt(secret_string, public_key=PUBLIC_KEY) input_with_secret = { 'foo': 'bar', - 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', + 'secret': f'{ENCRYPTED_INPUT_VALUE_PREFIX}:{encrypted_secret["encrypted_password"]}:{encrypted_secret["encrypted_value"]}', # noqa: E501 } await memory_storage_client.key_value_stores().get_or_create(id='default') diff --git a/tests/unit/actor/test_actor_log.py b/tests/unit/actor/test_actor_log.py index 9103c7c3..6d0454e1 100644 --- a/tests/unit/actor/test_actor_log.py +++ b/tests/unit/actor/test_actor_log.py @@ -15,7 +15,11 @@ class TestActorLog: - async def test_actor_log(self: TestActorLog, caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch) -> None: + async def test_actor_log( + self: TestActorLog, + caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch, + ) -> None: caplog.set_level(logging.DEBUG, logger='apify') monkeypatch.setenv('APIFY_IS_AT_HOME', '1') diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2c441883..fd74d99e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -19,7 +19,7 @@ from pathlib import Path -@pytest.fixture() +@pytest.fixture def reset_default_instances() -> Callable[[], None]: def reset() -> None: from crawlee.storages._creation_management import ( @@ -51,7 +51,11 @@ def reset() -> None: # To isolate the tests, we need to reset the used singletons before each test case # We also set the MemoryStorageClient to use a temp path @pytest.fixture(autouse=True) -def _reset_and_patch_default_instances(monkeypatch: pytest.MonkeyPatch, tmp_path: Path, reset_default_instances: Callable[[], None]) -> None: +def _reset_and_patch_default_instances( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + reset_default_instances: Callable[[], None], +) -> None: # This forces the MemoryStorageClient to use tmp_path for its storage dir monkeypatch.setenv(ApifyEnvVars.LOCAL_STORAGE_DIR, str(tmp_path)) @@ -119,7 +123,9 @@ def patch( original_submethod = getattr(client_method_return_type, submethod, None) if not original_submethod: - raise ValueError(f'apify_client.{client_method_return_type.__name__} does not contain method "{submethod}"!') + raise ValueError( + f'apify_client.{client_method_return_type.__name__} does not contain method "{submethod}"!' + ) if is_async is None: is_async = inspect.iscoroutinefunction(original_submethod) @@ -163,12 +169,12 @@ def getattr_override(apify_client_instance: Any, attr_name: str) -> Any: self.monkeypatch.setattr(ApifyClientAsync, '__getattr__', getattr_override, raising=False) -@pytest.fixture() +@pytest.fixture def apify_client_async_patcher(monkeypatch: pytest.MonkeyPatch) -> ApifyClientAsyncPatcher: return ApifyClientAsyncPatcher(monkeypatch) -@pytest.fixture() +@pytest.fixture def memory_storage_client() -> MemoryStorageClient: configuration = CrawleeConfiguration() configuration.persist_storage = True diff --git a/tests/unit/scrapy/middlewares/test_apify_proxy.py b/tests/unit/scrapy/middlewares/test_apify_proxy.py index 379e1b45..6bddcb74 100644 --- a/tests/unit/scrapy/middlewares/test_apify_proxy.py +++ b/tests/unit/scrapy/middlewares/test_apify_proxy.py @@ -16,14 +16,14 @@ class DummySpider(Spider): name = 'dummy_spider' -@pytest.fixture() +@pytest.fixture def middleware() -> ApifyHttpProxyMiddleware: """Fixture to create an Apify HTTP proxy middleware.""" proxy_settings = {'useApifyProxy': True} return ApifyHttpProxyMiddleware(proxy_settings) -@pytest.fixture() +@pytest.fixture def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler: """Fixture to create a Scrapy crawler.""" crawler = Crawler(DummySpider) @@ -31,19 +31,19 @@ def crawler(monkeypatch: pytest.MonkeyPatch) -> Crawler: return crawler -@pytest.fixture() +@pytest.fixture def spider() -> DummySpider: """Fixture to create a "dummy" Scrapy spider.""" return DummySpider() -@pytest.fixture() +@pytest.fixture def dummy_request() -> Request: """Fixture to create a "dummy" Scrapy spider.""" return Request('https://example.com') -@pytest.fixture() +@pytest.fixture def proxy_configuration() -> ProxyConfiguration: """Fixture to create an Apify ProxyConfiguration object.""" return ProxyConfiguration() diff --git a/tests/unit/scrapy/pipelines/test_actor_dataset_push.py b/tests/unit/scrapy/pipelines/test_actor_dataset_push.py index 7c89999d..0eb59599 100644 --- a/tests/unit/scrapy/pipelines/test_actor_dataset_push.py +++ b/tests/unit/scrapy/pipelines/test_actor_dataset_push.py @@ -24,13 +24,13 @@ class TitleItem(Item): title = Field() -@pytest.fixture() +@pytest.fixture def spider() -> DummySpider: """Fixture to create a "dummy" Scrapy spider.""" return DummySpider() -@pytest.fixture() +@pytest.fixture def pipeline() -> ActorDatasetPushPipeline: """Fixture to create an Actor dataset push pipeline.""" return ActorDatasetPushPipeline() diff --git a/tests/unit/scrapy/requests/test_to_apify_request.py b/tests/unit/scrapy/requests/test_to_apify_request.py index 0116f5ec..a05a1116 100644 --- a/tests/unit/scrapy/requests/test_to_apify_request.py +++ b/tests/unit/scrapy/requests/test_to_apify_request.py @@ -11,7 +11,7 @@ class DummySpider(Spider): name = 'dummy_spider' -@pytest.fixture() +@pytest.fixture def spider() -> DummySpider: """Fixture to create a "dummy" Scrapy spider.""" return DummySpider() diff --git a/tests/unit/scrapy/requests/test_to_scrapy_request.py b/tests/unit/scrapy/requests/test_to_scrapy_request.py index 8c9ebe4f..4cc69196 100644 --- a/tests/unit/scrapy/requests/test_to_scrapy_request.py +++ b/tests/unit/scrapy/requests/test_to_scrapy_request.py @@ -15,7 +15,7 @@ class DummySpider(Spider): name = 'dummy_spider' -@pytest.fixture() +@pytest.fixture def spider() -> DummySpider: """Fixture to create a "dummy" Scrapy spider.""" return DummySpider() @@ -23,7 +23,13 @@ def spider() -> DummySpider: def test__to_scrapy_request__without_reconstruction(spider: Spider) -> None: # Without reconstruction of encoded Scrapy request - apify_request = CrawleeRequest(url='https://example.com', method='GET', unique_key='https://example.com', id='fvwscO2UJLdr10B', user_data={}) + apify_request = CrawleeRequest( + url='https://example.com', + method='GET', + unique_key='https://example.com', + id='fvwscO2UJLdr10B', + user_data={}, + ) scrapy_request = to_scrapy_request(apify_request, spider) diff --git a/tests/unit/test_event_manager.py b/tests/unit/test_event_manager.py index 80977e97..48d139b9 100644 --- a/tests/unit/test_event_manager.py +++ b/tests/unit/test_event_manager.py @@ -27,7 +27,10 @@ async def test_lifecycle_local(self, caplog: pytest.LogCaptureFixture) -> None: assert len(caplog.records) == 1 assert caplog.records[0].levelno == logging.DEBUG - assert caplog.records[0].message == 'APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.' + assert ( + caplog.records[0].message + == 'APIFY_ACTOR_EVENTS_WS_URL env var not set, no events from Apify platform will be emitted.' + ) async def test_event_handling_local(self) -> None: async with EventManager() as event_manager: @@ -164,7 +167,7 @@ async def send_platform_event(event_name: Event, data: Any = None) -> None: if data: message['data'] = data - websockets.broadcast(connected_ws_clients, json.dumps(message)) + websockets.broadcast(connected_ws_clients, json.dumps(message)) # type: ignore async with websockets.server.serve(handler, host='localhost') as ws_server: # When you don't specify a port explicitly, the websocket connection is opened on a random free port. diff --git a/tests/unit/test_proxy_configuration.py b/tests/unit/test_proxy_configuration.py index 7074e395..485d8342 100644 --- a/tests/unit/test_proxy_configuration.py +++ b/tests/unit/test_proxy_configuration.py @@ -67,7 +67,10 @@ def test__fails_with_invalid_arguments(self: TestProxyConfiguration) -> None: ProxyConfiguration(country_code=invalid_country_code) # type: ignore with pytest.raises(ValueError, match='Exactly one of .* must be specified'): - ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222') + ProxyConfiguration( + proxy_urls=['http://proxy.com:1111'], + new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', + ) with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): ProxyConfiguration(proxy_urls=['http://proxy.com:1111'], groups=['GROUP1']) @@ -76,7 +79,9 @@ def test__fails_with_invalid_arguments(self: TestProxyConfiguration) -> None: ProxyConfiguration(proxy_urls=['bad-url']) with pytest.raises(ValueError, match='Cannot combine custom proxies with Apify Proxy'): - ProxyConfiguration(new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', groups=['GROUP1']) + ProxyConfiguration( + new_url_function=lambda session_id=None, request=None: 'http://proxy.com:2222', groups=['GROUP1'] + ) class TestProxyConfigurationNewUrl: @@ -347,7 +352,7 @@ async def test_new_proxy_info_rotates_urls_with_sessions(self: TestProxyConfigur assert proxy_info.url == proxy_urls[0] -@pytest.fixture() +@pytest.fixture def patched_apify_client(apify_client_async_patcher: ApifyClientAsyncPatcher) -> ApifyClientAsync: apify_client_async_patcher.patch( 'user',