diff --git a/.github/workflows/docs-ci.yml b/.github/workflows/docs-ci.yml index 511b7c2..929abd7 100644 --- a/.github/workflows/docs-ci.yml +++ b/.github/workflows/docs-ci.yml @@ -1,10 +1,10 @@ -name: CI Documentation +name: CI Documentation and Code style on: [push, pull_request] jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 strategy: max-parallel: 4 @@ -20,11 +20,8 @@ jobs: with: python-version: ${{ matrix.python-version }} - - name: Give permission to run scripts - run: chmod +x ./docs/scripts/doc8_style_check.sh - - name: Install Dependencies - run: pip install -e .[docs] + run: pip install -e .[docs,testing] - name: Check Sphinx Documentation build minimally working-directory: ./docs @@ -34,4 +31,5 @@ jobs: working-directory: ./docs run: ./scripts/doc8_style_check.sh - + - name: Check for Code style errors + run: make check-ci diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml index 9585730..d2206c8 100644 --- a/.github/workflows/pypi-release.yml +++ b/.github/workflows/pypi-release.yml @@ -21,10 +21,10 @@ on: jobs: build-pypi-distribs: name: Build and publish library to PyPI - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 with: @@ -47,7 +47,7 @@ jobs: name: Create GH release needs: - build-pypi-distribs - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Download built archives @@ -67,7 +67,7 @@ jobs: name: Create PyPI release needs: - create-gh-release - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Download built archives diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2262567..d2e581b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,13 @@ Release notes ============= + +Version 32.1.0 - (2024-12-06) +----------------------------- + +- Compute file checksums from streaming the file content in chunks to avoid running out of memory + + Version 32.0.0 - (2024-09-05) ----------------------------- diff --git a/Makefile b/Makefile index cc36c35..9840741 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # @@ -33,11 +33,19 @@ valid: isort black check: @echo "-> Run pycodestyle (PEP8) validation" - @${ACTIVATE} pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,migrations,settings.py,.cache . + @${ACTIVATE} pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,scripts,tests,migrations,settings.py,.cache . @echo "-> Run isort imports ordering validation" - @${ACTIVATE} isort --sl --check-only -l 100 setup.py src tests . + @${ACTIVATE} isort --sl -l 100 src tests setup.py --check-only @echo "-> Run black validation" - @${ACTIVATE} black --check --check -l 100 src tests setup.py + @${ACTIVATE} black --check -l 100 src tests setup.py + +check-ci: + @echo "-> Run pycodestyle (PEP8) validation" + pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,scripts,tests,migrations,settings.py,.cache . + @echo "-> Run isort imports ordering validation" + isort --sl -l 100 src tests setup.py --check-only + @echo "-> Run black validation" + black --check -l 100 src tests setup.py clean: @echo "-> Clean the Python env" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d5510e6..9b85823 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,7 +13,7 @@ jobs: parameters: job_name: ubuntu20_cpython image_name: ubuntu-20.04 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv/bin/pytest -n 2 -vvs @@ -21,7 +21,7 @@ jobs: parameters: job_name: ubuntu22_cpython image_name: ubuntu-22.04 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv/bin/pytest -n 2 -vvs @@ -29,7 +29,7 @@ jobs: parameters: job_name: macos12_cpython image_name: macOS-12 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv/bin/pytest -n 2 -vvs @@ -37,7 +37,7 @@ jobs: parameters: job_name: macos13_cpython image_name: macOS-13 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv/bin/pytest -n 2 -vvs @@ -45,7 +45,7 @@ jobs: parameters: job_name: win2019_cpython image_name: windows-2019 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv\Scripts\pytest -n 2 -vvs @@ -53,7 +53,7 @@ jobs: parameters: job_name: win2022_cpython image_name: windows-2022 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: venv\Scripts\pytest -n 2 -vvs @@ -63,9 +63,9 @@ jobs: - template: etc/ci/azure-posix.yml parameters: - job_name: ubuntu20_test_all_supported_click_versions - image_name: ubuntu-20.04 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + job_name: ubuntu22_test_all_supported_click_versions + image_name: ubuntu-22.04 + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: click_versions: | for clk_ver in 8.1.3 8.1.2 8.1.1 8.1.0 8.0.4 8.0.2 8.0.3 8.0.1 7.1.2 7.1.1 7.1 6.7; @@ -80,9 +80,9 @@ jobs: - template: etc/ci/azure-posix.yml parameters: - job_name: ubuntu20_cpython_latest_from_pip - image_name: ubuntu-20.04 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + job_name: ubuntu22_cpython_latest_from_pip + image_name: ubuntu-22.04 + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: | venv/bin/pip install --upgrade-strategy eager --force-reinstall --upgrade -e . @@ -93,7 +93,7 @@ jobs: parameters: job_name: win2019_cpython_latest_from_pip image_name: windows-2019 - python_versions: ["3.8", "3.9", "3.10", "3.11"] + python_versions: ["3.9", "3.10", "3.11", "3.12"] test_suites: all: | venv\Scripts\pip install --upgrade-strategy eager --force-reinstall --upgrade -e . diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf..788b039 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,6 +5,7 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build +SPHINXAUTOBUILD = sphinx-autobuild SOURCEDIR = source BUILDDIR = build @@ -14,6 +15,13 @@ help: .PHONY: help Makefile +# Run the development server using sphinx-autobuild +docs: + @echo + @echo "Starting up the docs server..." + @echo + $(SPHINXAUTOBUILD) --port 8000 --watch ${SOURCEDIR} $(SOURCEDIR) "$(BUILDDIR)/html" $(SPHINXOPTS) $(O) + # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile diff --git a/docs/make.bat b/docs/make.bat index 6247f7e..4a3c1a4 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -7,11 +7,16 @@ REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) +if "%SPHINXAUTOBUILD%" == "" ( + set SPHINXAUTOBUILD=sphinx-autobuild +) set SOURCEDIR=source set BUILDDIR=build if "%1" == "" goto help +if "%1" == "docs" goto docs + %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. @@ -28,6 +33,13 @@ if errorlevel 9009 ( %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end +:docs +@echo +@echo Starting up the docs server... +@echo +%SPHINXAUTOBUILD% --port 8000 --watch %SOURCEDIR% %SOURCEDIR% %BUILDDIR%\html %SPHINXOPTS% %O% +goto end + :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% diff --git a/docs/scripts/doc8_style_check.sh b/docs/scripts/doc8_style_check.sh old mode 100644 new mode 100755 diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css index 9662d63..5863ccf 100644 --- a/docs/source/_static/theme_overrides.css +++ b/docs/source/_static/theme_overrides.css @@ -1,353 +1,26 @@ -body { - color: #000000; -} - -p { - margin-bottom: 10px; -} - -.wy-plain-list-disc, .rst-content .section ul, .rst-content .toctree-wrapper ul, article ul { - margin-bottom: 10px; -} - -.custom_header_01 { - color: #cc0000; - font-size: 22px; - font-weight: bold; - line-height: 50px; -} - -h1, h2, h3, h4, h5, h6 { - margin-bottom: 20px; - margin-top: 20px; -} - -h5 { - font-size: 18px; - color: #000000; - font-style: italic; - margin-bottom: 10px; -} - -h6 { - font-size: 15px; - color: #000000; - font-style: italic; - margin-bottom: 10px; -} - -/* custom admonitions */ -/* success */ -.custom-admonition-success .admonition-title { - color: #000000; - background: #ccffcc; - border-radius: 5px 5px 0px 0px; -} -div.custom-admonition-success.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #cccccc; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - -/* important */ -.custom-admonition-important .admonition-title { - color: #000000; - background: #ccffcc; - border-radius: 5px 5px 0px 0px; - border-bottom: solid 1px #000000; -} -div.custom-admonition-important.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #cccccc; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - -/* caution */ -.custom-admonition-caution .admonition-title { - color: #000000; - background: #ffff99; - border-radius: 5px 5px 0px 0px; - border-bottom: solid 1px #e8e8e8; -} -div.custom-admonition-caution.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #cccccc; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - -/* note */ -.custom-admonition-note .admonition-title { - color: #ffffff; - background: #006bb3; - border-radius: 5px 5px 0px 0px; -} -div.custom-admonition-note.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #cccccc; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - -/* todo */ -.custom-admonition-todo .admonition-title { - color: #000000; - background: #cce6ff; - border-radius: 5px 5px 0px 0px; - border-bottom: solid 1px #99ccff; -} -div.custom-admonition-todo.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #99ccff; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - -/* examples */ -.custom-admonition-examples .admonition-title { - color: #000000; - background: #ffe6cc; - border-radius: 5px 5px 0px 0px; - border-bottom: solid 1px #d8d8d8; -} -div.custom-admonition-examples.admonition { - color: #000000; - background: #ffffff; - border: solid 1px #cccccc; - border-radius: 5px; - box-shadow: 1px 1px 5px 3px #d8d8d8; - margin: 20px 0px 30px 0px; -} - +/* this is the container for the pages */ .wy-nav-content { max-width: 100%; - padding-right: 100px; - padding-left: 100px; - background-color: #f2f2f2; -} - -div.rst-content { - background-color: #ffffff; - border: solid 1px #e5e5e5; - padding: 20px 40px 20px 40px; -} - -.rst-content .guilabel { - border: 1px solid #ffff99; - background: #ffff99; - font-size: 100%; - font-weight: normal; - border-radius: 4px; - padding: 2px 0px; - margin: auto 2px; - vertical-align: middle; -} - -.rst-content kbd { - font-family: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace; - border: solid 1px #d8d8d8; - background-color: #f5f5f5; - padding: 0px 3px; - border-radius: 3px; -} - -.wy-nav-content-wrap a { - color: #0066cc; - text-decoration: none; -} -.wy-nav-content-wrap a:hover { - color: #0099cc; - text-decoration: underline; -} - -.wy-nav-top a { - color: #ffffff; -} - -/* Based on numerous similar approaches e.g., https://github.com/readthedocs/sphinx_rtd_theme/issues/117 and https://rackerlabs.github.io/docs-rackspace/tools/rtd-tables.html -- but remove form-factor limits to enable table wrap on full-size and smallest-size form factors */ -.wy-table-responsive table td { - white-space: normal !important; -} - -.rst-content table.docutils td, -.rst-content table.docutils th { - padding: 5px 10px 5px 10px; -} -.rst-content table.docutils td p, -.rst-content table.docutils th p { - font-size: 14px; - margin-bottom: 0px; -} -.rst-content table.docutils td p cite, -.rst-content table.docutils th p cite { - font-size: 14px; - background-color: transparent; -} - -.colwidths-given th { - border: solid 1px #d8d8d8 !important; -} -.colwidths-given td { - border: solid 1px #d8d8d8 !important; -} - -/*handles single-tick inline code*/ -.wy-body-for-nav cite { - color: #000000; - background-color: transparent; - font-style: normal; - font-family: "Courier New"; - font-size: 13px; - padding: 3px 3px 3px 3px; -} - -.rst-content pre.literal-block, .rst-content div[class^="highlight"] pre, .rst-content .linenodiv pre { - font-family: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace; - font-size: 13px; - overflow: visible; - white-space: pre-wrap; - color: #000000; -} - -.rst-content pre.literal-block, .rst-content div[class^='highlight'] { - background-color: #f8f8f8; - border: solid 1px #e8e8e8; -} - -/* This enables inline code to wrap. */ -code, .rst-content tt, .rst-content code { - white-space: pre-wrap; - padding: 2px 3px 1px; - border-radius: 3px; - font-size: 13px; - background-color: #ffffff; -} - -/* use this added class for code blocks attached to bulleted list items */ -.highlight-top-margin { - margin-top: 20px !important; -} - -/* change color of inline code block */ -span.pre { - color: #e01e5a; -} - -.wy-body-for-nav blockquote { - margin: 1em 0; - padding-left: 1em; - border-left: 4px solid #ddd; - color: #000000; -} - -/* Fix the unwanted top and bottom padding inside a nested bulleted/numbered list */ -.rst-content .section ol p, .rst-content .section ul p { - margin-bottom: 0px; -} - -/* add spacing between bullets for legibility */ -.rst-content .section ol li, .rst-content .section ul li { - margin-bottom: 5px; -} - -.rst-content .section ol li:first-child, .rst-content .section ul li:first-child { - margin-top: 5px; -} - -/* but exclude the toctree bullets */ -.rst-content .toctree-wrapper ul li, .rst-content .toctree-wrapper ul li:first-child { + padding: 0px 40px 0px 0px; margin-top: 0px; - margin-bottom: 0px; } -/* remove extra space at bottom of multine list-table cell */ -.rst-content .line-block { - margin-left: 0px; - margin-bottom: 0px; - line-height: 24px; +.wy-nav-content-wrap { + border-right: solid 1px; } -/* fix extra vertical spacing in page toctree */ -.rst-content .toctree-wrapper ul li ul, article ul li ul { - margin-top: 0; - margin-bottom: 0; -} - -/* this is used by the genindex added via layout.html (see source/_templates/) to sidebar toc */ -.reference.internal.toc-index { - color: #d9d9d9; -} - -.reference.internal.toc-index.current { - background-color: #ffffff; - color: #000000; - font-weight: bold; -} - -.toc-index-div { - border-top: solid 1px #000000; - margin-top: 10px; - padding-top: 5px; -} - -.indextable ul li { - font-size: 14px; - margin-bottom: 5px; -} - -/* The next 2 fix the poor vertical spacing in genindex.html (the alphabetized index) */ -.indextable.genindextable { - margin-bottom: 20px; -} - -div.genindex-jumpbox { - margin-bottom: 10px; -} - -/* rst image classes */ - -.clear-both { - clear: both; - } - -.float-left { - float: left; - margin-right: 20px; -} - -img { - border: solid 1px #e8e8e8; -} - -/* These are custom and need to be defined in conf.py to access in all pages, e.g., '.. role:: red' */ -.img-title { - color: #000000; - /* neither padding nor margin works for vertical spacing bc it's a span -- line-height does, sort of */ - line-height: 3.0; - font-style: italic; - font-weight: 600; -} - -.img-title-para { - color: #000000; - margin-top: 20px; - margin-bottom: 0px; - font-style: italic; - font-weight: 500; -} - -.red { - color: red; +div.rst-content { + max-width: 1300px; + border: 0; + padding: 10px 80px 10px 80px; + margin-left: 50px; +} + +@media (max-width: 768px) { + div.rst-content { + max-width: 1300px; + border: 0; + padding: 0px 10px 10px 10px; + margin-left: 0px; + } } diff --git a/docs/source/conf.py b/docs/source/conf.py index 918d62c..8c88fa2 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -30,6 +30,10 @@ extensions = [ "sphinx.ext.intersphinx", "sphinx_reredirects", + "sphinx_rtd_theme", + "sphinx_rtd_dark_mode", + "sphinx.ext.extlinks", + "sphinx_copybutton", ] @@ -39,11 +43,14 @@ # This points to aboutcode.readthedocs.io # In case of "undefined label" ERRORS check docs on intersphinx to troubleshoot -# Link was created at commit - https://github.com/nexB/aboutcode/commit/faea9fcf3248f8f198844fe34d43833224ac4a83 +# Link was created at commit - https://github.com/aboutcode-org/aboutcode/commit/faea9fcf3248f8f198844fe34d43833224ac4a83 intersphinx_mapping = { "aboutcode": ("https://aboutcode.readthedocs.io/en/latest/", None), - "scancode-workbench": ("https://scancode-workbench.readthedocs.io/en/develop/", None), + "scancode-workbench": ( + "https://scancode-workbench.readthedocs.io/en/develop/", + None, + ), } @@ -78,7 +85,9 @@ "conf_py_path": "/docs/source/", # path in the checkout to the docs root } -html_css_files = ["_static/theme_overrides.css"] +html_css_files = [ + "theme_overrides.css", +] # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. @@ -104,6 +113,4 @@ # -- Options for LaTeX output ------------------------------------------------- -latex_elements = { - 'classoptions': ',openany,oneside' -} \ No newline at end of file +latex_elements = {"classoptions": ",openany,oneside"} diff --git a/docs/source/contribute/contrib_doc.rst b/docs/source/contribute/contrib_doc.rst index 13882e1..5640db2 100644 --- a/docs/source/contribute/contrib_doc.rst +++ b/docs/source/contribute/contrib_doc.rst @@ -12,7 +12,7 @@ To get started, create or identify a working directory on your local machine. Open that directory and execute the following command in a terminal session:: - git clone https://github.com/nexB/skeleton.git + git clone https://github.com/aboutcode-org/skeleton.git That will create an ``/skeleton`` directory in your working directory. Now you can install the dependencies in a virtualenv:: diff --git a/etc/scripts/check_thirdparty.py b/etc/scripts/check_thirdparty.py index b052f25..2daded9 100644 --- a/etc/scripts/check_thirdparty.py +++ b/etc/scripts/check_thirdparty.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import click @@ -17,7 +17,8 @@ @click.option( "-d", "--dest", - type=click.Path(exists=True, readable=True, path_type=str, file_okay=False), + type=click.Path(exists=True, readable=True, + path_type=str, file_okay=False), required=True, help="Path to the thirdparty directory to check.", ) diff --git a/etc/scripts/fetch_thirdparty.py b/etc/scripts/fetch_thirdparty.py index eedf05c..3f9ff52 100644 --- a/etc/scripts/fetch_thirdparty.py +++ b/etc/scripts/fetch_thirdparty.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # @@ -55,7 +55,8 @@ "-d", "--dest", "dest_dir", - type=click.Path(exists=True, readable=True, path_type=str, file_okay=False), + type=click.Path(exists=True, readable=True, + path_type=str, file_okay=False), metavar="DIR", default=utils_thirdparty.THIRDPARTY_DIR, show_default=True, @@ -224,7 +225,8 @@ def fetch_thirdparty( environments = None if wheels: evts = itertools.product(python_versions, operating_systems) - environments = [utils_thirdparty.Environment.from_pyver_and_os(pyv, os) for pyv, os in evts] + environments = [utils_thirdparty.Environment.from_pyver_and_os( + pyv, os) for pyv, os in evts] # Collect PyPI repos repos = [] @@ -260,13 +262,14 @@ def fetch_thirdparty( repos=repos, ) if not fetched: - wheels_or_sdist_not_found[f"{name}=={version}"].append(environment) + wheels_or_sdist_not_found[f"{name}=={version}"].append( + environment) if TRACE: print(f" NOT FOUND") if (sdists or (f"{name}=={version}" in wheels_or_sdist_not_found and name in sdist_only) - ): + ): if TRACE: print(f" ==> Fetching sdist: {name}=={version}") @@ -289,7 +292,8 @@ def fetch_thirdparty( sdist_missing = sdists and "sdist" in dists and not name in wheel_only if sdist_missing: mia.append(f"SDist missing: {nv} {dists}") - wheels_missing = wheels and any(d for d in dists if d != "sdist") and not name in sdist_only + wheels_missing = wheels and any( + d for d in dists if d != "sdist") and not name in sdist_only if wheels_missing: mia.append(f"Wheels missing: {nv} {dists}") @@ -299,7 +303,8 @@ def fetch_thirdparty( raise Exception(mia) print(f"==> FETCHING OR CREATING ABOUT AND LICENSE FILES") - utils_thirdparty.fetch_abouts_and_licenses(dest_dir=dest_dir, use_cached_index=use_cached_index) + utils_thirdparty.fetch_abouts_and_licenses( + dest_dir=dest_dir, use_cached_index=use_cached_index) utils_thirdparty.clean_about_files(dest_dir=dest_dir) # check for problems diff --git a/etc/scripts/gen_requirements.py b/etc/scripts/gen_requirements.py index 07e26f7..2b65ae8 100644 --- a/etc/scripts/gen_requirements.py +++ b/etc/scripts/gen_requirements.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import argparse diff --git a/etc/scripts/gen_requirements_dev.py b/etc/scripts/gen_requirements_dev.py index 12cc06d..5db1c48 100644 --- a/etc/scripts/gen_requirements_dev.py +++ b/etc/scripts/gen_requirements_dev.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import argparse diff --git a/etc/scripts/utils_dejacode.py b/etc/scripts/utils_dejacode.py index c42e6c9..652252d 100644 --- a/etc/scripts/utils_dejacode.py +++ b/etc/scripts/utils_dejacode.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import io @@ -33,7 +33,8 @@ def can_do_api_calls(): if not DEJACODE_API_KEY and DEJACODE_API_URL: - print("DejaCode DEJACODE_API_KEY and DEJACODE_API_URL not configured. Doing nothing") + print( + "DejaCode DEJACODE_API_KEY and DEJACODE_API_URL not configured. Doing nothing") return False else: return True @@ -68,7 +69,8 @@ def get_package_data(distribution): return results[0] elif len_results > 1: - print(f"More than 1 entry exists, review at: {DEJACODE_API_URL_PACKAGES}") + print( + f"More than 1 entry exists, review at: {DEJACODE_API_URL_PACKAGES}") else: print("Could not find package:", distribution.download_url) @@ -149,7 +151,8 @@ def find_latest_dejacode_package(distribution): # there was no exact match, find the latest version # TODO: consider the closest version rather than the latest # or the version that has the best data - with_versions = [(packaging_version.parse(p["version"]), p) for p in packages] + with_versions = [(packaging_version.parse(p["version"]), p) + for p in packages] with_versions = sorted(with_versions) latest_version, latest_package_version = sorted(with_versions)[-1] print( diff --git a/etc/scripts/utils_requirements.py b/etc/scripts/utils_requirements.py index 0fc25a3..1c50239 100644 --- a/etc/scripts/utils_requirements.py +++ b/etc/scripts/utils_requirements.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # @@ -102,7 +102,8 @@ def lock_dev_requirements( all_req_nvs = get_required_name_versions(all_req_lines) dev_only_req_nvs = {n: v for n, v in all_req_nvs if n not in main_names} - new_reqs = "\n".join(f"{n}=={v}" for n, v in sorted(dev_only_req_nvs.items())) + new_reqs = "\n".join( + f"{n}=={v}" for n, v in sorted(dev_only_req_nvs.items())) with open(dev_requirements_file, "w") as fo: fo.write(new_reqs) @@ -113,10 +114,12 @@ def get_installed_reqs(site_packages_dir): as a text. """ if not os.path.exists(site_packages_dir): - raise Exception(f"site_packages directory: {site_packages_dir!r} does not exists") + raise Exception( + f"site_packages directory: {site_packages_dir!r} does not exists") # Also include these packages in the output with --all: wheel, distribute, # setuptools, pip - args = ["pip", "freeze", "--exclude-editable", "--all", "--path", site_packages_dir] + args = ["pip", "freeze", "--exclude-editable", + "--all", "--path", site_packages_dir] return subprocess.check_output(args, encoding="utf-8") diff --git a/etc/scripts/utils_thirdparty.py b/etc/scripts/utils_thirdparty.py index addf8e5..569b605 100644 --- a/etc/scripts/utils_thirdparty.py +++ b/etc/scripts/utils_thirdparty.py @@ -5,7 +5,7 @@ # ScanCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. +# See https://github.com/aboutcode-org/skeleton for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # import email @@ -245,9 +245,11 @@ def download_wheel(name, version, environment, dest_dir=THIRDPARTY_DIR, repos=tu package = repo.get_package_version(name=name, version=version) if not package: if TRACE_DEEP: - print(f" download_wheel: No package in {repo.index_url} for {name}=={version}") + print( + f" download_wheel: No package in {repo.index_url} for {name}=={version}") continue - supported_wheels = list(package.get_supported_wheels(environment=environment)) + supported_wheels = list( + package.get_supported_wheels(environment=environment)) if not supported_wheels: if TRACE_DEEP: print( @@ -291,7 +293,8 @@ def download_sdist(name, version, dest_dir=THIRDPARTY_DIR, repos=tuple()): if not package: if TRACE_DEEP: - print(f" download_sdist: No package in {repo.index_url} for {name}=={version}") + print( + f" download_sdist: No package in {repo.index_url} for {name}=={version}") continue sdist = package.sdist if not sdist: @@ -300,7 +303,8 @@ def download_sdist(name, version, dest_dir=THIRDPARTY_DIR, repos=tuple()): continue if TRACE_DEEP: - print(f" download_sdist: Getting sdist from index (or cache): {sdist.download_url}") + print( + f" download_sdist: Getting sdist from index (or cache): {sdist.download_url}") fetched_sdist_filename = package.sdist.download(dest_dir=dest_dir) if fetched_sdist_filename: @@ -533,7 +537,8 @@ def get_best_download_url(self, repos=tuple()): repos = DEFAULT_PYPI_REPOS for repo in repos: - package = repo.get_package_version(name=self.name, version=self.version) + package = repo.get_package_version( + name=self.name, version=self.version) if not package: if TRACE: print( @@ -772,7 +777,8 @@ def load_remote_about_data(self): if notice_text: about_data["notice_text"] = notice_text except RemoteNotFetchedException: - print(f"Failed to fetch NOTICE file: {self.notice_download_url}") + print( + f"Failed to fetch NOTICE file: {self.notice_download_url}") return self.load_about_data(about_data) def get_checksums(self, dest_dir=THIRDPARTY_DIR): @@ -821,9 +827,11 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): Fetch license files if missing in `dest_dir`. Return True if license files were fetched. """ - urls = LinksRepository.from_url(use_cached_index=use_cached_index).links + urls = LinksRepository.from_url( + use_cached_index=use_cached_index).links errors = [] - extra_lic_names = [l.get("file") for l in self.extra_data.get("licenses", {})] + extra_lic_names = [l.get("file") + for l in self.extra_data.get("licenses", {})] extra_lic_names += [self.extra_data.get("license_file")] extra_lic_names = [ln for ln in extra_lic_names if ln] lic_names = [f"{key}.LICENSE" for key in self.get_license_keys()] @@ -834,7 +842,8 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): try: # try remotely first - lic_url = get_license_link_for_filename(filename=filename, urls=urls) + lic_url = get_license_link_for_filename( + filename=filename, urls=urls) fetch_and_save( path_or_url=lic_url, @@ -845,7 +854,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from remote: {lic_url}") - except: + except Exception: try: # try licensedb second lic_url = f"{LICENSEDB_API_URL}/{filename}" @@ -858,7 +867,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from licensedb: {lic_url}") - except: + except Exception: msg = f'No text for license {filename} in expression "{self.license_expression}" from {self}' print(msg) errors.append(msg) @@ -911,7 +920,8 @@ def load_pkginfo_data(self, dest_dir=THIRDPARTY_DIR): c for c in classifiers if c.startswith("License") ] license_expression = get_license_expression(declared_license) - other_classifiers = [c for c in classifiers if not c.startswith("License")] + other_classifiers = [ + c for c in classifiers if not c.startswith("License")] holder = raw_data["Author"] holder_contact = raw_data["Author-email"] @@ -953,7 +963,8 @@ def update(self, data, overwrite=False, keep_extra=True): package_url = data.get("package_url") if package_url: purl_from_data = packageurl.PackageURL.from_string(package_url) - purl_from_self = packageurl.PackageURL.from_string(self.package_url) + purl_from_self = packageurl.PackageURL.from_string( + self.package_url) if purl_from_data != purl_from_self: print( f"Invalid dist update attempt, no same same purl with dist: " @@ -1003,7 +1014,8 @@ def get_license_link_for_filename(filename, urls): if not path_or_url: raise Exception(f"Missing link to file: {filename}") if not len(path_or_url) == 1: - raise Exception(f"Multiple links to file: {filename}: \n" + "\n".join(path_or_url)) + raise Exception( + f"Multiple links to file: {filename}: \n" + "\n".join(path_or_url)) return path_or_url[0] @@ -1290,7 +1302,7 @@ def is_pure(self): def is_pure_wheel(filename): try: return Wheel.from_filename(filename).is_pure() - except: + except Exception: return False @@ -1397,7 +1409,8 @@ def packages_from_dir(cls, directory): """ base = os.path.abspath(directory) - paths = [os.path.join(base, f) for f in os.listdir(base) if f.endswith(EXTENSIONS)] + paths = [os.path.join(base, f) + for f in os.listdir(base) if f.endswith(EXTENSIONS)] if TRACE_ULTRA_DEEP: print("packages_from_dir: paths:", paths) @@ -1458,7 +1471,8 @@ def dists_from_paths_or_urls(cls, paths_or_urls): dists = [] if TRACE_ULTRA_DEEP: print(" ###paths_or_urls:", paths_or_urls) - installable = [f for f in paths_or_urls if f.endswith(EXTENSIONS_INSTALLABLE)] + installable = [f for f in paths_or_urls if f.endswith( + EXTENSIONS_INSTALLABLE)] for path_or_url in installable: try: dist = Distribution.from_path_or_url(path_or_url) @@ -1476,7 +1490,8 @@ def dists_from_paths_or_urls(cls, paths_or_urls): ) except InvalidDistributionFilename: if TRACE_DEEP: - print(f" Skipping invalid distribution from: {path_or_url}") + print( + f" Skipping invalid distribution from: {path_or_url}") continue return dists @@ -1525,7 +1540,8 @@ class Environment: implementation = attr.ib( type=str, default="cp", - metadata=dict(help="Python implementation supported by this environment."), + metadata=dict( + help="Python implementation supported by this environment."), repr=False, ) @@ -1539,7 +1555,8 @@ class Environment: platforms = attr.ib( type=list, default=attr.Factory(list), - metadata=dict(help="List of platform tags supported by this environment."), + metadata=dict( + help="List of platform tags supported by this environment."), repr=False, ) @@ -1623,7 +1640,8 @@ class PypiSimpleRepository: fetched_package_normalized_names = attr.ib( type=set, default=attr.Factory(set), - metadata=dict(help="A set of already fetched package normalized names."), + metadata=dict( + help="A set of already fetched package normalized names."), ) use_cached_index = attr.ib( @@ -1654,10 +1672,12 @@ def _get_package_versions_map(self, name): self.packages[normalized_name] = versions except RemoteNotFetchedException as e: if TRACE: - print(f"failed to fetch package name: {name} from: {self.index_url}:\n{e}") + print( + f"failed to fetch package name: {name} from: {self.index_url}:\n{e}") if not versions and TRACE: - print(f"WARNING: package {name} not found in repo: {self.index_url}") + print( + f"WARNING: package {name} not found in repo: {self.index_url}") return versions @@ -1842,7 +1862,8 @@ def get(self, path_or_url, as_text=True, force=False): if force or not os.path.exists(cached): if TRACE_DEEP: print(f" FILE CACHE MISS: {path_or_url}") - content = get_file_content(path_or_url=path_or_url, as_text=as_text) + content = get_file_content( + path_or_url=path_or_url, as_text=as_text) wmode = "w" if as_text else "wb" with open(cached, wmode) as fo: fo.write(content) @@ -1864,7 +1885,8 @@ def get_file_content(path_or_url, as_text=True): if path_or_url.startswith("https://"): if TRACE_DEEP: print(f"Fetching: {path_or_url}") - _headers, content = get_remote_file_content(url=path_or_url, as_text=as_text) + _headers, content = get_remote_file_content( + url=path_or_url, as_text=as_text) return content elif path_or_url.startswith("file://") or ( @@ -1930,7 +1952,8 @@ def get_remote_file_content( ) else: - raise RemoteNotFetchedException(f"Failed HTTP request from {url} with {status}") + raise RemoteNotFetchedException( + f"Failed HTTP request from {url} with {status}") if headers_only: return response.headers, None @@ -2021,7 +2044,8 @@ def get_other_dists(_package, _dist): # if has key data we may look to improve later, but we can move on if local_dist.has_key_metadata(): local_dist.save_about_and_notice_files(dest_dir=dest_dir) - local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index) + local_dist.fetch_license_files( + dest_dir=dest_dir, use_cached_index=use_cached_index) continue # lets try to get from another dist of the same local package @@ -2033,7 +2057,8 @@ def get_other_dists(_package, _dist): # if has key data we may look to improve later, but we can move on if local_dist.has_key_metadata(): local_dist.save_about_and_notice_files(dest_dir=dest_dir) - local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index) + local_dist.fetch_license_files( + dest_dir=dest_dir, use_cached_index=use_cached_index) continue # try to get another version of the same package that is not our version @@ -2044,7 +2069,8 @@ def get_other_dists(_package, _dist): ] other_local_version = other_local_packages and other_local_packages[-1] if other_local_version: - latest_local_dists = list(other_local_version.get_distributions()) + latest_local_dists = list( + other_local_version.get_distributions()) for latest_local_dist in latest_local_dists: latest_local_dist.load_about_data(dest_dir=dest_dir) if not latest_local_dist.has_key_metadata(): @@ -2070,7 +2096,8 @@ def get_other_dists(_package, _dist): # if has key data we may look to improve later, but we can move on if local_dist.has_key_metadata(): local_dist.save_about_and_notice_files(dest_dir=dest_dir) - local_dist.fetch_license_files(dest_dir=dest_dir, use_cached_index=use_cached_index) + local_dist.fetch_license_files( + dest_dir=dest_dir, use_cached_index=use_cached_index) continue # try to get a latest version of the same package that is not our version @@ -2111,7 +2138,8 @@ def get_other_dists(_package, _dist): # if local_dist.has_key_metadata() or not local_dist.has_key_metadata(): local_dist.save_about_and_notice_files(dest_dir) - lic_errs = local_dist.fetch_license_files(dest_dir, use_cached_index=use_cached_index) + lic_errs = local_dist.fetch_license_files( + dest_dir, use_cached_index=use_cached_index) if not local_dist.has_key_metadata(): print(f"Unable to add essential ABOUT data for: {local_dist}") @@ -2259,7 +2287,8 @@ def find_problems( for dist in package.get_distributions(): dist.load_about_data(dest_dir=dest_dir) - abpth = os.path.abspath(os.path.join(dest_dir, dist.about_filename)) + abpth = os.path.abspath(os.path.join( + dest_dir, dist.about_filename)) if not dist.has_key_metadata(): print(f" Missing key ABOUT data in file://{abpth}") if "classifiers" in dist.extra_data: diff --git a/setup.cfg b/setup.cfg index 163d21b..353b78f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ zip_safe = false setup_requires = setuptools_scm[toml] >= 4 -python_requires = >=3.8 +python_requires = >=3.9 install_requires = attrs >= 18.1, !=20.1.0 @@ -67,4 +67,7 @@ docs = sphinx-rtd-theme>=1.0.0 sphinx-reredirects >= 0.1.2 doc8>=0.11.2 + sphinx-autobuild + sphinx-rtd-dark-mode>=1.3.0 + sphinx-copybutton diff --git a/src/commoncode/cliutils.py b/src/commoncode/cliutils.py index 1638088..8ec419c 100644 --- a/src/commoncode/cliutils.py +++ b/src/commoncode/cliutils.py @@ -10,9 +10,7 @@ import sys import click - -# FIXME: this is NOT API -from click._termui_impl import ProgressBar +from click._termui_impl import ProgressBar # FIXME: this is NOT API from click.termui import style from click.types import BoolParamType from click.utils import echo diff --git a/src/commoncode/command.py b/src/commoncode/command.py index 2570a05..419da26 100644 --- a/src/commoncode/command.py +++ b/src/commoncode/command.py @@ -203,7 +203,7 @@ def close_pipe(p): # Ensure process death otherwise proc.wait may hang in some cases # NB: this will run only on POSIX OSes supporting signals os.kill(proc.pid, signal.SIGKILL) # NOQA - except: + except Exception: pass # This may slow things down a tad on non-POSIX Oses but is safe: diff --git a/src/commoncode/fileset.py b/src/commoncode/fileset.py index 9ebab26..f0298ea 100644 --- a/src/commoncode/fileset.py +++ b/src/commoncode/fileset.py @@ -161,7 +161,7 @@ def load(location): assert os.path.exists(fn) and os.path.isfile(fn), msg mode = "r" with open(fn, mode) as f: - return [l.strip() for l in f if l and l.strip()] + return [line.strip() for line in f if line and line.strip()] def includes_excludes(patterns, message): diff --git a/src/commoncode/fileutils.py b/src/commoncode/fileutils.py index 123a717..3f6f9ec 100644 --- a/src/commoncode/fileutils.py +++ b/src/commoncode/fileutils.py @@ -557,7 +557,7 @@ def _rm_handler(function, path, excinfo): # NOQA elif function == os.remove: try: delete(path, _err_handler=None) - except: + except Exception: pass if os.path.exists(path): diff --git a/src/commoncode/hash.py b/src/commoncode/hash.py index 954d3f4..5e71fde 100644 --- a/src/commoncode/hash.py +++ b/src/commoncode/hash.py @@ -8,6 +8,7 @@ import binascii import hashlib +import os import sys from functools import partial @@ -25,40 +26,83 @@ Checksums are operating on files. """ +# This is ~16 MB +FILE_CHUNK_SIZE = 2**24 + def _hash_mod(bitsize, hmodule): """ - Return a hashing class returning hashes with a `bitsize` bit length. The - interface of this class is similar to the hash module API. + Return a hasher class that returns hashes with a ``bitsize`` bit length. The interface of this + class is similar to the hash module API. """ - class hasher(object): - def __init__(self, msg=None): + class hasher(Hashable): + """A hasher class that behaves like a hashlib module.""" + + def __init__(self, msg=None, **kwargs): + """ + Return a hasher, populated with an initial ``msg`` bytes string. + Close on the bitsize and hmodule + """ + # length of binary digest for this hash self.digest_size = bitsize // 8 - self.h = msg and hmodule(msg).digest()[: self.digest_size] or None - def digest(self): - return bytes(self.h) + # binh = binary hasher module + self.binh = hmodule() - def hexdigest(self): - return self.h and binascii.hexlify(self.h).decode("utf-8") + # msg_len = length in bytes of the message hashed + self.msg_len = 0 - def b64digest(self): - return self.h and urlsafe_b64encode(self.h).decode("utf-8") + if msg: + self.update(msg) - def intdigest(self): - return self.h and int(bin_to_num(self.h)) + def update(self, msg=None): + """ + Update this hash with a ``msg`` bytes string. + """ + if msg: + self.binh.update(msg) + self.msg_len += len(msg) return hasher -# for FIPS support +class Hashable: + """ + A mixin for hashers that provides the base methods. + """ + + def digest(self): + """ + Return a bytes string digest for this hash. + """ + if not self.msg_len: + return + return self.binh.digest()[: self.digest_size] + + def hexdigest(self): + """ + Return a string hex digest for this hash. + """ + return self.msg_len and binascii.hexlify(self.digest()).decode("utf-8") + + def b64digest(self): + """ + Return a string base64 digest for this hash. + """ + return self.msg_len and urlsafe_b64encode(self.digest()).decode("utf-8") + + def intdigest(self): + """ + Return a int digest for this hash. + """ + return self.msg_len and int(bin_to_num(self.digest())) + + +# for FIPS support, we declare that "usedforsecurity" is False sys_v0 = sys.version_info[0] sys_v1 = sys.version_info[1] -if sys_v0 == 3 and sys_v1 >= 9: - md5_hasher = partial(hashlib.md5, usedforsecurity=False) -else: - md5_hasher = hashlib.md5 +md5_hasher = partial(hashlib.md5, usedforsecurity=False) # Base hashers for each bit size @@ -82,31 +126,65 @@ def get_hasher(bitsize): return _hashmodules_by_bitsize[bitsize] -class sha1_git_hasher(object): +class sha1_git_hasher(Hashable): """ Hash content using the git blob SHA1 convention. + See https://git-scm.com/book/en/v2/Git-Internals-Git-Objects#_object_storage """ - def __init__(self, msg=None): + def __init__(self, msg=None, total_length=0, **kwargs): + """ + Initialize a sha1_git_hasher with an optional ``msg`` byte string. The ``total_length`` of + all content that will be hashed, combining the ``msg`` length plus any later call to + update() with additional messages. + + Here ``total_length`` is total length in bytes of all the messages (chunks) hashed + in contrast to ``msg_len`` which is the length in bytes for the optional message. + """ self.digest_size = 160 // 8 - self.h = msg and self._compute(msg) or None + self.msg_len = 0 - def _compute(self, msg): - # note: bytes interpolation is new in Python 3.5 - git_blob_msg = b"blob %d\0%s" % (len(msg), msg) - return hashlib.sha1(git_blob_msg).digest() + if msg: + self.msg_len = msg_len = len(msg) - def digest(self): - return bytes(self.h) + if not total_length: + total_length = msg_len + else: + if total_length < msg_len: + raise ValueError( + f"Initial msg length: {msg_len} " + f"cannot be larger than the the total_length: {self.total_length}" + ) - def hexdigest(self): - return self.h and binascii.hexlify(self.h).decode("utf-8") + if not total_length: + raise ValueError("total_length cannot be zero") - def b64digest(self): - return self.h and urlsafe_b64encode(self.h).decode("utf-8") + self.total_length = total_length + self.binh = get_hasher(bitsize=160)(total_length=total_length) - def intdigest(self): - return self.h and int(bin_to_num(self.h)) + self._hash_header() + if msg: + self.update(msg) + + def _hash_header(self): + # note: bytes interpolation is new in Python 3.5 + git_blob_header = b"blob %d\0" % (self.total_length) + self.binh.update(msg=git_blob_header) + + def update(self, msg=None): + """ + Update this hash with a ``msg`` bytes string. + """ + if msg: + msg_len = len(msg) + if (msg_len + self.msg_len) > self.total_length: + raise ValueError( + f"Actual combined msg lengths: initial: {self.msg_len} plus added: {msg_len} " + f"cannot be larger than the the total_length: {self.total_length}" + ) + + self.binh.update(msg) + self.msg_len += msg_len _hashmodules_by_name = { @@ -119,25 +197,60 @@ def intdigest(self): } +def get_hasher_instance_by_name(name, total_length=0): + """ + Return a hasher instance for a checksum algorithm ``name`` with a planned ``total_length`` of + bytes to hash. + """ + try: + hm = _hashmodules_by_name[name] + return hm(total_length=total_length) + except KeyError: + raise ValueError(f"Unknown checksum algorithm: {name!r}") + + +def get_file_size(location): + return os.path.getsize(location) + + def checksum(location, name, base64=False): """ - Return a checksum of `bitsize` length from the content of the file at - `location`. The checksum is a hexdigest or base64-encoded is `base64` is - True. + Return a checksum from the content of the file at ``location`` using the ``name`` checksum + algorithm. The checksum is a string as a hexdigest or is base64-encoded is ``base64`` is True. """ if not filetype.is_file(location): return - hasher = _hashmodules_by_name[name] - # fixme: we should read in chunks? - with open(location, "rb") as f: - hashable = f.read() + total_length = get_file_size(location) + chunks = binary_chunks(location) + return checksum_from_chunks(chunks=chunks, total_length=total_length, name=name, base64=base64) + - hashed = hasher(hashable) +def checksum_from_chunks(chunks, name, total_length=0, base64=False): + """ + Return a checksum from the content of the iterator of byte strings ``chunks`` with a + ``total_length`` combined length using the ``name`` checksum algorithm. The returned checksum is + a string as a hexdigest or is base64-encoded is ``base64`` is True. + """ + hasher = get_hasher_instance_by_name(name=name, total_length=total_length) + for chunk in chunks: + hasher.update(chunk) if base64: - return hashed.b64digest() + return hasher.b64digest() + return hasher.hexdigest() + - return hashed.hexdigest() +def binary_chunks(location, size=FILE_CHUNK_SIZE): + """ + Read file at ``location`` as binary and yield bytes of up to ``size`` length in bytes, + defaulting to 2**24 bytes, e.g., about 16 MB. + """ + with open(location, "rb") as f: + while True: + chunk = f.read(size) + if not chunk: + break + yield chunk def md5(location): @@ -166,19 +279,24 @@ def sha1_git(location): def multi_checksums(location, checksum_names=("md5", "sha1", "sha256", "sha512", "sha1_git")): """ - Return a mapping of hexdigest checksums keyed by checksum name from the content - of the file at `location`. Use the `checksum_names` list of checksum names. - The mapping is guaranted to contains all the requested names as keys. - If the location is not a file, the values are None. + Return a mapping of hexdigest checksum strings keyed by checksum algorithm name from hashing the + content of the file at ``location``. Use the ``checksum_names`` list of checksum names. The + mapping is guaranted to contains all the requested names as keys. If the location is not a file, + or if the file is empty, the values are None. + + The purpose of this function is to avoid read the same file multiple times + to compute different checksums. """ - results = dict([(name, None) for name in checksum_names]) if not filetype.is_file(location): - return results - - # fixme: we should read in chunks? - with open(location, "rb") as f: - hashable = f.read() - - for name in checksum_names: - results[name] = _hashmodules_by_name[name](hashable).hexdigest() - return results + return {name: None for name in checksum_names} + file_size = get_file_size(location) + hashers = { + name: get_hasher_instance_by_name(name=name, total_length=file_size) + for name in checksum_names + } + + for chunk in binary_chunks(location): + for hasher in hashers.values(): + hasher.update(msg=chunk) + + return {name: hasher.hexdigest() for name, hasher in hashers.items()} diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 6ce1536..5a0fa34 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -511,7 +511,8 @@ def _create_resources_from_paths(self, root, paths): ) if not newpar: raise Exception( - f"ERROR: Codebase._create_resources_from_paths: cannot create parent for: {parent_path!r}" + "ERROR: Codebase._create_resources_from_paths:" + f" cannot create parent for: {parent_path!r}" ) parent = newpar @@ -1686,7 +1687,7 @@ def _get_scan_data_helper(self, location): """ try: return json.loads(location) - except: + except Exception: location = abspath(normpath(expanduser(location))) with open(location) as f: scan_data = json.load(f) @@ -1842,7 +1843,7 @@ def _populate(self, scan_data): ########################################################## for attr_name in self.codebase_attributes: value = scan_data.get(attr_name) - if value == None: + if not value: continue setattr(self.attributes, attr_name, value) diff --git a/src/commoncode/system.py b/src/commoncode/system.py index 0566a6c..ea6f7d3 100644 --- a/src/commoncode/system.py +++ b/src/commoncode/system.py @@ -113,12 +113,13 @@ def has_case_sensitive_fs(): case sensitive by default, newer macOS use APFS which is no longer case sensitive by default. - From https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/APFS_Guide/FAQ/FAQ.html - How does Apple File System handle filenames? - APFS accepts only valid UTF-8 encoded filenames for creation, and preserves - both case and normalization of the filename on disk in all variants. APFS, - like HFS+, is case-sensitive on iOS and is available in case-sensitive and - case-insensitive variants on macOS, with case-insensitive being the default. + From + https://developer.apple.com/library/archive/documentation/FileManagement/Conceptual/APFS_Guide/FAQ/FAQ.html + How does Apple File System handle filenames? + APFS accepts only valid UTF-8 encoded filenames for creation, and preserves + both case and normalization of the filename on disk in all variants. APFS, + like HFS+, is case-sensitive on iOS and is available in case-sensitive and + case-insensitive variants on macOS, with case-insensitive being the default. """ return not os.path.exists(__file__.upper()) diff --git a/src/commoncode/text.py b/src/commoncode/text.py index ab56b0a..04df60f 100644 --- a/src/commoncode/text.py +++ b/src/commoncode/text.py @@ -38,7 +38,7 @@ def lines(s): splitlines. """ # FIXME: leverage new Pythin 3.8 scopeing rules - return [l.strip() for l in s.splitlines() if l.strip()] + return [line.strip() for line in s.splitlines() if line.strip()] def foldcase(text): diff --git a/src/commoncode/timeutils.py b/src/commoncode/timeutils.py index bab8dad..45caf59 100644 --- a/src/commoncode/timeutils.py +++ b/src/commoncode/timeutils.py @@ -83,7 +83,7 @@ def tstamp2time(stamp): # deal with optional microsec try: microsec = dt_ms[1] - except: + except Exception: microsec = None if microsec: microsec = int(microsec) diff --git a/src/commoncode/version.py b/src/commoncode/version.py index a99ac8d..70746e8 100644 --- a/src/commoncode/version.py +++ b/src/commoncode/version.py @@ -228,7 +228,8 @@ def get_nupkg_nv(filename): """ Return a NameVersion tuple parsed from the .nupkg NuGet archive `filename`. - For example (taken from https://stackoverflow.com/questions/51662737/regex-to-parse-package-name-and-version-number-from-nuget-package-filenames/51662926): + For example (taken from + https://stackoverflow.com/questions/51662737/regex-to-parse-package-name-and-version-number-from-nuget-package-filenames/51662926): >>> get_nupkg_nv('knockoutjs.3.4.2.nupkg') NameVersion(name='knockoutjs', version='3.4.2') >>> get_nupkg_nv('log4net.2.0.8.nupkg') diff --git a/tests/test_hash.py b/tests/test_hash.py index 27d2865..52e36bf 100644 --- a/tests/test_hash.py +++ b/tests/test_hash.py @@ -6,10 +6,12 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import hashlib import os from commoncode.hash import b64sha1 from commoncode.hash import checksum +from commoncode.hash import checksum_from_chunks from commoncode.hash import get_hasher from commoncode.hash import md5 from commoncode.hash import multi_checksums @@ -174,3 +176,21 @@ def test_sha1_git_checksum(self): test_file = self.get_test_loc(test_file) # test that we match the git hash-object assert sha1_git(test_file) == expected_sha1_git + + def test_checksum_from_chunks_can_stream_gigabytes(self): + chunk_16mb = b"0" * 16000000 + chunks_3dot2gb = (chunk_16mb for _ in range(200)) + result = checksum_from_chunks( + chunks=chunks_3dot2gb, total_length=16000000 * 200, name="sha1_git" + ) + assert result == "494caf26c43c4473f6e930b0f5c2ecf8121bcf24" + + def test_checksum_from_chunks_from_stream_is_same_as_plain(self): + chunk = b"0" * 16000 + chunks = (chunk for _ in range(100)) + result1 = checksum_from_chunks(chunks=chunks, name="sha256") + + result2 = hashlib.sha256() + for _ in range(100): + result2.update(chunk) + assert result1 == result2.hexdigest() diff --git a/tests/test_skeleton_codestyle.py b/tests/test_skeleton_codestyle.py deleted file mode 100644 index 95fcb9f..0000000 --- a/tests/test_skeleton_codestyle.py +++ /dev/null @@ -1,36 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# ScanCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/skeleton for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import configparser -import subprocess -import unittest - - -class BaseTests(unittest.TestCase): - def test_skeleton_codestyle(self): - """ - This test shouldn't run in proliferated repositories. - """ - setup_cfg = configparser.ConfigParser() - setup_cfg.read("setup.cfg") - if setup_cfg["metadata"]["name"] != "skeleton": - return - - args = "venv/bin/black --check -l 100 setup.py etc tests" - try: - subprocess.check_output(args.split()) - except subprocess.CalledProcessError as e: - print("===========================================================") - print(e.output) - print("===========================================================") - raise Exception( - "Black style check failed; please format the code using:\n" - " python -m black -l 100 setup.py etc tests", - e.output, - ) from e