diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..52e191d --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,42 @@ +name: "CodeQL" + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + schedule: + # 17:00 on Friday (UTC) + - cron: "00 17 * * 5" + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'python' ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + queries: +security-and-quality + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..b049b7c --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,43 @@ +name: CI + +on: + push: + branches: [ "master", "ci-*" ] + pull_request: + branches: [ "master" ] + schedule: + # 17:00 on Friday (UTC) + - cron: "00 17 * * 5" + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] + fail-fast: false + + steps: + - uses: actions/checkout@v4 + - name: Set up python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --editable '.[develop,docs]' + shell: bash + + - name: Check linting and type annotations + run: | + python -m flake8 src tests + python -m mypy src tests + shell: bash + + - name: Run tests + if: success() || failure() + run: | + python -m pytest -v -s src tests + shell: bash diff --git a/readthedocs.yml b/.readthedocs.yml similarity index 74% rename from readthedocs.yml rename to .readthedocs.yml index 0c49bb5..140fc04 100644 --- a/readthedocs.yml +++ b/.readthedocs.yml @@ -1,5 +1,3 @@ -formats: - - none build: image: latest python: @@ -7,5 +5,7 @@ python: # Note that pip_install is buggy, but setup_py_install will not take into # account any dependencies from setup.py. *All* dependencies must be # declared in docs/rtd_environment.yml - setup_py_install: true - pip_install: false + #setup_py_install: true + pip_install: true + extra_requirements: + - dev diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3dfbdcc..0000000 --- a/.travis.yml +++ /dev/null @@ -1,17 +0,0 @@ -# Config file for automatic testing at travis-ci.org - -language: python -python: - - 3.6 - - 3.5 - - 3.4 - - 3.3 - -install: - - pip install -e .[dev] - - pip install coveralls - -script: - - py.test --doctest-modules --cov=doi src tests -after_success: - - coveralls diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..dd8885f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,7 @@ +v0.2 +==== + +- Remove support for python 3.4 and lower. +- Add type annotations. +- Simplify `validate_doi` to just raise a `404` error in case + something went wrong. diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1c1de3d..4886204 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -13,7 +13,7 @@ Types of Contributions Report Bugs ~~~~~~~~~~~ -Report bugs at https://github.com/alejandrogallo/doi/issues. +Report bugs at https://github.com/papis/python-doi/issues. If you are reporting a bug, please include: @@ -37,7 +37,7 @@ articles, and such. Submit Feedback ~~~~~~~~~~~~~~~ -The best way to send feedback is to file an issue at https://github.com/alejandrogallo/doi/issues. +The best way to send feedback is to file an issue at https://github.com/papis/python-doi/issues. If you are proposing a feature: @@ -51,7 +51,7 @@ Get Started! In short, -1. Clone the repository from ``git@github.com:alejandrogallo/doi.git`` +1. Clone the repository from ``git@github.com:papis/python-doi.git`` 2. Fork the repo on GitHub to your personal account. 3. Add your fork as a remote. 4. Pull in the latest changes from the master branch. @@ -77,6 +77,6 @@ Before you submit a pull request, check that it meets these guidelines: 2. If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add the feature to the list in README.rst. -3. Check https://travis-ci.org/alejandrogallo/doi/pull_requests +3. Check https://travis-ci.org/papis/python-doi/pull_requests and make sure that the tests pass for all supported Python versions. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 2d4499e..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,11 +0,0 @@ -graft src -include AUTHORS.rst -include CONTRIBUTING.rst -include LICENSE -include README.rst - -recursive-include tests * -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] - -recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif diff --git a/Makefile b/Makefile index c300533..7e98982 100644 --- a/Makefile +++ b/Makefile @@ -1,460 +1,30 @@ +PYTHON?=python -X dev -# File: common-makefile/src/version.m4 -MAKEFILE_VERSION = v0.0.1-21-g392d792 -MAKEFILE_DATE = 31-07-2017 15:47 -MAKEFILE_AUTHOR = Alejandro Gallo -MAKEFILE_URL = https://github.com/alejandrogallo/python-makefile -MAKEFILE_LICENSE = GPLv3 +all: help - - - -## < /dev/null) -# If messages should have color -WITH_COLOR ?= 1 - -ifneq ($(strip $(QUIET)),0) -FD_OUTPUT = 2>&1 > /dev/null -else -FD_OUTPUT = -endif - -ifdef DEBUG -DBG_FLAG = -DBG_FILE ?= .makefile-dbg -$(shell date | $(SED) "p; s/./=/g" > $(DBG_FILE)) -else -DBG_FLAG = @ -DBG_FILE = -endif - -define log-debug ->> $(or $(DBG_FILE),/dev/null) echo -endef - -# Print commands like [CMD] -define print-cmd-name -"[$(COLOR_LB) \ -$(shell \ - if test "$(1)" = g++; then \ - echo -n GXX; \ - elif test "$(1)" = gcc; then \ - echo -n GCC; \ - elif test "$(1)" = icc; then \ - echo -n ICC; \ - elif test "$(1)" = cc; then \ - echo -n CC; \ - elif test "$(1)" = povray; then \ - echo -n POV; \ - elif test "$(1)" = perl; then \ - echo -n PL; \ - elif test "$(1)" = perl5; then \ - echo -n PL5; \ - elif test "$(1)" = ruby; then \ - echo -n RB; \ - elif test "$(1)" = ruby2; then \ - echo -n RB2; \ - elif test "$(1)" = python; then \ - echo -n PY; \ - elif test "$(1)" = python2; then \ - echo -n PY2; \ - elif test "$(1)" = python3; then \ - echo -n PY3; \ - elif test "$(1)" = pdflatex; then \ - echo -n pdfTeX; \ - elif test "$(1)" = bash; then \ - echo -n BASH; \ - elif test "$(1)" = gnuplot; then \ - echo -n GPT; \ - elif test "$(1)" = mupdf; then \ - echo -n muPDF; \ - else \ - echo -n "$(1)" | tr a-z A-Z ; \ - fi -)\ -$(COLOR_E)]" -endef - -ifndef QQUIET - -ifeq ($(strip $(WITH_COLOR)),1) -# Red -COLOR_R ?= $(if $(TPUT),$(shell $(TPUT) setaf 1),"\033[0;31m") -# Green -COLOR_G ?= $(if $(TPUT),$(shell $(TPUT) setaf 2),"\033[0;32m") -# Yellow -COLOR_Y ?= $(if $(TPUT),$(shell $(TPUT) setaf 3),"\033[0;33m") -# Dark blue -COLOR_DB ?= $(if $(TPUT),$(shell $(TPUT) setaf 4),"\033[0;34m") -# Lila -COLOR_L ?= $(if $(TPUT),$(shell $(TPUT) setaf 5),"\033[0;35m") -# Light blue -COLOR_LB ?= $(if $(TPUT),$(shell $(TPUT) setaf 6),"\033[0;36m") -# Empty color -COLOR_E ?= $(if $(TPUT),$(shell $(TPUT) sgr0),"\033[0m") -ARROW ?= @echo "$(COLOR_L)===>$(COLOR_E)" -else -ARROW ?= @echo "===>" -endif #WITH_COLOR - -ECHO ?= @echo - -else -ARROW := @ > /dev/null echo -ECHO := @ > /dev/null echo -endif #QQUIET - - - - - - -# File: ctags.m4 - - -# ==================================== -# Ctags generation for latex documents -# ==================================== -# -# Generate a tags file so that you can navigate through the tags using -# compatible editors such as emacs or (n)vi(m). -# -tags: ## Create python exhuberant ctags - $(CTAGS) --language-force=python -R * - - - -# File: install.m4 - - -# Old-style requirements file -REQUIREMENTS ?= requirements.txt -# Command to be run when make `install` is run -INSTALL_COMMAND ?= $(PYTHON) setup.py install -# Command to be run when make `install-local` is run -INSTALL_LOCAL_COMMAND ?= $(PYTHON) setup.py install --user -# Command to be run when make `install-dev` is run -INSTALL_DEV_COMMAND ?= $(PYTHON) setup.py develop -# Command to be run when make `install-dev-local` is run -INSTALL_DEV_LOCAL_COMMAND ?= $(PYTHON) setup.py develop --user -# Command to be run when make `uninstall` is run -UNINSTALL_COMMAND ?= $(PIP) uninstall $(shell $(PYTHON) setup.py --name) -# Command to be run when make `install-deps` is run -INSTALL_DEPS_COMMAND ?= $(PIP) install -r requirements.txt -# Command to be run when make `install-deps-local` is run -INSTALL_DEPS_LOCAL_COMMAND ?= $(PIP) install --user -r requirements.txt -install-dev-local: ## Install developement version locally - $(ARROW) Installing development version locally - $(DBG_FLAG)$(INSTALL_DEV_LOCAL_COMMAND) - -install-dev: ## Install developement version - $(ARROW) Installing development version - $(DBG_FLAG)$(INSTALL_DEV_COMMAND) - -install-local: ## Install the package locally - $(ARROW) Installing locally - $(DBG_FLAG)$(INSTALL_LOCAL_COMMAND) - -install: ## Install the package - $(ARROW) Installing... - $(DBG_FLAG)$(INSTALL_COMMAND) - -uninstall: ## Uninstall the package - $(ARROW) Uninstalling... - $(DBG_FLAG)$(UNINSTALL_COMMAND) - -install-deps-local: ## Install python requirements locally - $(ARROW) Installing dependencies... - $(DBG_FLAG)$(INSTALL_DEPS_LOCAL_COMMAND) - -install-deps: ## Install python requirements - $(ARROW) Installing dependencies... - $(DBG_FLAG)$(INSTALL_DEPS_COMMAND) - - - -# File: lint.m4 - - -# Linter program -PY_LINTER ?= flake8 -# ============ -# Check syntax -# ============ -# -# It checks the syntax (lints) of all the tex sources using the program in the -# TEX_LINTER variable. -# -lint: ## Check syntax of sources - $(PY_LINTER) - - - -# File: doc.m4 - - -doc: ## Create documentation - make -C doc/ html - -doc-%: - make -C doc/ $* - -update-gh-pages: ## Update github pages - @echo "Warning: Black magic in action" - git push origin $$(git subtree split --prefix doc/build/html/ master):gh-pages --force - - - - -# File: test.m4 - - -# Command to run for `make test` -TEST_COMMAND ?= $(PYTHON) setup.py test -test: ## Run the tests - $(DBG_FLAG)$(TEST_COMMAND) - - - -# File: virtualenv.m4 - - -ENV ?= -ENV_FOLDER ?= env -ENV_PIP ?= $(ENV_FOLDER)/bin/pip -ENV_PYTHON ?= $(ENV_FOLDER)/bin/python -VIRTUALENV ?= virtualenv - -ifdef ENV -PYTHON = $(ENV_PYTHON) -PIP = $(ENV_PIP) -DEPENDENCIES += virtualenv -DIST_DEPENDENCIES += virtualenv -endif - -virtualenv: $(ENV_FOLDER) ## Create the python virtual environment -$(ENV_FOLDER): - $(ARROW) "Creating virtual environment in '$(ENV_FOLDER)' \ - with python executable '$(PYTHON)'" - $(DBG_FLAG)$(VIRTUALENV) -p $(PYTHON) $(ENV_FOLDER) - - - - -# File: common-makefile/src/update.m4 - - -MAKEFILE_UPDATE_URL ?= https://raw.githubusercontent.com/alejandrogallo/python-makefile/master/dist/Makefile - - -# =============================== -# Update the makefile from source -# =============================== -# -# You can always get the latest `Makefile` version using this target. You may -# override the `MAKEFILE_UPDATE_URL` to any path where you save your own -# personal makefile -# -update: ## Update the makefile from the repository - $(ARROW) "Getting makefile from $(MAKEFILE_UPDATE_URL)" - $(DBG_FLAG)wget $(MAKEFILE_UPDATE_URL) -O Makefile - - - - -# File: common-makefile/src/clean.m4 - - -# Remove command flags -RM_FLAGS ?= -rf - -# Default clean file to be cleaned -DEFAULT_CLEAN_FILES ?= - -# Files to be cleaned -CLEAN_FILES ?= $(DEFAULT_CLEAN_FILES) - -# ============= -# Main cleaning -# ============= -# -# This does a main cleaning of the produced auxiliary files. Before using it -# check which files are going to be cleaned up. -# -clean: ## Remove build and temporary files - $(ARROW) Cleaning up... - $(DBG_FLAG) {\ - for file in $(CLEAN_FILES); do \ - test -e $$file && { \ - $(RM) $(RM_FLAGS) $$file && \ - echo $(call print-cmd-name,RM) "$$file";\ - } || : ; \ - done \ - } - - - - -# File: common-makefile/src/print-variable.m4 - - -# This is used for printing defined variables from Some other scripts. For -# instance if you want to know the value of the `PDF_VIEWER` defined in the -# Makefile, then you would do -# ``` -# make print-PDF_VIEWER -# ``` -# and this would output `PDF_VIEWER=mupdf` for instance. -FORCE: -print-%: - $(DBG_FLAG)echo '$*=$($*)' - -# ===================================== -# Print a variable used by the Makefile -# ===================================== -# -# For debugging purposes it is useful to print out some variables that the -# makefile is using, for that just type `make print` and you will be prompted -# to insert the name of the variable that you want to know. -# -FORCE: -print: ## Print a variable - $(DBG_FLAG)read -p "Variable to print: " variable && \ - $(MAKE) --no-print-directory print-$$variable - - - - -# File: common-makefile/src/help.m4 - - - -# ================ -# Print quick help -# ================ -# -# It prints a quick help in the terminal -help: ## Prints help for targets with comments - $(DBG_FLAG)$(or $(AWK),awk) ' \ - BEGIN {FS = ":.*?## "}; \ - /^## *<=1.10", +] + +[project] +name = "python-doi" +version = "0.2.0" +description = "Python package to work with Document Object Identifiers (DOIs)" +readme = "README.rst" +keywords = [ + "doi", +] +license = { text = "GPL-3.0-or-later" } +maintainers = [{ name = "Alejandro Gallo", email = "aamsgallo@gmail.com" }] +authors = [{ name = "Alejandro Gallo", email = "aamsgallo@gmail.com" }] +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Utilities", +] + +[project.optional-dependencies] +develop = [ + "flake8", + "flake8-bugbear", + "Flake8-pyproject", + "flake8-quotes", + "mypy>=0.7", + "pep8-naming", + "pytest", + "pytest-cov", + "python-coveralls", +] +docs = [ + "sphinx>=4", + "sphinx_rtd_theme>=1", +] +# For solving client-side challenges on DDoS-protected sites +# (eg those using CloudFlare) +challenges = [ + "cloudscraper", +] + +[project.urls] +Repository = "https://github.com/papis/python-doi" + +[tool.hatch.build.targets.sdist] +exclude = [".github", "docs/build"] + +[tool.hatch.build.targets.wheel] +packages = ["src/doi"] + +[tool.flake8] +select = ["B", "D", "E", "F", "N", "Q", "W"] +extend-ignore = ["B019", "E123", "N818", "W503"] +max-line-length = 88 +inline-quotes = "double" +multiline-quotes = "double" + +[tool.pytest.ini_options] +addopts = [ + "--doctest-modules", + "--cov=src/doi", + "--ignore=docs", +] +markers = [ + "net: marks tests that call use the net" +] + +[tool.mypy] +strict = true +show_column_numbers = true +hide_error_codes = false +pretty = true +warn_unused_ignores = false diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index d7fc0e9..0000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[bdist_wheel] -universal = 1 - -[flake8] -exclude = docs - -[tool:pytest] -collect_ignore = ['setup.py'] - diff --git a/setup.py b/setup.py deleted file mode 100644 index 06915f0..0000000 --- a/setup.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -"""The setup script.""" - -from setuptools import setup, find_packages - - -def get_version(filename): - """Extract the package version""" - with open(filename) as in_fh: - for line in in_fh: - if line.startswith('__version__'): - return line.split('=')[1].strip()[1:-1] - raise ValueError("Cannot extract version from %s" % filename) - - -with open('README.rst') as readme_file: - readme = readme_file.read() - -requirements = [] - -dev_requirements = [ - 'coverage', 'pytest', 'pytest-cov==2.5.0', 'twine', 'pep8', - 'flake8', 'wheel', 'sphinx', 'sphinx-autobuild', 'sphinx_rtd_theme'] - -version = get_version('./src/doi/__init__.py') - -setup( - author="Alejandro Gallo", - author_email='aamsgallo@gmail.com', - classifiers=[ - 'Development Status :: 2 - Pre-Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Natural Language :: English', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - ], - description="Python package to work with Document Object Identifier (doi)", - install_requires=requirements, - extras_require={ - 'dev': dev_requirements, - }, - license="GNU General Public License v3", - long_description=readme, - include_package_data=True, - keywords='doi', - name='python-doi', - packages=find_packages(where="src"), - package_dir={"": "src"}, - url='https://github.com/alejandrogallo/doi', - version=version, - zip_safe=False, -) diff --git a/src/doi/__init__.py b/src/doi/__init__.py index 24b69b8..cd19507 100644 --- a/src/doi/__init__.py +++ b/src/doi/__init__.py @@ -1,113 +1,97 @@ import re +import sys import logging +from typing import Optional -__version__ = '0.1.0' +__version__ = "0.2.0" +logger = logging.getLogger("doi") # type: logging.Logger -logger = logging.getLogger("doi") - -def pdf_to_doi(filepath, maxlines=float('inf')): - """Try to get doi from a filepath, it looks for a regex in the binary - data and returns the first doi found, in the hopes that this doi +def pdf_to_doi(filepath: str, maxlines: Optional[int] = None) -> Optional[str]: + """Try to get DOI from a filepath. It looks for a regex in the binary + data and returns the first DOI found, in the hopes that this DOI is the correct one. - :param filepath: Path to the pdf file - :type filepath: str + :param filepath: Path to the pdf file. :param maxlines: Maximum number of lines that should be checked - For some documnets, it would spend a long time trying to look for - a doi, and dois in the middle of documents don't tend to be the correct - doi of the document. - :type maxlines: int - :returns: DOI or None - :rtype: str or None + For some documents, it could spend a long time trying to look for + a DOI, and DOIs in the middle of documents don't tend to be the correct + DOI of the document. + :returns: DOI or ``None``. """ - with open(filepath, 'rb') as fd: + if maxlines is None: + maxlines = sys.maxsize + + with open(filepath, "rb") as fd: for j, line in enumerate(fd): - doi = find_doi_in_text(line.decode('ascii', errors='ignore')) + doi = find_doi_in_text(line.decode("ascii", errors="ignore")) if doi: return doi if j > maxlines: return None - else: - return None - - -def validate_doi(doi): - """We check that the DOI can be resolved by official means. If so, we - return the resolved URL, otherwise, we return None (which means the DOI is - invalid). + return None - http://www.doi.org/factsheets/DOIProxy.html - :param doi: Doi identificator - :type doi: str - :returns: It returns the url assigned to the doi if everything went right - :rtype: str +def validate_doi(doi: str) -> Optional[str]: + """We check that the DOI can be resolved by + `official means `_. If so, we + return the resolved URL, otherwise, we return ``None`` (which means the + DOI is invalid). - :raises ValueError: Whenever the doi is not valid + :param doi: Identifier. + :returns: The URL assigned to the DOI or ``None``. """ - from urllib.error import HTTPError, URLError + from urllib.error import HTTPError import urllib.request import urllib.parse import json url = "https://doi.org/api/handles/{doi}".format(doi=doi) - logger.debug('handle url %s' % url) + logger.debug("handle url %s", url) request = urllib.request.Request(url) try: result = json.loads(urllib.request.urlopen(request).read().decode()) - if 'values' in result: - url = [v['data']['value'] - for v in result['values'] if v.get('type') == 'URL'] - return url[0] if url else None except HTTPError: - raise ValueError('HTTP 404: DOI not found') - except URLError as e: - raise ValueError(e) - - response_code = int(result["responseCode"]) - if response_code in [1, 200]: - # HTTP 200 all ok - logger.debug('HTTP 200: valid doi') - elif response_code == 2: - raise ValueError('HTTP 500: Interal DOI server error') - elif response_code == 100: - raise ValueError('HTTP 404: DOI not found') + raise ValueError("HTTP 404: DOI not found") else: - raise ValueError('Something unexpected happened') + urls = [v["data"]["value"] + for v in result["values"] if v.get("type") == "URL"] + return urls[0] if urls else None -def get_clean_doi(doi): - """Check if doi is actually a url and in that case just get - the exact doi. +def get_clean_doi(doi: str) -> str: + """Check if the DOI is actually a URL and in that case just get + the exact DOI. - :doi: String containing a doi - :returns: The pure doi + :param doi: String containing a DOI. + :returns: The extracted DOI. """ - doi = re.sub(r'%2F', '/', doi) + doi = re.sub(r"%2F", "/", doi) # For pdfs - doi = re.sub(r'\)>', ' ', doi) - doi = re.sub(r'\)/S/URI', ' ', doi) - doi = re.sub(r'(/abstract)', '', doi) - doi = re.sub(r'\)$', '', doi) + doi = re.sub(r"\)>", " ", doi) + doi = re.sub(r"\)/S/URI", " ", doi) + doi = re.sub(r"(/abstract)", "", doi) + doi = re.sub(r"\)$", "", doi) return doi -def find_doi_in_text(text): - """ - Try to find a doi in a text +def find_doi_in_text(text: str) -> Optional[str]: + """Try to find a DOI in a text. + + :param text: Text in which to look for DOI. + :returns: A DOI, if found, otherwise ``None``. """ text = get_clean_doi(text) forbidden_doi_characters = r'"\s%$^\'<>@,;:#?&' # Sometimes it is in the javascript defined var_doi = re.compile( - r'doi(.org)?' - r'\s*(=|:|/|\()\s*' - r'("|\')?' - r'(?P[^{fc}]+)' - r'("|\'|\))?' + r"doi(.org)?" + r"\s*(=|:|/|\()\s*" + r"(\"|')?" + r"(?P[^{fc}]+)" + r"(\"|'|\))?" .format( fc=forbidden_doi_characters ), re.I @@ -118,20 +102,25 @@ def find_doi_in_text(text): try: m = next(miter) if m: - doi = m.group('doi') + doi = m.group("doi") return get_clean_doi(doi) except StopIteration: pass return None -def get_real_url_from_doi(doi): +def get_real_url_from_doi(doi: str) -> Optional[str]: + """Get a URL corresponding to a DOI. + + :param doi: Identifier. + :returns: A URL for the DOI. If the DOI is invalid, return ``None``. + """ url = validate_doi(doi) - if not url: + if url is None: return url - m = re.match('.*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*', url, re.I) + m = re.match(r".*linkinghub\.elsevier.*/pii/([A-Z0-9]+).*", url, re.I) if m: - return ('https://www.sciencedirect.com/science/article/abs/pii/{pii}' + return ("https://www.sciencedirect.com/science/article/abs/pii/{pii}" .format(pii=m.group(1))) return url diff --git a/src/doi/py.typed b/src/doi/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_doi.py b/tests/test_doi.py index be2acce..1049b8b 100644 --- a/tests/test_doi.py +++ b/tests/test_doi.py @@ -1,87 +1,164 @@ -"""Tests for `doi` package.""" - import os -from pkg_resources import parse_version + +import requests +try: + import cloudscraper +except ImportError: + cloudscraper = None +from urllib.parse import urlparse, urlunparse +from warnings import warn + +import pytest from doi import ( - validate_doi, find_doi_in_text, __version__, pdf_to_doi, + validate_doi, find_doi_in_text, pdf_to_doi, get_real_url_from_doi ) -def test_valid_version(): - """Check that the package defines a valid __version__""" - assert parse_version(__version__) >= parse_version("0.1.0") - - -def test_validate_doi(): - data = [ - ('10.1063/1.5081715', - 'http://aip.scitation.org/doi/10.1063/1.5081715'), - ('10.1007%2FBF01451751', - 'http://link.springer.com/10.1007/BF01451751'), - ('10.1103/PhysRevLett.49.57', - 'https://link.aps.org/doi/10.1103/PhysRevLett.49.57'), - ('10.1080/14786442408634457', - 'https://www.tandfonline.com/doi/full/10.1080/14786442408634457'), - ('10.1021/jp003647e', 'https://pubs.acs.org/doi/10.1021/jp003647e'), - ('10.1016/S0009-2614(97)04014-1', - 'http://linkinghub.elsevier.com/retrieve/pii/S0009261497040141'), +def simplify_url(u): + return urlparse(u)._replace(query='', fragment='') + + +def resolve_redirects(u): + # Unconditionally upgrade to https, since some resolvers seem to require it + # If removed, it'd make sense to canonicalize in simplify_url instead to + # prevent spurious test failures + u = urlunparse(urlparse(u)._replace(scheme='https')) + + if cloudscraper: + scraper = cloudscraper.create_scraper() + return simplify_url(scraper.get(u).url) + + # Try emulating a browser to not get blocked + h = {'User-Agent': 'Mozilla/5.0'} + resp = requests.get(u, headers=h) + return simplify_url(resp.url) + + +def normalize_eq(u, v, expect_diff=False): + if u == v: + return True + if not expect_diff: + warn(f"{u} textually differs from {v}, please update the relevant case.\n" + "Attempting to recover by resolving redirects") + return (simplify_url(u) == simplify_url(v) + or resolve_redirects(u) == resolve_redirects(v) + ) + + +def listmin(param): + if isinstance(param, list): + return min(param) + return "" + + +@pytest.mark.net +@pytest.mark.parametrize( + "needs_cloudscraper, urls", ids=listmin, argvalues= + [ + (True, + ["http://pubs.aip.org/aip/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501 + "http://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled", # noqa: E501 + "http://aip.scitation.org/doi/10.1063/1.5081715" + ]), + ] +) +def test_redirect(needs_cloudscraper, urls) -> None: + base = urls[0] + if needs_cloudscraper and cloudscraper is None: + pytest.skip(f"cloudscraper needed to solve CloudFlare challenge on {base}") + for other in urls[1:]: + assert normalize_eq(base, other, expect_diff=True) + + +@pytest.mark.net +@pytest.mark.parametrize( + "doi,url", + [ + ("10.1063/1.5081715", + "https://pubs.aip.org/jcp/article/150/7/074102/197572/Exact-two-component-equation-of-motion-coupled"), # noqa: E501 + ("10.1007%2FBF01451751", + "http://link.springer.com/10.1007/BF01451751"), + ("10.1103/PhysRevLett.49.57", + "https://link.aps.org/doi/10.1103/PhysRevLett.49.57"), + ("10.1080/14786442408634457", + "https://www.tandfonline.com/doi/full/10.1080/14786442408634457"), + ("10.1021/jp003647e", + "https://pubs.acs.org/doi/10.1021/jp003647e"), + ("10.1016/S0009-2614(97)04014-1", + "https://linkinghub.elsevier.com/retrieve/pii/S0009261497040141"), ] - for doi, url in data: - assert(url == validate_doi(doi)) - - for doi in ['', 'asdf']: - try: - validate_doi(doi) - except ValueError as e: - assert(str(e) == 'HTTP 404: DOI not found') - -def test_get_real_url_from_doi(): - data = [ - ('10.1016/S0009-2614(97)04014-1', - 'https://www.sciencedirect.com/science/' - 'article/abs/pii/S0009261497040141'), +) +def test_validate_doi(doi, url) -> None: + assert normalize_eq(url, validate_doi(doi)) + + +@pytest.mark.parametrize( + "doi", + [ + "", + "asdf" + ] +) +def test_validate_invalid_doi(doi) -> None: + try: + validate_doi(doi) + except ValueError as e: + assert str(e) == "HTTP 404: DOI not found" + + +@pytest.mark.net +@pytest.mark.parametrize( + "doi,url", + [ + ("10.1016/S0009-2614(97)04014-1", + "https://www.sciencedirect.com/science/" + "article/abs/pii/S0009261497040141"), ] - for doi, url in data: - assert(url == get_real_url_from_doi(doi)) - - -def test_find_doi_in_line(): - test_data = [ - ('http://dx.doi.org/10.1063/1.881498', '10.1063/1.881498'), - ('http://dx.doi.org/10.1063%2F1.881498', '10.1063/1.881498'), - (2*'qer '+'var doi = "12345/12345.3"', '12345/12345.3'), - (2*'qer '+"var doi = '12345/12345.3';fas", '12345/12345.3'), - (2*'qer '+"var DoI = 12345%2F12345.3", '12345/12345.3'), - (2*'qer '+"var DoI : 12345%2F12345.3", '12345/12345.3'), - ('http://scitation.org/doi/10.1063/1.881498', '10.1063/1.881498'), - ('org/doi(10.1063/1.881498)', '10.1063/1.881498'), - ('/scitation.org/doi/10.1063/1.881498?234saf=34', '10.1063/1.881498'), - ('/scitation.org/doi/10.1063/1.88149 8?234saf=34', '10.1063/1.88149'), - ('/scitation.org/doi/10.1063/1.uniau12?as=234', - '10.1063/1.uniau12'), - ('https://doi.org/10.1093/analys/anw053' , '10.1093/analys/anw053'), - ('http://.scitation.org/doi/10.1063/1.mart(88)1498?asdfwer' , - '10.1063/1.mart(88)1498'), - ('@ibook{doi:10.1002/9780470125915.ch2,', '10.1002/9780470125915.ch2'), - ('application/pdf' - 'doi:10.1063/1.5079474', - '10.1063/1.5079474'), - ('<(DOI:10.1002/9780470915.CH2)/S/URI,', '10.1002/9780470915.CH2'), - ('URL<(DOI:10.1002/9780470125915.CH2,', '10.1002/9780470125915.CH2'), - (r'A<>/' - r'Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926' - r'347.24957 605.36926]/Subtype/Link/Type/A', - '10.1016/j.comptc.2018.10.004'), - ('doi(10.1038/s41535-018-0103-6;)', '10.1038/s41535-018-0103-6'), + "doi:10.1063/1.5079474", + "10.1063/1.5079474"), + ("<(DOI:10.1002/9780470915.CH2)/S/URI,", "10.1002/9780470915.CH2"), + ("URL<(DOI:10.1002/9780470125915.CH2,", "10.1002/9780470125915.CH2"), + (r"A<>/" + r"Border[0 0 0]/M(D:20181022082356+0530)/Rect[147.40158 594.36926" + r"347.24957 605.36926]/Subtype/Link/Type/A", + "10.1016/j.comptc.2018.10.004"), + ("doi(10.1038/s41535-018-0103-6;)", "10.1038/s41535-018-0103-6"), ] - for url, doi in test_data: - assert(find_doi_in_text(url) == doi) +) +def test_find_doi_in_line(url, doi) -> None: + assert find_doi_in_text(url) == doi + +def test_doi_from_pdf() -> None: + f = os.path.join(os.path.dirname(__file__), "resources", "doc.pdf") -def test_doi_from_pdf(): - f = os.path.join(os.path.dirname(__file__), 'resources', 'doc.pdf') - assert(os.path.exists(f)) - assert(pdf_to_doi(f) == '10.1103/PhysRevLett.50.1998') + assert os.path.exists(f) + assert pdf_to_doi(f) == "10.1103/PhysRevLett.50.1998"