Skip to content

Commit 3d296b2

Browse files
authored
Improve DOI validation, remove dependency (#326)
1 parent f75ae6a commit 3d296b2

File tree

2 files changed

+34
-7
lines changed

2 files changed

+34
-7
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ dependencies = [
2727
"click",
2828
"feedparser",
2929
"pydantic>=2.0",
30-
"python-doi",
3130
"python-dotenv",
3231
"requests",
3332
"ruamel-yaml>=0.17.21",

src/pyosmeta/utils_clean.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
from datetime import datetime
88
from typing import Any
99

10-
import doi
1110
import requests
1211
import unidecode
12+
from requests.exceptions import HTTPError
1313

1414
from .logging import logger
1515

@@ -147,14 +147,20 @@ def check_url(url: str) -> bool:
147147

148148
try:
149149
response = requests.get(url, timeout=30)
150-
return response.status_code == 200
150+
response.raise_for_status()
151+
return True
151152
except Exception: # pragma: no cover
152153
return False
153154

154155

155156
def is_doi(archive) -> str | None:
156157
"""Check if the DOI is valid and return the DOI link.
157158
159+
We check that the DOI can be resolved by
160+
`official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we
161+
return the resolved URL, otherwise, we return ``None`` (which means the
162+
DOI is invalid).
163+
158164
Parameters
159165
----------
160166
archive : str
@@ -166,10 +172,32 @@ def is_doi(archive) -> str | None:
166172
The DOI link in the form `https://doi.org/10.1234/zenodo.12345678` or `None`
167173
if the DOI is invalid.
168174
"""
175+
# If archive is a URL, extract the DOI record
176+
if archive.startswith("http"):
177+
match = re.search(
178+
r"https?://(?:dx\.)?doi\.org/(10\.\d{4,9}/[-._;()/:A-Z0-9]+)",
179+
archive,
180+
re.IGNORECASE,
181+
)
182+
doi = match.group(1) if match else archive
183+
else:
184+
doi = archive
185+
url = f"https://doi.org/api/handles/{doi}"
186+
169187
try:
170-
return doi.validate_doi(archive)
171-
except ValueError:
172-
pass
188+
response = requests.get(url, timeout=30)
189+
response.raise_for_status()
190+
result = response.json()
191+
except HTTPError:
192+
# HTTP 404: DOI not found'
193+
return None
194+
else:
195+
urls = [
196+
v["data"]["value"]
197+
for v in result["values"]
198+
if v.get("type") == "URL"
199+
]
200+
return urls[0] if urls else None
173201

174202

175203
def clean_archive(archive):
@@ -186,7 +214,7 @@ def clean_archive(archive):
186214
If the archive link is a URL, it will be returned as is with a check that
187215
it resolves but is not required to be a valid DOI. If the archive link is
188216
a DOI, it will be validated and returned as a URL in the form
189-
`https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package.
217+
`https://doi.org/10.1234/zenodo.12345678`.
190218
191219
"""
192220
archive = archive.strip() # Remove leading/trailing whitespace

0 commit comments

Comments
 (0)