77from datetime import datetime
88from typing import Any
99
10- import doi
1110import requests
1211import unidecode
12+ from requests .exceptions import HTTPError
1313
1414from .logging import logger
1515
@@ -147,14 +147,20 @@ def check_url(url: str) -> bool:
147147
148148 try :
149149 response = requests .get (url , timeout = 30 )
150- return response .status_code == 200
150+ response .raise_for_status ()
151+ return True
151152 except Exception : # pragma: no cover
152153 return False
153154
154155
155156def is_doi (archive ) -> str | None :
156157 """Check if the DOI is valid and return the DOI link.
157158
159+ We check that the DOI can be resolved by
160+ `official means <http://www.doi.org/factsheets/DOIProxy.html>`_. If so, we
161+ return the resolved URL, otherwise, we return ``None`` (which means the
162+ DOI is invalid).
163+
158164 Parameters
159165 ----------
160166 archive : str
@@ -166,10 +172,32 @@ def is_doi(archive) -> str | None:
166172 The DOI link in the form `https://doi.org/10.1234/zenodo.12345678` or `None`
167173 if the DOI is invalid.
168174 """
175+ # If archive is a URL, extract the DOI record
176+ if archive .startswith ("http" ):
177+ match = re .search (
178+ r"https?://(?:dx\.)?doi\.org/(10\.\d{4,9}/[-._;()/:A-Z0-9]+)" ,
179+ archive ,
180+ re .IGNORECASE ,
181+ )
182+ doi = match .group (1 ) if match else archive
183+ else :
184+ doi = archive
185+ url = f"https://doi.org/api/handles/{ doi } "
186+
169187 try :
170- return doi .validate_doi (archive )
171- except ValueError :
172- pass
188+ response = requests .get (url , timeout = 30 )
189+ response .raise_for_status ()
190+ result = response .json ()
191+ except HTTPError :
192+ # HTTP 404: DOI not found'
193+ return None
194+ else :
195+ urls = [
196+ v ["data" ]["value" ]
197+ for v in result ["values" ]
198+ if v .get ("type" ) == "URL"
199+ ]
200+ return urls [0 ] if urls else None
173201
174202
175203def clean_archive (archive ):
@@ -186,7 +214,7 @@ def clean_archive(archive):
186214 If the archive link is a URL, it will be returned as is with a check that
187215 it resolves but is not required to be a valid DOI. If the archive link is
188216 a DOI, it will be validated and returned as a URL in the form
189- `https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package .
217+ `https://doi.org/10.1234/zenodo.12345678`.
190218
191219 """
192220 archive = archive .strip () # Remove leading/trailing whitespace
0 commit comments