From cfe03964db2e7f39116ce6dfa9529d8096d63fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Mon, 13 Jan 2025 10:35:09 +0100 Subject: [PATCH 1/9] accept both `http://d-nb.info/gnd/` and `https://d-nb.info/gnd/` to be pasted as GND URI and being normalized --- idutils/normalizers.py | 6 ++++-- idutils/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/idutils/normalizers.py b/idutils/normalizers.py index a4c8b07..aec6f95 100644 --- a/idutils/normalizers.py +++ b/idutils/normalizers.py @@ -54,8 +54,10 @@ def normalize_orcid(val): def normalize_gnd(val): """Normalize a GND identifier.""" - if val.startswith(gnd_resolver_url): - val = val[len(gnd_resolver_url) :] + if val.startswith("http://" + gnd_resolver_url): + val = val[len("http://" + gnd_resolver_url) :] + elif val.startswith("https://" + gnd_resolver_url): + val = val[len("https://" + gnd_resolver_url) :] if val.lower().startswith("gnd:"): val = val[len("gnd:") :] return "gnd:{0}".format(val) diff --git a/idutils/utils.py b/idutils/utils.py index 86f99c0..fe6bbc9 100644 --- a/idutils/utils.py +++ b/idutils/utils.py @@ -82,7 +82,7 @@ """ gnd_regexp = re.compile( - r"(gnd:|GND:)?(" + r"(gnd:|GND:|http://d-nb.info/gnd/|https://d-nb.info/gnd/)?(" r"(1|10)\d{7}[0-9X]|" r"[47]\d{6}-\d|" r"[1-9]\d{0,7}-[0-9X]|" @@ -91,7 +91,7 @@ ) """See https://www.wikidata.org/wiki/Property:P227.""" -gnd_resolver_url = "http://d-nb.info/gnd/" +gnd_resolver_url = "d-nb.info/gnd/" urn_resolver_url = "https://nbn-resolving.org/" From 17d8660a3626ac26086db5744528875ee34d86f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Mon, 13 Jan 2025 10:59:52 +0100 Subject: [PATCH 2/9] re-use gnd_resolver_url var in regex --- idutils/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/idutils/utils.py b/idutils/utils.py index fe6bbc9..0a3ce2f 100644 --- a/idutils/utils.py +++ b/idutils/utils.py @@ -81,8 +81,10 @@ https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier """ +gnd_resolver_url = "d-nb.info/gnd/" + gnd_regexp = re.compile( - r"(gnd:|GND:|http://d-nb.info/gnd/|https://d-nb.info/gnd/)?(" + rf"(gnd:|GND:|http://{re.escape(gnd_resolver_url)}|https://{re.escape(gnd_resolver_url)})?(" r"(1|10)\d{7}[0-9X]|" r"[47]\d{6}-\d|" r"[1-9]\d{0,7}-[0-9X]|" @@ -91,7 +93,6 @@ ) """See https://www.wikidata.org/wiki/Property:P227.""" -gnd_resolver_url = "d-nb.info/gnd/" urn_resolver_url = "https://nbn-resolving.org/" From 0aad9a435463c347949b42ec4b18e2c9ec4efcf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Wed, 29 Jan 2025 13:50:38 +0100 Subject: [PATCH 3/9] use the regex in the normalize function --- idutils/normalizers.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/idutils/normalizers.py b/idutils/normalizers.py index aec6f95..b1ef1ab 100644 --- a/idutils/normalizers.py +++ b/idutils/normalizers.py @@ -54,13 +54,8 @@ def normalize_orcid(val): def normalize_gnd(val): """Normalize a GND identifier.""" - if val.startswith("http://" + gnd_resolver_url): - val = val[len("http://" + gnd_resolver_url) :] - elif val.startswith("https://" + gnd_resolver_url): - val = val[len("https://" + gnd_resolver_url) :] - if val.lower().startswith("gnd:"): - val = val[len("gnd:") :] - return "gnd:{0}".format(val) + m = gnd_regexp.match(val) + return f"gnd:{m.group(2)}" def normalize_urn(val): From 5e2a5c90762c136a302f0ed0611b0af800e26f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Wed, 29 Jan 2025 13:51:16 +0100 Subject: [PATCH 4/9] utils: remove var and improve regex --- idutils/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/idutils/utils.py b/idutils/utils.py index 0a3ce2f..1c77e75 100644 --- a/idutils/utils.py +++ b/idutils/utils.py @@ -81,10 +81,8 @@ https://support.orcid.org/hc/en-us/articles/360006897674-Structure-of-the-ORCID-Identifier """ -gnd_resolver_url = "d-nb.info/gnd/" - gnd_regexp = re.compile( - rf"(gnd:|GND:|http://{re.escape(gnd_resolver_url)}|https://{re.escape(gnd_resolver_url)})?(" + r"(gnd:|GND:|https?://d-nb\.info/gnd/)?(" r"(1|10)\d{7}[0-9X]|" r"[47]\d{6}-\d|" r"[1-9]\d{0,7}-[0-9X]|" From 0e8c06e918152407273c3581a778537b38ba2779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Wed, 29 Jan 2025 13:51:42 +0100 Subject: [PATCH 5/9] validators: dont use var --- idutils/validators.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/idutils/validators.py b/idutils/validators.py index df6e96f..37bf4fa 100644 --- a/idutils/validators.py +++ b/idutils/validators.py @@ -13,7 +13,6 @@ """Utility file containing ID validators.""" - import unicodedata from urllib.parse import urlparse @@ -237,8 +236,8 @@ def is_pmcid(val): def is_gnd(val): """Test if argument is a GND Identifier.""" - if val.startswith(gnd_resolver_url): - val = val[len(gnd_resolver_url) :] + if val.startswith("d-nb.info/gnd/"): + val = val[len("d-nb.info/gnd/") :] return gnd_regexp.match(val) From 6944dab63b678467b4ba8c5906a51cf85575672d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Wed, 29 Jan 2025 14:48:05 +0100 Subject: [PATCH 6/9] utils: adapt to correct url match regex --- idutils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idutils/utils.py b/idutils/utils.py index 1c77e75..c7d461d 100644 --- a/idutils/utils.py +++ b/idutils/utils.py @@ -83,7 +83,7 @@ gnd_regexp = re.compile( r"(gnd:|GND:|https?://d-nb\.info/gnd/)?(" - r"(1|10)\d{7}[0-9X]|" + r"1[012]?\d{7}[0-9X]|" r"[47]\d{6}-\d|" r"[1-9]\d{0,7}-[0-9X]|" r"3\d{7}[0-9X]" From 8901e7c95b3afc21b44601b954a96b2e8d5cd9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Fri, 31 Jan 2025 07:42:05 +0100 Subject: [PATCH 7/9] validators: remove additional check --- idutils/validators.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/idutils/validators.py b/idutils/validators.py index 37bf4fa..e38a8fc 100644 --- a/idutils/validators.py +++ b/idutils/validators.py @@ -236,8 +236,6 @@ def is_pmcid(val): def is_gnd(val): """Test if argument is a GND Identifier.""" - if val.startswith("d-nb.info/gnd/"): - val = val[len("d-nb.info/gnd/") :] return gnd_regexp.match(val) From e8229a933fbdcdd79fa81fd865096ecd30379967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Fri, 31 Jan 2025 07:42:48 +0100 Subject: [PATCH 8/9] utils: improve regex to match for IDs without http prefix in order to remove additional check in validators --- idutils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idutils/utils.py b/idutils/utils.py index c7d461d..bdf26b8 100644 --- a/idutils/utils.py +++ b/idutils/utils.py @@ -82,7 +82,7 @@ """ gnd_regexp = re.compile( - r"(gnd:|GND:|https?://d-nb\.info/gnd/)?(" + r"(gnd:|GND:|https?://d-nb\.info/gnd/|d-nb\.info/gnd/)?(" r"1[012]?\d{7}[0-9X]|" r"[47]\d{6}-\d|" r"[1-9]\d{0,7}-[0-9X]|" From 2f042b987fb1c2d43aa5029ef752f29158f8778d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karl=20Kr=C3=A4gelin?= Date: Fri, 31 Jan 2025 08:16:05 +0100 Subject: [PATCH 9/9] validators: adhere to pydocstyle --- idutils/validators.py | 1 - 1 file changed, 1 deletion(-) diff --git a/idutils/validators.py b/idutils/validators.py index e38a8fc..d6d41d1 100644 --- a/idutils/validators.py +++ b/idutils/validators.py @@ -236,7 +236,6 @@ def is_pmcid(val): def is_gnd(val): """Test if argument is a GND Identifier.""" - return gnd_regexp.match(val)