Skip to content

Commit a1c10cd

Browse files
committed
fix multiple author parsing #53
1 parent 8df6132 commit a1c10cd

File tree

2 files changed

+62
-29
lines changed

2 files changed

+62
-29
lines changed

codemeta/common.py

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -531,13 +531,14 @@ def add_triple(g: Graph, res: Union[URIRef, BNode],key, value, args: AttribDict,
531531

532532
def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author, single_author = False, **kwargs):
533533
"""Parse and add authors and their e-mail addresses"""
534+
534535
if single_author:
535-
names = [value.strip()]
536+
names = [value.strip()] #mails/urls/affiliations may be tucked away with the name, will be extracted later
536537
mails = [ kwargs.get('mail',"") ]
537538
urls = [ kwargs.get('url',"") ]
538539
orgs = [ kwargs.get('organization',"") ]
539540
else:
540-
names = value.strip().split(",")
541+
names = list(split_comma_rfc822(value.strip())) #mails/urls/affiliations may be tucked away with the name, will be extracted later
541542
mails = kwargs.get("mail","").strip().split(",")
542543
urls = kwargs.get('url',"").strip().split(",")
543544
orgs = kwargs.get('organization',"").strip().split(",")
@@ -561,7 +562,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
561562
org = None
562563

563564
if not mail:
564-
#mails and urls may be tucked away with the name
565+
#mails, urls and affiliations may be tucked away with the name
565566
# npm allows strings like "Barney Rubble <[email protected]> (http://barnyrubble.tumblr.com/)"
566567
# we do the same, alternatively we allow affiliations:
567568
# "Barney Rubble <[email protected]> (Barney's Chocolate Factory)"
@@ -576,7 +577,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
576577
else:
577578
org = extra
578579

579-
firstname, lastname = parse_human_name(name.strip())
580+
firstname, lastname = parse_human_name(name.strip(" \"\t\n")) #this also strips quotes that might be here for RFC-822-like entries
580581

581582
author = URIRef(generate_uri(firstname + "-" + lastname, kwargs.get('baseuri'), prefix="person"))
582583
if skip_duplicates:
@@ -600,7 +601,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
600601
if org:
601602
orgres = URIRef(generate_uri(org, kwargs.get('baseuri'), prefix="org"))
602603
g.add((orgres, RDF.type, SDO.Organization))
603-
g.add((orgres, SDO.name, Literal(org.strip())))
604+
g.add((orgres, SDO.name, Literal(org.strip("() ")))) # needed to cleanup after the regexp and to prevent other accidents
604605
g.add((author, SDO.affiliation, orgres))
605606

606607
if property in ORDEREDLIST_PROPERTIES:
@@ -692,6 +693,41 @@ def get_last_component(uri):
692693
else:
693694
return uri[index+1:]
694695

696+
def split_comma_rfc822(s):
697+
#splits comma seperated authors with optional mail addresses
698+
stack = []
699+
begin = 0
700+
for i, c in enumerate(s):
701+
print(i, c, stack)
702+
if c == "," and not stack:
703+
yield s[begin:i].strip()
704+
begin = i + 1
705+
elif c == "\"":
706+
if stack and stack[-1] == c:
707+
stack = stack[:-1]
708+
else:
709+
stack.append(c)
710+
elif c in ("(","[","<","{"):
711+
stack.append(c)
712+
elif c == ")":
713+
if stack and stack[-1] == "(":
714+
stack = stack[:-1]
715+
elif c == "]":
716+
if stack and stack[-1] == "[":
717+
stack = stack[:-1]
718+
elif c == ">":
719+
if stack and stack[-1] == "<":
720+
stack = stack[:-1]
721+
elif c == "}":
722+
if stack and stack[-1] == "{":
723+
stack = stack[:-1]
724+
725+
if begin < len(s):
726+
yield s[begin:].strip()
727+
728+
729+
730+
695731
def reconcile(g: Graph, res: URIRef, args: AttribDict):
696732
"""Reconcile possible conflicts in the graph and issue warnings."""
697733
IDENTIFIER = g.value(res, SDO.identifier)

codemeta/parsers/python.py

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -301,26 +301,24 @@ def parse_python(
301301
baseuri=args.baseuri,
302302
)
303303
elif key == "Author" and pkg: # distutils
304+
only_mail = pkg.metadata.get("Author-email", "")
305+
if not re.match(r'^[\w._%+-]+@[\w.-]+(\.[\w]+)+$',only_mail):
306+
only_mail = ""
304307
add_authors(
305308
g,
306309
res,
307310
value,
308311
single_author=args.single_author,
309-
mail=pkg.metadata.get("Author-email", ""),
312+
mail=only_mail,
310313
baseuri=args.baseuri,
311314
)
312315
elif key == "Author-email" and pkg: # importlib.metadata
316+
#this contains both the name and the email (name <email>) and may contain multiple such pairs seperated by a comma
313317
add_authors(
314318
g,
315319
res,
316-
value=(pkg.metadata.get("Author-email", "").rsplit("<")[0])
317-
.strip()
318-
.removeprefix('"')
319-
.removesuffix('"'),
320-
single_author=True,
321-
mail=pkg.metadata.get("Author-email", "")
322-
.rsplit("<")[-1]
323-
.rstrip(">"),
320+
value,
321+
single_author=False,
324322
baseuri=args.baseuri,
325323
)
326324
elif key == "maintainers" and isinstance(value, (list, tuple)): # pyproject
@@ -338,29 +336,28 @@ def parse_python(
338336
baseuri=args.baseuri,
339337
)
340338
elif key == "Maintainer" and pkg: # distutils
339+
only_mail = pkg.metadata.get("Maintainer-email", "")
340+
if not re.match(r'^[\w._%+-]+@[\w.-]+(\.[\w]+)+$',only_mail):
341+
only_mail = ""
341342
add_authors(
342343
g,
343344
res,
344345
value,
345346
property=SDO.maintainer,
346-
single_author=args.single_author,
347-
mail=pkg.metadata.get("Maintainer-email", ""),
347+
single_author=False,
348+
mail=only_mail,
348349
baseuri=args.baseuri,
349350
)
350351
elif key == "Maintainer-email" and pkg: # importlib.metadata
351-
add_authors(
352-
g,
353-
res,
354-
value=(pkg.metadata.get("Maintainer-email", "").rsplit("<")[0])
355-
.strip()
356-
.removeprefix('"')
357-
.removesuffix('"'),
358-
single_author=True,
359-
mail=pkg.metadata.get("Maintainer-email", "")
360-
.rsplit("<")[-1]
361-
.rstrip(">"),
362-
baseuri=args.baseuri,
363-
)
352+
#this contains both the name and the email (name <email>) and may contain multiple such pairs seperated by a comma
353+
add_authors(
354+
g,
355+
res,
356+
value,
357+
property=SDO.maintainer,
358+
single_author=False,
359+
baseuri=args.baseuri,
360+
)
364361
elif key == "Project-URL":
365362
if "," in value:
366363
label, url = value.split(",", 1) # according to spec

0 commit comments

Comments
 (0)