@@ -531,13 +531,14 @@ def add_triple(g: Graph, res: Union[URIRef, BNode],key, value, args: AttribDict,
531531
532532def add_authors (g : Graph , res : Union [URIRef , BNode ], value , property = SDO .author , single_author = False , ** kwargs ):
533533 """Parse and add authors and their e-mail addresses"""
534+
534535 if single_author :
535- names = [value .strip ()]
536+ names = [value .strip ()] #mails/urls/affiliations may be tucked away with the name, will be extracted later
536537 mails = [ kwargs .get ('mail' ,"" ) ]
537538 urls = [ kwargs .get ('url' ,"" ) ]
538539 orgs = [ kwargs .get ('organization' ,"" ) ]
539540 else :
540- names = value .strip (). split ( "," )
541+ names = list ( split_comma_rfc822 ( value .strip ())) #mails/urls/affiliations may be tucked away with the name, will be extracted later
541542 mails = kwargs .get ("mail" ,"" ).strip ().split ("," )
542543 urls = kwargs .get ('url' ,"" ).strip ().split ("," )
543544 orgs = kwargs .get ('organization' ,"" ).strip ().split ("," )
@@ -561,7 +562,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
561562 org = None
562563
563564 if not mail :
564- #mails and urls may be tucked away with the name
565+ #mails, urls and affiliations may be tucked away with the name
565566 # npm allows strings like "Barney Rubble <[email protected] > (http://barnyrubble.tumblr.com/)" 566567 # we do the same, alternatively we allow affiliations:
567568 # "Barney Rubble <[email protected] > (Barney's Chocolate Factory)" @@ -576,7 +577,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
576577 else :
577578 org = extra
578579
579- firstname , lastname = parse_human_name (name .strip ())
580+ firstname , lastname = parse_human_name (name .strip (" \" \t \n " )) #this also strips quotes that might be here for RFC-822-like entries
580581
581582 author = URIRef (generate_uri (firstname + "-" + lastname , kwargs .get ('baseuri' ), prefix = "person" ))
582583 if skip_duplicates :
@@ -600,7 +601,7 @@ def add_authors(g: Graph, res: Union[URIRef, BNode], value, property=SDO.author,
600601 if org :
601602 orgres = URIRef (generate_uri (org , kwargs .get ('baseuri' ), prefix = "org" ))
602603 g .add ((orgres , RDF .type , SDO .Organization ))
603- g .add ((orgres , SDO .name , Literal (org .strip () )))
604+ g .add ((orgres , SDO .name , Literal (org .strip ("() " )))) # needed to cleanup after the regexp and to prevent other accidents
604605 g .add ((author , SDO .affiliation , orgres ))
605606
606607 if property in ORDEREDLIST_PROPERTIES :
@@ -692,6 +693,41 @@ def get_last_component(uri):
692693 else :
693694 return uri [index + 1 :]
694695
696+ def split_comma_rfc822 (s ):
697+ #splits comma seperated authors with optional mail addresses
698+ stack = []
699+ begin = 0
700+ for i , c in enumerate (s ):
701+ print (i , c , stack )
702+ if c == "," and not stack :
703+ yield s [begin :i ].strip ()
704+ begin = i + 1
705+ elif c == "\" " :
706+ if stack and stack [- 1 ] == c :
707+ stack = stack [:- 1 ]
708+ else :
709+ stack .append (c )
710+ elif c in ("(" ,"[" ,"<" ,"{" ):
711+ stack .append (c )
712+ elif c == ")" :
713+ if stack and stack [- 1 ] == "(" :
714+ stack = stack [:- 1 ]
715+ elif c == "]" :
716+ if stack and stack [- 1 ] == "[" :
717+ stack = stack [:- 1 ]
718+ elif c == ">" :
719+ if stack and stack [- 1 ] == "<" :
720+ stack = stack [:- 1 ]
721+ elif c == "}" :
722+ if stack and stack [- 1 ] == "{" :
723+ stack = stack [:- 1 ]
724+
725+ if begin < len (s ):
726+ yield s [begin :].strip ()
727+
728+
729+
730+
695731def reconcile (g : Graph , res : URIRef , args : AttribDict ):
696732 """Reconcile possible conflicts in the graph and issue warnings."""
697733 IDENTIFIER = g .value (res , SDO .identifier )
0 commit comments