Skip to content

Commit 2fe8e98

Browse files
fixed multiple creators and dates (#34)
* support multiple creators and fix dates
1 parent 5c4c975 commit 2fe8e98

File tree

2 files changed

+69
-7
lines changed

2 files changed

+69
-7
lines changed

caltechdata_api/md_to_json.py

Lines changed: 62 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def expand_special_keys(key, value):
3333
{
3434
"nameIdentifier": orcid,
3535
"nameIdentifierScheme": "ORCID",
36+
"schemeUri": f"https://orcid.org/{value}",
3637
}
3738
]
3839
return value
@@ -54,14 +55,21 @@ def parse_readme_to_json(readme_path):
5455
raise ValueError('README.md needs to start with "# Title"')
5556
else:
5657
json_data["titles"] = [{"title": title_line.replace("# ", "")}]
58+
59+
contributors = []
60+
identifiers = []
61+
item_list = []
5762

5863
section_pattern = re.compile(r"^##\s+(.*)$")
5964
key_value_pattern = re.compile(r"^-\s+(.*?):\s+(.*)$")
6065
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
6166

6267
for line_number, line in enumerate(lines, 1):
6368
if not line.strip():
64-
if current_object and current_section:
69+
if item_list and current_section:
70+
json_data[current_section] = item_list
71+
item_list = []
72+
elif current_object and current_section:
6573
if current_section == "types":
6674
json_data[current_section] = current_object
6775
elif len(current_object) == 1:
@@ -70,14 +78,43 @@ def parse_readme_to_json(readme_path):
7078
json_data[current_section] = value
7179
else:
7280
json_data[current_section].append(current_object)
81+
elif current_section in ["creators", "contributors"]:
82+
contributors.append(current_object)
83+
current_object = {}
84+
elif current_section == "identifiers":
85+
identifiers.append(current_object)
86+
current_object = {}
7387
else:
7488
json_data[current_section].append(current_object)
7589
current_object = {}
7690
continue
7791

7892
section_match = section_pattern.match(line)
7993
if section_match:
80-
if current_section and current_object:
94+
if item_list:
95+
json_data[current_section] = item_list
96+
elif current_object:
97+
if current_section in json_data:
98+
if isinstance(json_data[current_section], list):
99+
json_data[current_section].append(current_object)
100+
elif isinstance(json_data[current_section], dict):
101+
json_data[current_section].update(current_object)
102+
else:
103+
json_data[current_section] = (
104+
[current_object]
105+
if current_section != "types"
106+
else current_object
107+
)
108+
current_object = {}
109+
110+
elif contributors and current_section in ["creators", "contributors"]:
111+
json_data[current_section] = contributors
112+
contributors = []
113+
elif identifiers and current_section == "identifiers":
114+
json_data[current_section] = identifiers
115+
identifiers = []
116+
117+
elif current_section and current_object:
81118
if current_section == "types":
82119
json_data[current_section] = current_object
83120
elif len(current_object) == 1:
@@ -100,19 +137,38 @@ def parse_readme_to_json(readme_path):
100137

101138
if key in ["affiliation", "nameIdentifiers"]:
102139
value = expand_special_keys(key, value)
140+
elif (
141+
key == "nameType"
142+
and current_object
143+
and current_section in ["creators", "contributors"]
144+
):
145+
contributors.append(current_object)
146+
current_object = {}
147+
elif current_section in ["subjects"]:
148+
item_list.append({key: value})
149+
elif current_section == "dates":
150+
if key == "date":
151+
current_object["date"] = value
152+
elif key == "dateType":
153+
current_object["dateType"] = value
154+
item_list.append(current_object)
155+
current_object = {}
103156
else:
104157
link_match = link_pattern.search(value)
105158
if link_match:
106159
value = link_match.group(1)
107-
108-
current_object[key] = value
160+
current_object[key] = value
109161

110162
elif line.strip() and not section_match:
111163
raise ReadmeFormatException(
112164
f"Incorrect format detected at line {line_number}: {line}"
113165
)
114166

115-
if current_section and current_object:
167+
if contributors and current_section in ["creators", "contributors"]:
168+
json_data[current_section] = contributors
169+
elif identifiers and current_section == "identifiers":
170+
json_data[current_section] = identifiers
171+
elif current_section and current_object:
116172
if current_section == "types":
117173
json_data[current_section] = current_object
118174
elif len(current_object) == 1:
@@ -126,9 +182,8 @@ def parse_readme_to_json(readme_path):
126182

127183
return json_data
128184

129-
130185
if __name__ == "__main__":
131-
readme_path = "exampleREADME.md"
186+
readme_path = "/Users/elizabethwon/downloads/exampleREADME.md"
132187
try:
133188
json_data = parse_readme_to_json(readme_path)
134189
output_json_path = "output1.json"

templates/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88
- Given Name: Given Name
99
- Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097)
1010

11+
- Name Type: Personal
12+
- Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81)
13+
- Name: Name2
14+
- Family Name: Family Name 2
15+
- Given Name: Given Name 2
16+
- Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097)
17+
1118
## Descriptions
1219
- Description: Description
1320
- Description Type: Abstract

0 commit comments

Comments
 (0)