Skip to content

Commit 7ce86bd

Browse files
Create md_to_json.py
1 parent 730f97e commit 7ce86bd

File tree

1 file changed

+196
-0
lines changed

1 file changed

+196
-0
lines changed

tests/md_to_json.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
import re
2+
import json
3+
import requests
4+
5+
6+
class ReadmeFormatException(Exception):
7+
"""Custom exception for errors in the README format."""
8+
9+
10+
def camel_case(s):
11+
"""Converts a string to camelCase."""
12+
s = re.sub(r"(\s|_|-)+", " ", s).title().replace(" ", "")
13+
return s[0].lower() + s[1:] if s else ""
14+
15+
16+
def expand_special_keys(key, value):
17+
"""Expand special keys into their structured format (affiliation, nameIdentifiers)."""
18+
if key == "affiliation":
19+
if "ror.org" not in value:
20+
raise ValueError("Affiliation Identifier is not a ROR")
21+
ror = value.split("ror.org/")[1].split("]")[0]
22+
response = requests.get(f"https://api.ror.org/organizations/{ror}").json()
23+
return [
24+
{
25+
"affiliationIdentifier": ror,
26+
"affiliationIdentifierScheme": "ROR",
27+
"name": response["name"],
28+
}
29+
]
30+
elif key == "nameIdentifiers":
31+
orcid = value.split("orcid.org/")[1].split("]")[0]
32+
return [
33+
{
34+
"nameIdentifier": orcid,
35+
"nameIdentifierScheme": "ORCID",
36+
"schemeUri": f"https://orcid.org/{value}",
37+
}
38+
]
39+
return value
40+
41+
42+
def parse_readme_to_json(readme_path):
43+
try:
44+
with open(readme_path, "r") as file:
45+
lines = file.read().split("\n")
46+
except IOError as e:
47+
raise ReadmeFormatException(f"Failed to open or read the file: {e}")
48+
49+
json_data = {}
50+
current_section = None
51+
current_object = {}
52+
53+
title_line = lines.pop(0)
54+
if title_line.startswith("#") == False:
55+
raise ValueError('README.md needs to start with "# Title"')
56+
else:
57+
json_data["titles"] = [{"title": title_line.replace("# ", "")}]
58+
59+
contributors = []
60+
identifiers = []
61+
item_list = []
62+
63+
section_pattern = re.compile(r"^##\s+(.*)$")
64+
key_value_pattern = re.compile(r"^-\s+(.*?):\s+(.*)$")
65+
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
66+
67+
for line_number, line in enumerate(lines, 1):
68+
if not line.strip():
69+
if item_list and current_section:
70+
json_data[current_section] = item_list
71+
item_list = []
72+
elif current_object and current_section:
73+
if current_section == "types":
74+
json_data[current_section] = current_object
75+
elif len(current_object) == 1:
76+
key, value = next(iter(current_object.items()))
77+
if key in ["language", "publicationYear", "publisher", "version"]:
78+
json_data[current_section] = value
79+
else:
80+
json_data[current_section].append(current_object)
81+
elif current_section in ["creators", "contributors"]:
82+
contributors.append(current_object)
83+
current_object = {}
84+
elif current_section == "identifiers":
85+
identifiers.append(current_object)
86+
current_object = {}
87+
else:
88+
json_data[current_section].append(current_object)
89+
current_object = {}
90+
continue
91+
92+
section_match = section_pattern.match(line)
93+
if section_match:
94+
if item_list:
95+
json_data[current_section] = item_list
96+
elif current_object:
97+
if current_section in json_data:
98+
if isinstance(json_data[current_section], list):
99+
json_data[current_section].append(current_object)
100+
elif isinstance(json_data[current_section], dict):
101+
json_data[current_section].update(current_object)
102+
else:
103+
json_data[current_section] = (
104+
[current_object]
105+
if current_section != "types"
106+
else current_object
107+
)
108+
current_object = {}
109+
110+
elif contributors and current_section in ["creators", "contributors"]:
111+
json_data[current_section] = contributors
112+
contributors = []
113+
elif identifiers and current_section == "identifiers":
114+
json_data[current_section] = identifiers
115+
identifiers = []
116+
117+
elif current_section and current_object:
118+
if current_section == "types":
119+
json_data[current_section] = current_object
120+
elif len(current_object) == 1:
121+
key, value = next(iter(current_object.items()))
122+
if key in ["language", "publicationYear", "publisher", "version"]:
123+
json_data[current_section].append(value)
124+
else:
125+
json_data[current_section].append(current_object)
126+
else:
127+
json_data[current_section].append(current_object)
128+
current_object = {}
129+
current_section = camel_case(section_match.group(1))
130+
json_data[current_section] = [] if current_section != "types" else {}
131+
continue
132+
133+
key_value_match = key_value_pattern.match(line)
134+
if key_value_match and current_section:
135+
key, value = key_value_match.groups()
136+
key = camel_case(key)
137+
138+
if key in ["affiliation", "nameIdentifiers"]:
139+
value = expand_special_keys(key, value)
140+
elif (
141+
key == "nameType"
142+
and current_object
143+
and current_section in ["creators", "contributors"]
144+
):
145+
contributors.append(current_object)
146+
current_object = {}
147+
elif current_section in ["subjects"]:
148+
item_list.append({key: value})
149+
elif current_section == "dates":
150+
if key == "date":
151+
current_object["date"] = value
152+
elif key == "dateType":
153+
current_object["dateType"] = value
154+
item_list.append(current_object)
155+
current_object = {}
156+
else:
157+
link_match = link_pattern.search(value)
158+
if link_match:
159+
value = link_match.group(1)
160+
161+
current_object[key] = value
162+
163+
elif line.strip() and not section_match:
164+
raise ReadmeFormatException(
165+
f"Incorrect format detected at line {line_number}: {line}"
166+
)
167+
168+
if contributors and current_section in ["creators", "contributors"]:
169+
json_data[current_section] = contributors
170+
elif identifiers and current_section == "identifiers":
171+
json_data[current_section] = identifiers
172+
elif current_section and current_object:
173+
if current_section == "types":
174+
json_data[current_section] = current_object
175+
elif len(current_object) == 1:
176+
key, value = next(iter(current_object.items()))
177+
if key in ["language", "publicationYear", "publisher", "version"]:
178+
json_data[current_section].append(value)
179+
else:
180+
json_data[current_section].append(current_object)
181+
else:
182+
json_data[current_section].append(current_object)
183+
184+
return json_data
185+
186+
187+
if __name__ == "__main__":
188+
readme_path = "/Users/elizabethwon/downloads/exampleREADME.md"
189+
try:
190+
json_data = parse_readme_to_json(readme_path)
191+
output_json_path = "output1.json"
192+
with open(output_json_path, "w") as json_file:
193+
json.dump(json_data, json_file, indent=4)
194+
print(f"Converted JSON saved to {output_json_path}")
195+
except ReadmeFormatException as e:
196+
print(f"Error parsing README file: {e}")

0 commit comments

Comments
 (0)