Skip to content

Commit 1eec824

Browse files
committed
Updates to README parsing
1 parent 2c5557f commit 1eec824

File tree

3 files changed

+70
-73
lines changed

3 files changed

+70
-73
lines changed

caltechdata_api/cli.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,6 @@ def get_funding_entries():
9696

9797
def validate_funder_identifier(funder_identifier):
9898
response = requests.get(f"https://api.ror.org/organizations/{funder_identifier}")
99-
print(response.status_code)
100-
print(response.url)
10199
if response.status_code == 200:
102100
return True
103101
else:
@@ -310,7 +308,6 @@ def upload_data_from_file():
310308

311309
if filename == "README.md":
312310
data = parse_readme_to_json(filename)
313-
print(json.dumps(data))
314311
return data
315312
else:
316313
try:
@@ -362,7 +359,8 @@ def create_record():
362359
response = caltechdata_write(
363360
existing_data, token, production=False, publish=False
364361
)
365-
print(response)
362+
rec_id = response
363+
print(f'You can view and publish this record at https://data.caltechlibrary.dev/uploads/{rec_id}')
366364
break
367365
else:
368366
print("Going back to the main menu.")
@@ -422,7 +420,8 @@ def create_record():
422420
response = caltechdata_write(
423421
metadata, token, production=False, publish=False
424422
)
425-
print(response)
423+
rec_id = response
424+
print(f'You can view and publish this record at https://data.caltechlibrary.dev/uploads/{rec_id}')
426425
with open(response + ".json", "w") as file:
427426
json.dump(metadata, file, indent=2)
428427
break
@@ -472,12 +471,14 @@ def edit_record():
472471
response = caltechdata_edit(
473472
record_id, metadata, token, production=False, publish=False
474473
)
475-
print(response)
474+
rec_id = response
475+
print(f'You can view and publish this record at https://data.caltechlibrary.dev/uploads/{rec_id}')
476476
elif choice == "n":
477477
response = caltechdata_edit(
478478
record_id, metadata, token, production=False, publish=False
479479
)
480-
print(response)
480+
rec_id = response
481+
print(f'You can view and publish this record at https://data.caltechlibrary.dev/uploads/{rec_id}')
481482
else:
482483
print("Invalid choice. Please enter 'metadata' or 'files'.")
483484

caltechdata_api/md_to_json.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import json
3+
import requests
34

45

56
class ReadmeFormatException(Exception):
@@ -15,13 +16,17 @@ def camel_case(s):
1516
def expand_special_keys(key, value):
1617
"""Expand special keys into their structured format (affiliation, nameIdentifiers)."""
1718
if key == "affiliation":
18-
return [{"affiliationIdentifier": value, "affiliationIdentifierScheme": "ROR"}]
19+
if 'ror.org' not in value:
20+
raise ValueError('Affiliation Identifier is not a ROR')
21+
ror = value.split('ror.org/')[1].split(']')[0]
22+
response = requests.get(f'https://api.ror.org/organizations/{ror}').json()
23+
return [{"affiliationIdentifier": ror, "affiliationIdentifierScheme": "ROR","name":response['name']}]
1924
elif key == "nameIdentifiers":
25+
orcid = value.split('orcid.org/')[1].split(']')[0]
2026
return [
2127
{
22-
"nameIdentifier": value,
28+
"nameIdentifier": orcid,
2329
"nameIdentifierScheme": "ORCID",
24-
"schemeUri": f"https://orcid.org/{value}",
2530
}
2631
]
2732
return value
@@ -38,6 +43,12 @@ def parse_readme_to_json(readme_path):
3843
current_section = None
3944
current_object = {}
4045

46+
title_line = lines.pop(0)
47+
if title_line.startswith('#') == False:
48+
raise ValueError('README.md needs to start with "# Title"')
49+
else:
50+
json_data['titles'] = [{'title':title_line.replace("# ","")}]
51+
4152
section_pattern = re.compile(r"^##\s+(.*)$")
4253
key_value_pattern = re.compile(r"^-\s+(.*?):\s+(.*)$")
4354
link_pattern = re.compile(r"\[.*?\]\((.*?)\)")
@@ -50,7 +61,7 @@ def parse_readme_to_json(readme_path):
5061
elif len(current_object) == 1:
5162
key, value = next(iter(current_object.items()))
5263
if key in ["language", "publicationYear", "publisher", "version"]:
53-
json_data[current_section].append(value)
64+
json_data[current_section]=value
5465
else:
5566
json_data[current_section].append(current_object)
5667
else:
@@ -83,7 +94,6 @@ def parse_readme_to_json(readme_path):
8394

8495
if key in ["affiliation", "nameIdentifiers"]:
8596
value = expand_special_keys(key, value)
86-
print(value)
8797
else:
8898
link_match = link_pattern.search(value)
8999
if link_match:
@@ -110,13 +120,13 @@ def parse_readme_to_json(readme_path):
110120

111121
return json_data
112122

113-
114-
readme_path = "/Users/elizabethwon/downloads/exampleREADME.md"
115-
try:
116-
json_data = parse_readme_to_json(readme_path)
117-
output_json_path = "output1.json"
118-
with open(output_json_path, "w") as json_file:
119-
json.dump(json_data, json_file, indent=4)
120-
print(f"Converted JSON saved to {output_json_path}")
121-
except ReadmeFormatException as e:
122-
print(f"Error parsing README file: {e}")
123+
if __name__ == '__main__':
124+
readme_path = "exampleREADME.md"
125+
try:
126+
json_data = parse_readme_to_json(readme_path)
127+
output_json_path = "output1.json"
128+
with open(output_json_path, "w") as json_file:
129+
json.dump(json_data, json_file, indent=4)
130+
print(f"Converted JSON saved to {output_json_path}")
131+
except ReadmeFormatException as e:
132+
print(f"Error parsing README file: {e}")

templates/README.md

Lines changed: 37 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,4 @@
1-
2-
## Identifiers
3-
- Identifier: 1924MNRAS..84..308E
4-
- Identifier Type: bibcode
5-
6-
## Contributors
7-
- Name Type: Personal
8-
- Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81)
9-
10-
- Name: Contributor Name
11-
- Family Name: Family Name
12-
- Given Name: Given Name
13-
- Contributor Type: ContactPerson
14-
- Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097)
1+
# This is the title of your submission to CaltechDATA
152

163
## Creators
174
- Name Type: Personal
@@ -21,35 +8,41 @@
218
- Given Name: Given Name
229
- Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097)
2310

11+
## Descriptions
12+
- Description: Description
13+
- Description Type: Abstract
14+
15+
## Types
16+
- Resource Type General: Dataset
17+
- Resource Type: Dataset
18+
19+
## Rights List
20+
- Rights: Creative Commons Zero v1.0 Universal
21+
- Rights URI: https://creativecommons.org/publicdomain/zero/1.0/legalcode
22+
23+
## Publication Year
24+
- Publication Year: 2024
25+
26+
## Publisher
27+
- Publisher: CaltechDATA
28+
2429
## Dates
2530
- Date: 2014-10-01
2631
- Date Type: Created
2732
- Date: 2012-05-22/2016-12-21
2833
- Date Type: Collected
2934

30-
## Descriptions
31-
- Description: Description
32-
- Description Type: Abstract
33-
34-
## Formats
35-
- Format: format
35+
## Subjects
36+
- Subject: subject1
37+
- Subject: subject2
3638

3739
## Funding References
3840
- Award Title: Measurement of Column-Averaged CO2
3941
- Funder Name: National Aeronautics and Space Administration
40-
- Funder Identifier Type: GRID
41-
- Funder Identifier: grid.238252.c
42+
- Funder Identifier Type: ROR
43+
- Funder Identifier: https://ror.org/027ka1x80
4244
- Award Number: NAG5-12247
4345

44-
## Language:
45-
- Language: eng
46-
47-
## Publication Year
48-
- Publication Year: 2017
49-
50-
## Publisher
51-
- Publisher: Publisher
52-
5346
## Related Identifiers
5447
- Related Identifier: [http://www.url.org/](http://www.url.org/)
5548
- Related Identifier Type: URL
@@ -58,25 +51,18 @@
5851
- Related Identifier Type: DOI
5952
- Relation Type: IsDocumentedBy
6053

61-
## Types
62-
- Resource Type General: Dataset
63-
- Resource Type: Dataset
64-
65-
## Rights List
66-
- Rights: Rights Name
67-
- Rights URI: Rights List
68-
69-
## Subjects
70-
- Subject: subject1
71-
- Subject: subject2
72-
73-
## Titles
74-
- Title: Title
75-
- Title: Alternative Title
76-
- Title Type: AlternativeTitle
77-
7854
## Version
79-
- Version: 0
55+
- Version: 1
56+
57+
## Identifiers
58+
- Identifier: 1924MNRAS..84..308E
59+
- Identifier Type: bibcode
8060

81-
## Schema Version
82-
- Schema Version: [http://datacite.org/schema/kernel-4](http://datacite.org/schema/kernel-4)
61+
## Contributors
62+
- Name Type: Personal
63+
- Affiliation: [https://ror.org/04wxnsj81](https://ror.org/04wxnsj81)
64+
- Name: Contributor Name
65+
- Family Name: Family Name
66+
- Given Name: Given Name
67+
- Contributor Type: ContactPerson
68+
- Name Identifiers: [https://orcid.org/0000-0002-1825-0097](https://orcid.org/0000-0002-1825-0097)

0 commit comments

Comments
 (0)