Skip to content

Commit 30d9b7b

Browse files
committed
#51 extracted subject and keywords
1 parent f1cd139 commit 30d9b7b

File tree

3 files changed

+833
-1
lines changed

3 files changed

+833
-1
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import json
2+
import os
3+
4+
# Folder containing JSON files
5+
folder_path = "../tests/test_resources/dataverse_dump_02_03_2025"
6+
7+
# Output file
8+
output_file = "../tests/test_resources/dataverse_dump_02_03_2025/extracted/unique_subjects_keywords.txt"
9+
10+
# Sets to store unique subjects and keywords
11+
unique_subjects = set()
12+
unique_keywords = set()
13+
14+
# Iterate through all JSON files in the folder
15+
for filename in os.listdir(folder_path):
16+
if filename.endswith(".json"): # Process only JSON files
17+
file_path = os.path.join(folder_path, filename)
18+
19+
# Read JSON data from the file
20+
with open(file_path, "r", encoding="utf-8") as file:
21+
json_data = json.load(file)
22+
23+
fields = json_data.get("data", {}).get("metadataBlocks", {}).get("citation", {}).get("fields", [])
24+
25+
# Iterate through fields to find subjects and keywords
26+
for field in fields:
27+
type_name = field.get("typeName")
28+
29+
if type_name == "subject":
30+
value = field.get("value")
31+
if isinstance(value, list):
32+
unique_subjects.update(value) # Add multiple subjects
33+
elif value:
34+
unique_subjects.add(value) # Add single subject
35+
36+
elif type_name == "keyword":
37+
for kw in field.get("value", []):
38+
keyword_value = kw.get("keywordValue", {}).get("value")
39+
if keyword_value:
40+
unique_keywords.add(keyword_value)
41+
42+
# Sort unique values for better readability
43+
sorted_subjects = sorted(unique_subjects)
44+
sorted_keywords = sorted(unique_keywords)
45+
46+
# Prepare text output
47+
output_text = "Unique Subjects:\n" + "\n".join(sorted_subjects) + "\n\n"
48+
output_text += "Unique Keywords:\n" + "\n".join(sorted_keywords) + "\n"
49+
50+
# Write output to file
51+
with open(output_file, "w", encoding="utf-8") as f:
52+
f.write(output_text)
53+
54+
print(f"\nUnique subjects and keywords have been written to {output_file}")

pcor_tools/tests/test_dataverse_access.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,14 @@ def test_extract_doi_list_from_dataset_result(self):
4343
persistent_id = doi
4444
dataset_metadata = dataverse_access.get_dataset_metadata_by_persistent_id(persistent_id)
4545
temp_dataset_id = dataset_metadata['data']['datasetId']
46+
title = dataset_metadata['data']['metadataBlocks']['citation']['fields'][0]['value']
47+
if title is None:
48+
filename = temp_dataset_id
49+
else:
50+
filename = title.replace(' ', '_').replace('/', '_').replace(':', '_').replace(',', '_').replace('.', '_')
4651
logger.info('Dataset Metadata: %s', dataset_metadata)
4752

48-
with open(f'test_resources/dataverse_dump/{temp_dataset_id}.json', 'w') as f:
53+
with open(f'test_resources/dataverse_dump_02_03_2025/{filename}.json', 'w') as f:
4954
json.dump(dataset_metadata, f, indent=4)
5055

5156
self.assertIsNotNone(doi_list)

0 commit comments

Comments
 (0)