|
| 1 | +import json |
| 2 | +import os |
| 3 | + |
| 4 | +# Folder containing JSON files |
| 5 | +folder_path = "../tests/test_resources/dataverse_dump_02_03_2025" |
| 6 | + |
| 7 | +# Output file |
| 8 | +output_file = "../tests/test_resources/dataverse_dump_02_03_2025/extracted/unique_subjects_keywords.txt" |
| 9 | + |
| 10 | +# Sets to store unique subjects and keywords |
| 11 | +unique_subjects = set() |
| 12 | +unique_keywords = set() |
| 13 | + |
| 14 | +# Iterate through all JSON files in the folder |
| 15 | +for filename in os.listdir(folder_path): |
| 16 | + if filename.endswith(".json"): # Process only JSON files |
| 17 | + file_path = os.path.join(folder_path, filename) |
| 18 | + |
| 19 | + # Read JSON data from the file |
| 20 | + with open(file_path, "r", encoding="utf-8") as file: |
| 21 | + json_data = json.load(file) |
| 22 | + |
| 23 | + fields = json_data.get("data", {}).get("metadataBlocks", {}).get("citation", {}).get("fields", []) |
| 24 | + |
| 25 | + # Iterate through fields to find subjects and keywords |
| 26 | + for field in fields: |
| 27 | + type_name = field.get("typeName") |
| 28 | + |
| 29 | + if type_name == "subject": |
| 30 | + value = field.get("value") |
| 31 | + if isinstance(value, list): |
| 32 | + unique_subjects.update(value) # Add multiple subjects |
| 33 | + elif value: |
| 34 | + unique_subjects.add(value) # Add single subject |
| 35 | + |
| 36 | + elif type_name == "keyword": |
| 37 | + for kw in field.get("value", []): |
| 38 | + keyword_value = kw.get("keywordValue", {}).get("value") |
| 39 | + if keyword_value: |
| 40 | + unique_keywords.add(keyword_value) |
| 41 | + |
| 42 | +# Sort unique values for better readability |
| 43 | +sorted_subjects = sorted(unique_subjects) |
| 44 | +sorted_keywords = sorted(unique_keywords) |
| 45 | + |
| 46 | +# Prepare text output |
| 47 | +output_text = "Unique Subjects:\n" + "\n".join(sorted_subjects) + "\n\n" |
| 48 | +output_text += "Unique Keywords:\n" + "\n".join(sorted_keywords) + "\n" |
| 49 | + |
| 50 | +# Write output to file |
| 51 | +with open(output_file, "w", encoding="utf-8") as f: |
| 52 | + f.write(output_text) |
| 53 | + |
| 54 | +print(f"\nUnique subjects and keywords have been written to {output_file}") |
0 commit comments