This repository was archived by the owner on Apr 23, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtsv2json.py
More file actions
90 lines (75 loc) · 3.65 KB
/
tsv2json.py
File metadata and controls
90 lines (75 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#! /usr/bin/env python3
import os
import csv
import json
import sys
cwd = os.getcwd()
#print("Current working directory:", cwd)
# /home/runner/work/controlled-lists/controlled-lists
#sys.exit()
tsv_dir = os.path.join(cwd, "tsv") # "../tsv" # set the folder name
json_dir = os.path.join(cwd, "json") # "../json" # set the folder name
# loop over all subfolders in the tsv folder
for folder_name in os.listdir(tsv_dir):
folder_path = os.path.join(tsv_dir, folder_name)
json_path = os.path.join(json_dir, folder_name + ".json")
print ("Processing: " + folder_name)
# check if the subfolder is a directory
if os.path.isdir(folder_path):
note = ""
note_marker = "> **Note**"
# read the README.md file if it exists
readme_path = os.path.join(folder_path, "README.md")
if os.path.isfile(readme_path):
with open(readme_path, "r") as readme_file:
#readme_content = readme_file.read()
for line in readme_file:
#print (line)
if line.startswith(note_marker):
break
note += line
tsv_data = {
"id": folder_name,
"description": note,
"terms": {}
} # initialize the dictionary for this tsv file
# loop over all tsv files in the subfolder
for filename in os.listdir(folder_path):
if filename.endswith(".tsv"):
filepath = os.path.join(folder_path, filename)
filename_parts = os.path.splitext(filename)[0].split("_")
#print (filename_parts)
# read the tsv file and extract the data
with open(filepath, "r") as tsv_file:
tsv_reader = csv.reader(tsv_file, delimiter="\t")
next(tsv_reader) # skip the first row if it starts with DB_Term
for row in tsv_reader:
# Check the length of the list
list_len = len(row)
# If the list has less than 5 values, add empty values to the end
if list_len < 5:
row += [None] * (5 - list_len)
# If the list has more than 5 values, remove the extra values
elif list_len > 5:
row = row[:5]
if len(row) == 5:
term, label, description, sameAs, source = row
if term not in tsv_data["terms"]:
tsv_data["terms"][term] = {
"term": term,
"label": {filename_parts[-1]: label},
"description": {filename_parts[-1]: description},
"sameAs": sameAs,
"source": source
}
else:
tsv_data["terms"][term]["label"][filename_parts[-1]] = label
tsv_data["terms"][term]["description"][filename_parts[-1]] = description
# Convert dictionary to JSON
json_str = json.dumps(tsv_data, indent=2, ensure_ascii=False)
if os.path.isfile(json_path):
with open(json_path, 'w') as file:
file.write(json_str)
else:
with open(json_path, 'x') as file:
file.write(json_str)