Skip to content

Commit e3bbf94

Browse files
authored
Merge pull request #33 from Knowledge-Graph-Hub/make_semantic_graph
Add script to produce humanized graph
2 parents 7f71818 + 0a013a5 commit e3bbf94

File tree

1 file changed

+250
-0
lines changed

1 file changed

+250
-0
lines changed

scripts/make_humanized_graph.py

Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Humanize Knowledge Graph edges by replacing IDs with human-readable names.
4+
Usage: python3 make_humanized_graph.py [debug_limit]
5+
"""
6+
7+
import sys
8+
import csv
9+
from pathlib import Path
10+
11+
12+
def find_column_index(header, pattern):
13+
"""Find the index of a column in the header that matches the pattern."""
14+
for i, col in enumerate(header):
15+
if col == pattern:
16+
return i
17+
return -1
18+
19+
20+
def humanize_predicate(pred, id_to_name):
21+
"""Humanize a predicate by removing prefixes and converting underscores to spaces."""
22+
if ':' in pred:
23+
# Strip prefix and convert underscores to spaces
24+
return pred.split(':', 1)[1].replace('_', ' ')
25+
elif pred in id_to_name:
26+
return id_to_name[pred]
27+
return pred
28+
29+
30+
def humanize_source(source):
31+
"""Humanize a source by removing prefixes."""
32+
if ':' in source:
33+
return source.split(':', 1)[1]
34+
return source
35+
36+
37+
def humanize_category(category):
38+
"""Humanize a category by removing prefixes and formatting for display."""
39+
if not category:
40+
return ""
41+
42+
if ':' in category:
43+
category = category.split(':', 1)[1]
44+
45+
# Convert CamelCase to spaces (e.g., "GeneProduct" -> "Gene Product")
46+
import re
47+
category = re.sub(r'([a-z])([A-Z])', r'\1 \2', category).lower()
48+
49+
# Add article prefix
50+
if category[0].lower() in 'aeiou':
51+
return f"an {category}"
52+
else:
53+
return f"a {category}"
54+
55+
56+
def main():
57+
# Parse debug limit if provided
58+
debug_limit = 0
59+
if len(sys.argv) > 1:
60+
try:
61+
debug_limit = int(sys.argv[1])
62+
except ValueError:
63+
print(f"Invalid debug limit: {sys.argv[1]}")
64+
sys.exit(1)
65+
66+
# File paths
67+
data_dir = Path("data")
68+
edges_file = data_dir / "kg-alzheimers_edges.tsv"
69+
nodes_file = data_dir / "kg-alzheimers_nodes.tsv"
70+
output_file = data_dir / "kg-alzheimers_humanized_edges.tsv"
71+
72+
# Check if files exist
73+
if not data_dir.exists():
74+
print(f"Error: '{data_dir}' directory not found")
75+
sys.exit(1)
76+
if not edges_file.exists():
77+
print(f"Error: '{edges_file}' not found")
78+
sys.exit(1)
79+
if not nodes_file.exists():
80+
print(f"Error: '{nodes_file}' not found")
81+
sys.exit(1)
82+
83+
print("Loading node names into memory...")
84+
85+
# Find column indices in nodes file
86+
with open(nodes_file, 'r') as f:
87+
nodes_header = next(csv.reader(f, delimiter='\t'))
88+
89+
id_col_idx = find_column_index(nodes_header, "id")
90+
name_col_idx = find_column_index(nodes_header, "name")
91+
category_col_idx = find_column_index(nodes_header, "category")
92+
93+
if id_col_idx == -1:
94+
print("Error: Could not find 'id' column in nodes file")
95+
sys.exit(1)
96+
if name_col_idx == -1:
97+
print("Error: Could not find 'name' column in nodes file")
98+
sys.exit(1)
99+
100+
has_categories = category_col_idx != -1
101+
if not has_categories:
102+
print("Warning: Could not find 'category' column in nodes file")
103+
print("Node categories will not be included in the output")
104+
105+
print(
106+
f"In nodes file: ID column is {id_col_idx+1}, Name column is {name_col_idx+1}")
107+
if has_categories:
108+
print(f"Category column is {category_col_idx+1}")
109+
110+
# Find column indices in edges file
111+
with open(edges_file, 'r') as f:
112+
edges_header = next(csv.reader(f, delimiter='\t'))
113+
114+
subject_col_idx = find_column_index(edges_header, "subject")
115+
predicate_col_idx = find_column_index(edges_header, "predicate")
116+
object_col_idx = find_column_index(edges_header, "object")
117+
source_col_idx = find_column_index(
118+
edges_header, "primary_knowledge_source")
119+
120+
if subject_col_idx == -1 or predicate_col_idx == -1 or object_col_idx == -1:
121+
print("Error: Could not find required columns in edges file")
122+
print("Looking for 'subject', 'object', and 'predicate' columns")
123+
sys.exit(1)
124+
125+
has_source = source_col_idx != -1
126+
127+
if not has_source:
128+
print("Warning: Could not find 'primary_knowledge_source' column in edges file")
129+
print("The source information will not be included in the output")
130+
131+
print(f"In edges file: Subject column is {subject_col_idx+1}, "
132+
f"Predicate column is {predicate_col_idx+1}, "
133+
f"Object column is {object_col_idx+1}")
134+
135+
if has_source:
136+
print(f"Knowledge source column is {source_col_idx+1}")
137+
138+
# Load node ID to name mapping and categories
139+
id_to_name = {}
140+
id_to_category = {}
141+
142+
with open(nodes_file, 'r') as f:
143+
reader = csv.reader(f, delimiter='\t')
144+
next(reader) # Skip header
145+
for row in reader:
146+
if len(row) > max(id_col_idx, name_col_idx):
147+
node_id = row[id_col_idx]
148+
node_name = row[name_col_idx]
149+
150+
# Get category if available
151+
node_category = ""
152+
if has_categories and len(row) > category_col_idx:
153+
node_category = row[category_col_idx]
154+
155+
if node_id:
156+
id_to_name[node_id] = node_name
157+
if node_category:
158+
id_to_category[node_id] = node_category
159+
160+
print(f"Loaded {len(id_to_name)} node mappings")
161+
if has_categories:
162+
print(f"Loaded {len(id_to_category)} node categories")
163+
164+
# Show mode info
165+
if debug_limit > 0:
166+
print(f"Processing first {debug_limit} edges (debug mode)...")
167+
limit_msg = f"First {debug_limit}"
168+
else:
169+
print("Processing all edges (production mode)...")
170+
limit_msg = "All"
171+
172+
# Create header for the output file
173+
with open(output_file, 'w', newline='') as f_out:
174+
writer = csv.writer(f_out, delimiter='\t')
175+
if has_source:
176+
writer.writerow(["subject", "predicate", "object", "source"])
177+
else:
178+
writer.writerow(["subject", "predicate", "object"])
179+
180+
# Set to track unique edges
181+
unique_edges = set()
182+
duplicate_count = 0
183+
184+
# Process edges file
185+
count = 0
186+
written_count = 0
187+
188+
with open(edges_file, 'r') as f_in, open(output_file, 'a', newline='') as f_out:
189+
reader = csv.reader(f_in, delimiter='\t')
190+
writer = csv.writer(f_out, delimiter='\t')
191+
next(reader) # Skip header, we've already written it
192+
193+
for row in reader:
194+
if debug_limit > 0 and count >= debug_limit:
195+
break
196+
197+
count += 1 # Increment before possible continue to correctly count processed edges
198+
199+
if len(row) > max(subject_col_idx, predicate_col_idx, object_col_idx):
200+
subject_id = row[subject_col_idx]
201+
predicate = row[predicate_col_idx]
202+
object_id = row[object_col_idx]
203+
204+
# Get basic names
205+
subject_name = id_to_name.get(subject_id, subject_id)
206+
object_name = id_to_name.get(object_id, object_id)
207+
208+
# Add category information if available
209+
if has_categories:
210+
if subject_id in id_to_category and id_to_category[subject_id]:
211+
humanized_category = humanize_category(
212+
id_to_category[subject_id])
213+
if humanized_category:
214+
subject_name = f"{subject_name} ({humanized_category})"
215+
216+
if object_id in id_to_category and id_to_category[object_id]:
217+
humanized_category = humanize_category(
218+
id_to_category[object_id])
219+
if humanized_category:
220+
object_name = f"{object_name} ({humanized_category})"
221+
222+
# Humanize predicate
223+
predicate = humanize_predicate(predicate, id_to_name)
224+
225+
# Create output row based on whether source is available
226+
if has_source and len(row) > source_col_idx:
227+
source = row[source_col_idx]
228+
source = humanize_source(source)
229+
output_row = [subject_name, predicate, object_name, source]
230+
else:
231+
output_row = [subject_name, predicate, object_name]
232+
233+
# Check if this is a duplicate edge
234+
edge_key = tuple(output_row)
235+
if edge_key in unique_edges:
236+
duplicate_count += 1
237+
continue # Skip writing this edge
238+
239+
# Add to set of unique edges and write to output
240+
unique_edges.add(edge_key)
241+
writer.writerow(output_row)
242+
written_count += 1
243+
244+
print(f"Processed {count} edges")
245+
print(f"Found {duplicate_count} duplicate edges")
246+
print(f"{written_count} unique humanized edges have been saved to {output_file}")
247+
248+
249+
if __name__ == "__main__":
250+
main()

0 commit comments

Comments
 (0)