|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Humanize Knowledge Graph edges by replacing IDs with human-readable names. |
| 4 | +Usage: python3 make_humanized_graph.py [debug_limit] |
| 5 | +""" |
| 6 | + |
| 7 | +import sys |
| 8 | +import csv |
| 9 | +from pathlib import Path |
| 10 | + |
| 11 | + |
| 12 | +def find_column_index(header, pattern): |
| 13 | + """Find the index of a column in the header that matches the pattern.""" |
| 14 | + for i, col in enumerate(header): |
| 15 | + if col == pattern: |
| 16 | + return i |
| 17 | + return -1 |
| 18 | + |
| 19 | + |
| 20 | +def humanize_predicate(pred, id_to_name): |
| 21 | + """Humanize a predicate by removing prefixes and converting underscores to spaces.""" |
| 22 | + if ':' in pred: |
| 23 | + # Strip prefix and convert underscores to spaces |
| 24 | + return pred.split(':', 1)[1].replace('_', ' ') |
| 25 | + elif pred in id_to_name: |
| 26 | + return id_to_name[pred] |
| 27 | + return pred |
| 28 | + |
| 29 | + |
| 30 | +def humanize_source(source): |
| 31 | + """Humanize a source by removing prefixes.""" |
| 32 | + if ':' in source: |
| 33 | + return source.split(':', 1)[1] |
| 34 | + return source |
| 35 | + |
| 36 | + |
| 37 | +def humanize_category(category): |
| 38 | + """Humanize a category by removing prefixes and formatting for display.""" |
| 39 | + if not category: |
| 40 | + return "" |
| 41 | + |
| 42 | + if ':' in category: |
| 43 | + category = category.split(':', 1)[1] |
| 44 | + |
| 45 | + # Convert CamelCase to spaces (e.g., "GeneProduct" -> "Gene Product") |
| 46 | + import re |
| 47 | + category = re.sub(r'([a-z])([A-Z])', r'\1 \2', category).lower() |
| 48 | + |
| 49 | + # Add article prefix |
| 50 | + if category[0].lower() in 'aeiou': |
| 51 | + return f"an {category}" |
| 52 | + else: |
| 53 | + return f"a {category}" |
| 54 | + |
| 55 | + |
| 56 | +def main(): |
| 57 | + # Parse debug limit if provided |
| 58 | + debug_limit = 0 |
| 59 | + if len(sys.argv) > 1: |
| 60 | + try: |
| 61 | + debug_limit = int(sys.argv[1]) |
| 62 | + except ValueError: |
| 63 | + print(f"Invalid debug limit: {sys.argv[1]}") |
| 64 | + sys.exit(1) |
| 65 | + |
| 66 | + # File paths |
| 67 | + data_dir = Path("data") |
| 68 | + edges_file = data_dir / "kg-alzheimers_edges.tsv" |
| 69 | + nodes_file = data_dir / "kg-alzheimers_nodes.tsv" |
| 70 | + output_file = data_dir / "kg-alzheimers_humanized_edges.tsv" |
| 71 | + |
| 72 | + # Check if files exist |
| 73 | + if not data_dir.exists(): |
| 74 | + print(f"Error: '{data_dir}' directory not found") |
| 75 | + sys.exit(1) |
| 76 | + if not edges_file.exists(): |
| 77 | + print(f"Error: '{edges_file}' not found") |
| 78 | + sys.exit(1) |
| 79 | + if not nodes_file.exists(): |
| 80 | + print(f"Error: '{nodes_file}' not found") |
| 81 | + sys.exit(1) |
| 82 | + |
| 83 | + print("Loading node names into memory...") |
| 84 | + |
| 85 | + # Find column indices in nodes file |
| 86 | + with open(nodes_file, 'r') as f: |
| 87 | + nodes_header = next(csv.reader(f, delimiter='\t')) |
| 88 | + |
| 89 | + id_col_idx = find_column_index(nodes_header, "id") |
| 90 | + name_col_idx = find_column_index(nodes_header, "name") |
| 91 | + category_col_idx = find_column_index(nodes_header, "category") |
| 92 | + |
| 93 | + if id_col_idx == -1: |
| 94 | + print("Error: Could not find 'id' column in nodes file") |
| 95 | + sys.exit(1) |
| 96 | + if name_col_idx == -1: |
| 97 | + print("Error: Could not find 'name' column in nodes file") |
| 98 | + sys.exit(1) |
| 99 | + |
| 100 | + has_categories = category_col_idx != -1 |
| 101 | + if not has_categories: |
| 102 | + print("Warning: Could not find 'category' column in nodes file") |
| 103 | + print("Node categories will not be included in the output") |
| 104 | + |
| 105 | + print( |
| 106 | + f"In nodes file: ID column is {id_col_idx+1}, Name column is {name_col_idx+1}") |
| 107 | + if has_categories: |
| 108 | + print(f"Category column is {category_col_idx+1}") |
| 109 | + |
| 110 | + # Find column indices in edges file |
| 111 | + with open(edges_file, 'r') as f: |
| 112 | + edges_header = next(csv.reader(f, delimiter='\t')) |
| 113 | + |
| 114 | + subject_col_idx = find_column_index(edges_header, "subject") |
| 115 | + predicate_col_idx = find_column_index(edges_header, "predicate") |
| 116 | + object_col_idx = find_column_index(edges_header, "object") |
| 117 | + source_col_idx = find_column_index( |
| 118 | + edges_header, "primary_knowledge_source") |
| 119 | + |
| 120 | + if subject_col_idx == -1 or predicate_col_idx == -1 or object_col_idx == -1: |
| 121 | + print("Error: Could not find required columns in edges file") |
| 122 | + print("Looking for 'subject', 'object', and 'predicate' columns") |
| 123 | + sys.exit(1) |
| 124 | + |
| 125 | + has_source = source_col_idx != -1 |
| 126 | + |
| 127 | + if not has_source: |
| 128 | + print("Warning: Could not find 'primary_knowledge_source' column in edges file") |
| 129 | + print("The source information will not be included in the output") |
| 130 | + |
| 131 | + print(f"In edges file: Subject column is {subject_col_idx+1}, " |
| 132 | + f"Predicate column is {predicate_col_idx+1}, " |
| 133 | + f"Object column is {object_col_idx+1}") |
| 134 | + |
| 135 | + if has_source: |
| 136 | + print(f"Knowledge source column is {source_col_idx+1}") |
| 137 | + |
| 138 | + # Load node ID to name mapping and categories |
| 139 | + id_to_name = {} |
| 140 | + id_to_category = {} |
| 141 | + |
| 142 | + with open(nodes_file, 'r') as f: |
| 143 | + reader = csv.reader(f, delimiter='\t') |
| 144 | + next(reader) # Skip header |
| 145 | + for row in reader: |
| 146 | + if len(row) > max(id_col_idx, name_col_idx): |
| 147 | + node_id = row[id_col_idx] |
| 148 | + node_name = row[name_col_idx] |
| 149 | + |
| 150 | + # Get category if available |
| 151 | + node_category = "" |
| 152 | + if has_categories and len(row) > category_col_idx: |
| 153 | + node_category = row[category_col_idx] |
| 154 | + |
| 155 | + if node_id: |
| 156 | + id_to_name[node_id] = node_name |
| 157 | + if node_category: |
| 158 | + id_to_category[node_id] = node_category |
| 159 | + |
| 160 | + print(f"Loaded {len(id_to_name)} node mappings") |
| 161 | + if has_categories: |
| 162 | + print(f"Loaded {len(id_to_category)} node categories") |
| 163 | + |
| 164 | + # Show mode info |
| 165 | + if debug_limit > 0: |
| 166 | + print(f"Processing first {debug_limit} edges (debug mode)...") |
| 167 | + limit_msg = f"First {debug_limit}" |
| 168 | + else: |
| 169 | + print("Processing all edges (production mode)...") |
| 170 | + limit_msg = "All" |
| 171 | + |
| 172 | + # Create header for the output file |
| 173 | + with open(output_file, 'w', newline='') as f_out: |
| 174 | + writer = csv.writer(f_out, delimiter='\t') |
| 175 | + if has_source: |
| 176 | + writer.writerow(["subject", "predicate", "object", "source"]) |
| 177 | + else: |
| 178 | + writer.writerow(["subject", "predicate", "object"]) |
| 179 | + |
| 180 | + # Set to track unique edges |
| 181 | + unique_edges = set() |
| 182 | + duplicate_count = 0 |
| 183 | + |
| 184 | + # Process edges file |
| 185 | + count = 0 |
| 186 | + written_count = 0 |
| 187 | + |
| 188 | + with open(edges_file, 'r') as f_in, open(output_file, 'a', newline='') as f_out: |
| 189 | + reader = csv.reader(f_in, delimiter='\t') |
| 190 | + writer = csv.writer(f_out, delimiter='\t') |
| 191 | + next(reader) # Skip header, we've already written it |
| 192 | + |
| 193 | + for row in reader: |
| 194 | + if debug_limit > 0 and count >= debug_limit: |
| 195 | + break |
| 196 | + |
| 197 | + count += 1 # Increment before possible continue to correctly count processed edges |
| 198 | + |
| 199 | + if len(row) > max(subject_col_idx, predicate_col_idx, object_col_idx): |
| 200 | + subject_id = row[subject_col_idx] |
| 201 | + predicate = row[predicate_col_idx] |
| 202 | + object_id = row[object_col_idx] |
| 203 | + |
| 204 | + # Get basic names |
| 205 | + subject_name = id_to_name.get(subject_id, subject_id) |
| 206 | + object_name = id_to_name.get(object_id, object_id) |
| 207 | + |
| 208 | + # Add category information if available |
| 209 | + if has_categories: |
| 210 | + if subject_id in id_to_category and id_to_category[subject_id]: |
| 211 | + humanized_category = humanize_category( |
| 212 | + id_to_category[subject_id]) |
| 213 | + if humanized_category: |
| 214 | + subject_name = f"{subject_name} ({humanized_category})" |
| 215 | + |
| 216 | + if object_id in id_to_category and id_to_category[object_id]: |
| 217 | + humanized_category = humanize_category( |
| 218 | + id_to_category[object_id]) |
| 219 | + if humanized_category: |
| 220 | + object_name = f"{object_name} ({humanized_category})" |
| 221 | + |
| 222 | + # Humanize predicate |
| 223 | + predicate = humanize_predicate(predicate, id_to_name) |
| 224 | + |
| 225 | + # Create output row based on whether source is available |
| 226 | + if has_source and len(row) > source_col_idx: |
| 227 | + source = row[source_col_idx] |
| 228 | + source = humanize_source(source) |
| 229 | + output_row = [subject_name, predicate, object_name, source] |
| 230 | + else: |
| 231 | + output_row = [subject_name, predicate, object_name] |
| 232 | + |
| 233 | + # Check if this is a duplicate edge |
| 234 | + edge_key = tuple(output_row) |
| 235 | + if edge_key in unique_edges: |
| 236 | + duplicate_count += 1 |
| 237 | + continue # Skip writing this edge |
| 238 | + |
| 239 | + # Add to set of unique edges and write to output |
| 240 | + unique_edges.add(edge_key) |
| 241 | + writer.writerow(output_row) |
| 242 | + written_count += 1 |
| 243 | + |
| 244 | + print(f"Processed {count} edges") |
| 245 | + print(f"Found {duplicate_count} duplicate edges") |
| 246 | + print(f"{written_count} unique humanized edges have been saved to {output_file}") |
| 247 | + |
| 248 | + |
| 249 | +if __name__ == "__main__": |
| 250 | + main() |
0 commit comments