Skip to content

Commit 0a013a5

Browse files
committed
Ensure unique edges
1 parent c996c12 commit 0a013a5

File tree

1 file changed

+23
-5
lines changed

1 file changed

+23
-5
lines changed

scripts/make_humanized_graph.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,14 @@ def main():
177177
else:
178178
writer.writerow(["subject", "predicate", "object"])
179179

180+
# Set to track unique edges
181+
unique_edges = set()
182+
duplicate_count = 0
183+
180184
# Process edges file
181185
count = 0
186+
written_count = 0
187+
182188
with open(edges_file, 'r') as f_in, open(output_file, 'a', newline='') as f_out:
183189
reader = csv.reader(f_in, delimiter='\t')
184190
writer = csv.writer(f_out, delimiter='\t')
@@ -188,6 +194,8 @@ def main():
188194
if debug_limit > 0 and count >= debug_limit:
189195
break
190196

197+
count += 1 # Increment before possible continue to correctly count processed edges
198+
191199
if len(row) > max(subject_col_idx, predicate_col_idx, object_col_idx):
192200
subject_id = row[subject_col_idx]
193201
predicate = row[predicate_col_idx]
@@ -214,18 +222,28 @@ def main():
214222
# Humanize predicate
215223
predicate = humanize_predicate(predicate, id_to_name)
216224

225+
# Create output row based on whether source is available
217226
if has_source and len(row) > source_col_idx:
218227
source = row[source_col_idx]
219228
source = humanize_source(source)
220-
writer.writerow(
221-
[subject_name, predicate, object_name, source])
229+
output_row = [subject_name, predicate, object_name, source]
222230
else:
223-
writer.writerow([subject_name, predicate, object_name])
231+
output_row = [subject_name, predicate, object_name]
232+
233+
# Check if this is a duplicate edge
234+
edge_key = tuple(output_row)
235+
if edge_key in unique_edges:
236+
duplicate_count += 1
237+
continue # Skip writing this edge
224238

225-
count += 1
239+
# Add to set of unique edges and write to output
240+
unique_edges.add(edge_key)
241+
writer.writerow(output_row)
242+
written_count += 1
226243

227244
print(f"Processed {count} edges")
228-
print(f"{limit_msg} humanized edges have been saved to {output_file}")
245+
print(f"Found {duplicate_count} duplicate edges")
246+
print(f"{written_count} unique humanized edges have been saved to {output_file}")
229247

230248

231249
if __name__ == "__main__":

0 commit comments

Comments
 (0)