@@ -177,8 +177,14 @@ def main():
177177 else :
178178 writer .writerow (["subject" , "predicate" , "object" ])
179179
180+ # Set to track unique edges
181+ unique_edges = set ()
182+ duplicate_count = 0
183+
180184 # Process edges file
181185 count = 0
186+ written_count = 0
187+
182188 with open (edges_file , 'r' ) as f_in , open (output_file , 'a' , newline = '' ) as f_out :
183189 reader = csv .reader (f_in , delimiter = '\t ' )
184190 writer = csv .writer (f_out , delimiter = '\t ' )
@@ -188,6 +194,8 @@ def main():
188194 if debug_limit > 0 and count >= debug_limit :
189195 break
190196
197+ count += 1 # Increment before possible continue to correctly count processed edges
198+
191199 if len (row ) > max (subject_col_idx , predicate_col_idx , object_col_idx ):
192200 subject_id = row [subject_col_idx ]
193201 predicate = row [predicate_col_idx ]
@@ -214,18 +222,28 @@ def main():
214222 # Humanize predicate
215223 predicate = humanize_predicate (predicate , id_to_name )
216224
225+ # Create output row based on whether source is available
217226 if has_source and len (row ) > source_col_idx :
218227 source = row [source_col_idx ]
219228 source = humanize_source (source )
220- writer .writerow (
221- [subject_name , predicate , object_name , source ])
229+ output_row = [subject_name , predicate , object_name , source ]
222230 else :
223- writer .writerow ([subject_name , predicate , object_name ])
231+ output_row = [subject_name , predicate , object_name ]
232+
233+ # Check if this is a duplicate edge
234+ edge_key = tuple (output_row )
235+ if edge_key in unique_edges :
236+ duplicate_count += 1
237+ continue # Skip writing this edge
224238
225- count += 1
239+ # Add to set of unique edges and write to output
240+ unique_edges .add (edge_key )
241+ writer .writerow (output_row )
242+ written_count += 1
226243
227244 print (f"Processed { count } edges" )
228- print (f"{ limit_msg } humanized edges have been saved to { output_file } " )
245+ print (f"Found { duplicate_count } duplicate edges" )
246+ print (f"{ written_count } unique humanized edges have been saved to { output_file } " )
229247
230248
231249if __name__ == "__main__" :
0 commit comments