@@ -183,30 +183,36 @@ def yield_token_label_pair(self, doc, lists=False):
183183
184184 token_counter += 1
185185
186+
186187def get_document_hashes (dataset ):
187188 """Get the hashes for every doc in a dataset and return as set
188189 """
189190 return set ([doc ["_input_hash" ] for doc in dataset ])
190191
192+
191193def check_all_equal (lst ):
192194 """Check that all items in a list are equal and return True or False
193195 """
194196 return not lst or lst .count (lst [0 ]) == len (lst )
195197
198+
196199def hash_matches (doc , hash ):
197200 """Check whether the hash of the passed doc matches the passed hash
198201 """
199202 return doc ["_input_hash" ] == hash
200203
204+
201205def get_doc_by_hash (dataset , hash ):
202206 """Return a doc from a dataset where hash matches doc["_input_hash"]
203207 Assumes there will only be one match!
204208 """
205209 return [doc for doc in dataset if doc ["_input_hash" ] == hash ][0 ]
206210
211+
207212def get_tokens (doc ):
208213 return [token ["text" ] for token in doc ["tokens" ]]
209214
215+
210216def check_inputs (annotated_data ):
211217 """Checks whether two prodigy datasets contain the same docs (evaluated by
212218 doc["_input_hash"] and whether those docs contain the same tokens. This is
@@ -231,7 +237,9 @@ def check_inputs(annotated_data):
231237 diff = set (doc_hashes [i ]) ^ set (doc_hashes [j ])
232238
233239 if diff :
234- msg .fail (f"Docs { diff } unequal between dataset { i } and { j } " , exits = 1 )
240+ msg .fail (
241+ f"Docs { diff } unequal between dataset { i } and { j } " , exits = 1
242+ )
235243
236244 # Check that the tokens between the splitting and parsing docs match
237245
@@ -245,19 +253,27 @@ def check_inputs(annotated_data):
245253
246254 return True
247255
256+
248257def sort_docs_list (lst ):
249258 """Sort a list of prodigy docs by input hash
250259 """
251- return sorted (lst , key = lambda k : k ['_input_hash' ])
260+ return sorted (lst , key = lambda k : k ["_input_hash" ])
261+
252262
253263def combine_token_label_pairs (pairs ):
254264 """Combines a list of [(token, label), (token, label)] to give
255265 (token,label,label).
256266 """
257267 return pairs [0 ][0 :] + tuple (pair [1 ] for pair in pairs [1 :])
258268
269+
259270@plac .annotations (
260- input_files = ("Comma separated list of paths to jsonl files containing prodigy docs." , "positional" , None , str ),
271+ input_files = (
272+ "Comma separated list of paths to jsonl files containing prodigy docs." ,
273+ "positional" ,
274+ None ,
275+ str ,
276+ ),
261277 output_file = ("Path to output tsv file." , "positional" , None , str ),
262278 respect_lines = (
263279 "Respect line endings? Or parse entire document in a single string?" ,
@@ -343,16 +359,22 @@ def prodigy_to_tsv(
343359 # NOTE: Use of reduce to handle pairs_list of unknown length
344360
345361 if len (pairs_list ) > 1 :
346- merged_pairs = (combine_token_label_pairs (pairs ) for pairs in reduce (zip , pairs_list ))
347- example_pairs = [combine_token_label_pairs (pairs ) for i , pairs in enumerate (reduce (zip , pairs_list )) if i < 15 ]
362+ merged_pairs = (
363+ combine_token_label_pairs (pairs ) for pairs in reduce (zip , pairs_list )
364+ )
365+ example_pairs = [
366+ combine_token_label_pairs (pairs )
367+ for i , pairs in enumerate (reduce (zip , pairs_list ))
368+ if i < 15
369+ ]
348370 else :
349371 merged_pairs = pairs_list [0 ]
350372 example_pairs = merged_pairs [0 :14 ]
351373
352374 with open (output_file , "w" ) as fb :
353375 writer = csv .writer (fb , delimiter = "\t " )
354376 # Write DOCSTART and a blank line
355- #writer.writerows([("DOCSTART", None), (None, None)])
377+ # writer.writerows([("DOCSTART", None), (None, None)])
356378 writer .writerows (merged_pairs )
357379
358380 # Print out the first ten rows as a sense check
0 commit comments