@@ -269,34 +269,18 @@ def multiprocess_annotation_records(input_annotations: dict):
269269 else :
270270 record_chunks = input_annotations
271271
272+ counter = 0
272273 for record_chunk in record_chunks :
273274 rec_que .put (record_chunk )
274275 annotation_process_pool_results .append (annotations_process_pool .starmap_async (_process_annotation_records , [(rec_que .get (),)], error_callback = logging .error ))
276+ counter += 1
275277
276- for result in annotation_process_pool_results :
277- result_data = result .get (timeout = TIMEOUT )
278+ for result in annotation_process_pool_results :
279+ result_data = result .get (timeout = TIMEOUT )
278280
279- _cui2ptt_pos , _cui2ptt_tsp = result_data [0 ][0 ], result_data [0 ][1 ]
280-
281- for cui , patient_id_count_vals in _cui2ptt_pos .items ():
282- if cui not in cui2ptt_pos .keys ():
283- cui2ptt_pos [cui ] = patient_id_count_vals
284- else :
285- for patient_id , count in patient_id_count_vals .items ():
286- if patient_id not in cui2ptt_pos [cui ].keys ():
287- cui2ptt_pos [cui ][patient_id ] = count
288- else :
289- cui2ptt_pos [cui ][patient_id ] += count
290-
291- for cui , patient_id_timestamps in _cui2ptt_tsp .items ():
292- if cui not in cui2ptt_tsp .keys ():
293- cui2ptt_tsp [cui ] = patient_id_timestamps
294- else :
295- for patient_id , timestamp in patient_id_timestamps .items ():
296- if patient_id not in cui2ptt_tsp [cui ].keys ():
297- cui2ptt_tsp [cui ][patient_id ] = timestamp
298- else :
299- cui2ptt_tsp [cui ][patient_id ] = timestamp
281+ _cui2ptt_pos , _cui2ptt_tsp = result_data [0 ][0 ], result_data [0 ][1 ]
282+ cui2ptt_pos .update (_cui2ptt_pos )
283+ cui2ptt_tsp .update (_cui2ptt_tsp )
300284
301285 except Exception as exception :
302286 time = datetime .now ()
@@ -372,13 +356,6 @@ def multiprocess_annotation_records(input_annotations: dict):
372356 global_doc2ptt = json .loads (global_doc2ptt )
373357
374358if INPUT_ANNOTATIONS_RECORDS_FILE_NAME_PATTERN :
375-
376- # cui2ptt_pos.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a count {<cui>: {<patient_id>:<count>, ...}}\n...
377- cui2ptt_pos = defaultdict (Counter ) # store the count of a SNOMED term for a patient
378-
379- # cui2ptt_tsp.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a timestamp {<cui>: {<patient_id>:<tsp>, ...}}\n...
380- cui2ptt_tsp = defaultdict (lambda : defaultdict (int )) # store the first mention timestamp of a SNOMED term for a patient
381-
382359 # read each of the patient record files one by one
383360 for root , sub_directories , files in os .walk (INPUT_FOLDER_PATH ):
384361 for file_name in files :
@@ -390,42 +367,21 @@ def multiprocess_annotation_records(input_annotations: dict):
390367 with open (f_path , mode = "r+" ) as f :
391368 contents = json .loads (f .read ())
392369
393- _cui2ptt_pos , _cui2ptt_tsp = multiprocess_annotation_records (contents )
370+ cui2ptt_pos , cui2ptt_tsp = multiprocess_annotation_records (contents )
371+ with open (os .path .join (OUTPUT_FOLDER_PATH , "cui2ptt_pos.jsonl" ), "a+" , encoding = "utf-8" ) as outfile :
372+ for k ,v in cui2ptt_pos .items ():
373+ o = {k : v }
374+ json_obj = json .loads (json .dumps (o ))
375+ json .dump (json_obj , outfile , ensure_ascii = False , indent = None , separators = (',' ,':' ))
376+ print ('' , file = outfile )
377+
378+ with open (os .path .join (OUTPUT_FOLDER_PATH , "cui2ptt_tsp.jsonl" ), "a+" , encoding = "utf-8" ) as outfile :
379+ for k ,v in cui2ptt_tsp .items ():
380+ o = {k : v }
381+ json_obj = json .loads (json .dumps (o ))
382+ json .dump (json_obj , outfile , ensure_ascii = False , indent = None , separators = (',' ,':' ))
383+ print ('' , file = outfile )
394384
395385 with open (log_file_path , "a+" ) as log_file :
396386 time = datetime .now ()
397- log_file .write ("\n " + str (time ) + ": processed file " + str (file_name ))
398-
399- for cui , patient_id_count_vals in _cui2ptt_pos .items ():
400- if cui not in cui2ptt_pos .keys ():
401- cui2ptt_pos [cui ] = patient_id_count_vals
402- else :
403- for patient_id , count in patient_id_count_vals .items ():
404- if patient_id not in cui2ptt_pos [cui ]:
405- cui2ptt_pos [cui ][patient_id ] = count
406- else :
407- cui2ptt_pos [cui ][patient_id ] += count
408-
409- for cui , patient_id_timestamps in _cui2ptt_tsp .items ():
410- if cui not in cui2ptt_tsp .keys ():
411- cui2ptt_tsp [cui ] = patient_id_timestamps
412- else :
413- for patient_id , timestamp in patient_id_timestamps .items ():
414- if patient_id not in cui2ptt_pos [cui ].keys ():
415- cui2ptt_tsp [cui ][patient_id ] = timestamp
416- else :
417- cui2ptt_tsp [cui ][patient_id ] = timestamp
418-
419- with open (os .path .join (OUTPUT_FOLDER_PATH , "cui2ptt_pos.jsonl" ), "a+" , encoding = "utf-8" ) as outfile :
420- for k ,v in cui2ptt_pos .items ():
421- o = {k : v }
422- json_obj = json .loads (json .dumps (o ))
423- json .dump (json_obj , outfile , ensure_ascii = False , indent = None , separators = (',' ,':' ))
424- print ('' , file = outfile )
425-
426- with open (os .path .join (OUTPUT_FOLDER_PATH , "cui2ptt_tsp.jsonl" ), "a+" , encoding = "utf-8" ) as outfile :
427- for k ,v in cui2ptt_tsp .items ():
428- o = {k : v }
429- json_obj = json .loads (json .dumps (o ))
430- json .dump (json_obj , outfile , ensure_ascii = False , indent = None , separators = (',' ,':' ))
431- print ('' , file = outfile )
387+ log_file .write ("\n " + str (time ) + ": processed file " + str (file_name ))
0 commit comments