Skip to content

Commit f59090e

Browse files
committed
NiFi: reverted cohort export script (ann section).
1 parent 7ea2ff7 commit f59090e

File tree

1 file changed

+22
-66
lines changed

1 file changed

+22
-66
lines changed

nifi/user-scripts/cogstack_cohort_generate_data.py

Lines changed: 22 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -269,34 +269,18 @@ def multiprocess_annotation_records(input_annotations: dict):
269269
else:
270270
record_chunks = input_annotations
271271

272+
counter = 0
272273
for record_chunk in record_chunks:
273274
rec_que.put(record_chunk)
274275
annotation_process_pool_results.append(annotations_process_pool.starmap_async(_process_annotation_records, [(rec_que.get(),)], error_callback=logging.error))
276+
counter += 1
275277

276-
for result in annotation_process_pool_results:
277-
result_data = result.get(timeout=TIMEOUT)
278+
for result in annotation_process_pool_results:
279+
result_data = result.get(timeout=TIMEOUT)
278280

279-
_cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]
280-
281-
for cui, patient_id_count_vals in _cui2ptt_pos.items():
282-
if cui not in cui2ptt_pos.keys():
283-
cui2ptt_pos[cui] = patient_id_count_vals
284-
else:
285-
for patient_id, count in patient_id_count_vals.items():
286-
if patient_id not in cui2ptt_pos[cui].keys():
287-
cui2ptt_pos[cui][patient_id] = count
288-
else:
289-
cui2ptt_pos[cui][patient_id] += count
290-
291-
for cui, patient_id_timestamps in _cui2ptt_tsp.items():
292-
if cui not in cui2ptt_tsp.keys():
293-
cui2ptt_tsp[cui] = patient_id_timestamps
294-
else:
295-
for patient_id, timestamp in patient_id_timestamps.items():
296-
if patient_id not in cui2ptt_tsp[cui].keys():
297-
cui2ptt_tsp[cui][patient_id] = timestamp
298-
else:
299-
cui2ptt_tsp[cui][patient_id] = timestamp
281+
_cui2ptt_pos, _cui2ptt_tsp = result_data[0][0], result_data[0][1]
282+
cui2ptt_pos.update(_cui2ptt_pos)
283+
cui2ptt_tsp.update(_cui2ptt_tsp)
300284

301285
except Exception as exception:
302286
time = datetime.now()
@@ -372,13 +356,6 @@ def multiprocess_annotation_records(input_annotations: dict):
372356
global_doc2ptt = json.loads(global_doc2ptt)
373357

374358
if INPUT_ANNOTATIONS_RECORDS_FILE_NAME_PATTERN:
375-
376-
# cui2ptt_pos.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a count {<cui>: {<patient_id>:<count>, ...}}\n...
377-
cui2ptt_pos = defaultdict(Counter) # store the count of a SNOMED term for a patient
378-
379-
# cui2ptt_tsp.jsonl each line is a dictionary of cui and the value is a dictionary of patients with a timestamp {<cui>: {<patient_id>:<tsp>, ...}}\n...
380-
cui2ptt_tsp = defaultdict(lambda: defaultdict(int)) # store the first mention timestamp of a SNOMED term for a patient
381-
382359
# read each of the patient record files one by one
383360
for root, sub_directories, files in os.walk(INPUT_FOLDER_PATH):
384361
for file_name in files:
@@ -390,42 +367,21 @@ def multiprocess_annotation_records(input_annotations: dict):
390367
with open(f_path, mode="r+") as f:
391368
contents = json.loads(f.read())
392369

393-
_cui2ptt_pos, _cui2ptt_tsp = multiprocess_annotation_records(contents)
370+
cui2ptt_pos, cui2ptt_tsp = multiprocess_annotation_records(contents)
371+
with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
372+
for k,v in cui2ptt_pos.items():
373+
o = {k: v}
374+
json_obj = json.loads(json.dumps(o))
375+
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
376+
print('', file=outfile)
377+
378+
with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
379+
for k,v in cui2ptt_tsp.items():
380+
o = {k: v}
381+
json_obj = json.loads(json.dumps(o))
382+
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
383+
print('', file=outfile)
394384

395385
with open(log_file_path, "a+") as log_file:
396386
time = datetime.now()
397-
log_file.write("\n" + str(time) + ": processed file " + str(file_name))
398-
399-
for cui, patient_id_count_vals in _cui2ptt_pos.items():
400-
if cui not in cui2ptt_pos.keys():
401-
cui2ptt_pos[cui] = patient_id_count_vals
402-
else:
403-
for patient_id, count in patient_id_count_vals.items():
404-
if patient_id not in cui2ptt_pos[cui]:
405-
cui2ptt_pos[cui][patient_id] = count
406-
else:
407-
cui2ptt_pos[cui][patient_id] += count
408-
409-
for cui, patient_id_timestamps in _cui2ptt_tsp.items():
410-
if cui not in cui2ptt_tsp.keys():
411-
cui2ptt_tsp[cui] = patient_id_timestamps
412-
else:
413-
for patient_id, timestamp in patient_id_timestamps.items():
414-
if patient_id not in cui2ptt_pos[cui].keys():
415-
cui2ptt_tsp[cui][patient_id] = timestamp
416-
else:
417-
cui2ptt_tsp[cui][patient_id] = timestamp
418-
419-
with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_pos.jsonl"), "a+", encoding="utf-8") as outfile:
420-
for k,v in cui2ptt_pos.items():
421-
o = {k: v}
422-
json_obj = json.loads(json.dumps(o))
423-
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
424-
print('', file=outfile)
425-
426-
with open(os.path.join(OUTPUT_FOLDER_PATH, "cui2ptt_tsp.jsonl"), "a+", encoding="utf-8") as outfile:
427-
for k,v in cui2ptt_tsp.items():
428-
o = {k: v}
429-
json_obj = json.loads(json.dumps(o))
430-
json.dump(json_obj, outfile, ensure_ascii=False, indent=None, separators=(',',':'))
431-
print('', file=outfile)
387+
log_file.write("\n" + str(time) + ": processed file " + str(file_name))

0 commit comments

Comments
 (0)