99from clients import logger
1010from file_level_validation import file_level_validation
1111from errors import NoOperationPermissions , InvalidHeaders
12+ from utils_for_recordprocessor import get_csv_content_dict_reader
1213
1314
1415def process_csv_to_fhir (incoming_message_body : dict ) -> None :
1516 """
1617 For each row of the csv, attempts to transform into FHIR format, sends a message to kinesis,
1718 and documents the outcome for each row in the ack file.
1819 """
19- encoder = "utf-8" # default encoding
2020 try :
2121 interim_message_body = file_level_validation (incoming_message_body = incoming_message_body )
2222 except (InvalidHeaders , NoOperationPermissions , Exception ): # pylint: disable=broad-exception-caught
@@ -32,53 +32,72 @@ def process_csv_to_fhir(incoming_message_body: dict) -> None:
3232 csv_reader = interim_message_body .get ("csv_dict_reader" )
3333
3434 target_disease = map_target_disease (vaccine )
35+ print ("process csv to fhir" )
3536 row_count = 0
37+ encoder = "utf-8" # default encoding
3638 try :
3739 row_count = process_rows (file_id , vaccine , supplier , file_key , allowed_operations ,
3840 created_at_formatted_string , csv_reader , target_disease )
3941 except Exception as error : # pylint: disable=broad-exception-caught
4042 new_encoder = "cp1252"
4143 print (f"Error processing: { error } ." )
4244 # check if it's a decode error, ie error.args[0] begins with "'utf-8' codec can't decode byte"
43- if error .args [ 0 ]. startswith ( "'utf-8' codec can't decode byte") :
45+ if error .reason == "invalid continuation byte" :
4446 print (f"Encode error at row { row_count } with { encoder } . Switch to { new_encoder } " )
45- print (f"Detected decode error: { error .args [0 ]} " )
46- # if we are here, re-read the file with correct encoding and ignore the processed rows
47- # if error.args[0] == "'utf-8' codec can't decode byte 0xe9 in position 2996: invalid continuation byte":
48- # cp1252
49- row_count += process_rows_retry (file_id , vaccine , supplier , file_key ,
50- allowed_operations , created_at_formatted_string ,
51- "cp1252" , start_row = row_count )
47+ # print(f"Detected decode error: {error.reason}")
48+ encoder = new_encoder
49+ # if we are here, re-read the file with alternative encoding and skip processed rows
50+ row_count = process_rows_retry (file_id , vaccine , supplier , file_key ,
51+ allowed_operations , created_at_formatted_string ,
52+ encoder , row_count )
5253 else :
53- logger .error (f"Non-decode error: { error } . Cannot retry." )
54+ logger .error (f"Non-decode error: { error } . Cannot retry. Call someone. " )
5455 raise error from error
5556
5657 logger .info ("Total rows processed: %s" , row_count )
57- update_audit_table_status (file_key , file_id , FileStatus .PREPROCESSED )
5858
5959
6060def process_rows_retry (file_id , vaccine , supplier , file_key , allowed_operations ,
61- created_at_formatted_string , encoder , target_disease , start_row = 0 ) -> int :
62- new_reader = get_csv_content_dict_reader (file_key , encoding = encoder )
63- return process_rows (file_id , vaccine , supplier , file_key , allowed_operations ,
64- created_at_formatted_string , new_reader , start_row )
61+ created_at_formatted_string , encoder , total_rows_processed_count = 0 ) -> int :
62+ """
63+ Retry processing rows with a different encoding from a specific row number
64+ """
65+ print ("process_rows_retry..." )
66+ new_reader = get_csv_content_dict_reader (file_key , encoder = encoder )
67+
68+ total_rows_processed_count = process_rows (
69+ file_id , vaccine , supplier , file_key , allowed_operations ,
70+ created_at_formatted_string , new_reader , total_rows_processed_count )
71+
72+ return total_rows_processed_count
6573
6674
6775def process_rows (file_id , vaccine , supplier , file_key , allowed_operations , created_at_formatted_string ,
68- csv_reader , target_disease , start_row = 0 ) -> int :
76+ csv_reader , target_disease ,
77+ total_rows_processed_count = 0 ) -> int :
6978 """
7079 Processes each row in the csv_reader starting from start_row.
7180 """
72-
81+ print ( "process_rows..." )
7382 row_count = 0
83+ start_row = total_rows_processed_count
7484 for row in csv_reader :
75- if row_count >= start_row :
76- row_count += 1
85+
86+ row_count += 1
87+ if row_count > start_row :
7788 row_id = f"{ file_id } ^{ row_count } "
7889 logger .info ("MESSAGE ID : %s" , row_id )
7990
91+ # convert dict to string and print first 20 chars
92+ if (total_rows_processed_count % 1000 == 0 ):
93+ print (f"Process: { total_rows_processed_count } " )
94+ if (total_rows_processed_count > 19995 ):
95+ print (f"Process: { total_rows_processed_count } - { row ['PERSON_SURNAME' ]} " )
96+
97+ # Process the row to obtain the details needed for the message_body and ack file
8098 details_from_processing = process_row (target_disease , allowed_operations , row )
8199
100+ # Create the message body for sending
82101 outgoing_message_body = {
83102 "row_id" : row_id ,
84103 "file_key" : file_key ,
@@ -89,8 +108,9 @@ def process_rows(file_id, vaccine, supplier, file_key, allowed_operations, creat
89108 }
90109
91110 send_to_kinesis (supplier , outgoing_message_body , vaccine )
92-
93- return row_count
111+ total_rows_processed_count += 1
112+ logger .info ("Total rows processed: %s" , total_rows_processed_count )
113+ return total_rows_processed_count
94114
95115
96116def main (event : str ) -> None :
0 commit comments