11"""
22to run
33python clean_sde_dump.py inputs/sde_init_dump_04_28_2025.csv outputs/input.csv -v
4+ python3 scripts/sde_dump_processing/clean_sde_dump.py \
5+ /Users/dhanursharma/Everything/sde/full_text_dump/dev_sde_index_cmr_related_urls_sample_2025-08-19.csv \
6+ /Users/dhanursharma/Everything/sde/full_text_dump/processed_dev_sde_index_cmr_related_urls_sample_2025-08-19.csv -v
47
58to compress
697z a -t7z -m0=lzma2 -mx=9 -mmt=on -md=512m outputs/output.7z outputs/input.csv
@@ -128,13 +131,15 @@ def assemble_records_generator(input_filename, skip_header=False, progress_inter
128131 raise
129132
130133
131- def process_records_generator (raw_records_iterator , delimiter = "༜" , expected_fields = None , batch_size = 50000 ):
134+ def process_records_generator (raw_records_iterator , delimiter = "༜" , expected_fields = None , batch_size = 50000 , header = None ):
132135 """
133136 Processes records from an iterator, yielding valid processed records.
134- Requires expected_fields to be specified.
137+ Requires expected_fields and a header to be specified.
135138 """
136139 if expected_fields is None :
137140 raise ValueError ("process_records_generator requires 'expected_fields' to be specified." )
141+ if header is None :
142+ raise ValueError ("process_records_generator requires a 'header' list to identify target fields." )
138143
139144 processed_count = 0
140145 invalid_records = 0
@@ -144,23 +149,34 @@ def process_records_generator(raw_records_iterator, delimiter="༜", expected_fi
144149
145150 logger .info (f"Starting record processing (expecting { expected_fields } fields)..." )
146151
152+ # --- Find indices of target full-text fields from the header ---
153+ # Default to -1; if a field is not found, its processing will be skipped.
154+ text_field_index = - 1
155+ data_product_desc_index = - 1
156+ try :
157+ text_field_index = header .index ("text" )
158+ logger .info (f"Target field 'text' found in header at index: { text_field_index } " )
159+ except ValueError :
160+ logger .info ("Target field 'text' not found in header. It will be skipped if not present." )
161+ try :
162+ data_product_desc_index = header .index ("data_product_desc" )
163+ logger .info (f"Target field 'data_product_desc' found in header at index: { data_product_desc_index } " )
164+ except ValueError :
165+ logger .info ("Target field 'data_product_desc' not found in header. It will be skipped if not present." )
166+
147167 for i , record in enumerate (raw_records_iterator ):
148168 input_record_index = i + 1
149169 # Split into AT MOST expected_fields parts. The last part will contain the rest.
150170 parts = record .split (delimiter , expected_fields - 1 )
151171
152172 if len (parts ) == expected_fields :
153- # Replace literal newlines with '\\n' string in the text field
154- # The text field is the last one, index expected_fields - 1
155- text_field_index = expected_fields - 1
156- if text_field_index < len (parts ): # Ensure index exists
173+ # Replace newlines in the 'text' field if it was found
174+ if text_field_index != - 1 and text_field_index < len (parts ):
157175 parts [text_field_index ] = parts [text_field_index ].replace ("\n " , "\\ n" )
158- else :
159- # This case should technically not happen if len(parts) == expected_fields
160- # but adding a safeguard.
161- logger .warning (
162- f"Record ~{ input_record_index } : Field index { text_field_index } out of bounds unexpectedly."
163- )
176+
177+ # Replace newlines in the 'data_product_desc' field if it was found
178+ if data_product_desc_index != - 1 and data_product_desc_index < len (parts ):
179+ parts [data_product_desc_index ] = parts [data_product_desc_index ].replace ("\n " , "\\ n" )
164180
165181 processed_count += 1
166182 yield parts # Yield the valid processed record
@@ -265,6 +281,21 @@ def process_large_csv(input_filename, output_filename, delimiter="༜", verbose=
265281 first_line = next (infile ).rstrip ("\n " )
266282 header = first_line .split (delimiter )
267283 expected_fields = len (header )
284+ # try:
285+ # sourcestr15_index = header.index('sourcestr15')
286+ # print("\n--- SCRIPT DEBUGGING INFO ---")
287+ # print(f"Total fields found in header: {expected_fields}")
288+ # print(f"0-based index of 'sourcestr15': {sourcestr15_index}")
289+
290+ # number_to_subtract = expected_fields - sourcestr15_index
291+ # print(f"THE NUMBER TO SUBTRACT IS: {number_to_subtract}")
292+ # print("-----------------------------\n")
293+
294+ # except ValueError:
295+ # print("\n--- SCRIPT DEBUGGING ERROR ---")
296+ # print("CRITICAL: 'sourcestr15' was NOT found in the input file's header.")
297+ # print("This means the cleaning cannot be done.")
298+ # print("-------------------------------\n")
268299 logger .info (f"Detected header: { header } " )
269300 logger .info (f"Expecting { expected_fields } fields based on header." )
270301 if expected_fields == 0 :
@@ -285,8 +316,8 @@ def process_large_csv(input_filename, output_filename, delimiter="༜", verbose=
285316 # Create iterators/generators
286317 # Pass skip_header=True to avoid processing the header line again
287318 raw_records_iter = assemble_records_generator (input_filename , skip_header = True )
288- # Pass the dynamically determined expected_fields
289- processed_records_iter = process_records_generator (raw_records_iter , delimiter , expected_fields )
319+ # Pass the dynamically determined expected_fields and the header
320+ processed_records_iter = process_records_generator (raw_records_iter , delimiter , expected_fields , header = header )
290321
291322 # Write to output file by consuming the processed records iterator, using the detected header
292323 records_written = write_output_file (output_filename , processed_records_iter , header )
0 commit comments