Skip to content

Commit fca1423

Browse files
committed
Updated dump processing script
1 parent c4776d8 commit fca1423

File tree

1 file changed

+45
-14
lines changed

1 file changed

+45
-14
lines changed

scripts/sde_dump_processing/clean_sde_dump.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
"""
22
to run
33
python clean_sde_dump.py inputs/sde_init_dump_04_28_2025.csv outputs/input.csv -v
4+
python3 scripts/sde_dump_processing/clean_sde_dump.py \
5+
/Users/dhanursharma/Everything/sde/full_text_dump/dev_sde_index_cmr_related_urls_sample_2025-08-19.csv \
6+
/Users/dhanursharma/Everything/sde/full_text_dump/processed_dev_sde_index_cmr_related_urls_sample_2025-08-19.csv -v
47
58
to compress
69
7z a -t7z -m0=lzma2 -mx=9 -mmt=on -md=512m outputs/output.7z outputs/input.csv
@@ -128,13 +131,15 @@ def assemble_records_generator(input_filename, skip_header=False, progress_inter
128131
raise
129132

130133

131-
def process_records_generator(raw_records_iterator, delimiter="༜", expected_fields=None, batch_size=50000):
134+
def process_records_generator(raw_records_iterator, delimiter="༜", expected_fields=None, batch_size=50000, header=None):
132135
"""
133136
Processes records from an iterator, yielding valid processed records.
134-
Requires expected_fields to be specified.
137+
Requires expected_fields and a header to be specified.
135138
"""
136139
if expected_fields is None:
137140
raise ValueError("process_records_generator requires 'expected_fields' to be specified.")
141+
if header is None:
142+
raise ValueError("process_records_generator requires a 'header' list to identify target fields.")
138143

139144
processed_count = 0
140145
invalid_records = 0
@@ -144,23 +149,34 @@ def process_records_generator(raw_records_iterator, delimiter="༜", expected_fi
144149

145150
logger.info(f"Starting record processing (expecting {expected_fields} fields)...")
146151

152+
# --- Find indices of target full-text fields from the header ---
153+
# Default to -1; if a field is not found, its processing will be skipped.
154+
text_field_index = -1
155+
data_product_desc_index = -1
156+
try:
157+
text_field_index = header.index("text")
158+
logger.info(f"Target field 'text' found in header at index: {text_field_index}")
159+
except ValueError:
160+
logger.info("Target field 'text' not found in header. It will be skipped if not present.")
161+
try:
162+
data_product_desc_index = header.index("data_product_desc")
163+
logger.info(f"Target field 'data_product_desc' found in header at index: {data_product_desc_index}")
164+
except ValueError:
165+
logger.info("Target field 'data_product_desc' not found in header. It will be skipped if not present.")
166+
147167
for i, record in enumerate(raw_records_iterator):
148168
input_record_index = i + 1
149169
# Split into AT MOST expected_fields parts. The last part will contain the rest.
150170
parts = record.split(delimiter, expected_fields - 1)
151171

152172
if len(parts) == expected_fields:
153-
# Replace literal newlines with '\\n' string in the text field
154-
# The text field is the last one, index expected_fields - 1
155-
text_field_index = expected_fields - 1
156-
if text_field_index < len(parts): # Ensure index exists
173+
# Replace newlines in the 'text' field if it was found
174+
if text_field_index != -1 and text_field_index < len(parts):
157175
parts[text_field_index] = parts[text_field_index].replace("\n", "\\n")
158-
else:
159-
# This case should technically not happen if len(parts) == expected_fields
160-
# but adding a safeguard.
161-
logger.warning(
162-
f"Record ~{input_record_index}: Field index {text_field_index} out of bounds unexpectedly."
163-
)
176+
177+
# Replace newlines in the 'data_product_desc' field if it was found
178+
if data_product_desc_index != -1 and data_product_desc_index < len(parts):
179+
parts[data_product_desc_index] = parts[data_product_desc_index].replace("\n", "\\n")
164180

165181
processed_count += 1
166182
yield parts # Yield the valid processed record
@@ -265,6 +281,21 @@ def process_large_csv(input_filename, output_filename, delimiter="༜", verbose=
265281
first_line = next(infile).rstrip("\n")
266282
header = first_line.split(delimiter)
267283
expected_fields = len(header)
284+
# try:
285+
# sourcestr15_index = header.index('sourcestr15')
286+
# print("\n--- SCRIPT DEBUGGING INFO ---")
287+
# print(f"Total fields found in header: {expected_fields}")
288+
# print(f"0-based index of 'sourcestr15': {sourcestr15_index}")
289+
290+
# number_to_subtract = expected_fields - sourcestr15_index
291+
# print(f"THE NUMBER TO SUBTRACT IS: {number_to_subtract}")
292+
# print("-----------------------------\n")
293+
294+
# except ValueError:
295+
# print("\n--- SCRIPT DEBUGGING ERROR ---")
296+
# print("CRITICAL: 'sourcestr15' was NOT found in the input file's header.")
297+
# print("This means the cleaning cannot be done.")
298+
# print("-------------------------------\n")
268299
logger.info(f"Detected header: {header}")
269300
logger.info(f"Expecting {expected_fields} fields based on header.")
270301
if expected_fields == 0:
@@ -285,8 +316,8 @@ def process_large_csv(input_filename, output_filename, delimiter="༜", verbose=
285316
# Create iterators/generators
286317
# Pass skip_header=True to avoid processing the header line again
287318
raw_records_iter = assemble_records_generator(input_filename, skip_header=True)
288-
# Pass the dynamically determined expected_fields
289-
processed_records_iter = process_records_generator(raw_records_iter, delimiter, expected_fields)
319+
# Pass the dynamically determined expected_fields and the header
320+
processed_records_iter = process_records_generator(raw_records_iter, delimiter, expected_fields, header=header)
290321

291322
# Write to output file by consuming the processed records iterator, using the detected header
292323
records_written = write_output_file(output_filename, processed_records_iter, header)

0 commit comments

Comments
 (0)