33
44def process_large_csv (input_filename , output_filename ):
55 # Open the input file for reading and the output file for writing.
6- # We assume UTF-8 encoding (adjust if necessary).
76 with open (input_filename , encoding = "utf-8" ) as infile , open (
87 output_filename , "w" , encoding = "utf-8" , newline = ""
98 ) as outfile :
@@ -14,33 +13,25 @@ def process_large_csv(input_filename, output_filename):
1413
1514 current_record = ""
1615 for line in infile :
17- # Remove the trailing newline from the line.
1816 line = line .rstrip ("\n " )
19- # If the line starts with /SDE/, it signals the beginning of a new row.
20- if line .startswith ("/SDE/" ):
21- # If we already have a record accumulated, process it.
17+ # Skip lines until the first record is found.
18+ if not current_record and not (line .startswith ("/SDE/" ) or line .startswith ("/SDE-TDAMM/" )):
19+ continue
20+ if line .startswith ("/SDE/" ) or line .startswith ("/SDE-TDAMM/" ):
2221 if current_record :
23- # Split the record into exactly 8 fields.
24- # Using maxsplit=7 ensures that any additional occurrences of '火'
25- # (for example in the text field) remain intact.
26- parts = current_record .split ("火" , 7 )
27- # Optional: normalize the text field if needed.
28- # For example, replace literal newline characters within the text with "\n".
22+ parts = current_record .split ("༜" , 7 )
2923 if len (parts ) == 8 :
3024 parts [6 ] = parts [6 ].replace ("\n " , "\\ n" )
3125 writer .writerow (parts )
3226 else :
33- # Handle unexpected formatting issues (e.g. log or skip the record).
3427 print ("Warning: Expected 8 fields, got" , len (parts ))
35- # Start a new record with the current line.
3628 current_record = line
3729 else :
38- # Otherwise, this line is a continuation of the current record.
3930 current_record += "\n " + line
4031
4132 # After the loop, process the last accumulated record.
4233 if current_record :
43- parts = current_record .split ("火 " , 7 )
34+ parts = current_record .split ("༜ " , 7 )
4435 if len (parts ) == 8 :
4536 parts [6 ] = parts [6 ].replace ("\n " , "\\ n" )
4637 writer .writerow (parts )
@@ -50,5 +41,4 @@ def process_large_csv(input_filename, output_filename):
5041
5142if __name__ == "__main__" :
5243 # Replace with your actual file names.
53- # process_large_csv("./dump.csv", "./cleaned_dump.csv")
54- process_large_csv ("./tests/original.csv" , "./actual_script_output.csv" )
44+ process_large_csv ("./inputs/dump_delimeter.csv" , "./outputs/cleaned_dump_delimeter.csv" )
0 commit comments