3
3
4
4
def process_large_csv (input_filename , output_filename ):
5
5
# Open the input file for reading and the output file for writing.
6
- # We assume UTF-8 encoding (adjust if necessary).
7
6
with open (input_filename , encoding = "utf-8" ) as infile , open (
8
7
output_filename , "w" , encoding = "utf-8" , newline = ""
9
8
) as outfile :
@@ -14,33 +13,25 @@ def process_large_csv(input_filename, output_filename):
14
13
15
14
current_record = ""
16
15
for line in infile :
17
- # Remove the trailing newline from the line.
18
16
line = line .rstrip ("\n " )
19
- # If the line starts with /SDE/, it signals the beginning of a new row.
20
- if line .startswith ("/SDE/" ):
21
- # If we already have a record accumulated, process it.
17
+ # Skip lines until the first record is found.
18
+ if not current_record and not (line .startswith ("/SDE/" ) or line .startswith ("/SDE-TDAMM/" )):
19
+ continue
20
+ if line .startswith ("/SDE/" ) or line .startswith ("/SDE-TDAMM/" ):
22
21
if current_record :
23
- # Split the record into exactly 8 fields.
24
- # Using maxsplit=7 ensures that any additional occurrences of '火'
25
- # (for example in the text field) remain intact.
26
- parts = current_record .split ("火" , 7 )
27
- # Optional: normalize the text field if needed.
28
- # For example, replace literal newline characters within the text with "\n".
22
+ parts = current_record .split ("༜" , 7 )
29
23
if len (parts ) == 8 :
30
24
parts [6 ] = parts [6 ].replace ("\n " , "\\ n" )
31
25
writer .writerow (parts )
32
26
else :
33
- # Handle unexpected formatting issues (e.g. log or skip the record).
34
27
print ("Warning: Expected 8 fields, got" , len (parts ))
35
- # Start a new record with the current line.
36
28
current_record = line
37
29
else :
38
- # Otherwise, this line is a continuation of the current record.
39
30
current_record += "\n " + line
40
31
41
32
# After the loop, process the last accumulated record.
42
33
if current_record :
43
- parts = current_record .split ("火 " , 7 )
34
+ parts = current_record .split ("༜ " , 7 )
44
35
if len (parts ) == 8 :
45
36
parts [6 ] = parts [6 ].replace ("\n " , "\\ n" )
46
37
writer .writerow (parts )
@@ -50,5 +41,4 @@ def process_large_csv(input_filename, output_filename):
50
41
51
42
if __name__ == "__main__" :
52
43
# Replace with your actual file names.
53
- # process_large_csv("./dump.csv", "./cleaned_dump.csv")
54
- process_large_csv ("./tests/original.csv" , "./actual_script_output.csv" )
44
+ process_large_csv ("./inputs/dump_delimeter.csv" , "./outputs/cleaned_dump_delimeter.csv" )
0 commit comments