Skip to content

Commit 4072349

Browse files
committed
add initial processing script for full text dumps
1 parent 2f18832 commit 4072349

File tree

1 file changed

+54
-0
lines changed

1 file changed

+54
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import csv
2+
3+
4+
def process_large_csv(input_filename, output_filename):
5+
# Open the input file for reading and the output file for writing.
6+
# We assume UTF-8 encoding (adjust if necessary).
7+
with open(input_filename, encoding="utf-8") as infile, open(
8+
output_filename, "w", encoding="utf-8", newline=""
9+
) as outfile:
10+
11+
writer = csv.writer(outfile)
12+
# Write header if needed:
13+
writer.writerow(["id", "url1", "title", "collection", "treepath", "sourcestr56", "text", "sourcebool3"])
14+
15+
current_record = ""
16+
for line in infile:
17+
# Remove the trailing newline from the line.
18+
line = line.rstrip("\n")
19+
# If the line starts with /SDE/, it signals the beginning of a new row.
20+
if line.startswith("/SDE/"):
21+
# If we already have a record accumulated, process it.
22+
if current_record:
23+
# Split the record into exactly 8 fields.
24+
# Using maxsplit=7 ensures that any additional occurrences of '火'
25+
# (for example in the text field) remain intact.
26+
parts = current_record.split("火", 7)
27+
# Optional: normalize the text field if needed.
28+
# For example, replace literal newline characters within the text with "\n".
29+
if len(parts) == 8:
30+
parts[6] = parts[6].replace("\n", "\\n")
31+
writer.writerow(parts)
32+
else:
33+
# Handle unexpected formatting issues (e.g. log or skip the record).
34+
print("Warning: Expected 8 fields, got", len(parts))
35+
# Start a new record with the current line.
36+
current_record = line
37+
else:
38+
# Otherwise, this line is a continuation of the current record.
39+
current_record += "\n" + line
40+
41+
# After the loop, process the last accumulated record.
42+
if current_record:
43+
parts = current_record.split("火", 7)
44+
if len(parts) == 8:
45+
parts[6] = parts[6].replace("\n", "\\n")
46+
writer.writerow(parts)
47+
else:
48+
print("Warning: Expected 8 fields, got", len(parts))
49+
50+
51+
if __name__ == "__main__":
52+
# Replace with your actual file names.
53+
# process_large_csv("./dump.csv", "./cleaned_dump.csv")
54+
process_large_csv("./tests/original.csv", "./actual_script_output.csv")

0 commit comments

Comments
 (0)