Skip to content

Commit c650645

Browse files
committed
add processing scripts for csv
1 parent 4072349 commit c650645

File tree

3 files changed

+97
-17
lines changed

3 files changed

+97
-17
lines changed

scripts/sde_dump_processing/clean_sde_dump.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
def process_large_csv(input_filename, output_filename):
55
# Open the input file for reading and the output file for writing.
6-
# We assume UTF-8 encoding (adjust if necessary).
76
with open(input_filename, encoding="utf-8") as infile, open(
87
output_filename, "w", encoding="utf-8", newline=""
98
) as outfile:
@@ -14,33 +13,25 @@ def process_large_csv(input_filename, output_filename):
1413

1514
current_record = ""
1615
for line in infile:
17-
# Remove the trailing newline from the line.
1816
line = line.rstrip("\n")
19-
# If the line starts with /SDE/, it signals the beginning of a new row.
20-
if line.startswith("/SDE/"):
21-
# If we already have a record accumulated, process it.
17+
# Skip lines until the first record is found.
18+
if not current_record and not (line.startswith("/SDE/") or line.startswith("/SDE-TDAMM/")):
19+
continue
20+
if line.startswith("/SDE/") or line.startswith("/SDE-TDAMM/"):
2221
if current_record:
23-
# Split the record into exactly 8 fields.
24-
# Using maxsplit=7 ensures that any additional occurrences of '火'
25-
# (for example in the text field) remain intact.
26-
parts = current_record.split("火", 7)
27-
# Optional: normalize the text field if needed.
28-
# For example, replace literal newline characters within the text with "\n".
22+
parts = current_record.split("༜", 7)
2923
if len(parts) == 8:
3024
parts[6] = parts[6].replace("\n", "\\n")
3125
writer.writerow(parts)
3226
else:
33-
# Handle unexpected formatting issues (e.g. log or skip the record).
3427
print("Warning: Expected 8 fields, got", len(parts))
35-
# Start a new record with the current line.
3628
current_record = line
3729
else:
38-
# Otherwise, this line is a continuation of the current record.
3930
current_record += "\n" + line
4031

4132
# After the loop, process the last accumulated record.
4233
if current_record:
43-
parts = current_record.split("", 7)
34+
parts = current_record.split("", 7)
4435
if len(parts) == 8:
4536
parts[6] = parts[6].replace("\n", "\\n")
4637
writer.writerow(parts)
@@ -50,5 +41,4 @@ def process_large_csv(input_filename, output_filename):
5041

5142
if __name__ == "__main__":
5243
# Replace with your actual file names.
53-
# process_large_csv("./dump.csv", "./cleaned_dump.csv")
54-
process_large_csv("./tests/original.csv", "./actual_script_output.csv")
44+
process_large_csv("./inputs/dump_delimeter.csv", "./outputs/cleaned_dump_delimeter.csv")
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import csv
2+
import sys
3+
4+
5+
def analyze_output_csv(filename):
6+
# Increase CSV field size limit
7+
csv.field_size_limit(sys.maxsize)
8+
9+
total_rows = 0
10+
correct_rows = 0
11+
incorrect_rows = 0
12+
13+
with open(filename, encoding="utf-8") as f:
14+
reader = csv.reader(f)
15+
16+
# Attempt to read a header row
17+
header = next(reader, None)
18+
if header:
19+
print("Header row:", header)
20+
21+
for row_index, row in enumerate(reader, start=2):
22+
total_rows += 1
23+
24+
# Check for exactly 8 columns
25+
if len(row) != 8:
26+
incorrect_rows += 1
27+
print(f"[WARNING] Row {row_index} has {len(row)} columns (expected 8). Row data")
28+
continue
29+
30+
# Optional: ensure last column is strictly 'true' or 'false'
31+
if row[7] not in ("true", "false"):
32+
incorrect_rows += 1
33+
print(f"[WARNING] Row {row_index} last column not 'true' or 'false': {row[0]}, {row[7][:100]}")
34+
continue
35+
36+
correct_rows += 1
37+
38+
print("\n=== ANALYSIS COMPLETE ===")
39+
print(f"Total data rows (excluding header): {total_rows}")
40+
print(f"Correctly formatted rows: {correct_rows}")
41+
print(f"Incorrectly formatted rows: {incorrect_rows}")
42+
43+
44+
def main():
45+
import sys
46+
47+
filename = sys.argv[1] if len(sys.argv) > 1 else "outputs/cleaned_dump_delimeter.csv"
48+
print(f"Analyzing: {filename}")
49+
analyze_output_csv(filename)
50+
51+
52+
if __name__ == "__main__":
53+
main()
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import csv
2+
import sys
3+
4+
5+
def write_id_and_url_for_row(filename, row_number, output_file):
6+
# Increase CSV field size limit so large fields won't cause errors
7+
csv.field_size_limit(sys.maxsize)
8+
9+
with open(filename, encoding="utf-8") as f:
10+
reader = csv.reader(f)
11+
12+
# Skip the header row
13+
next(reader, None)
14+
15+
# IMPORTANT: start=2 to match the validation script's row numbering
16+
for current_row_index, row in enumerate(reader, start=2):
17+
if current_row_index == row_number:
18+
with open(output_file, "w", encoding="utf-8", newline="") as out_f:
19+
writer = csv.writer(out_f)
20+
writer.writerow(row)
21+
return
22+
23+
# If you get here, that row_number didn't exist
24+
with open(output_file, "w", encoding="utf-8") as out_f:
25+
out_f.write(f"Row {row_number} does not exist in {filename}.\n")
26+
27+
28+
def main():
29+
# Example usage:
30+
filename = "outputs/cleaned_dump_delimeter.csv"
31+
output_file = "outputs/row_output.csv"
32+
desired_row_number = 175655 # or wherever you want
33+
write_id_and_url_for_row(filename, desired_row_number, output_file)
34+
35+
36+
if __name__ == "__main__":
37+
main()

0 commit comments

Comments
 (0)