add processing scripts for csv

CarsonDavis · CarsonDavis · commit c650645249f7 · 2025-02-25T16:46:46.000-06:00
diff --git a/scripts/sde_dump_processing/clean_sde_dump.py b/scripts/sde_dump_processing/clean_sde_dump.py
@@ -3,7 +3,6 @@
 
 def process_large_csv(input_filename, output_filename):
     # Open the input file for reading and the output file for writing.
-    # We assume UTF-8 encoding (adjust if necessary).
     with open(input_filename, encoding="utf-8") as infile, open(
         output_filename, "w", encoding="utf-8", newline=""
     ) as outfile:
@@ -14,33 +13,25 @@ def process_large_csv(input_filename, output_filename):
 
         current_record = ""
         for line in infile:
-            # Remove the trailing newline from the line.
             line = line.rstrip("\n")
-            # If the line starts with /SDE/, it signals the beginning of a new row.
-            if line.startswith("/SDE/"):
-                # If we already have a record accumulated, process it.
+            # Skip lines until the first record is found.
+            if not current_record and not (line.startswith("/SDE/") or line.startswith("/SDE-TDAMM/")):
+                continue
+            if line.startswith("/SDE/") or line.startswith("/SDE-TDAMM/"):
                 if current_record:
-                    # Split the record into exactly 8 fields.
-                    # Using maxsplit=7 ensures that any additional occurrences of '火'
-                    # (for example in the text field) remain intact.
-                    parts = current_record.split("火", 7)
-                    # Optional: normalize the text field if needed.
-                    # For example, replace literal newline characters within the text with "\n".
+                    parts = current_record.split("༜", 7)
                     if len(parts) == 8:
                         parts[6] = parts[6].replace("\n", "\\n")
                         writer.writerow(parts)
                     else:
-                        # Handle unexpected formatting issues (e.g. log or skip the record).
                         print("Warning: Expected 8 fields, got", len(parts))
-                # Start a new record with the current line.
                 current_record = line
             else:
-                # Otherwise, this line is a continuation of the current record.
                 current_record += "\n" + line
 
         # After the loop, process the last accumulated record.
         if current_record:
-            parts = current_record.split("火", 7)
+            parts = current_record.split("༜", 7)
             if len(parts) == 8:
                 parts[6] = parts[6].replace("\n", "\\n")
                 writer.writerow(parts)
@@ -50,5 +41,4 @@ def process_large_csv(input_filename, output_filename):
 
 if __name__ == "__main__":
     # Replace with your actual file names.
-    # process_large_csv("./dump.csv", "./cleaned_dump.csv")
-    process_large_csv("./tests/original.csv", "./actual_script_output.csv")
+    process_large_csv("./inputs/dump_delimeter.csv", "./outputs/cleaned_dump_delimeter.csv")
diff --git a/scripts/sde_dump_processing/validate_csv_structure.py b/scripts/sde_dump_processing/validate_csv_structure.py
@@ -0,0 +1,53 @@
+import csv
+import sys
+
+
+def analyze_output_csv(filename):
+    # Increase CSV field size limit
+    csv.field_size_limit(sys.maxsize)
+
+    total_rows = 0
+    correct_rows = 0
+    incorrect_rows = 0
+
+    with open(filename, encoding="utf-8") as f:
+        reader = csv.reader(f)
+
+        # Attempt to read a header row
+        header = next(reader, None)
+        if header:
+            print("Header row:", header)
+
+        for row_index, row in enumerate(reader, start=2):
+            total_rows += 1
+
+            # Check for exactly 8 columns
+            if len(row) != 8:
+                incorrect_rows += 1
+                print(f"[WARNING] Row {row_index} has {len(row)} columns (expected 8). Row data")
+                continue
+
+            # Optional: ensure last column is strictly 'true' or 'false'
+            if row[7] not in ("true", "false"):
+                incorrect_rows += 1
+                print(f"[WARNING] Row {row_index} last column not 'true' or 'false': {row[0]}, {row[7][:100]}")
+                continue
+
+            correct_rows += 1
+
+    print("\n=== ANALYSIS COMPLETE ===")
+    print(f"Total data rows (excluding header): {total_rows}")
+    print(f"Correctly formatted rows: {correct_rows}")
+    print(f"Incorrectly formatted rows: {incorrect_rows}")
+
+
+def main():
+    import sys
+
+    filename = sys.argv[1] if len(sys.argv) > 1 else "outputs/cleaned_dump_delimeter.csv"
+    print(f"Analyzing: {filename}")
+    analyze_output_csv(filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/sde_dump_processing/view_one_row.py b/scripts/sde_dump_processing/view_one_row.py
@@ -0,0 +1,37 @@
+import csv
+import sys
+
+
+def write_id_and_url_for_row(filename, row_number, output_file):
+    # Increase CSV field size limit so large fields won't cause errors
+    csv.field_size_limit(sys.maxsize)
+
+    with open(filename, encoding="utf-8") as f:
+        reader = csv.reader(f)
+
+        # Skip the header row
+        next(reader, None)
+
+        # IMPORTANT: start=2 to match the validation script's row numbering
+        for current_row_index, row in enumerate(reader, start=2):
+            if current_row_index == row_number:
+                with open(output_file, "w", encoding="utf-8", newline="") as out_f:
+                    writer = csv.writer(out_f)
+                    writer.writerow(row)
+                return
+
+    # If you get here, that row_number didn't exist
+    with open(output_file, "w", encoding="utf-8") as out_f:
+        out_f.write(f"Row {row_number} does not exist in {filename}.\n")
+
+
+def main():
+    # Example usage:
+    filename = "outputs/cleaned_dump_delimeter.csv"
+    output_file = "outputs/row_output.csv"
+    desired_row_number = 175655  # or wherever you want
+    write_id_and_url_for_row(filename, desired_row_number, output_file)
+
+
+if __name__ == "__main__":
+    main()