Add script for converting campaign data to parquet

jcrist · jcrist · commit 3f202782729d · 2023-04-19T16:21:31.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -137,4 +137,5 @@ Icon?
 # Tutorial and examples artifacts
 geography.db
 palmer_penguins.ddb*
+scratch/
 *.log
diff --git a/scripts/prepare_campaign_finance_data.py b/scripts/prepare_campaign_finance_data.py
@@ -0,0 +1,81 @@
+from pathlib import Path
+from urllib.request import urlretrieve
+from zipfile import ZipFile
+
+import ibis
+
+# Download and unzip the 2018 individual contributions data
+url = (
+    "https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1."
+    "amazonaws.com/bulk-downloads/2018/indiv18.zip"
+)
+
+root_dir = Path(__file__).resolve().parent.parent
+data_dir = root_dir.joinpath("data")
+data_dir.mkdir(exist_ok=True)
+
+zip_path = data_dir.joinpath("indiv18.zip")
+csv_path = data_dir.joinpath("itcont.txt")
+parquet_path = data_dir.joinpath("itcont.parquet")
+
+if not zip_path.exists():
+    print("Downloading indiv18.zip...")
+    urlretrieve(url, zip_path)
+else:
+    print("indiv18.zip already downloaded")
+
+if not csv_path.exists():
+    print("Extracting itcont.txt...")
+    with ZipFile(zip_path) as zip_file:
+        zip_file.extract("itcont.txt", path=data_dir)
+else:
+    print("itcont.txt already extracted")
+
+if not parquet_path.exists():
+    print("Generating itcont.parquet...")
+    # Read in the CSV
+    t = ibis.read_csv(csv_path)
+
+    # The CSV doesn't have a header, we need to manually add titles
+    header = [
+        "CMTE_ID",
+        "AMNDT_IND",
+        "RPT_TP",
+        "TRANSACTION_PGI",
+        "IMAGE_NUM",
+        "TRANSACTION_TP",
+        "ENTITY_TP",
+        "NAME",
+        "CITY",
+        "STATE",
+        "ZIP_CODE",
+        "EMPLOYER",
+        "OCCUPATION",
+        "TRANSACTION_DT",
+        "TRANSACTION_AMT",
+        "OTHER_ID",
+        "TRAN_ID",
+        "FILE_NUM",
+        "MEMO_CD",
+        "MEMO_TEXT",
+        "SUB_ID",
+    ]
+    t = t.relabel(dict(zip(t.columns, header)))
+
+    # For the analysis, we're only going to use a few of the columns. To save
+    # bandwidth, lets select out only the columns we'll be using.
+    columns = [
+        "CMTE_ID",
+        "TRANSACTION_PGI",
+        "ENTITY_TP",
+        "CITY",
+        "STATE",
+        "TRANSACTION_DT",
+        "TRANSACTION_AMT",
+    ]
+    t = t[columns]
+
+    # Write out a parquet file
+    t.to_parquet(parquet_path, compression="zstd")
+else:
+    print("itcont.parquet already exists")