Skip to content
This repository was archived by the owner on Jul 15, 2024. It is now read-only.

Commit 3f20278

Browse files
committed
Add script for converting campaign data to parquet
1 parent 85c84ad commit 3f20278

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,5 @@ Icon?
137137
# Tutorial and examples artifacts
138138
geography.db
139139
palmer_penguins.ddb*
140+
scratch/
140141
*.log
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from pathlib import Path
2+
from urllib.request import urlretrieve
3+
from zipfile import ZipFile
4+
5+
import ibis
6+
7+
# Download and unzip the 2018 individual contributions data
8+
url = (
9+
"https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1."
10+
"amazonaws.com/bulk-downloads/2018/indiv18.zip"
11+
)
12+
13+
root_dir = Path(__file__).resolve().parent.parent
14+
data_dir = root_dir.joinpath("data")
15+
data_dir.mkdir(exist_ok=True)
16+
17+
zip_path = data_dir.joinpath("indiv18.zip")
18+
csv_path = data_dir.joinpath("itcont.txt")
19+
parquet_path = data_dir.joinpath("itcont.parquet")
20+
21+
if not zip_path.exists():
22+
print("Downloading indiv18.zip...")
23+
urlretrieve(url, zip_path)
24+
else:
25+
print("indiv18.zip already downloaded")
26+
27+
if not csv_path.exists():
28+
print("Extracting itcont.txt...")
29+
with ZipFile(zip_path) as zip_file:
30+
zip_file.extract("itcont.txt", path=data_dir)
31+
else:
32+
print("itcont.txt already extracted")
33+
34+
if not parquet_path.exists():
35+
print("Generating itcont.parquet...")
36+
# Read in the CSV
37+
t = ibis.read_csv(csv_path)
38+
39+
# The CSV doesn't have a header, we need to manually add titles
40+
header = [
41+
"CMTE_ID",
42+
"AMNDT_IND",
43+
"RPT_TP",
44+
"TRANSACTION_PGI",
45+
"IMAGE_NUM",
46+
"TRANSACTION_TP",
47+
"ENTITY_TP",
48+
"NAME",
49+
"CITY",
50+
"STATE",
51+
"ZIP_CODE",
52+
"EMPLOYER",
53+
"OCCUPATION",
54+
"TRANSACTION_DT",
55+
"TRANSACTION_AMT",
56+
"OTHER_ID",
57+
"TRAN_ID",
58+
"FILE_NUM",
59+
"MEMO_CD",
60+
"MEMO_TEXT",
61+
"SUB_ID",
62+
]
63+
t = t.relabel(dict(zip(t.columns, header)))
64+
65+
# For the analysis, we're only going to use a few of the columns. To save
66+
# bandwidth, lets select out only the columns we'll be using.
67+
columns = [
68+
"CMTE_ID",
69+
"TRANSACTION_PGI",
70+
"ENTITY_TP",
71+
"CITY",
72+
"STATE",
73+
"TRANSACTION_DT",
74+
"TRANSACTION_AMT",
75+
]
76+
t = t[columns]
77+
78+
# Write out a parquet file
79+
t.to_parquet(parquet_path, compression="zstd")
80+
else:
81+
print("itcont.parquet already exists")

0 commit comments

Comments
 (0)