-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_trials.py
More file actions
31 lines (28 loc) · 982 Bytes
/
clean_trials.py
File metadata and controls
31 lines (28 loc) · 982 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
RAW_FILE = "clinical_trials.csv"
CLEAN_FILE = "clinical_trials_clean.csv"
def clean_csv():
print(f"Reading {RAW_FILE}...")
with open(RAW_FILE, newline='', encoding='utf-8') as infile:
reader = csv.reader(infile)
rows = list(reader)
if not rows:
print("No data found.")
return
header = rows[0]
num_cols = len(header)
print(f"Header has {num_cols} columns.")
cleaned = [header]
bad_rows = 0
for row in rows[1:]:
if len(row) == num_cols and not any(cell.strip().startswith('<') for cell in row):
cleaned.append(row)
else:
bad_rows += 1
print(f"Kept {len(cleaned)-1} rows. Removed {bad_rows} malformed or HTML/script rows.")
with open(CLEAN_FILE, 'w', newline='', encoding='utf-8') as outfile:
writer = csv.writer(outfile)
writer.writerows(cleaned)
print(f"Cleaned CSV saved as {CLEAN_FILE}")
if __name__ == "__main__":
clean_csv()