-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_data_from_date.py
More file actions
136 lines (106 loc) · 4.58 KB
/
clean_data_from_date.py
File metadata and controls
136 lines (106 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
'''
https://nextcentury.atlassian.net/browse/ITM-973
This script should be used when prod needs to be cleaned of testing data from a certain date.
Usage ex. `python clean_data_from_date.py 06/24/2025`
It will clean out the surveyResults, userScenarioResults, and participantLog collections from that date.
'''
import sys
from pymongo import MongoClient
from decouple import config
from datetime import datetime
import re
# parse the date parameter
def parse_date_string(date_str):
try:
return datetime.strptime(date_str, "%m/%d/%Y")
except ValueError:
raise ValueError(f"Invalid date format: {date_str}. Expected format: mm/dd/yyyy")
# pull out the date to match format stored in mongo db
def extract_date_from_string(date_string):
if not date_string:
return None
pattern = r'([A-Za-z]{3})\s+([A-Za-z]{3})\s+(\d{1,2})\s+(\d{4})'
match = re.search(pattern, date_string)
if match:
month_str = match.group(2)
day = int(match.group(3))
year = int(match.group(4))
months = {
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}
if month_str in months:
month = months[month_str]
return datetime(year, month, day)
return None
def delete_data_from_date(date):
target_date = parse_date_string(date)
print(f"Deleting data from date: {target_date.strftime('%Y-%m-%d')}")
mongo_url = config('MONGO_URL')
client = MongoClient(mongo_url)
db = client['dashboard']
# collections to wipe
text_based = db['userScenarioResults']
survey_results = db['surveyResults']
participant_log = db['participantLog']
participant_ids_to_delete = set()
# userScenarioResults
print("\nProcessing userScenarioResults...")
text_cursor = text_based.find({})
text_delete_count = 0
for doc in text_cursor:
if 'startTime' in doc:
doc_date = extract_date_from_string(doc['startTime'])
if doc_date and doc_date.date() == target_date.date():
if 'participantID' in doc:
participant_ids_to_delete.add(str(doc['participantID']))
text_based.delete_one({'_id': doc['_id']})
text_delete_count += 1
print(f"Deleted {text_delete_count} documents from userScenarioResults")
# surveyResults
print("\nProcessing surveyResults...")
survey_cursor = survey_results.find({})
survey_delete_count = 0
for doc in survey_cursor:
if 'results' in doc and 'startTime' in doc['results']:
doc_date = extract_date_from_string(doc['results']['startTime'])
if doc_date and doc_date.date() == target_date.date():
if 'results' in doc and 'pid' in doc['results']:
participant_ids_to_delete.add(str(doc['results']['pid']))
survey_results.delete_one({'_id': doc['_id']})
survey_delete_count += 1
print(f"Deleted {survey_delete_count} documents from surveyResults")
# p log
print(f"\nDeleting {len(participant_ids_to_delete)} participants from participantLog...")
participant_delete_count = 0
for pid in participant_ids_to_delete:
result = participant_log.delete_many({'ParticipantID': int(pid)})
participant_delete_count += result.deleted_count
print(f"Deleted {participant_delete_count} documents from participantLog")
print("DELETION SUMMARY")
print(f"Date: {target_date.strftime('%m-%d-%Y')}")
print(f"userScenarioResults: {text_delete_count} documents deleted")
print(f"surveyResults: {survey_delete_count} documents deleted")
print(f"participantLog: {participant_delete_count} documents deleted")
print(f"Total participant IDs processed: {len(participant_ids_to_delete)}")
if participant_ids_to_delete:
print(f"Participant IDs: {', '.join(sorted(participant_ids_to_delete))}")
client.close()
def main():
if len(sys.argv) != 2:
print("Usage: python clean_data_from_date.py <mm/dd/yyyy>")
print("Example: python clean_data_from_date.py 06/24/2025")
return
date = sys.argv[1]
try:
parse_date_string(date)
except ValueError as e:
print(f"Error: {e}")
return
try:
delete_data_from_date(date)
print("\nDeletion completed successfully!")
except Exception as e:
print(f"\nError during deletion: {e}")
if __name__ == "__main__":
sys.exit(main())