Skip to content

Commit 7bf4f12

Browse files
committed
stuff
1 parent 2550b5a commit 7bf4f12

File tree

10 files changed

+375
-34
lines changed

10 files changed

+375
-34
lines changed

.github/workflows/deploy-prod.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,6 @@ jobs:
103103
- build
104104
environment: "AWS PROD"
105105
steps:
106-
- name: Set up Node for testing
107-
uses: actions/setup-node@v4
108-
with:
109-
node-version: 22.x
110-
111106
- name: Setup Terraform
112107
uses: hashicorp/setup-terraform@v3
113108
with:
@@ -131,5 +126,6 @@ jobs:
131126
env:
132127
HUSKY: "0"
133128
VITE_RUN_ENVIRONMENT: prod
129+
134130
- name: Call the health check script
135131
run: make prod_health_check

.github/workflows/deploy-qa.yml

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -106,25 +106,11 @@ jobs:
106106
env:
107107
HUSKY: "0"
108108

109-
- name: Set up Node for testing
110-
uses: actions/setup-node@v4
111-
with:
112-
node-version: 22.x
113-
cache: "yarn"
114-
115109
- name: Setup Terraform
116110
uses: hashicorp/setup-terraform@v3
117111
with:
118112
terraform_version: 1.12.2
119113

120-
- name: Restore Yarn Cache
121-
uses: actions/cache@v4
122-
with:
123-
path: node_modules
124-
key: yarn-modules-${{ runner.arch }}-${{ runner.os }}-${{ hashFiles('**/yarn.lock') }}-dev
125-
restore-keys: |
126-
yarn-modules-${{ runner.arch }}-${{ runner.os }}-
127-
128114
- name: Download Build files
129115
uses: actions/download-artifact@v4
130116
with:
@@ -143,6 +129,20 @@ jobs:
143129
HUSKY: "0"
144130
VITE_RUN_ENVIRONMENT: dev
145131

132+
- name: Set up Node for testing
133+
uses: actions/setup-node@v4
134+
with:
135+
node-version: 22.x
136+
cache: "yarn"
137+
138+
- name: Restore Yarn Cache
139+
uses: actions/cache@v4
140+
with:
141+
path: node_modules
142+
key: yarn-modules-${{ runner.arch }}-${{ runner.os }}-${{ hashFiles('**/yarn.lock') }}-dev
143+
restore-keys: |
144+
yarn-modules-${{ runner.arch }}-${{ runner.os }}-
145+
146146
- name: Run health check
147147
run: make dev_health_check
148148

.github/workflows/manual-prod.yml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,6 @@ jobs:
101101
- build
102102
environment: "AWS PROD"
103103
steps:
104-
- name: Setup Node
105-
uses: actions/setup-node@v4
106-
with:
107-
node-version: 22.x
108-
109104
- name: Setup Terraform
110105
uses: hashicorp/setup-terraform@v3
111106
with:
@@ -119,6 +114,7 @@ jobs:
119114
uses: actions/download-artifact@v4
120115
with:
121116
name: build-prod
117+
122118
- uses: aws-actions/configure-aws-credentials@v4
123119
with:
124120
role-to-assume: arn:aws:iam::298118738376:role/GitHubActionsRole

notebooks/read_archived_s3.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
"metadata": {},
2323
"outputs": [],
2424
"source": [
25-
"!pip install s3fs pandas"
25+
"%pip install s3fs pandas"
2626
]
2727
},
2828
{

notebooks/read_audit_log_s3.ipynb

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "ba97204d",
6+
"metadata": {},
7+
"source": [
8+
"## Read Audit Logs from S3\n",
9+
"Make sure that you have signed into AWS and setup a profile.\n",
10+
"\n",
11+
"Also, create `config.py` with contents:\n",
12+
"```python\n",
13+
"AUDIT_LOG_S3_BUCKET = \"<bucket name of archival data bucket>\"\n",
14+
"AWS_PROFILE = \"<AWS_PROFILE HERE>\"\n",
15+
"```"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": null,
21+
"id": "22f8f0a6",
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"%pip install s3fs pandas pyarrow"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"id": "84a7370f",
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"import os\n",
36+
"import config\n",
37+
"import pyarrow.fs as fs\n",
38+
"import pyarrow.dataset as ds\n",
39+
"assert config.AUDIT_LOG_S3_BUCKET\n",
40+
"assert config.AWS_PROFILE\n",
41+
"os.environ[\"AWS_PROFILE\"] = config.AWS_PROFILE\n",
42+
"\n",
43+
"module_to_read = \"tickets\"\n",
44+
"s3_path = f\"s3://{config.AUDIT_LOG_S3_BUCKET}/module={module_to_read}/year=2025/month=08/\"\n",
45+
"s3, path = fs.FileSystem.from_uri(s3_path)\n",
46+
"dataset = ds.dataset(path, partitioning=\"hive\", filesystem=s3, format=\"parquet\")\n",
47+
"\n",
48+
"table = dataset.to_table()\n",
49+
"\n",
50+
"print(f\"Successfully read {table.num_rows} rows.\")\n",
51+
"table.to_pandas()"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"id": "328af1fa",
58+
"metadata": {},
59+
"outputs": [],
60+
"source": []
61+
}
62+
],
63+
"metadata": {
64+
"kernelspec": {
65+
"display_name": "base",
66+
"language": "python",
67+
"name": "python3"
68+
},
69+
"language_info": {
70+
"codemirror_mode": {
71+
"name": "ipython",
72+
"version": 3
73+
},
74+
"file_extension": ".py",
75+
"mimetype": "text/x-python",
76+
"name": "python",
77+
"nbconvert_exporter": "python",
78+
"pygments_lexer": "ipython3",
79+
"version": "3.13.5"
80+
}
81+
},
82+
"nbformat": 4,
83+
"nbformat_minor": 5
84+
}

onetime/auditlog-migration.py

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
import boto3
2+
import json
3+
import logging
4+
import os
5+
from decimal import Decimal
6+
7+
# --- Configuration ---
8+
logging.basicConfig(
9+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
10+
)
11+
12+
AWS_REGION = os.environ.get("AWS_REGION", "us-east-1")
13+
DYNAMODB_TABLE_NAME = "infra-core-api-audit-log"
14+
FIREHOSE_STREAM_NAME = "infra-core-api-audit-log-stream"
15+
16+
# The top-level attributes to include in the JSON record for Firehose
17+
REQUIRED_KEYS = ["module", "createdAt", "actor", "message", "requestId", "target"]
18+
19+
# Kinesis Data Firehose has a batch limit of 500 records per call.
20+
FIREHOSE_BATCH_SIZE = 500
21+
22+
# --- CORRECTED: Primary key configuration based on your schema. ---
23+
DYNAMODB_PRIMARY_KEY_ATTRIBUTES = ["module", "createdAt"]
24+
25+
26+
class DecimalEncoder(json.JSONEncoder):
27+
"""A helper class to convert DynamoDB's Decimal type to standard int/float for JSON."""
28+
29+
def default(self, obj):
30+
if isinstance(obj, Decimal):
31+
return int(obj) if obj % 1 == 0 else float(obj)
32+
return super(DecimalEncoder, self).default(obj)
33+
34+
35+
def _process_and_delete_batch(
36+
firehose_batch, dynamodb_items_batch, firehose_client, dynamodb_table
37+
):
38+
"""
39+
Sends a batch of records to Firehose and deletes the successful ones from DynamoDB.
40+
Returns the count of records successfully sent and deleted.
41+
"""
42+
if not firehose_batch:
43+
return 0, 0
44+
45+
sent_count = 0
46+
deleted_count = 0
47+
48+
try:
49+
# 1. Send the entire batch to Firehose
50+
response = firehose_client.put_record_batch(
51+
DeliveryStreamName=FIREHOSE_STREAM_NAME, Records=firehose_batch
52+
)
53+
54+
failed_put_count = response.get("FailedPutCount", 0)
55+
if failed_put_count > 0:
56+
logging.warning(
57+
f"{failed_put_count} of {len(firehose_batch)} records failed to be "
58+
"sent to Firehose in this batch. They will not be deleted."
59+
)
60+
61+
# 2. Identify successful records and prepare their keys for deletion
62+
keys_to_delete = []
63+
for i, record_response in enumerate(response.get("RequestResponses", [])):
64+
original_item = dynamodb_items_batch[i]
65+
request_id = original_item.get("requestId", "N/A")
66+
67+
if "ErrorCode" in record_response:
68+
# This record failed, log the error and skip deletion
69+
logging.error(
70+
f"Failed to send record {request_id} to Firehose: "
71+
f"{record_response.get('ErrorCode')} - {record_response.get('ErrorMessage')}"
72+
)
73+
else:
74+
# This record succeeded, add its primary key to the deletion list
75+
try:
76+
primary_key = {
77+
key: original_item[key]
78+
for key in DYNAMODB_PRIMARY_KEY_ATTRIBUTES
79+
}
80+
keys_to_delete.append(primary_key)
81+
except KeyError as ke:
82+
logging.error(
83+
f"Sent record {request_id} but cannot delete. "
84+
f"Missing primary key attribute in item: {ke}. "
85+
"Please check DYNAMODB_PRIMARY_KEY_ATTRIBUTES configuration."
86+
)
87+
88+
sent_count = len(keys_to_delete)
89+
90+
# 3. Use a batch_writer to efficiently delete all successful records from DynamoDB
91+
if keys_to_delete:
92+
with dynamodb_table.batch_writer() as batch:
93+
for key in keys_to_delete:
94+
batch.delete_item(Key=key)
95+
deleted_count = len(keys_to_delete)
96+
logging.info(
97+
f"Successfully sent and deleted {deleted_count} records in this batch."
98+
)
99+
100+
except Exception as e:
101+
logging.error(f"A fatal error occurred while processing a batch: {e}")
102+
# In case of a total batch failure, nothing is sent or deleted.
103+
return 0, 0
104+
105+
return sent_count, deleted_count
106+
107+
108+
def process_send_and_delete_logs():
109+
"""
110+
Scans a DynamoDB table, sends items in batches to Kinesis Data Firehose,
111+
and deletes successfully processed items from the DynamoDB table.
112+
"""
113+
if not DYNAMODB_PRIMARY_KEY_ATTRIBUTES:
114+
logging.error(
115+
"Configuration error: DYNAMODB_PRIMARY_KEY_ATTRIBUTES is not set."
116+
)
117+
return
118+
119+
try:
120+
dynamodb = boto3.resource("dynamodb", region_name=AWS_REGION)
121+
table = dynamodb.Table(DYNAMODB_TABLE_NAME)
122+
firehose = boto3.client("firehose", region_name=AWS_REGION)
123+
124+
total_sent_count = 0
125+
total_deleted_count = 0
126+
logging.info(
127+
f"Starting scan of '{DYNAMODB_TABLE_NAME}' to process in batches of {FIREHOSE_BATCH_SIZE}..."
128+
)
129+
130+
firehose_batch = []
131+
dynamodb_items_batch = []
132+
scan_kwargs = {}
133+
done = False
134+
start_key = None
135+
136+
while not done:
137+
if start_key:
138+
scan_kwargs["ExclusiveStartKey"] = start_key
139+
140+
response = table.scan(**scan_kwargs)
141+
items = response.get("Items", [])
142+
143+
for item in items:
144+
# Build the record for the Firehose batch
145+
output_record = {key: item.get(key) for key in REQUIRED_KEYS}
146+
payload = (json.dumps(output_record, cls=DecimalEncoder) + "\n").encode(
147+
"utf-8"
148+
)
149+
firehose_batch.append({"Data": payload})
150+
151+
# Keep the original item to get its primary key for deletion later
152+
dynamodb_items_batch.append(item)
153+
154+
# If the batch is full, process it
155+
if len(firehose_batch) >= FIREHOSE_BATCH_SIZE:
156+
sent, deleted = _process_and_delete_batch(
157+
firehose_batch, dynamodb_items_batch, firehose, table
158+
)
159+
total_sent_count += sent
160+
total_deleted_count += deleted
161+
# Clear the batches for the next set of records
162+
firehose_batch = []
163+
dynamodb_items_batch = []
164+
165+
start_key = response.get("LastEvaluatedKey", None)
166+
done = start_key is None
167+
168+
# Process any remaining records that are left over after the loop finishes
169+
if firehose_batch:
170+
logging.info(
171+
f"Processing the final batch of {len(firehose_batch)} records..."
172+
)
173+
sent, deleted = _process_and_delete_batch(
174+
firehose_batch, dynamodb_items_batch, firehose, table
175+
)
176+
total_sent_count += sent
177+
total_deleted_count += deleted
178+
179+
logging.info(
180+
f"Scan complete. Total records sent: {total_sent_count}. "
181+
f"Total records deleted: {total_deleted_count}."
182+
)
183+
184+
except (
185+
boto3.client("dynamodb").exceptions.ResourceNotFoundException,
186+
dynamodb.meta.client.exceptions.ResourceNotFoundException,
187+
):
188+
logging.error(f"Error: DynamoDB table '{DYNAMODB_TABLE_NAME}' not found.")
189+
except firehose.exceptions.ResourceNotFoundException:
190+
logging.error(f"Error: Firehose stream '{FIREHOSE_STREAM_NAME}' not found.")
191+
except Exception as e:
192+
logging.error(f"An unexpected error occurred: {e}")
193+
194+
195+
if __name__ == "__main__":
196+
process_send_and_delete_logs()

0 commit comments

Comments
 (0)