Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ __pycache__
# config
config/local.json
.envrc
.poetry

# logs
gobble.log
s3_upload.log

# ides
.cursor
.vscode
3 changes: 2 additions & 1 deletion config/template.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
"dir": null,
"refresh_interval_days": 7
},
"DATADOG_TRACE_ENABLED": false
"DATADOG_TRACE_ENABLED": false,
"file_retention_days": 180
}
1 change: 1 addition & 0 deletions devops/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ echo "Deploying Gobble..."
echo "View stack log here: https://$AWS_REGION.console.aws.amazon.com/cloudformation/home?region=$AWS_REGION"

aws cloudformation deploy --stack-name $STACK_NAME \
--tags service=gobble env=prod \
--template-file cloudformation.json \
--capabilities CAPABILITY_NAMED_IAM \
--no-fail-on-empty-changeset
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 46 additions & 0 deletions src/disk.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import os
import pathlib
from datetime import datetime, timedelta
from util import output_dir_path
from ddtrace import tracer

Expand All @@ -11,6 +12,9 @@
logger = set_up_logging(__name__)
tracer.enabled = CONFIG["DATADOG_TRACE_ENABLED"]

# Number of days to retain event files before cleanup
FILE_RETENTION_DAYS = CONFIG.get("file_retention_days", 180)

CSV_FILENAME = "events.csv"
CSV_FIELDS = [
"service_date",
Expand Down Expand Up @@ -50,3 +54,45 @@ def write_event(event: dict):
if not file_exists:
writer.writeheader()
writer.writerow(event)


def _scan_and_cleanup(path: pathlib.Path, cutoff: datetime) -> int:
"""Recursively scan directory and delete CSV files older than cutoff.

Args:
path: Directory path to scan
cutoff: Datetime cutoff - files modified before this will be deleted

Returns:
Number of files deleted
"""
deleted = 0
with os.scandir(path) as it:
for entry in it:
try:
if entry.is_dir(follow_symlinks=False):
deleted += _scan_and_cleanup(entry.path, cutoff)
elif entry.is_file(follow_symlinks=False) and entry.name == CSV_FILENAME:
if datetime.fromtimestamp(entry.stat().st_mtime) < cutoff:
os.unlink(entry.path)
deleted += 1
logger.info(f"Deleted old file: {entry.path}")
except Exception as e:
logger.warning(f"Skipping {entry.path}: {e}")
return deleted


def cleanup_old_files(reference_time: datetime = None):
"""Delete CSV files older than the configured retention period.

Args:
reference_time: The datetime to use as reference for calculating cutoff.
Defaults to current time if not provided.
"""
logger.info("Starting cleanup of old files")
if reference_time is None:
reference_time = datetime.now()
cutoff = reference_time - timedelta(days=FILE_RETENTION_DAYS)

deleted = _scan_and_cleanup(DATA_DIR, cutoff)
logger.info(f"Completed cleanup — deleted {deleted} file(s)")
6 changes: 5 additions & 1 deletion src/s3_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging

from config import CONFIG
from disk import DATA_DIR
from disk import DATA_DIR, cleanup_old_files
from logger import set_up_logging
from util import EASTERN_TIME, service_date

Expand Down Expand Up @@ -45,6 +45,7 @@ def _compress_and_upload_file(fp: str):
def upload_todays_events_to_s3():
"""Upload today's events to the TM s3 bucket."""
start_time = time.time()
start_datetime = datetime.datetime.now()

logger.info("Beginning upload of recent events to s3.")
pull_date = service_date(datetime.datetime.now(EASTERN_TIME))
Expand All @@ -62,6 +63,9 @@ def upload_todays_events_to_s3():
end_time = time.time()
logger.info(f"Uploaded {len(files_updated_today)} files to s3, took {end_time - start_time} seconds.")

# cleanup old files, free up disk space
cleanup_old_files(reference_time=start_datetime)


if __name__ == "__main__":
logger = set_up_logging(__file__)
Expand Down