Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ __pycache__
# config
config/local.json
.envrc
.poetry

# logs
gobble.log
s3_upload.log

# ides
.cursor
.vscode
3 changes: 2 additions & 1 deletion config/template.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
"dir": null,
"refresh_interval_days": 7
},
"DATADOG_TRACE_ENABLED": false
"DATADOG_TRACE_ENABLED": false,
"file_retention_days": 180
}
1 change: 1 addition & 0 deletions devops/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ echo "Deploying Gobble..."
echo "View stack log here: https://$AWS_REGION.console.aws.amazon.com/cloudformation/home?region=$AWS_REGION"

aws cloudformation deploy --stack-name $STACK_NAME \
--tags service=gobble env=prod \
--template-file cloudformation.json \
--capabilities CAPABILITY_NAMED_IAM \
--no-fail-on-empty-changeset
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

46 changes: 46 additions & 0 deletions src/disk.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import os
import pathlib
from datetime import datetime, timedelta
from util import output_dir_path
from ddtrace import tracer

Expand All @@ -11,6 +12,9 @@
logger = set_up_logging(__name__)
tracer.enabled = CONFIG["DATADOG_TRACE_ENABLED"]

# Number of days to retain event files before cleanup
FILE_RETENTION_DAYS = CONFIG.get("file_retention_days", 180)

CSV_FILENAME = "events.csv"
CSV_FIELDS = [
"service_date",
Expand Down Expand Up @@ -50,3 +54,45 @@ def write_event(event: dict):
if not file_exists:
writer.writeheader()
writer.writerow(event)


def _scan_and_cleanup(path: pathlib.Path, cutoff: datetime) -> int:
"""Recursively scan directory and delete CSV files older than cutoff.

Args:
path: Directory path to scan
cutoff: Datetime cutoff - files modified before this will be deleted

Returns:
Number of files deleted
"""
deleted = 0
with os.scandir(path) as it:
for entry in it:
try:
if entry.is_dir(follow_symlinks=False):
deleted += _scan_and_cleanup(entry.path, cutoff)
elif entry.is_file(follow_symlinks=False) and entry.name == CSV_FILENAME:
if datetime.fromtimestamp(entry.stat().st_mtime) < cutoff:
os.unlink(entry.path)
deleted += 1
logger.info(f"Deleted old file: {entry.path}")
except Exception as e:
logger.warning(f"Skipping {entry.path}: {e}")
return deleted


def cleanup_old_files(reference_time: datetime = None):
"""Delete CSV files older than the configured retention period.

Args:
reference_time: The datetime to use as reference for calculating cutoff.
Defaults to current time if not provided.
"""
logger.info("Starting cleanup of old files")
if reference_time is None:
reference_time = datetime.now()
cutoff = reference_time - timedelta(days=FILE_RETENTION_DAYS)

deleted = _scan_and_cleanup(DATA_DIR, cutoff)
logger.info(f"Completed cleanup — deleted {deleted} file(s)")
6 changes: 5 additions & 1 deletion src/s3_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging

from config import CONFIG
from disk import DATA_DIR
from disk import DATA_DIR, cleanup_old_files
from logger import set_up_logging
from util import EASTERN_TIME, service_date

Expand Down Expand Up @@ -45,6 +45,7 @@ def _compress_and_upload_file(fp: str):
def upload_todays_events_to_s3():
"""Upload today's events to the TM s3 bucket."""
start_time = time.time()
start_datetime = datetime.datetime.now()

logger.info("Beginning upload of recent events to s3.")
pull_date = service_date(datetime.datetime.now(EASTERN_TIME))
Expand All @@ -62,6 +63,9 @@ def upload_todays_events_to_s3():
end_time = time.time()
logger.info(f"Uploaded {len(files_updated_today)} files to s3, took {end_time - start_time} seconds.")

# cleanup old files, free up disk space
cleanup_old_files(reference_time=start_datetime)


if __name__ == "__main__":
logger = set_up_logging(__file__)
Expand Down