Skip to content

Commit 8826610

Browse files
authored
Setup firehose stream to archive data (#267)
1 parent da7407b commit 8826610

File tree

24 files changed

+2207
-413
lines changed

24 files changed

+2207
-413
lines changed

infracost-usage.yml

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,25 @@ resource_usage:
2121
monthly_select_data_scanned_gb: 0 # Monthly data scanned by S3 Select in GB.
2222
monthly_select_data_returned_gb: 0 # Monthly data returned by S3 Select in GB.
2323
early_delete_gb: 0 # If an archive is deleted within 1 months of being uploaded, you will be charged an early deletion fee per GB.
24+
module.ttl_archiver.aws_s3_bucket.this:
25+
aws_s3_bucket:
26+
object_tags: 50000 # Total object tags. Only for AWS provider V3.
27+
standard: # Usages of S3 Standard:
28+
storage_gb: 30 # Total storage in GB.
29+
monthly_tier_1_requests: 10000 # Monthly PUT, COPY, POST, LIST requests (Tier 1).
30+
monthly_tier_2_requests: 125000 # Monthly GET, SELECT, and all other requests (Tier 2).
31+
monthly_select_data_scanned_gb: 20 # Monthly data scanned by S3 Select in GB.
32+
monthly_select_data_returned_gb: 2 # Monthly data returned by S3 Select in GB.
33+
intelligent_tiering: # Usages of S3 Intelligent - Tiering:
34+
frequent_access_storage_gb: 30 # Total storage for Frequent Access Tier in GB.
35+
infrequent_access_storage_gb: 0 # Total storage for Infrequent Access Tier in GB.
36+
monitored_objects: 0 # Total objects monitored by the Intelligent Tiering.
37+
monthly_tier_1_requests: 0 # Monthly PUT, COPY, POST, LIST requests (Tier 1).
38+
monthly_tier_2_requests: 0 # Monthly GET, SELECT, and all other requests (Tier 2).
39+
monthly_lifecycle_transition_requests: 0 # Monthly Lifecycle Transition requests.
40+
monthly_select_data_scanned_gb: 0 # Monthly data scanned by S3 Select in GB.
41+
monthly_select_data_returned_gb: 0 # Monthly data returned by S3 Select in GB.
42+
early_delete_gb: 0 # If an archive is deleted within 1 months of being uploaded, you will be charged an early deletion fee per GB.
2443

2544
resource_type_default_usage:
2645
aws_acmpca_certificate_authority:
@@ -204,7 +223,7 @@ resource_type_default_usage:
204223
aws_kinesisanalyticsv2_application_snapshot:
205224
durable_application_backup_gb: 208 # Total amount of durable application backups in GB.
206225
aws_kinesis_firehose_delivery_stream:
207-
monthly_data_ingested_gb: 142 # Monthly data ingested by the Delivery Stream in GB.
226+
monthly_data_ingested_gb: 20 # Monthly data ingested by the Delivery Stream in GB.
208227
aws_kinesis_stream:
209228
monthly_on_demand_data_in_gb: 62 # Monthly data ingested by the stream in GB.
210229
monthly_on_demand_data_out_gb: 125 # Monthly data egressed by the stream in GB total, (not per consumer application).
@@ -282,8 +301,8 @@ resource_type_default_usage:
282301
storage_gb: 10 # Total storage in GB.
283302
monthly_tier_1_requests: 10000 # Monthly PUT, COPY, POST, LIST requests (Tier 1).
284303
monthly_tier_2_requests: 125000 # Monthly GET, SELECT, and all other requests (Tier 2).
285-
monthly_select_data_scanned_gb: 2500 # Monthly data scanned by S3 Select in GB.
286-
monthly_select_data_returned_gb: 7250 # Monthly data returned by S3 Select in GB.
304+
monthly_select_data_scanned_gb: 0 # Monthly data scanned by S3 Select in GB.
305+
monthly_select_data_returned_gb: 0 # Monthly data returned by S3 Select in GB.
287306
intelligent_tiering: # Usages of S3 Intelligent - Tiering:
288307
frequent_access_storage_gb: 30 # Total storage for Frequent Access Tier in GB.
289308
infrequent_access_storage_gb: 0 # Total storage for Infrequent Access Tier in GB.

notebooks/.gitignore

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[codz]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py.cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# UV
98+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99+
# This is especially recommended for binary packages to ensure reproducibility, and is more
100+
# commonly ignored for libraries.
101+
#uv.lock
102+
103+
# poetry
104+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105+
# This is especially recommended for binary packages to ensure reproducibility, and is more
106+
# commonly ignored for libraries.
107+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108+
#poetry.lock
109+
#poetry.toml
110+
111+
# pdm
112+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115+
#pdm.lock
116+
#pdm.toml
117+
.pdm-python
118+
.pdm-build/
119+
120+
# pixi
121+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122+
#pixi.lock
123+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124+
# in the .venv directory. It is recommended not to include this directory in version control.
125+
.pixi
126+
127+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128+
__pypackages__/
129+
130+
# Celery stuff
131+
celerybeat-schedule
132+
celerybeat.pid
133+
134+
# Redis
135+
*.rdb
136+
*.aof
137+
*.pid
138+
139+
# RabbitMQ
140+
mnesia/
141+
rabbitmq/
142+
rabbitmq-data/
143+
144+
# ActiveMQ
145+
activemq-data/
146+
147+
# SageMath parsed files
148+
*.sage.py
149+
150+
# Environments
151+
.env
152+
.envrc
153+
.venv
154+
env/
155+
venv/
156+
ENV/
157+
env.bak/
158+
venv.bak/
159+
160+
# Spyder project settings
161+
.spyderproject
162+
.spyproject
163+
164+
# Rope project settings
165+
.ropeproject
166+
167+
# mkdocs documentation
168+
/site
169+
170+
# mypy
171+
.mypy_cache/
172+
.dmypy.json
173+
dmypy.json
174+
175+
# Pyre type checker
176+
.pyre/
177+
178+
# pytype static type analyzer
179+
.pytype/
180+
181+
# Cython debug symbols
182+
cython_debug/
183+
184+
# PyCharm
185+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187+
# and can be added to the global gitignore or merged into this file. For a more nuclear
188+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
189+
#.idea/
190+
191+
# Abstra
192+
# Abstra is an AI-powered process automation framework.
193+
# Ignore directories containing user credentials, local state, and settings.
194+
# Learn more at https://abstra.io/docs
195+
.abstra/
196+
197+
# Visual Studio Code
198+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
201+
# you could uncomment the following to ignore the entire vscode folder
202+
# .vscode/
203+
204+
# Ruff stuff:
205+
.ruff_cache/
206+
207+
# PyPI configuration file
208+
.pypirc
209+
210+
# Marimo
211+
marimo/_static/
212+
marimo/_lsp/
213+
__marimo__/
214+
215+
# Streamlit
216+
.streamlit/secrets.toml
217+
218+
# Config
219+
config.py

notebooks/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Notebooks
2+
IPYNB notebooks for ACM tasks
3+
4+
## Getting Started
5+
Create the file `config.py` with the following contents:
6+
```python
7+
AWS_PROFILE="<your authenticated AWS profile>"
8+
```

notebooks/read_archived_s3.ipynb

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "ba97204d",
6+
"metadata": {},
7+
"source": [
8+
"## Read Archived Logs from S3\n",
9+
"Make sure that you have signed into AWS and setup a profile.\n",
10+
"\n",
11+
"Also, create `config.py` with contents:\n",
12+
"```python\n",
13+
"ARCHIVE_S3_BUCKET = \"<bucket name of archival data bucket>\"\n",
14+
"AWS_PROFILE = \"<AWS_PROFILE HERE>\"\n",
15+
"```"
16+
]
17+
},
18+
{
19+
"cell_type": "code",
20+
"execution_count": null,
21+
"id": "22f8f0a6",
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"!pip install s3fs pandas"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": null,
31+
"id": "84a7370f",
32+
"metadata": {},
33+
"outputs": [],
34+
"source": [
35+
"import pandas as pd\n",
36+
"import config\n",
37+
"\n",
38+
"assert config.ARCHIVE_S3_BUCKET\n",
39+
"assert config.AWS_PROFILE\n",
40+
"module_to_read = \"ExampleTableWithStream\"\n",
41+
"# configure this s3 path correctly based on the date you're looking for, etc.\n",
42+
"s3_path = f\"s3://{config.ARCHIVE_S3_BUCKET}/resource={module_to_read}/**/*.gz\"\n",
43+
"\n",
44+
"df = pd.read_json(\n",
45+
" s3_path,\n",
46+
" lines=True,\n",
47+
" compression=\"gzip\",\n",
48+
" storage_options={\"profile\": config.AWS_PROFILE, \"expand\": True},\n",
49+
")\n",
50+
"\n",
51+
"print(df.head())"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": null,
57+
"id": "328af1fa",
58+
"metadata": {},
59+
"outputs": [],
60+
"source": []
61+
}
62+
],
63+
"metadata": {
64+
"kernelspec": {
65+
"display_name": "base",
66+
"language": "python",
67+
"name": "python3"
68+
},
69+
"language_info": {
70+
"codemirror_mode": {
71+
"name": "ipython",
72+
"version": 3
73+
},
74+
"file_extension": ".py",
75+
"mimetype": "text/x-python",
76+
"name": "python",
77+
"nbconvert_exporter": "python",
78+
"pygments_lexer": "ipython3",
79+
"version": "3.13.5"
80+
}
81+
},
82+
"nbformat": 4,
83+
"nbformat_minor": 5
84+
}

package.json

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
"type": "module",
66
"workspaces": [
77
"src/api",
8-
"src/ui"
8+
"src/ui",
9+
"src/archival"
910
],
1011
"packageManager": "[email protected]",
1112
"scripts": {
1213
"postinstall": "npm run setup",
1314
"setup": "git config blame.ignoreRevsFile .git-blame-ignore-revs",
14-
"build": "concurrently --names 'api,ui' 'yarn workspace infra-core-api run build' 'yarn workspace infra-core-ui run build'",
15+
"build": "concurrently --names 'api,ui,archival' 'yarn workspace infra-core-api run build' 'yarn workspace infra-core-ui run build' 'yarn workspace infra-core-archival run build'",
1516
"postbuild": "node src/api/createLambdaPackage.js && yarn lockfile-manage",
1617
"dev": "cross-env DISABLE_AUDIT_LOG=true concurrently --names 'api,ui' 'yarn workspace infra-core-api run dev' 'yarn workspace infra-core-ui run dev'",
1718
"lockfile-manage": "synp --with-workspace --source-file yarn.lock",
@@ -38,6 +39,7 @@
3839
"@eslint/eslintrc": "^3.3.1",
3940
"@eslint/js": "^9.33.0",
4041
"@playwright/test": "^1.54.2",
42+
"@smithy/types": "^4.3.2",
4143
"@tsconfig/node22": "^22.0.1",
4244
"@types/ioredis-mock": "^8.2.5",
4345
"@types/node": "^24.3.0",
@@ -92,4 +94,4 @@
9294
"pdfjs-dist": "^4.8.69",
9395
"form-data": "^4.0.4"
9496
}
95-
}
97+
}

src/api/build.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* eslint-disable no-console */
12
import esbuild from "esbuild";
23
import { resolve } from "path";
34
import { copy } from "esbuild-plugin-copy";

0 commit comments

Comments
 (0)