Skip to content

Commit e7ca2ab

Browse files
committed
add workflow to deploy diarization
1 parent e549230 commit e7ca2ab

25 files changed

+143
-515
lines changed

.github/workflows/python-app.yml

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
name: Build/run tgov
2+
3+
on:
4+
push:
5+
branches: [ "main", "deploy-lambda" ]
6+
pull_request:
7+
branches: [ "main" ]
8+
9+
permissions:
10+
contents: read
11+
12+
jobs:
13+
build:
14+
15+
runs-on: ubuntu-latest
16+
17+
steps:
18+
- uses: actions/checkout@v4
19+
- name: Set up Python 3.11
20+
uses: actions/setup-python@v3
21+
with:
22+
python-version: "3.11"
23+
- name: Install dependencies
24+
run: |
25+
apt-get update
26+
apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc libpython3-dev
27+
apt-get clean
28+
rm -rf /var/lib/apt/lists/*
29+
python3 -m venv "${POETRY_VENV}"
30+
$POETRY_VENV/bin/pip install -U pip setuptools
31+
$POETRY_VENV/bin/pip install "poetry==${POETRY_VERSION}"
32+
- name: Lint with flake8
33+
run: |
34+
# stop the build if there are Python syntax errors or undefined names
35+
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36+
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37+
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38+
export PATH="${PATH}:${POETRY_VENV}/bin"
39+
poetry config virtualenvs.create false
40+
poetry install
41+
- name: Run Diarization
42+
run: |
43+
python src/run_diarization.py

Dockerfile

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM python:3.12-slim AS build
2+
ARG POETRY_VERSION=1.3.2
3+
ENV POETRY_VENV=/opt/poetry-venv
4+
5+
RUN apt-get update && \
6+
apt-get install --no-install-suggests --no-install-recommends --yes python3-venv gcc libpython3-dev && \
7+
apt-get clean && \
8+
rm -rf /var/lib/apt/lists/* && \
9+
python3 -m venv "${POETRY_VENV}" \
10+
&& $POETRY_VENV/bin/pip install -U pip setuptools \
11+
&& $POETRY_VENV/bin/pip install "poetry==${POETRY_VERSION}"
12+
13+
ENV PATH="${PATH}:${POETRY_VENV}/bin"
14+
WORKDIR /app
15+
COPY poetry.lock pyproject.toml ./
16+
COPY src /app/src
17+
COPY db /app/db
18+
COPY README.md /app/README.md
19+
20+
RUN poetry config virtualenvs.create false
21+
RUN poetry install
22+
23+
ENV PYTHONPATH=/app
24+
ENV PATH="${POETRY_VENV}/bin:${PATH}"
25+
26+
CMD ["python", "src/run_diarization.py"]
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

notebooks/videos.ipynb

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@
115115
"# Create output directory if it doesn't exist\n",
116116
"VIDEO_DIRECTORY = Path(\"../data/video\")\n",
117117
"VIDEO_DIRECTORY.mkdir(parents=True, exist_ok=True)\n",
118+
"print(file_name)\n",
119+
"print(video_url)\n",
118120
"\n",
119121
"# Define output path for the video\n",
120122
"output_path = VIDEO_DIRECTORY / f\"{file_name}.mp4\"\n",
@@ -181,26 +183,15 @@
181183
"metadata": {},
182184
"source": [
183185
"### Convert the video file into a transcipt\n",
184-
"This step requires a huggingface login and api_token.\n",
185-
"You will also need to agree to terms on each of the following models:\n",
186-
"- guillaumekln/faster-whisper\n",
187-
"- \n"
186+
"This step requires a huggingface login and api_token (?)"
188187
]
189188
},
190189
{
191190
"cell_type": "code",
192191
"execution_count": null,
193192
"metadata": {},
194193
"outputs": [],
195-
"source": [
196-
"from src.videos import transcribe_video\n",
197-
"\n",
198-
"video_file = \"../data/video/regular_council_meeting___2025_02_26.mp4\"\n",
199-
"\n",
200-
"transcription_dir = Path(\"../data/transcripts\")\n",
201-
"\n",
202-
"transcription = await transcribe_video(video_file, transcription_dir)"
203-
]
194+
"source": []
204195
},
205196
{
206197
"cell_type": "code",
@@ -216,13 +207,20 @@
216207
"\n",
217208
"transcription = await transcribe_video_with_diarization(video_file, transcription_dir)"
218209
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": null,
214+
"metadata": {},
215+
"outputs": [],
216+
"source": []
219217
}
220218
],
221219
"metadata": {
222220
"kernelspec": {
223-
"display_name": "TGOV Scraper",
221+
"display_name": ".venv",
224222
"language": "python",
225-
"name": "tgov-scraper"
223+
"name": "python3"
226224
},
227225
"language_info": {
228226
"codemirror_mode": {
@@ -234,7 +232,7 @@
234232
"name": "python",
235233
"nbconvert_exporter": "python",
236234
"pygments_lexer": "ipython3",
237-
"version": "3.11.9"
235+
"version": "3.11.10"
238236
}
239237
},
240238
"nbformat": 4,

push_image.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# !/bin/bash
2+
3+
set -ex
4+
5+
aws ecr get-login-password --region us-east-2 --profile kendall | docker login --username AWS --password-stdin 340531845404.dkr.ecr.us-east-2.amazonaws.com
6+
docker buildx build --platform linux/amd64 -t tgov_linux . --provenance=false
7+
export tag=$(date +%s)
8+
docker tag tgov_linux 340531845404.dkr.ecr.us-east-2.amazonaws.com/tgov:$tag
9+
docker tag tgov_linux 340531845404.dkr.ecr.us-east-2.amazonaws.com/tgov:latest
10+
docker push 340531845404.dkr.ecr.us-east-2.amazonaws.com/tgov:$tag
11+
docker push 340531845404.dkr.ecr.us-east-2.amazonaws.com/tgov:latest

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,11 @@ jupyter = "^1.1.1"
2525
jupyter-nbextensions-configurator = "^0.6.4"
2626
python-dotenv = "^1.0.1"
2727
aiofiles = "^24.1.0"
28-
faster-whisper = "^1.1.1"
2928
prefect = "^3.3.0"
3029
boto3 = "^1.37.24"
3130
dyntastic = "^0.18.0"
3231
dateparser = "^1.2.1"
32+
whisperx = "^3.3.4"
3333

3434

3535
[tool.poetry.group.dev.dependencies]

0 commit comments

Comments
 (0)