codefortulsa · groovecoder · Apr 2, 2025 · Mar 31, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Environment variables
 .env
+.envrc
 
 data/
 models/
@@ -16,6 +17,7 @@ __pycache__/
 *.py[cod]
 *.so
 .Python
+.python-version
 build/
 develop-eggs/
 dist/

diff --git a/README.md b/README.md
@@ -14,17 +14,35 @@ poetry install --no-root
 poetry self add poetry-plugin-shell
 poetry shell
 
-# Install Jupyter kernel for this environment (needed for Jupyter notebooks)
-python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
+# Set up pre-commit hooks
+poetry run pre-commit install
+
+# Verify pre-commit hooks are working
+poetry run pre-commit run --all-files
+
+# See notebook_precommit.md for more details on how notebook outputs are automatically stripped
 ```
 
 ## Running
+### Jupyter notebooks
 
 ```bash
+# Install Jupyter kernel for this environment (needed for Jupyter notebooks)
+python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
+
 jupyter notebook
 ```
 
-## Running Tests
+### Prefect flows
+See https://docs.prefect.io/get-started
+
+```bash
+prefect server start                      # to start the persistent server
+
+python -m flows.translate_meetings        # to run a specific flow
+```
+
+### Tests
 
 ```bash
 # Run all tests
@@ -39,12 +57,16 @@ pytest -v
 
 ## Project Structure
 
+- `data/`: local data artifacts
+- `flows/`: prefect flows
+- `notebooks/`: Jupyter notebooks for analysis and exploration
+- `scripts/`: one off scripts for downloading, conversions, etc
 - `src/`: Source code for the scraper
   - `models/`: Pydantic models for data representation
-- 'scripts`: one off scripts for downloading, conversions, etc
+- `tasks/`: prefect tasks
 - `tests/`: Test files
 - `notebooks/`: Jupyter notebooks for analysis and exploration
-- `data/`: output from notebooks 
+- `data/`: output from notebooks
 
 
 ## Running the transcription scripts

diff --git a/data/meetings.csv b/data/meetings.csv
diff --git a/flows/__init__.py b/flows/__init__.py
diff --git a/flows/translate_meetings.py b/flows/translate_meetings.py
@@ -0,0 +1,17 @@
+from prefect import flow
+
+from tasks.meetings import create_meetings_csv
+
+
+@flow(log_prints=True)
+async def translate_meetings():
+    await create_meetings_csv()
+    # TODO: await download_videos()
+    # TODO: await transcribe_videos()
+    # TODO: await diarize_transcriptions()
+    # TODO: await translate_transcriptions()
+    # TODO: await create_subtitled_video_pages()
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(translate_meetings())
diff --git a/NOTEBOOK_GUIDELINES.md → notebook_precommit.md b/NOTEBOOK_GUIDELINES.md → notebook_precommit.md
diff --git a/notebooks/meetings.ipynb b/notebooks/meetings.ipynb
diff --git a/notebooks/vtt_subtitles.ipynb b/notebooks/vtt_subtitles.ipynb
@@ -70,17 +70,17 @@
     "\n",
     "# Import from the new subtitles module\n",
     "from src.subtitles import create_track, load_transcript\n",
-    "from src.models.subtitles import TrackFormat\n",
+    "from src.models.subtitles import SubtitleTrack\n",
     "\n",
     "# Path to the transcript file\n",
     "transcript_file = Path(\n",
     "    \"../data/transcripts/regular_council_meeting___2025_02_26.diarized.json\"\n",
     ")\n",
     "\n",
     "# Create VTT track\n",
-    "vtt_track = create_track(\n",
+    "vtt_track: SubtitleTrack = create_track(\n",
     "    transcript_data=transcript_file,\n",
-    "    track_format='vtt',\n",
+    "    format='vtt',\n",
     "    max_duration=5.0,\n",
     "    include_speaker_prefix=False,\n",
     ")\n",

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,11 +2,11 @@
 name = "tgov scraper"
 version = "0.1.0"
 description = "A set of scripts and notebooks for exploring Tulsa Government Access Television"
-authors = ["jdungan <[email protected]>"]
+authors = ["jdungan <[email protected]>", "groovecoder <[email protected]>"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "3.11.*"
+python = ">=3.11,<3.13"
 selectolax = "^0.3.28"
 aiohttp = "^3.11.13"
 pytest-asyncio = "^0.25.3"
@@ -25,14 +25,17 @@ jupyter-nbextensions-configurator = "^0.6.4"
 python-dotenv = "^1.0.1"
 aiofiles = "^24.1.0"
 faster-whisper = "^1.1.1"
+prefect = "^3.3.0"
+boto3 = "^1.37.24"
 
 
 [tool.poetry.group.dev.dependencies]
-jupyter = "^1.1.1"
+ipdb = "^0.13.13"
 ipykernel = "^6.29.5"
-pytest = "^8.0.0"
-pre-commit = "^4.2.0"
+jupyter = "^1.1.1"
 nbstripout = "^0.8.1"
+pre-commit = "^4.2.0"
+pytest = "^8.0.0"
 
 [build-system]
 requires = ["poetry-core"]

diff --git a/scripts/download_m3u8.py b/scripts/download_m3u8.py