codefortulsa · jdungan · Mar 14, 2025 · Mar 13, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ data/audio/
 data/video/
 data/transcripts/
 models/
+notebooks/.ipynb_checkpoints/
 
 # Python
 __pycache__/

diff --git a/README.md b/README.md
@@ -15,26 +15,26 @@ poetry self add poetry-plugin-shell
 poetry shell
 
 # Install Jupyter kernel for this environment (needed for Jupyter notebooks)
-poetry run python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
+python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
 ```
 
 ## Running
 
 ```bash
-poetry run jupyter notebook
+jupyter notebook
 ```
 
 ## Running Tests
 
 ```bash
 # Run all tests
-poetry run pytest
+pytest
 
 # Run specific tests
-poetry run pytest tests/test_meetings.py
+pytest tests/test_meetings.py
 
 # Run tests with verbose output
-poetry run pytest -v
+pytest -v
 ```
 
 ## Project Structure

diff --git a/notebooks/roll_call.ipynb b/notebooks/roll_call.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### This notebook is for focusing on a roll call to see how it is transcribed\n",
+    "\n",
+    "Recognizing short words by different speakers is difficult.  This notebook focuses in a roll call vote to see if changing model parameters can improve it.  \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import pandas as pd\n",
+    "sys.path.append(\"../\")\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### use ffmpeg to get a section of a meeting\n",
+    "This 30 second clip is a roll call vote"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Clip successfully extracted to: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n"
+     ]
+    }
+   ],
+   "source": [
+    "import subprocess\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Input and output file paths\n",
+    "input_file = Path(\"../data/video/regular_council_meeting___2025_02_26.mp4\")\n",
+    "clip_file = Path(\"../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\")\n",
+    "\n",
+    "# Parameters for clip extraction\n",
+    "start_time = \"4:50\"\n",
+    "duration = \"30\"  # 30 seconds\n",
+    "\n",
+    "# Run FFmpeg command\n",
+    "result = subprocess.run(\n",
+    "    [\n",
+    "        \"ffmpeg\",\n",
+    "        \"-i\",\n",
+    "        str(input_file),\n",
+    "        \"-ss\",\n",
+    "        start_time,\n",
+    "        \"-t\",\n",
+    "        duration,\n",
+    "        \"-c\",\n",
+    "        \"copy\",  # Copy codec (fast but might not be frame accurate)\n",
+    "        \"-avoid_negative_ts\",\n",
+    "        \"1\",\n",
+    "        str(clip_file),\n",
+    "        \"-y\",  # Overwrite if exists\n",
+    "    ],\n",
+    "    capture_output=True,\n",
+    "    text=True,\n",
+    ")\n",
+    "\n",
+    "# Check if command was successful\n",
+    "if result.returncode == 0:\n",
+    "    print(f\"Clip successfully extracted to: {clip_file}\")\n",
+    "else:\n",
+    "    print(f\"Error extracting clip: {result.stderr}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### experiment with model parameters\n",
+    "\n",
+    "using these setting actually made the results worse:\n",
+    "- min_speakers=3,  # Specify at least 3 speakers\n",
+    "- max_speakers=15,  # Limit to at most 10 speakers\n",
+    "- diarize_min_duration=0.1,  # Shorter minimum segment duration\n",
+    "I also tested with medium, and large versions but the results using tiny were the same\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:src.videos:Transcribing video with speaker diarization: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n",
+      "INFO:src.videos:Output will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n",
+      "INFO:src.huggingface:Auto-detected device: cpu\n",
+      "INFO:src.huggingface:Auto-selected compute_type: int8\n",
+      "INFO:src.huggingface:Loading WhisperX model: tiny on cpu with int8 precision\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "168afa65d3ae4108af591eb1993fe482",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89d35faecb8e447db3ccb95407e2a775",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f616039556ee46aaaee2f975f016aeb0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "50bd4e88d6084638b91847587cc9ed0a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`\n",
+      "INFO:src.huggingface:Loading diarization pipeline\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No language specified, language will be first be detected for each audio file (increases inference time).\n",
+      ">>Performing voice activity detection using Pyannote...\n",
+      "Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
+      "Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:src.huggingface:WhisperX model loaded in 4.50 seconds\n",
+      "INFO:src.videos:Running initial transcription with batch size 8...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Detected language: en (0.99) in first 30s of audio...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:src.videos:Detected language: en\n",
+      "INFO:src.videos:Loading alignment model for detected language: en\n",
+      "INFO:src.videos:Aligning transcription with audio...\n",
+      "INFO:src.videos:Running speaker diarization...\n",
+      "/Users/owner/Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/pyannote/audio/models/blocks/pooling.py:104: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/ReduceOps.cpp:1808.)\n",
+      "  std = sequences.std(dim=-1, correction=1)\n",
+      "INFO:src.videos:Assigning speakers to transcription...\n",
+      "INFO:src.videos:Processing transcription segments...\n",
+      "INFO:src.videos:Diarized transcription completed in 30.03 seconds\n",
+      "INFO:src.videos:Detailed JSON saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "from src.videos import transcribe_video_with_diarization\n",
+    "\n",
+    "transcription_dir = Path(\"../data/transcripts\")\n",
+    "\n",
+    "transcript_data = await transcribe_video_with_diarization(\n",
+    "    clip_file,\n",
+    "    transcription_dir,\n",
+    "    model_size=\"tiny\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5d97ff70c1c3409da83c10c478f2bfaa",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HTML(value='<h3>Meeting Script</h3><hr><p><b>[00:00:00] SPEAKER_01:</b><br>Thank you, Mr. Huffinds. Any counci…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def format_timestamp(seconds: float) -> str:\n",
+    "    \"\"\"Convert seconds to HH:MM:SS format\"\"\"\n",
+    "    hours = int(seconds // 3600)\n",
+    "    minutes = int((seconds % 3600) // 60)\n",
+    "    secs = int(seconds % 60)\n",
+    "    return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n",
+    "\n",
+    "\n",
+    "from ipywidgets import HTML, VBox, Layout\n",
+    "from textwrap import fill\n",
+    "\n",
+    "# Create formatted HTML output\n",
+    "html_output = [\"<h3>Meeting Script</h3>\"]\n",
+    "html_output.append(\"<hr>\")\n",
+    "\n",
+    "current_speaker = None\n",
+    "current_text = []\n",
+    "current_start = None\n",
+    "\n",
+    "for segment in transcript_data[\"segments\"]:\n",
+    "    if current_speaker != segment[\"speaker\"]:\n",
+    "        # Output previous speaker's text\n",
+    "        if current_speaker:\n",
+    "            timestamp = format_timestamp(current_start)\n",
+    "            wrapped_text = fill(\" \".join(current_text), width=80)\n",
+    "            html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
+    "            html_output.append(f\"{wrapped_text}</p>\")\n",
+    "            html_output.append(\"<hr>\")\n",
+    "\n",
+    "        # Start new speaker\n",
+    "        current_speaker = segment[\"speaker\"]\n",
+    "        current_text = [segment[\"text\"].strip()]\n",
+    "        current_start = segment[\"start\"]\n",
+    "    else:\n",
+    "        # Continue current speaker\n",
+    "        current_text.append(segment[\"text\"].strip())\n",
+    "\n",
+    "# Output final speaker\n",
+    "if current_speaker:\n",
+    "    timestamp = format_timestamp(current_start)\n",
+    "    wrapped_text = fill(\" \".join(current_text), width=80)\n",
+    "    html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
+    "    html_output.append(f\"{wrapped_text}</p>\")\n",
+    "    html_output.append(\"<hr>\")\n",
+    "\n",
+    "# Display formatted output\n",
+    "display(\n",
+    "    HTML(\n",
+    "        value=\"\".join(html_output),\n",
+    "        layout=Layout(width=\"100%\", border=\"1px solid gray\", padding=\"10px\"),\n",
+    "    )\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "TGOV Scraper",
+   "language": "python",
+   "name": "tgov-scraper"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ authors = ["jdungan <johnadungan@gmail.com>"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
-python = "^3.10"
+python = "3.11.*"
 selectolax = "^0.3.28"
 aiohttp = "^3.11.13"
 pytest-asyncio = "^0.25.3"
@@ -28,6 +28,10 @@ python-dotenv = "^1.0.1"
 aiofiles = "^24.1.0"
 
 
+[tool.poetry.group.dev.dependencies]
+jupyter = "^1.1.1"
+ipykernel = "^6.29.5"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"