Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ data/audio/
data/video/
data/transcripts/
models/
notebooks/.ipynb_checkpoints/

# Python
__pycache__/
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,26 @@ poetry self add poetry-plugin-shell
poetry shell

# Install Jupyter kernel for this environment (needed for Jupyter notebooks)
poetry run python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
python -m ipykernel install --user --name=tgov-scraper --display-name="TGOV Scraper"
```

## Running

```bash
poetry run jupyter notebook
jupyter notebook
```

## Running Tests

```bash
# Run all tests
poetry run pytest
pytest

# Run specific tests
poetry run pytest tests/test_meetings.py
pytest tests/test_meetings.py

# Run tests with verbose output
poetry run pytest -v
pytest -v
```

## Project Structure
Expand Down
329 changes: 329 additions & 0 deletions notebooks/roll_call.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### This notebook is for focusing on a roll call to see how it is transcribed\n",
"\n",
"Recognizing short words by different speakers is difficult. This notebook focuses in a roll call vote to see if changing model parameters can improve it. \n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import pandas as pd\n",
"sys.path.append(\"../\")\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### use ffmpeg to get a section of a meeting\n",
"This 30 second clip is a roll call vote"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Clip successfully extracted to: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n"
]
}
],
"source": [
"import subprocess\n",
"from pathlib import Path\n",
"\n",
"# Input and output file paths\n",
"input_file = Path(\"../data/video/regular_council_meeting___2025_02_26.mp4\")\n",
"clip_file = Path(\"../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\")\n",
"\n",
"# Parameters for clip extraction\n",
"start_time = \"4:50\"\n",
"duration = \"30\" # 30 seconds\n",
"\n",
"# Run FFmpeg command\n",
"result = subprocess.run(\n",
" [\n",
" \"ffmpeg\",\n",
" \"-i\",\n",
" str(input_file),\n",
" \"-ss\",\n",
" start_time,\n",
" \"-t\",\n",
" duration,\n",
" \"-c\",\n",
" \"copy\", # Copy codec (fast but might not be frame accurate)\n",
" \"-avoid_negative_ts\",\n",
" \"1\",\n",
" str(clip_file),\n",
" \"-y\", # Overwrite if exists\n",
" ],\n",
" capture_output=True,\n",
" text=True,\n",
")\n",
"\n",
"# Check if command was successful\n",
"if result.returncode == 0:\n",
" print(f\"Clip successfully extracted to: {clip_file}\")\n",
"else:\n",
" print(f\"Error extracting clip: {result.stderr}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### experiment with model parameters\n",
"\n",
"using these setting actually made the results worse:\n",
"- min_speakers=3, # Specify at least 3 speakers\n",
"- max_speakers=15, # Limit to at most 10 speakers\n",
"- diarize_min_duration=0.1, # Shorter minimum segment duration\n",
"I also tested with medium, and large versions but the results using tiny were the same\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:src.videos:Transcribing video with speaker diarization: ../data/video/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.mp4\n",
"INFO:src.videos:Output will be saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n",
"INFO:src.huggingface:Auto-detected device: cpu\n",
"INFO:src.huggingface:Auto-selected compute_type: int8\n",
"INFO:src.huggingface:Loading WhisperX model: tiny on cpu with int8 precision\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "168afa65d3ae4108af591eb1993fe482",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.json: 0%| | 0.00/2.20M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89d35faecb8e447db3ccb95407e2a775",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/2.25k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f616039556ee46aaaee2f975f016aeb0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"vocabulary.txt: 0%| | 0.00/460k [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "50bd4e88d6084638b91847587cc9ed0a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"model.bin: 0%| | 0.00/75.5M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`\n",
"INFO:src.huggingface:Loading diarization pipeline\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"No language specified, language will be first be detected for each audio file (increases inference time).\n",
">>Performing voice activity detection using Pyannote...\n",
"Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
"Model was trained with torch 1.10.0+cu102, yours is 2.4.1. Bad things might happen unless you revert torch to 1.x.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:src.huggingface:WhisperX model loaded in 4.50 seconds\n",
"INFO:src.videos:Running initial transcription with batch size 8...\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Detected language: en (0.99) in first 30s of audio...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:src.videos:Detected language: en\n",
"INFO:src.videos:Loading alignment model for detected language: en\n",
"INFO:src.videos:Aligning transcription with audio...\n",
"INFO:src.videos:Running speaker diarization...\n",
"/Users/owner/Library/Caches/pypoetry/virtualenvs/tgov_scraper-zRR99ne3-py3.11/lib/python3.11/site-packages/pyannote/audio/models/blocks/pooling.py:104: UserWarning: std(): degrees of freedom is <= 0. Correction should be strictly less than the reduction factor (input numel divided by output numel). (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/native/ReduceOps.cpp:1808.)\n",
" std = sequences.std(dim=-1, correction=1)\n",
"INFO:src.videos:Assigning speakers to transcription...\n",
"INFO:src.videos:Processing transcription segments...\n",
"INFO:src.videos:Diarized transcription completed in 30.03 seconds\n",
"INFO:src.videos:Detailed JSON saved to: ../data/transcripts/regular_council_meeting___2025_02_26_clip_4-50_to_5-20.diarized.json\n"
]
}
],
"source": [
"from src.videos import transcribe_video_with_diarization\n",
"\n",
"transcription_dir = Path(\"../data/transcripts\")\n",
"\n",
"transcript_data = await transcribe_video_with_diarization(\n",
" clip_file,\n",
" transcription_dir,\n",
" model_size=\"tiny\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5d97ff70c1c3409da83c10c478f2bfaa",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HTML(value='<h3>Meeting Script</h3><hr><p><b>[00:00:00] SPEAKER_01:</b><br>Thank you, Mr. Huffinds. Any counci…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def format_timestamp(seconds: float) -> str:\n",
" \"\"\"Convert seconds to HH:MM:SS format\"\"\"\n",
" hours = int(seconds // 3600)\n",
" minutes = int((seconds % 3600) // 60)\n",
" secs = int(seconds % 60)\n",
" return f\"{hours:02d}:{minutes:02d}:{secs:02d}\"\n",
"\n",
"\n",
"from ipywidgets import HTML, VBox, Layout\n",
"from textwrap import fill\n",
"\n",
"# Create formatted HTML output\n",
"html_output = [\"<h3>Meeting Script</h3>\"]\n",
"html_output.append(\"<hr>\")\n",
"\n",
"current_speaker = None\n",
"current_text = []\n",
"current_start = None\n",
"\n",
"for segment in transcript_data[\"segments\"]:\n",
" if current_speaker != segment[\"speaker\"]:\n",
" # Output previous speaker's text\n",
" if current_speaker:\n",
" timestamp = format_timestamp(current_start)\n",
" wrapped_text = fill(\" \".join(current_text), width=80)\n",
" html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
" html_output.append(f\"{wrapped_text}</p>\")\n",
" html_output.append(\"<hr>\")\n",
"\n",
" # Start new speaker\n",
" current_speaker = segment[\"speaker\"]\n",
" current_text = [segment[\"text\"].strip()]\n",
" current_start = segment[\"start\"]\n",
" else:\n",
" # Continue current speaker\n",
" current_text.append(segment[\"text\"].strip())\n",
"\n",
"# Output final speaker\n",
"if current_speaker:\n",
" timestamp = format_timestamp(current_start)\n",
" wrapped_text = fill(\" \".join(current_text), width=80)\n",
" html_output.append(f\"<p><b>[{timestamp}] {current_speaker}:</b><br>\")\n",
" html_output.append(f\"{wrapped_text}</p>\")\n",
" html_output.append(\"<hr>\")\n",
"\n",
"# Display formatted output\n",
"display(\n",
" HTML(\n",
" value=\"\".join(html_output),\n",
" layout=Layout(width=\"100%\", border=\"1px solid gray\", padding=\"10px\"),\n",
" )\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "TGOV Scraper",
"language": "python",
"name": "tgov-scraper"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ authors = ["jdungan <johnadungan@gmail.com>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.10"
python = "3.11.*"
selectolax = "^0.3.28"
aiohttp = "^3.11.13"
pytest-asyncio = "^0.25.3"
Expand All @@ -28,6 +28,10 @@ python-dotenv = "^1.0.1"
aiofiles = "^24.1.0"


[tool.poetry.group.dev.dependencies]
jupyter = "^1.1.1"
ipykernel = "^6.29.5"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"