Skip to content

Commit c547000

Browse files
authored
[#13] Add audio preprocessing for uploaded recordings (#33)
* #13 Add implementation plan for audio preprocessing * #13 Add preprocess_audio field to schema and new dependencies * #13 Add audio preprocessor service with high-pass, noise reduction, loudness normalization * #13 Wire audio preprocessor into transcription pipeline * #13 Accept preprocess_audio form field in upload endpoint * #13 Add audio preprocessing toggle to upload form * #13 Add tests for audio preprocessing and preprocess_audio persistence * #13 Move preprocessed file cleanup to finally block * #13 Skip audio preprocessor tests when numpy is not installed CI does not install ML dependencies. Use pytest.importorskip to gracefully skip when numpy/soundfile/noisereduce are unavailable. * #13 Document preprocessing stage in progress UI and CLAUDE.md * #13 Convert non-WAV audio formats via ffmpeg before preprocessing soundfile (libsndfile) cannot read m4a/AAC files, causing preprocessing to fail on common upload formats. Convert unsupported formats to WAV using ffmpeg before applying filters.
1 parent 99d1247 commit c547000

File tree

14 files changed

+317
-3
lines changed

14 files changed

+317
-3
lines changed

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ data/meetings/{id}/ # Per-meeting: metadata.json, transcript.json, audio file
4747

4848
## Key Flows
4949

50-
**Transcription pipeline:** Upload (POST /api/meetings) -> save audio + metadata.json (status=PROCESSING) -> create JobInfo -> spawn daemon thread -> WhisperX transcribe -> align timestamps -> PyAnnote diarize -> save transcript.json -> update metadata (status=READY)
50+
**Transcription pipeline:** Upload (POST /api/meetings) -> save audio + metadata.json (status=PROCESSING) -> create JobInfo -> spawn daemon thread -> preprocess audio (if enabled: high-pass filter, noise reduction, loudness normalization) -> WhisperX transcribe -> align timestamps -> PyAnnote diarize -> save transcript.json -> update metadata (status=READY)
5151

5252
**Frontend polling:** transcript-viewer.js polls GET /api/jobs/{jobId} every 3s -> shows progress bar -> auto-navigates on completion
5353

backend/routers/meetings.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ async def create_meeting(
8181
meeting_type: str = Form("other"),
8282
language: str = Form("auto"),
8383
num_speakers: str = Form("auto"),
84+
preprocess_audio: str = Form("true"),
8485
):
8586
# Validate file extension
8687
ext = Path(file.filename).suffix.lower()
@@ -122,13 +123,16 @@ async def create_meeting(
122123
except ValueError:
123124
pass
124125

126+
effective_preprocess = preprocess_audio.lower() not in ("false", "0", "no")
127+
125128
metadata = MeetingMetadata(
126129
id=meeting_id,
127130
title=effective_title,
128131
type=mt,
129132
audio_filename=audio_filename,
130133
language=effective_language,
131134
num_speakers=effective_num_speakers,
135+
preprocess_audio=effective_preprocess,
132136
status=MeetingStatus.PROCESSING,
133137
job_id=job.id,
134138
)

backend/schemas.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class JobStatus(str, Enum):
2828

2929
class JobStage(str, Enum):
3030
UPLOADING = "uploading"
31+
PREPROCESSING = "preprocessing"
3132
TRANSCRIBING = "transcribing"
3233
ALIGNING = "aligning"
3334
DIARIZING = "diarizing"
@@ -56,6 +57,7 @@ class MeetingMetadata(BaseModel):
5657
status: MeetingStatus = MeetingStatus.PROCESSING
5758
language: str = "auto"
5859
num_speakers: int | None = None
60+
preprocess_audio: bool = True
5961
job_id: str | None = None
6062
speakers: dict[str, str] = Field(default_factory=dict)
6163
error: str | None = None
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
import subprocess
5+
from pathlib import Path
6+
7+
logger = logging.getLogger(__name__)
8+
9+
HIGHPASS_CUTOFF_HZ = 80
10+
NOISE_PROP_DECREASE = 0.75
11+
TARGET_LUFS = -23.0
12+
13+
SOUNDFILE_FORMATS = {".wav", ".flac", ".ogg", ".aiff", ".aif"}
14+
15+
16+
def _convert_to_wav(audio_path: Path) -> Path:
17+
"""Convert non-WAV audio to WAV using ffmpeg. Returns path to the converted file."""
18+
wav_path = audio_path.parent / "audio_converted.wav"
19+
subprocess.run(
20+
["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)],
21+
check=True,
22+
capture_output=True,
23+
)
24+
return wav_path
25+
26+
27+
def preprocess_audio(audio_path: Path) -> Path:
28+
"""Apply audio preprocessing: high-pass filter, noise reduction, loudness normalization.
29+
30+
Returns the path to the preprocessed WAV file (saved alongside the original).
31+
"""
32+
import numpy as np
33+
import soundfile as sf
34+
from scipy.signal import butter, sosfilt
35+
36+
logger.info("Preprocessing audio: %s", audio_path.name)
37+
38+
converted_path = None
39+
if audio_path.suffix.lower() not in SOUNDFILE_FORMATS:
40+
logger.info("Converting %s to WAV via ffmpeg", audio_path.suffix)
41+
converted_path = _convert_to_wav(audio_path)
42+
read_path = converted_path
43+
else:
44+
read_path = audio_path
45+
46+
data, sample_rate = sf.read(read_path, dtype="float64")
47+
48+
# Convert stereo to mono if needed
49+
if data.ndim > 1:
50+
data = np.mean(data, axis=1)
51+
52+
# 1. High-pass filter (80 Hz, 4th-order Butterworth)
53+
sos = butter(4, HIGHPASS_CUTOFF_HZ, btype="high", fs=sample_rate, output="sos")
54+
data = sosfilt(sos, data)
55+
56+
# 2. Noise reduction (conservative)
57+
import noisereduce as nr
58+
59+
data = nr.reduce_noise(
60+
y=data,
61+
sr=sample_rate,
62+
prop_decrease=NOISE_PROP_DECREASE,
63+
stationary=False,
64+
)
65+
66+
# 3. Loudness normalization to -23 LUFS
67+
import pyloudnorm as pyln
68+
69+
meter = pyln.Meter(sample_rate)
70+
loudness = meter.integrated_loudness(data)
71+
72+
if not np.isinf(loudness):
73+
data = pyln.normalize.loudness(data, loudness, TARGET_LUFS)
74+
75+
# Save preprocessed copy
76+
output_path = audio_path.parent / "audio_preprocessed.wav"
77+
sf.write(str(output_path), data, sample_rate)
78+
79+
if converted_path and converted_path.exists():
80+
converted_path.unlink()
81+
82+
logger.info("Preprocessed audio saved: %s", output_path.name)
83+
return output_path

backend/services/transcriber.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,21 @@ def _run_transcription(meeting_id: str, job_id: str):
3939
"""Run transcription in a background thread."""
4040
meeting_dir = MEETINGS_DIR / meeting_id
4141
metadata_path = meeting_dir / "metadata.json"
42+
preprocessed_path = None
4243

4344
try:
4445
with open(metadata_path) as f:
4546
metadata = MeetingMetadata(**json.load(f))
4647

4748
audio_path = meeting_dir / metadata.audio_filename
4849

50+
if metadata.preprocess_audio:
51+
job_queue.update_job(job_id, status=JobStatus.PROCESSING, stage="preprocessing", progress=5)
52+
from backend.services.audio_preprocessor import preprocess_audio
53+
54+
preprocessed_path = preprocess_audio(audio_path)
55+
audio_path = preprocessed_path
56+
4957
job_queue.update_job(job_id, status=JobStatus.PROCESSING, stage="transcribing", progress=10)
5058

5159
# Import heavy deps only when needed
@@ -217,6 +225,10 @@ def _patched_torch_load(*args, **kwargs):
217225
except Exception:
218226
pass
219227

228+
finally:
229+
if preprocessed_path and preprocessed_path.exists():
230+
preprocessed_path.unlink()
231+
220232

221233
def start_transcription(meeting_id: str, job_id: str):
222234
"""Start transcription in a background thread."""
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Plan: Audio Preprocessing
2+
3+
**Story**: #13
4+
**Spec**: `docs/specs/transcription-quality-improvements.md` — US-1
5+
**Branch**: `feature/13-audio-preprocessing`
6+
**Date**: 2026-03-14
7+
**Mode**: Standard — core logic is straightforward audio/numpy operations
8+
9+
## Technical Decisions
10+
11+
### TD-1: Preprocessing as a separate service module
12+
- **Context**: Preprocessing is a distinct pipeline stage
13+
- **Decision**: New `backend/services/audio_preprocessor.py`
14+
- **Alternatives considered**: Inline in transcriber.py — rejected for separation of concerns
15+
16+
### TD-2: Preprocessed file stored alongside original as WAV
17+
- **Context**: Original must be preserved (T3); preprocessed audio needs to be on disk for WhisperX
18+
- **Decision**: Save as `audio_preprocessed.wav` in meeting directory, clean up after transcription
19+
- **Alternatives considered**: In-memory numpy array — rejected because WhisperX `load_audio` expects a file path
20+
21+
## Files to Create or Modify
22+
23+
- `backend/services/audio_preprocessor.py` — new: high-pass filter, noise reduction, loudness normalization
24+
- `backend/schemas.py` — add `preprocess_audio: bool = True` to MeetingMetadata
25+
- `backend/routers/meetings.py` — accept `preprocess_audio` form field
26+
- `backend/services/transcriber.py` — call preprocessor, use preprocessed audio path
27+
- `frontend/js/components/upload.js` — add preprocessing toggle checkbox
28+
- `frontend/js/api.js` — pass `preprocess_audio` in createMeeting
29+
- `requirements.txt` — add noisereduce and pyloudnorm
30+
- `tests/unit/test_audio_preprocessor.py` — new: unit tests
31+
- `tests/integration/test_meetings.py` — test preprocess_audio persistence
32+
33+
## Approach per AC
34+
35+
### AC 1: Audio undergoes preprocessing (high-pass, noise reduction, loudness normalization)
36+
Load audio with soundfile, apply 80Hz butterworth high-pass (scipy), noisereduce with prop_decrease=0.75 (T1/BR-3), pyloudnorm to -23 LUFS. Save as WAV working copy.
37+
38+
### AC 2: Preprocessing enabled by default
39+
`preprocess_audio` field defaults to `True` in MeetingMetadata.
40+
41+
### AC 3: Upload form toggle to disable preprocessing
42+
Checkbox in upload form, checked by default.
43+
44+
### AC 4: Conservative noise reduction doesn't degrade clean audio
45+
Enforced by prop_decrease=0.75 setting.
46+
47+
### AC 5: `preprocess_audio` field persisted in MeetingMetadata
48+
New Pydantic field with default True.
49+
50+
### AC 6: New dependencies
51+
Add noisereduce>=3.0.0 and pyloudnorm>=0.1.1 to requirements.txt.
52+
53+
## Commit Sequence
54+
55+
1. Add preprocess_audio field to schema + requirements
56+
2. Add audio_preprocessor.py service
57+
3. Wire preprocessor into transcription pipeline
58+
4. Add preprocess_audio to upload endpoint
59+
5. Add preprocessing toggle to frontend
60+
6. Add tests
61+
62+
## Risks and Trade-offs
63+
64+
- soundfile and scipy are transitive deps of whisperx — no explicit addition needed
65+
- Preprocessed file is always WAV regardless of input format
66+
67+
## Deviations from Spec
68+
69+
- None anticipated
70+
71+
## Deviations from Plan
72+
73+
_Populated after implementation._

frontend/css/styles.css

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,26 @@ body {
353353
border-color: var(--primary);
354354
}
355355

356+
.form-checkbox label {
357+
display: flex;
358+
align-items: center;
359+
gap: 8px;
360+
cursor: pointer;
361+
font-size: 14px;
362+
color: var(--text);
363+
}
364+
365+
.form-checkbox input[type="checkbox"] {
366+
width: auto;
367+
margin: 0;
368+
accent-color: var(--primary);
369+
}
370+
371+
.form-hint {
372+
font-size: 12px;
373+
color: var(--text-muted);
374+
}
375+
356376
/* Audio Player */
357377
.audio-player {
358378
background: var(--bg-surface);

frontend/js/api.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ const API = {
1111
return res.json();
1212
},
1313

14-
async createMeeting(file, title, meetingType, language, numSpeakers) {
14+
async createMeeting(file, title, meetingType, language, numSpeakers, preprocessAudio = true) {
1515
const form = new FormData();
1616
form.append('file', file);
1717
form.append('title', title || '');
1818
form.append('meeting_type', meetingType || 'other');
1919
form.append('language', language || 'auto');
2020
form.append('num_speakers', numSpeakers || 'auto');
21+
form.append('preprocess_audio', preprocessAudio ? 'true' : 'false');
2122
const res = await fetch('/api/meetings', { method: 'POST', body: form });
2223
if (!res.ok) {
2324
const err = await res.json();

frontend/js/components/transcript-viewer.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ async function updateProgress(meetingId, jobId) {
337337

338338
const stageLabels = {
339339
uploading: 'Uploading...',
340+
preprocessing: 'Preprocessing audio...',
340341
transcribing: 'Transcribing audio...',
341342
aligning: 'Aligning timestamps...',
342343
diarizing: 'Identifying speakers...',

frontend/js/components/upload.js

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ function renderUpload(container) {
6161
<input type="text" id="speakers-input" placeholder="Auto" value="">
6262
</div>
6363
</div>
64+
<div class="form-group form-checkbox">
65+
<label>
66+
<input type="checkbox" id="preprocess-checkbox" checked>
67+
Audio preprocessing
68+
<span class="form-hint">High-pass filter, noise reduction, and loudness normalization</span>
69+
</label>
70+
</div>
6471
<button type="submit" id="upload-btn" class="btn btn-primary btn-large" disabled>
6572
Upload & Transcribe
6673
</button>
@@ -140,8 +147,9 @@ async function handleUpload(e) {
140147
const type = document.getElementById('type-select').value;
141148
const language = document.getElementById('language-select').value;
142149
const numSpeakers = document.getElementById('speakers-input').value.trim() || 'auto';
150+
const preprocessAudio = document.getElementById('preprocess-checkbox').checked;
143151
requestNotificationPermission();
144-
const result = await API.createMeeting(selectedFile, title, type, language, numSpeakers);
152+
const result = await API.createMeeting(selectedFile, title, type, language, numSpeakers, preprocessAudio);
145153
App.navigate(`/meetings/${result.meeting_id}`);
146154
} catch (err) {
147155
showToast(err.message, 'error');

0 commit comments

Comments
 (0)