[#13] Add audio preprocessing for uploaded recordings (#33)

julien731 · web-flow · commit c54700055233 · 2026-03-17T13:51:31.000+07:00
* #13 Add implementation plan for audio preprocessing * #13 Add preprocess_audio field to schema and new dependencies * #13 Add audio preprocessor service with high-pass, noise reduction, loudness normalization * #13 Wire audio preprocessor into transcription pipeline * #13 Accept preprocess_audio form field in upload endpoint * #13 Add audio preprocessing toggle to upload form * #13 Add tests for audio preprocessing and preprocess_audio persistence * #13 Move preprocessed file cleanup to finally block * #13 Skip audio preprocessor tests when numpy is not installed CI does not install ML dependencies. Use pytest.importorskip to gracefully skip when numpy/soundfile/noisereduce are unavailable. * #13 Document preprocessing stage in progress UI and CLAUDE.md * #13 Convert non-WAV audio formats via ffmpeg before preprocessing soundfile (libsndfile) cannot read m4a/AAC files, causing preprocessing to fail on common upload formats. Convert unsupported formats to WAV using ffmpeg before applying filters.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -47,7 +47,7 @@ data/meetings/{id}/    # Per-meeting: metadata.json, transcript.json, audio file
 
 ## Key Flows
 
-**Transcription pipeline:** Upload (POST /api/meetings) -> save audio + metadata.json (status=PROCESSING) -> create JobInfo -> spawn daemon thread -> WhisperX transcribe -> align timestamps -> PyAnnote diarize -> save transcript.json -> update metadata (status=READY)
+**Transcription pipeline:** Upload (POST /api/meetings) -> save audio + metadata.json (status=PROCESSING) -> create JobInfo -> spawn daemon thread -> preprocess audio (if enabled: high-pass filter, noise reduction, loudness normalization) -> WhisperX transcribe -> align timestamps -> PyAnnote diarize -> save transcript.json -> update metadata (status=READY)
 
 **Frontend polling:** transcript-viewer.js polls GET /api/jobs/{jobId} every 3s -> shows progress bar -> auto-navigates on completion
 
diff --git a/backend/routers/meetings.py b/backend/routers/meetings.py
@@ -81,6 +81,7 @@ async def create_meeting(
     meeting_type: str = Form("other"),
     language: str = Form("auto"),
     num_speakers: str = Form("auto"),
+    preprocess_audio: str = Form("true"),
 ):
     # Validate file extension
     ext = Path(file.filename).suffix.lower()
@@ -122,13 +123,16 @@ async def create_meeting(
         except ValueError:
             pass
 
+    effective_preprocess = preprocess_audio.lower() not in ("false", "0", "no")
+
     metadata = MeetingMetadata(
         id=meeting_id,
         title=effective_title,
         type=mt,
         audio_filename=audio_filename,
         language=effective_language,
         num_speakers=effective_num_speakers,
+        preprocess_audio=effective_preprocess,
         status=MeetingStatus.PROCESSING,
         job_id=job.id,
     )
diff --git a/backend/schemas.py b/backend/schemas.py
@@ -28,6 +28,7 @@ class JobStatus(str, Enum):
 
 class JobStage(str, Enum):
     UPLOADING = "uploading"
+    PREPROCESSING = "preprocessing"
     TRANSCRIBING = "transcribing"
     ALIGNING = "aligning"
     DIARIZING = "diarizing"
@@ -56,6 +57,7 @@ class MeetingMetadata(BaseModel):
     status: MeetingStatus = MeetingStatus.PROCESSING
     language: str = "auto"
     num_speakers: int | None = None
+    preprocess_audio: bool = True
     job_id: str | None = None
     speakers: dict[str, str] = Field(default_factory=dict)
     error: str | None = None
diff --git a/backend/services/audio_preprocessor.py b/backend/services/audio_preprocessor.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+import logging
+import subprocess
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+HIGHPASS_CUTOFF_HZ = 80
+NOISE_PROP_DECREASE = 0.75
+TARGET_LUFS = -23.0
+
+SOUNDFILE_FORMATS = {".wav", ".flac", ".ogg", ".aiff", ".aif"}
+
+
+def _convert_to_wav(audio_path: Path) -> Path:
+    """Convert non-WAV audio to WAV using ffmpeg. Returns path to the converted file."""
+    wav_path = audio_path.parent / "audio_converted.wav"
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(audio_path), "-ar", "16000", "-ac", "1", str(wav_path)],
+        check=True,
+        capture_output=True,
+    )
+    return wav_path
+
+
+def preprocess_audio(audio_path: Path) -> Path:
+    """Apply audio preprocessing: high-pass filter, noise reduction, loudness normalization.
+
+    Returns the path to the preprocessed WAV file (saved alongside the original).
+    """
+    import numpy as np
+    import soundfile as sf
+    from scipy.signal import butter, sosfilt
+
+    logger.info("Preprocessing audio: %s", audio_path.name)
+
+    converted_path = None
+    if audio_path.suffix.lower() not in SOUNDFILE_FORMATS:
+        logger.info("Converting %s to WAV via ffmpeg", audio_path.suffix)
+        converted_path = _convert_to_wav(audio_path)
+        read_path = converted_path
+    else:
+        read_path = audio_path
+
+    data, sample_rate = sf.read(read_path, dtype="float64")
+
+    # Convert stereo to mono if needed
+    if data.ndim > 1:
+        data = np.mean(data, axis=1)
+
+    # 1. High-pass filter (80 Hz, 4th-order Butterworth)
+    sos = butter(4, HIGHPASS_CUTOFF_HZ, btype="high", fs=sample_rate, output="sos")
+    data = sosfilt(sos, data)
+
+    # 2. Noise reduction (conservative)
+    import noisereduce as nr
+
+    data = nr.reduce_noise(
+        y=data,
+        sr=sample_rate,
+        prop_decrease=NOISE_PROP_DECREASE,
+        stationary=False,
+    )
+
+    # 3. Loudness normalization to -23 LUFS
+    import pyloudnorm as pyln
+
+    meter = pyln.Meter(sample_rate)
+    loudness = meter.integrated_loudness(data)
+
+    if not np.isinf(loudness):
+        data = pyln.normalize.loudness(data, loudness, TARGET_LUFS)
+
+    # Save preprocessed copy
+    output_path = audio_path.parent / "audio_preprocessed.wav"
+    sf.write(str(output_path), data, sample_rate)
+
+    if converted_path and converted_path.exists():
+        converted_path.unlink()
+
+    logger.info("Preprocessed audio saved: %s", output_path.name)
+    return output_path
diff --git a/backend/services/transcriber.py b/backend/services/transcriber.py
@@ -39,13 +39,21 @@ def _run_transcription(meeting_id: str, job_id: str):
     """Run transcription in a background thread."""
     meeting_dir = MEETINGS_DIR / meeting_id
     metadata_path = meeting_dir / "metadata.json"
+    preprocessed_path = None
 
     try:
         with open(metadata_path) as f:
             metadata = MeetingMetadata(**json.load(f))
 
         audio_path = meeting_dir / metadata.audio_filename
 
+        if metadata.preprocess_audio:
+            job_queue.update_job(job_id, status=JobStatus.PROCESSING, stage="preprocessing", progress=5)
+            from backend.services.audio_preprocessor import preprocess_audio
+
+            preprocessed_path = preprocess_audio(audio_path)
+            audio_path = preprocessed_path
+
         job_queue.update_job(job_id, status=JobStatus.PROCESSING, stage="transcribing", progress=10)
 
         # Import heavy deps only when needed
@@ -217,6 +225,10 @@ def _patched_torch_load(*args, **kwargs):
             except Exception:
                 pass
 
+    finally:
+        if preprocessed_path and preprocessed_path.exists():
+            preprocessed_path.unlink()
+
 
 def start_transcription(meeting_id: str, job_id: str):
     """Start transcription in a background thread."""
diff --git a/docs/plans/13-audio-preprocessing.md b/docs/plans/13-audio-preprocessing.md
@@ -0,0 +1,73 @@
+# Plan: Audio Preprocessing
+
+**Story**: #13
+**Spec**: `docs/specs/transcription-quality-improvements.md` — US-1
+**Branch**: `feature/13-audio-preprocessing`
+**Date**: 2026-03-14
+**Mode**: Standard — core logic is straightforward audio/numpy operations
+
+## Technical Decisions
+
+### TD-1: Preprocessing as a separate service module
+- **Context**: Preprocessing is a distinct pipeline stage
+- **Decision**: New `backend/services/audio_preprocessor.py`
+- **Alternatives considered**: Inline in transcriber.py — rejected for separation of concerns
+
+### TD-2: Preprocessed file stored alongside original as WAV
+- **Context**: Original must be preserved (T3); preprocessed audio needs to be on disk for WhisperX
+- **Decision**: Save as `audio_preprocessed.wav` in meeting directory, clean up after transcription
+- **Alternatives considered**: In-memory numpy array — rejected because WhisperX `load_audio` expects a file path
+
+## Files to Create or Modify
+
+- `backend/services/audio_preprocessor.py` — new: high-pass filter, noise reduction, loudness normalization
+- `backend/schemas.py` — add `preprocess_audio: bool = True` to MeetingMetadata
+- `backend/routers/meetings.py` — accept `preprocess_audio` form field
+- `backend/services/transcriber.py` — call preprocessor, use preprocessed audio path
+- `frontend/js/components/upload.js` — add preprocessing toggle checkbox
+- `frontend/js/api.js` — pass `preprocess_audio` in createMeeting
+- `requirements.txt` — add noisereduce and pyloudnorm
+- `tests/unit/test_audio_preprocessor.py` — new: unit tests
+- `tests/integration/test_meetings.py` — test preprocess_audio persistence
+
+## Approach per AC
+
+### AC 1: Audio undergoes preprocessing (high-pass, noise reduction, loudness normalization)
+Load audio with soundfile, apply 80Hz butterworth high-pass (scipy), noisereduce with prop_decrease=0.75 (T1/BR-3), pyloudnorm to -23 LUFS. Save as WAV working copy.
+
+### AC 2: Preprocessing enabled by default
+`preprocess_audio` field defaults to `True` in MeetingMetadata.
+
+### AC 3: Upload form toggle to disable preprocessing
+Checkbox in upload form, checked by default.
+
+### AC 4: Conservative noise reduction doesn't degrade clean audio
+Enforced by prop_decrease=0.75 setting.
+
+### AC 5: `preprocess_audio` field persisted in MeetingMetadata
+New Pydantic field with default True.
+
+### AC 6: New dependencies
+Add noisereduce>=3.0.0 and pyloudnorm>=0.1.1 to requirements.txt.
+
+## Commit Sequence
+
+1. Add preprocess_audio field to schema + requirements
+2. Add audio_preprocessor.py service
+3. Wire preprocessor into transcription pipeline
+4. Add preprocess_audio to upload endpoint
+5. Add preprocessing toggle to frontend
+6. Add tests
+
+## Risks and Trade-offs
+
+- soundfile and scipy are transitive deps of whisperx — no explicit addition needed
+- Preprocessed file is always WAV regardless of input format
+
+## Deviations from Spec
+
+- None anticipated
+
+## Deviations from Plan
+
+_Populated after implementation._
diff --git a/frontend/css/styles.css b/frontend/css/styles.css
@@ -353,6 +353,26 @@ body {
     border-color: var(--primary);
 }
 
+.form-checkbox label {
+    display: flex;
+    align-items: center;
+    gap: 8px;
+    cursor: pointer;
+    font-size: 14px;
+    color: var(--text);
+}
+
+.form-checkbox input[type="checkbox"] {
+    width: auto;
+    margin: 0;
+    accent-color: var(--primary);
+}
+
+.form-hint {
+    font-size: 12px;
+    color: var(--text-muted);
+}
+
 /* Audio Player */
 .audio-player {
     background: var(--bg-surface);
diff --git a/frontend/js/api.js b/frontend/js/api.js
@@ -11,13 +11,14 @@ const API = {
         return res.json();
     },
 
-    async createMeeting(file, title, meetingType, language, numSpeakers) {
+    async createMeeting(file, title, meetingType, language, numSpeakers, preprocessAudio = true) {
         const form = new FormData();
         form.append('file', file);
         form.append('title', title || '');
         form.append('meeting_type', meetingType || 'other');
         form.append('language', language || 'auto');
         form.append('num_speakers', numSpeakers || 'auto');
+        form.append('preprocess_audio', preprocessAudio ? 'true' : 'false');
         const res = await fetch('/api/meetings', { method: 'POST', body: form });
         if (!res.ok) {
             const err = await res.json();
diff --git a/frontend/js/components/transcript-viewer.js b/frontend/js/components/transcript-viewer.js
@@ -337,6 +337,7 @@ async function updateProgress(meetingId, jobId) {
 
         const stageLabels = {
             uploading: 'Uploading...',
+            preprocessing: 'Preprocessing audio...',
             transcribing: 'Transcribing audio...',
             aligning: 'Aligning timestamps...',
             diarizing: 'Identifying speakers...',
diff --git a/frontend/js/components/upload.js b/frontend/js/components/upload.js
@@ -61,6 +61,13 @@ function renderUpload(container) {
                     <input type="text" id="speakers-input" placeholder="Auto" value="">
                 </div>
             </div>
+            <div class="form-group form-checkbox">
+                <label>
+                    <input type="checkbox" id="preprocess-checkbox" checked>
+                    Audio preprocessing
+                    <span class="form-hint">High-pass filter, noise reduction, and loudness normalization</span>
+                </label>
+            </div>
             <button type="submit" id="upload-btn" class="btn btn-primary btn-large" disabled>
                 Upload & Transcribe
             </button>
@@ -140,8 +147,9 @@ async function handleUpload(e) {
         const type = document.getElementById('type-select').value;
         const language = document.getElementById('language-select').value;
         const numSpeakers = document.getElementById('speakers-input').value.trim() || 'auto';
+        const preprocessAudio = document.getElementById('preprocess-checkbox').checked;
         requestNotificationPermission();
-        const result = await API.createMeeting(selectedFile, title, type, language, numSpeakers);
+        const result = await API.createMeeting(selectedFile, title, type, language, numSpeakers, preprocessAudio);
         App.navigate(`/meetings/${result.meeting_id}`);
     } catch (err) {
         showToast(err.message, 'error');
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,8 @@ python-dotenv>=1.0.0
 whisperx @ git+https://github.com/m-bain/whisperX.git
 torch
 torchaudio
+noisereduce>=3.0.0
+pyloudnorm>=0.1.1
 
 # Dev dependencies
 pytest>=8.0.0
diff --git a/tests/integration/test_meetings.py b/tests/integration/test_meetings.py
@@ -132,6 +132,30 @@ async def test_upload_creates_files_on_disk(self, mock_start, client, sample_aud
         assert (meeting_dir / "metadata.json").exists()
         assert (meeting_dir / "audio.wav").exists()
 
+    @patch("backend.routers.meetings.start_transcription")
+    async def test_preprocess_audio_defaults_to_true(self, mock_start, client, sample_audio: Path, meetings_dir: Path):
+        with open(sample_audio, "rb") as f:
+            res = await client.post(
+                "/api/meetings",
+                files={"file": ("test.wav", f, "audio/wav")},
+                data={"title": "Preprocess Test"},
+            )
+        meeting_id = res.json()["meeting_id"]
+        meta = json.loads((meetings_dir / meeting_id / "metadata.json").read_text())
+        assert meta["preprocess_audio"] is True
+
+    @patch("backend.routers.meetings.start_transcription")
+    async def test_preprocess_audio_can_be_disabled(self, mock_start, client, sample_audio: Path, meetings_dir: Path):
+        with open(sample_audio, "rb") as f:
+            res = await client.post(
+                "/api/meetings",
+                files={"file": ("test.wav", f, "audio/wav")},
+                data={"title": "No Preprocess", "preprocess_audio": "false"},
+            )
+        meeting_id = res.json()["meeting_id"]
+        meta = json.loads((meetings_dir / meeting_id / "metadata.json").read_text())
+        assert meta["preprocess_audio"] is False
+
 
 class TestGetMeeting:
     async def test_existing_meeting(self, client, populated_meeting):
diff --git a/tests/unit/test_audio_preprocessor.py b/tests/unit/test_audio_preprocessor.py
diff --git a/tests/unit/test_transcriber.py b/tests/unit/test_transcriber.py