CCExtractor · Rahul-2k4 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026 · Jan 2, 2026
@@ -53,3 +53,9 @@ monkeytype.sqlite3
 
 # Test related data
 temp/
+
+# MCP & Claude sensitive files (do NOT commit tokens)
+.claude/
+*.env
+mcp-*.log
+ecosystem.config.js
@@ -0,0 +1,130 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+CCExtractor Sample Platform - Flask web application for managing regression tests, sample uploads, and CI/CD for the CCExtractor project. Validates PRs by running CCExtractor against sample media files on GCP VMs (Linux/Windows).
+
+## Tech Stack
+
+- **Backend**: Flask 3.1, SQLAlchemy 1.4, MySQL (SQLite for tests)
+- **Cloud**: GCP Compute Engine (test VMs), Google Cloud Storage (samples)
+- **CI/CD**: GitHub Actions, GitHub API (PyGithub)
+- **Testing**: nose2, Flask-Testing, coverage
+
+## Commands
+
+```bash
+# Setup
+virtualenv venv && source venv/bin/activate
+pip install -r requirements.txt
+pip install -r test-requirements.txt
+
+# Run tests
+TESTING=True nose2
+
+# Linting & type checking
+pycodestyle ./ --config=./.pycodestylerc
+pydocstyle ./
+mypy .
+isort . --check-only
+
+# Database migrations
+export FLASK_APP=/path/to/run.py
+flask db upgrade      # Apply migrations
+flask db migrate      # Generate new migration
+
+# Update regression test results
+python manage.py update /path/to/ccextractor
+```
+
+## Architecture
+
+### Module Structure
+Each module in `mod_*/` follows: `__init__.py`, `controllers.py` (routes), `models.py` (ORM), `forms.py` (WTForms)
+
+| Module | Purpose |
+|--------|---------|
+| `mod_ci` | GitHub webhooks, GCP VM orchestration, test execution |
+| `mod_regression` | Regression test definitions, categories, expected outputs |
+| `mod_test` | Test runs, results, progress tracking |
+| `mod_sample` | Sample file management, tags, extra files |
+| `mod_upload` | HTTP/FTP upload handling |
+| `mod_auth` | User auth, roles (admin/user/contributor/tester) |
+| `mod_customized` | Custom test runs for forks |
+
+### Key Models & Relationships
+```
+Sample (sha hash) -> RegressionTest (command, expected_rc) -> RegressionTestOutput
+                                    |
+Fork (GitHub repo) -> Test (platform, commit) -> TestResult -> TestResultFile
+                                              -> TestProgress (status tracking)
+```
+
+### CI Flow
+1. GitHub webhook (`/start-ci`) receives PR/push events
+2. Waits for GitHub Actions build artifacts
+3. `gcp_instance()` provisions Linux/Windows VMs
+4. VMs run CCExtractor, report to `progress_reporter()`
+5. Results compared against expected outputs
+6. `comment_pr()` posts results to GitHub
+
+## Critical Files
+
+- `run.py` - Flask app entry, blueprint registration
+- `mod_ci/controllers.py` - CI orchestration (2500+ lines)
+- `mod_regression/models.py` - Test definitions
+- `mod_test/models.py` - Test execution models
+- `database.py` - SQLAlchemy setup, custom types
+- `tests/base.py` - Test fixtures, mock helpers
+
+## GSoC 2026 Focus Areas (from Carlos)
+
+### Priority 1: Regression Test Suite
+The main blocker for CCExtractor Rust migration is test coverage. Current needs:
+- Add regression tests for uncovered caption types/containers
+- Import FFmpeg and VLC official video libraries as test samples
+- Systematic sample analysis using ffprobe, mkvnix, CCExtractor output
+- Goal: Trust SP enough that passing tests = safe to merge
+
+### Priority 2: Sample Platform Improvements
+Low-coverage modules needing work:
+- `mod_upload` (44% coverage) - FTP upload, progress tracking
+- `mod_test` (58% coverage) - diff generation, error scenarios
+- `mod_sample` (61% coverage) - Issue linking, tag management
+
+### Contribution Strategy
+1. Start with unit tests for low-coverage modules
+2. Add integration tests for CI flow
+3. Help document sample metadata systematically
+4. Enable confident C code removal by proving test coverage
+
+## Code Style
+
+- Type hints required (mypy enforced)
+- Docstrings required (pydocstyle enforced)
+- PEP8 (pycodestyle enforced)
+- Imports sorted with isort
+
+## MCP Setup (GSoC 2026)
+
+**Configured servers** (`~/.claude/settings.json`):
+- `github` – repo/PR/issue management (needs `GITHUB_PERSONAL_ACCESS_TOKEN` env var)
+- `context7` – up-to-date library docs
+- `filesystem` – scoped to `/home/rahul/projects/gsoc`
+
+**Security**:
+- Token stored in `~/.profile`, never committed
+- MCP paths added to `.gitignore`
+- pm2 config at `~/ecosystem.config.js` for auto-restart
+
+**Commands**:
+```bash
+# Start MCP servers
+pm2 start ~/ecosystem.config.js
+pm2 logs
+
+# Resume Claude session
+claude --resume
+```
@@ -27,7 +27,8 @@ def run():
         Category('DVB', 'Samples that contain DVB subtitles'),
         Category('DVD', 'Samples that contain DVD subtitles'),
         Category('MP4', 'Samples that are stored in the MP4 format'),
-        Category('General', 'General regression samples')
+        Category('General', 'General regression samples'),
+        Category('Output Formats', 'Tests for specific output format generation')
     ]
     entries.extend(categories)
 
@@ -42,16 +43,20 @@ def run():
 
     regression_tests = [
         RegressionTest(1, '-autoprogram -out=ttxt -latin1', InputType.file, OutputType.file, 3, 10),
-        RegressionTest(2, '-autoprogram -out=ttxt -latin1 -ucla', InputType.file, OutputType.file, 1, 10)
+        RegressionTest(2, '-autoprogram -out=ttxt -latin1 -ucla', InputType.file, OutputType.file, 1, 10),
+        RegressionTest(1, '-out=webvtt', InputType.file, OutputType.file, 6, 0, True,
+                       'Validates WebVTT header generation on empty-caption input')
     ]
     entries.extend(regression_tests)
 
     gen_data = GeneralData('last_commit', '71dffd6eb30c1f4b5cf800307de845072ce33262')
     entries.append(gen_data)
 
+    WEBVTT_TEST_ID = 3
     regression_test_output = [
-        RegressionTestOutput(1, "test1", "srt", "test1.srt"),
-        RegressionTestOutput(2, "test2", "srt", "test2.srt")
+        RegressionTestOutput(1, "test1", ".srt", "test1.srt"),
+        RegressionTestOutput(2, "test2", ".srt", "test2.srt"),
+        RegressionTestOutput(WEBVTT_TEST_ID, "WEBVTT\r\n\r\n", ".webvtt", "sample1.webvtt")
     ]
     entries.extend(regression_test_output)
 

@@ -0,0 +1,28 @@
+# Golden File Provenance
+
+This document tracks the generation details for regression test golden files.
+
+## sample1.webvtt
+
+| Field | Value |
+|-------|-------|
+| Generated | 2026-01-02 |
+| CCExtractor Version | 0.96.3 |
+| Binary | ccextractorwinfull.exe |
+| Platform | Windows x64 |
+| Source Commit | Release build from windows/x64/Release-Full |
+| Command | `ccextractorwinfull.exe sample1.ts -out=webvtt -o sample1.webvtt` |
+| Input Sample | sample1.ts (no embedded closed captions) |
+| Expected Output | WebVTT header only (WEBVTT + blank line) |
+
+### Reproduction Steps
+
+```bash
+ccextractor install/sample_files/sample1.ts -out=webvtt -o install/sample_files/sample1.webvtt
+```
+
+### Notes
+
+- sample1.ts contains no closed caption data, so output is header-only
+- This test validates WebVTT header generation, not full cue formatting
+- For full WebVTT validation, a sample with embedded captions should be added
@@ -0,0 +1,2 @@
+WEBVTT
+
@@ -0,0 +1,112 @@
+"""Add WebVTT regression test
+
+Revision ID: c1a2b3d4e5f6
+Revises: b3ed927671bd
+Create Date: 2026-01-04 21:05:00.000000
+
+"""
+import sqlalchemy as sa
+from alembic import op
+from sqlalchemy import text
+
+# revision identifiers, used by Alembic.
+revision = 'c1a2b3d4e5f6'
+down_revision = 'b3ed927671bd'
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    conn = op.get_bind()
+
+    # 1. Insert "Output Formats" category if not exists
+    existing_cat = conn.execute(
+        text("SELECT id FROM category WHERE name = 'Output Formats'")
+    ).fetchone()
+
+    if existing_cat is None:
+        conn.execute(
+            text("INSERT INTO category (name, description) VALUES ('Output Formats', 'Tests for specific output format generation')")
+        )
+        category_id = conn.execute(text("SELECT id FROM category WHERE name = 'Output Formats'")).fetchone()[0]
+    else:
+        category_id = existing_cat[0]
+
+    # 2. Check if WebVTT regression test already exists
+    existing_test = conn.execute(
+        text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1")
+    ).fetchone()
+
+    if existing_test is None:
+        # 3. Insert the WebVTT regression test (sample_id=1 is sample1.ts)
+        conn.execute(
+            text("""
+                INSERT INTO regression_test (sample_id, command, input_type, output_type, expected_rc, active, description)
+                VALUES (1, '-out=webvtt', 'file', 'file', 0, 1, 'Validates WebVTT header generation on empty-caption input')
+            """)
+        )
+        test_id = conn.execute(
+            text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1")
+        ).fetchone()[0]
+
+        # 4. Insert RegressionTestOutput with the golden content
+        conn.execute(
+            text("""
+                INSERT INTO regression_test_output (regression_id, correct, correct_extension, expected_filename)
+                VALUES (:test_id, 'WEBVTT\r\n\r\n', '.webvtt', 'sample1.webvtt')
+            """),
+            {"test_id": test_id}
+        )
+
+        # 5. Link test to category
+        conn.execute(
+            text("""
+                INSERT INTO regression_test_category (regression_id, category_id)
+                VALUES (:test_id, :cat_id)
+            """),
+            {"test_id": test_id, "cat_id": category_id}
+        )
+
+
+def downgrade():
+    conn = op.get_bind()
+
+    # Get the WebVTT test ID
+    test_row = conn.execute(
+        text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1")
+    ).fetchone()
+
+    if test_row is not None:
+        test_id = test_row[0]
+
+        # Delete in reverse order of dependencies
+        conn.execute(
+            text("DELETE FROM regression_test_category WHERE regression_id = :test_id"),
+            {"test_id": test_id}
+        )
+        conn.execute(
+            text("DELETE FROM regression_test_output WHERE regression_id = :test_id"),
+            {"test_id": test_id}
+        )
+        conn.execute(
+            text("DELETE FROM regression_test WHERE id = :test_id"),
+            {"test_id": test_id}
+        )
+
+    # Check if "Output Formats" category has any remaining tests
+    cat_row = conn.execute(
+        text("SELECT id FROM category WHERE name = 'Output Formats'")
+    ).fetchone()
+
+    if cat_row is not None:
+        category_id = cat_row[0]
+        remaining = conn.execute(
+            text("SELECT COUNT(*) FROM regression_test_category WHERE category_id = :cat_id"),
+            {"cat_id": category_id}
+        ).fetchone()[0]
+
+        if remaining == 0:
+            conn.execute(
+                text("DELETE FROM category WHERE id = :cat_id"),
+                {"cat_id": category_id}
+            )
@@ -11,7 +11,7 @@
 
 import datetime
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, ClassVar, Dict, List, Optional, Type
 
 from sqlalchemy import (Boolean, Column, DateTime, ForeignKey, Integer, String,
                         Text)
@@ -91,7 +91,7 @@ class PendingDeletion(Base):
     retry_count = Column(Integer, nullable=False, default=0)
 
     # Max retries before we give up and just try to force delete
-    MAX_RETRIES = 5
+    MAX_RETRIES: ClassVar[int] = 5
 
     def __init__(self, vm_name, operation_name, created_at=None) -> None:
         """

@@ -207,11 +207,14 @@ <h6>There are no tests executed in this category.</h6>
 
                                 // Update sample progress display
                                 if (data.sample_progress) {
-                                    var progressText = document.getElementById('sample-progress-text');
-                                    if (progressText) {
-                                        progressText.textContent = data.sample_progress.current + ' / ' + 
-                                            data.sample_progress.total + ' samples (' + 
-                                            data.sample_progress.percentage + '%)';
+                                    var $progressText = $('#sample-progress-text');
+                                    if ($progressText.length) {
+                                        var text = data.sample_progress.current + ' / ' +
+                                            data.sample_progress.total + ' samples';
+                                        if (data.sample_progress.total > 0) {
+                                            text += ' (' + data.sample_progress.percentage + '%)';
+                                        }
+                                        $progressText.text(text);
                                     }
                                 }
 

@@ -319,7 +319,8 @@ def setUp(self):
             Category("DVB", "Samples that contain DVB subtitles"),
             Category("DVD", "Samples that contain DVD subtitles"),
             Category("MP4", "Samples that are stored in the MP4 format"),
-            Category("General", "General regression samples")
+            Category("General", "General regression samples"),
+            Category("Output Formats", "Tests for specific output format generation")
         ]
         g.db.add_all(categories)
         g.db.commit()
@@ -340,16 +341,20 @@ def setUp(self):
 
         regression_tests = [
             RegressionTest(1, "-autoprogram -out=ttxt -latin1 -2", InputType.file, OutputType.file, 3, 10),
-            RegressionTest(2, "-autoprogram -out=ttxt -latin1 -ucla", InputType.file, OutputType.file, 1, 10)
+            RegressionTest(2, "-autoprogram -out=ttxt -latin1 -ucla", InputType.file, OutputType.file, 1, 10),
+            RegressionTest(1, "-out=webvtt", InputType.file, OutputType.file, 6, 0, True,
+                           "Validates WebVTT header generation on empty-caption input")
         ]
         g.db.add_all(regression_tests)
         g.db.commit()
 
         categories[0].regression_tests.append(regression_tests[0])
         categories[2].regression_tests.append(regression_tests[1])
+        categories[5].regression_tests.append(regression_tests[2])
         regression_test_outputs = [
             RegressionTestOutput(1, "sample_out1", ".srt", ""),
-            RegressionTestOutput(2, "sample_out2", ".srt", "")
+            RegressionTestOutput(2, "sample_out2", ".srt", ""),
+            RegressionTestOutput(3, "WEBVTT\r\n\r\n", ".webvtt", "sample1.webvtt")
         ]
         g.db.add_all(regression_test_outputs)
         g.db.commit()

@@ -96,6 +96,7 @@ def test_init_handles_os_error(self):
                         self.assertEqual(log_config._consoleLogger, mock_sh())
                         # File logger should be None
                         self.assertIsNone(log_config._fileLogger)
+                        self.assertIsNone(log_config.file_logger)
 
     def test_create_logger(self):
         """Test logger creation."""