diff --git a/.gitignore b/.gitignore index 4f8ca2d8d..86a9bf32b 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,9 @@ monkeytype.sqlite3 # Test related data temp/ + +# MCP & Claude sensitive files (do NOT commit tokens) +.claude/ +*.env +mcp-*.log +ecosystem.config.js diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..d1e1285b3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,130 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +CCExtractor Sample Platform - Flask web application for managing regression tests, sample uploads, and CI/CD for the CCExtractor project. Validates PRs by running CCExtractor against sample media files on GCP VMs (Linux/Windows). + +## Tech Stack + +- **Backend**: Flask 3.1, SQLAlchemy 1.4, MySQL (SQLite for tests) +- **Cloud**: GCP Compute Engine (test VMs), Google Cloud Storage (samples) +- **CI/CD**: GitHub Actions, GitHub API (PyGithub) +- **Testing**: nose2, Flask-Testing, coverage + +## Commands + +```bash +# Setup +virtualenv venv && source venv/bin/activate +pip install -r requirements.txt +pip install -r test-requirements.txt + +# Run tests +TESTING=True nose2 + +# Linting & type checking +pycodestyle ./ --config=./.pycodestylerc +pydocstyle ./ +mypy . +isort . --check-only + +# Database migrations +export FLASK_APP=/path/to/run.py +flask db upgrade # Apply migrations +flask db migrate # Generate new migration + +# Update regression test results +python manage.py update /path/to/ccextractor +``` + +## Architecture + +### Module Structure +Each module in `mod_*/` follows: `__init__.py`, `controllers.py` (routes), `models.py` (ORM), `forms.py` (WTForms) + +| Module | Purpose | +|--------|---------| +| `mod_ci` | GitHub webhooks, GCP VM orchestration, test execution | +| `mod_regression` | Regression test definitions, categories, expected outputs | +| `mod_test` | Test runs, results, progress tracking | +| `mod_sample` | Sample file management, tags, extra files | +| `mod_upload` | HTTP/FTP upload handling | +| `mod_auth` | User auth, roles (admin/user/contributor/tester) | +| `mod_customized` | Custom test runs for forks | + +### Key Models & Relationships +``` +Sample (sha hash) -> RegressionTest (command, expected_rc) -> RegressionTestOutput + | +Fork (GitHub repo) -> Test (platform, commit) -> TestResult -> TestResultFile + -> TestProgress (status tracking) +``` + +### CI Flow +1. GitHub webhook (`/start-ci`) receives PR/push events +2. Waits for GitHub Actions build artifacts +3. `gcp_instance()` provisions Linux/Windows VMs +4. VMs run CCExtractor, report to `progress_reporter()` +5. Results compared against expected outputs +6. `comment_pr()` posts results to GitHub + +## Critical Files + +- `run.py` - Flask app entry, blueprint registration +- `mod_ci/controllers.py` - CI orchestration (2500+ lines) +- `mod_regression/models.py` - Test definitions +- `mod_test/models.py` - Test execution models +- `database.py` - SQLAlchemy setup, custom types +- `tests/base.py` - Test fixtures, mock helpers + +## GSoC 2026 Focus Areas (from Carlos) + +### Priority 1: Regression Test Suite +The main blocker for CCExtractor Rust migration is test coverage. Current needs: +- Add regression tests for uncovered caption types/containers +- Import FFmpeg and VLC official video libraries as test samples +- Systematic sample analysis using ffprobe, mkvnix, CCExtractor output +- Goal: Trust SP enough that passing tests = safe to merge + +### Priority 2: Sample Platform Improvements +Low-coverage modules needing work: +- `mod_upload` (44% coverage) - FTP upload, progress tracking +- `mod_test` (58% coverage) - diff generation, error scenarios +- `mod_sample` (61% coverage) - Issue linking, tag management + +### Contribution Strategy +1. Start with unit tests for low-coverage modules +2. Add integration tests for CI flow +3. Help document sample metadata systematically +4. Enable confident C code removal by proving test coverage + +## Code Style + +- Type hints required (mypy enforced) +- Docstrings required (pydocstyle enforced) +- PEP8 (pycodestyle enforced) +- Imports sorted with isort + +## MCP Setup (GSoC 2026) + +**Configured servers** (`~/.claude/settings.json`): +- `github` – repo/PR/issue management (needs `GITHUB_PERSONAL_ACCESS_TOKEN` env var) +- `context7` – up-to-date library docs +- `filesystem` – scoped to `/home/rahul/projects/gsoc` + +**Security**: +- Token stored in `~/.profile`, never committed +- MCP paths added to `.gitignore` +- pm2 config at `~/ecosystem.config.js` for auto-restart + +**Commands**: +```bash +# Start MCP servers +pm2 start ~/ecosystem.config.js +pm2 logs + +# Resume Claude session +claude --resume +``` diff --git a/install/sample_db.py b/install/sample_db.py index ec5f05465..cee70f444 100644 --- a/install/sample_db.py +++ b/install/sample_db.py @@ -27,7 +27,8 @@ def run(): Category('DVB', 'Samples that contain DVB subtitles'), Category('DVD', 'Samples that contain DVD subtitles'), Category('MP4', 'Samples that are stored in the MP4 format'), - Category('General', 'General regression samples') + Category('General', 'General regression samples'), + Category('Output Formats', 'Tests for specific output format generation') ] entries.extend(categories) @@ -42,16 +43,20 @@ def run(): regression_tests = [ RegressionTest(1, '-autoprogram -out=ttxt -latin1', InputType.file, OutputType.file, 3, 10), - RegressionTest(2, '-autoprogram -out=ttxt -latin1 -ucla', InputType.file, OutputType.file, 1, 10) + RegressionTest(2, '-autoprogram -out=ttxt -latin1 -ucla', InputType.file, OutputType.file, 1, 10), + RegressionTest(1, '-out=webvtt', InputType.file, OutputType.file, 6, 0, True, + 'Validates WebVTT header generation on empty-caption input') ] entries.extend(regression_tests) gen_data = GeneralData('last_commit', '71dffd6eb30c1f4b5cf800307de845072ce33262') entries.append(gen_data) + WEBVTT_TEST_ID = 3 regression_test_output = [ - RegressionTestOutput(1, "test1", "srt", "test1.srt"), - RegressionTestOutput(2, "test2", "srt", "test2.srt") + RegressionTestOutput(1, "test1", ".srt", "test1.srt"), + RegressionTestOutput(2, "test2", ".srt", "test2.srt"), + RegressionTestOutput(WEBVTT_TEST_ID, "WEBVTT\r\n\r\n", ".webvtt", "sample1.webvtt") ] entries.extend(regression_test_output) diff --git a/install/sample_files/GOLDEN_FILE_PROVENANCE.md b/install/sample_files/GOLDEN_FILE_PROVENANCE.md new file mode 100644 index 000000000..c7a447f57 --- /dev/null +++ b/install/sample_files/GOLDEN_FILE_PROVENANCE.md @@ -0,0 +1,28 @@ +# Golden File Provenance + +This document tracks the generation details for regression test golden files. + +## sample1.webvtt + +| Field | Value | +|-------|-------| +| Generated | 2026-01-02 | +| CCExtractor Version | 0.96.3 | +| Binary | ccextractorwinfull.exe | +| Platform | Windows x64 | +| Source Commit | Release build from windows/x64/Release-Full | +| Command | `ccextractorwinfull.exe sample1.ts -out=webvtt -o sample1.webvtt` | +| Input Sample | sample1.ts (no embedded closed captions) | +| Expected Output | WebVTT header only (WEBVTT + blank line) | + +### Reproduction Steps + +```bash +ccextractor install/sample_files/sample1.ts -out=webvtt -o install/sample_files/sample1.webvtt +``` + +### Notes + +- sample1.ts contains no closed caption data, so output is header-only +- This test validates WebVTT header generation, not full cue formatting +- For full WebVTT validation, a sample with embedded captions should be added diff --git a/install/sample_files/sample1.webvtt b/install/sample_files/sample1.webvtt new file mode 100644 index 000000000..dd7db57c7 --- /dev/null +++ b/install/sample_files/sample1.webvtt @@ -0,0 +1,2 @@ +WEBVTT + diff --git a/migrations/versions/c1a2b3d4e5f6_add_webvtt_regression_test.py b/migrations/versions/c1a2b3d4e5f6_add_webvtt_regression_test.py new file mode 100644 index 000000000..0af4d6dd4 --- /dev/null +++ b/migrations/versions/c1a2b3d4e5f6_add_webvtt_regression_test.py @@ -0,0 +1,112 @@ +"""Add WebVTT regression test + +Revision ID: c1a2b3d4e5f6 +Revises: b3ed927671bd +Create Date: 2026-01-04 21:05:00.000000 + +""" +import sqlalchemy as sa +from alembic import op +from sqlalchemy import text + +# revision identifiers, used by Alembic. +revision = 'c1a2b3d4e5f6' +down_revision = 'b3ed927671bd' +branch_labels = None +depends_on = None + + +def upgrade(): + conn = op.get_bind() + + # 1. Insert "Output Formats" category if not exists + existing_cat = conn.execute( + text("SELECT id FROM category WHERE name = 'Output Formats'") + ).fetchone() + + if existing_cat is None: + conn.execute( + text("INSERT INTO category (name, description) VALUES ('Output Formats', 'Tests for specific output format generation')") + ) + category_id = conn.execute(text("SELECT id FROM category WHERE name = 'Output Formats'")).fetchone()[0] + else: + category_id = existing_cat[0] + + # 2. Check if WebVTT regression test already exists + existing_test = conn.execute( + text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1") + ).fetchone() + + if existing_test is None: + # 3. Insert the WebVTT regression test (sample_id=1 is sample1.ts) + conn.execute( + text(""" + INSERT INTO regression_test (sample_id, command, input_type, output_type, expected_rc, active, description) + VALUES (1, '-out=webvtt', 'file', 'file', 0, 1, 'Validates WebVTT header generation on empty-caption input') + """) + ) + test_id = conn.execute( + text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1") + ).fetchone()[0] + + # 4. Insert RegressionTestOutput with the golden content + conn.execute( + text(""" + INSERT INTO regression_test_output (regression_id, correct, correct_extension, expected_filename) + VALUES (:test_id, 'WEBVTT\r\n\r\n', '.webvtt', 'sample1.webvtt') + """), + {"test_id": test_id} + ) + + # 5. Link test to category + conn.execute( + text(""" + INSERT INTO regression_test_category (regression_id, category_id) + VALUES (:test_id, :cat_id) + """), + {"test_id": test_id, "cat_id": category_id} + ) + + +def downgrade(): + conn = op.get_bind() + + # Get the WebVTT test ID + test_row = conn.execute( + text("SELECT id FROM regression_test WHERE command = '-out=webvtt' AND sample_id = 1") + ).fetchone() + + if test_row is not None: + test_id = test_row[0] + + # Delete in reverse order of dependencies + conn.execute( + text("DELETE FROM regression_test_category WHERE regression_id = :test_id"), + {"test_id": test_id} + ) + conn.execute( + text("DELETE FROM regression_test_output WHERE regression_id = :test_id"), + {"test_id": test_id} + ) + conn.execute( + text("DELETE FROM regression_test WHERE id = :test_id"), + {"test_id": test_id} + ) + + # Check if "Output Formats" category has any remaining tests + cat_row = conn.execute( + text("SELECT id FROM category WHERE name = 'Output Formats'") + ).fetchone() + + if cat_row is not None: + category_id = cat_row[0] + remaining = conn.execute( + text("SELECT COUNT(*) FROM regression_test_category WHERE category_id = :cat_id"), + {"cat_id": category_id} + ).fetchone()[0] + + if remaining == 0: + conn.execute( + text("DELETE FROM category WHERE id = :cat_id"), + {"cat_id": category_id} + ) diff --git a/mod_ci/models.py b/mod_ci/models.py index 1b5c9ebfb..dfbbd430e 100644 --- a/mod_ci/models.py +++ b/mod_ci/models.py @@ -11,7 +11,7 @@ import datetime from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Type +from typing import Any, ClassVar, Dict, List, Optional, Type from sqlalchemy import (Boolean, Column, DateTime, ForeignKey, Integer, String, Text) @@ -91,7 +91,7 @@ class PendingDeletion(Base): retry_count = Column(Integer, nullable=False, default=0) # Max retries before we give up and just try to force delete - MAX_RETRIES = 5 + MAX_RETRIES: ClassVar[int] = 5 def __init__(self, vm_name, operation_name, created_at=None) -> None: """ diff --git a/templates/test/by_id.html b/templates/test/by_id.html index 92aa37e7f..1990f7a84 100644 --- a/templates/test/by_id.html +++ b/templates/test/by_id.html @@ -207,11 +207,14 @@
There are no tests executed in this category.
// Update sample progress display if (data.sample_progress) { - var progressText = document.getElementById('sample-progress-text'); - if (progressText) { - progressText.textContent = data.sample_progress.current + ' / ' + - data.sample_progress.total + ' samples (' + - data.sample_progress.percentage + '%)'; + var $progressText = $('#sample-progress-text'); + if ($progressText.length) { + var text = data.sample_progress.current + ' / ' + + data.sample_progress.total + ' samples'; + if (data.sample_progress.total > 0) { + text += ' (' + data.sample_progress.percentage + '%)'; + } + $progressText.text(text); } } diff --git a/tests/base.py b/tests/base.py index f15c81ab6..3bf644cc3 100644 --- a/tests/base.py +++ b/tests/base.py @@ -319,7 +319,8 @@ def setUp(self): Category("DVB", "Samples that contain DVB subtitles"), Category("DVD", "Samples that contain DVD subtitles"), Category("MP4", "Samples that are stored in the MP4 format"), - Category("General", "General regression samples") + Category("General", "General regression samples"), + Category("Output Formats", "Tests for specific output format generation") ] g.db.add_all(categories) g.db.commit() @@ -340,16 +341,20 @@ def setUp(self): regression_tests = [ RegressionTest(1, "-autoprogram -out=ttxt -latin1 -2", InputType.file, OutputType.file, 3, 10), - RegressionTest(2, "-autoprogram -out=ttxt -latin1 -ucla", InputType.file, OutputType.file, 1, 10) + RegressionTest(2, "-autoprogram -out=ttxt -latin1 -ucla", InputType.file, OutputType.file, 1, 10), + RegressionTest(1, "-out=webvtt", InputType.file, OutputType.file, 6, 0, True, + "Validates WebVTT header generation on empty-caption input") ] g.db.add_all(regression_tests) g.db.commit() categories[0].regression_tests.append(regression_tests[0]) categories[2].regression_tests.append(regression_tests[1]) + categories[5].regression_tests.append(regression_tests[2]) regression_test_outputs = [ RegressionTestOutput(1, "sample_out1", ".srt", ""), - RegressionTestOutput(2, "sample_out2", ".srt", "") + RegressionTestOutput(2, "sample_out2", ".srt", ""), + RegressionTestOutput(3, "WEBVTT\r\n\r\n", ".webvtt", "sample1.webvtt") ] g.db.add_all(regression_test_outputs) g.db.commit() diff --git a/tests/test_log_configuration.py b/tests/test_log_configuration.py old mode 100755 new mode 100644 index dec5a3386..f69b4a1c4 --- a/tests/test_log_configuration.py +++ b/tests/test_log_configuration.py @@ -96,6 +96,7 @@ def test_init_handles_os_error(self): self.assertEqual(log_config._consoleLogger, mock_sh()) # File logger should be None self.assertIsNone(log_config._fileLogger) + self.assertIsNone(log_config.file_logger) def test_create_logger(self): """Test logger creation.""" diff --git a/tests/test_regression/test_controllers.py b/tests/test_regression/test_controllers.py index 83eb40394..22c50c7a4 100644 --- a/tests/test_regression/test_controllers.py +++ b/tests/test_regression/test_controllers.py @@ -227,7 +227,7 @@ def test_add_test(self): expected_rc=25, submit=True, )) - self.assertNotEqual(RegressionTest.query.filter(RegressionTest.id == 3).first(), None) + self.assertNotEqual(RegressionTest.query.filter(RegressionTest.id == 4).first(), None) def test_add_test_empty_erc(self): """Check it will not add a regression test with empty Expected Runtime Code.""" @@ -243,7 +243,7 @@ def test_add_test_empty_erc(self): category_id=1, submit=True, )) - self.assertEqual(RegressionTest.query.filter(RegressionTest.id == 3).first(), None) + self.assertEqual(RegressionTest.query.filter(RegressionTest.id == 4).first(), None) def test_category_deletion_without_login(self): """Check if it will move to the login page.""" diff --git a/tests/test_regression/test_update_regression.py b/tests/test_regression/test_update_regression.py index 7b2219b38..49cf92525 100644 --- a/tests/test_regression/test_update_regression.py +++ b/tests/test_regression/test_update_regression.py @@ -39,7 +39,7 @@ def test_update_expected_results_(self, mock_test, mock_os): mock_os.path.isfile.return_value = True expected = True - num_tests = 2 # store number of mock regression tests we have + num_tests = 3 # store number of mock regression tests we have response = update_expected_results('valid/path')