diff --git a/.github/workflows/ci.yml b/.github/workflows/ci-python.yml similarity index 92% rename from .github/workflows/ci.yml rename to .github/workflows/ci-python.yml index 5193ea1..807cf88 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci-python.yml @@ -73,13 +73,18 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + cd apps/core pip install -e ".[dev]" - name: Check code formatting with Black - run: black --check src/ tests/ + run: | + cd apps/core + black --check src/ tests/ - name: Lint with Ruff - run: ruff check src/ tests/ + run: | + cd apps/core + ruff check src/ tests/ type-check: name: type-check @@ -104,10 +109,13 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + cd apps/core pip install -e ".[dev]" - name: Type check with MyPy - run: mypy src/ + run: | + cd apps/core + mypy src/ test: name: test @@ -137,15 +145,18 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + cd apps/core pip install -e ".[dev]" - name: Run tests with pytest - run: pytest tests/ --cov=voice_clone --cov-report=xml --cov-report=term-missing + run: | + cd apps/core + pytest tests/ --cov=src --cov-report=xml --cov-report=term-missing - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: - file: ./coverage.xml + file: ./apps/core/coverage.xml fail_ci_if_error: false # Job to notify when CI is waiting for approval diff --git a/.github/workflows/ci-rust.yml b/.github/workflows/ci-rust.yml new file mode 100644 index 0000000..8d9ab21 --- /dev/null +++ b/.github/workflows/ci-rust.yml @@ -0,0 +1,77 @@ +name: CI - Rust (Tauri Backend) + +on: + push: + branches: [master, main, develop] + paths: + - 'apps/desktop/src-tauri/**' + - '.github/workflows/ci-rust.yml' + pull_request: + branches: [master, main, develop] + paths: + - 'apps/desktop/src-tauri/**' + - '.github/workflows/ci-rust.yml' + +jobs: + lint: + name: Rust Lint + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt, clippy + override: true + + - name: Check formatting + run: | + cd apps/desktop/src-tauri + cargo fmt -- --check + + - name: Clippy + run: | + cd apps/desktop/src-tauri + cargo clippy -- -D warnings + + test: + name: Rust Test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Run tests + run: | + cd apps/desktop/src-tauri + cargo test + + build: + name: Rust Build + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Build + run: | + cd apps/desktop/src-tauri + cargo build --release + +# Note: This workflow is a placeholder for Phase 8 (Tauri Desktop App Setup) +# It will be activated once apps/desktop/src-tauri/ is created diff --git a/.github/workflows/ci-typescript.yml b/.github/workflows/ci-typescript.yml new file mode 100644 index 0000000..4b61ef9 --- /dev/null +++ b/.github/workflows/ci-typescript.yml @@ -0,0 +1,98 @@ +name: CI - TypeScript (Tauri Frontend) + +on: + push: + branches: [master, main, develop] + paths: + - 'apps/desktop/src/**' + - 'apps/desktop/package.json' + - 'apps/desktop/tsconfig.json' + - '.github/workflows/ci-typescript.yml' + pull_request: + branches: [master, main, develop] + paths: + - 'apps/desktop/src/**' + - 'apps/desktop/package.json' + - 'apps/desktop/tsconfig.json' + - '.github/workflows/ci-typescript.yml' + +jobs: + lint: + name: TypeScript Lint + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: apps/desktop/package-lock.json + + - name: Install dependencies + run: | + cd apps/desktop + npm ci + + - name: Lint + run: | + cd apps/desktop + npm run lint + + - name: Type check + run: | + cd apps/desktop + npm run type-check + + test: + name: TypeScript Test + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: apps/desktop/package-lock.json + + - name: Install dependencies + run: | + cd apps/desktop + npm ci + + - name: Run tests + run: | + cd apps/desktop + npm test + + build: + name: TypeScript Build + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: apps/desktop/package-lock.json + + - name: Install dependencies + run: | + cd apps/desktop + npm ci + + - name: Build + run: | + cd apps/desktop + npm run build + +# Note: This workflow is a placeholder for Phase 8 (Tauri Desktop App Setup) +# It will be activated once apps/desktop/ is created with React + TypeScript + Vite diff --git a/.gitignore b/.gitignore index 6e5693d..f8d13a6 100644 --- a/.gitignore +++ b/.gitignore @@ -23,26 +23,63 @@ venv/ env/ ENV/ +# Monorepo - Python Core Library (apps/core/) +apps/core/build/ +apps/core/dist/ +apps/core/*.egg-info/ +apps/core/.pytest_cache/ +apps/core/.coverage +apps/core/htmlcov/ +apps/core/.mypy_cache/ +apps/core/.ruff_cache/ +apps/core/.hypothesis/ + +# Monorepo - Desktop App (apps/desktop/) +apps/desktop/node_modules/ +apps/desktop/dist/ +apps/desktop/.next/ +apps/desktop/.turbo/ +apps/desktop/out/ +apps/desktop/build/ +apps/desktop/.DS_Store +apps/desktop/*.tsbuildinfo +apps/desktop/npm-debug.log* +apps/desktop/yarn-debug.log* +apps/desktop/yarn-error.log* + +# Monorepo - Tauri (apps/desktop/src-tauri/) +apps/desktop/src-tauri/target/ +apps/desktop/src-tauri/Cargo.lock + # Data (IMPORTANT: personal data) data/samples/ data/outputs/ data/models/ data/cache/ +data/profiles/ + +# Monorepo - Core Library Data (models, cache, outputs) +apps/core/data/ # Configuration (personal) config/config.yaml .env +.env.local +.env.*.local # IDE .vscode/ .idea/ *.swp *.swo + +# Kiro (specs and steering files) .kiro/ # OS .DS_Store Thumbs.db +*.log # Testing .pytest_cache/ @@ -60,3 +97,9 @@ dmypy.json # Private recordings-from-iphone + +# Temporary files +*.tmp +*.bak +*.swp +*~ diff --git a/.kiro/specs/gradio-integration/tasks.md b/.kiro/specs/gradio-integration/tasks.md deleted file mode 100644 index 3e4bc10..0000000 --- a/.kiro/specs/gradio-integration/tasks.md +++ /dev/null @@ -1,532 +0,0 @@ -# Tasks - Gradio UI Integration - -## Task Status Legend -- `[ ]` Not started -- `[~]` Queued -- `[-]` In progress -- `[x]` Completed -- `[ ]*` Optional task - ---- - -## Phase 1: Setup & Infrastructure (Week 1) - -### 1. Project Setup -- [x] 1.1 Create directory structure - - [x] 1.1.1 Create `src/gradio_ui/` directory - - [x] 1.1.2 Create `src/gradio_ui/components/` directory - - [x] 1.1.3 Create `src/gradio_ui/handlers/` directory - - [x] 1.1.4 Create `src/gradio_ui/utils/` directory - - [x] 1.1.5 Create `src/gradio_ui/assets/` directory - - [x] 1.1.6 Create all `__init__.py` files - -- [x] 1.2 Update dependencies - - [x] 1.2.1 Add `gradio>=4.0.0` to `requirements.txt` - - [x] 1.2.2 Update `pyproject.toml` with Gradio dependency - - [x] 1.2.3 Add `voice-clone-ui` script entry point - - [x] 1.2.4 Install dependencies: `pip install -e .` - -- [x] 1.3 Create basic app structure - - [x] 1.3.1 Create `src/gradio_ui/app.py` with minimal layout - - [x] 1.3.2 Implement `create_app()` function - - [x] 1.3.3 Implement `main()` function - - [x] 1.3.4 Test app launches: `python -m gradio_ui.app` - -- [x] 1.4 Add CLI command - - [x] 1.4.1 Add `ui` command to `src/voice_clone/cli.py` - - [x] 1.4.2 Add `--port` option - - [x] 1.4.3 Add `--share` option - - [x] 1.4.4 Test: `voice-clone ui` - - -## Phase 2: Tab 1 - Prepare Voice Profile (Week 2) - -### 2. Sample Upload & Validation - -- [x] 2.1 Implement sample upload UI - - [x] 2.1.1 Add `gr.File` component for multiple files - - [x] 2.1.2 Configure file types (`.wav`, `.mp3`, `.m4a`, `.flac`) - - [x] 2.1.3 Add file count limit (1-3 files) - - [x] 2.1.4 Add drag & drop support (built-in) - -- [x] 2.2 Implement validation handler - - [x] 2.2.1 Create `src/gradio_ui/handlers/sample_handler.py` - - [x] 2.2.2 Implement `validate_samples_handler()` function - - [x] 2.2.3 Integrate with `AudioProcessor.validate_sample()` - - [x] 2.2.4 Format results as Markdown with ✅/❌ - - [x] 2.2.5 Handle empty file list - - [x] 2.2.6 Handle file not found errors - - [x] 2.2.7 Handle audio processing errors - -- [x] 2.3 Wire validation UI - - [x] 2.3.1 Add "Validate Samples" button - - [x] 2.3.2 Add `gr.Markdown` output for results - - [x] 2.3.3 Connect button click to handler - - [x] 2.3.4 Test validation with valid samples - - [x] 2.3.5 Test validation with invalid samples - -### 3. Profile Creation - -- [x] 3.1 Implement profile creation UI - - [x] 3.1.1 Add `gr.Textbox` for profile name - - [x] 3.1.2 Add `gr.Textbox` for reference text (optional) - - [x] 3.1.3 Add "Create Profile" button - - [x] 3.1.4 Add `gr.JSON` output for profile info - -- [x] 3.2 Implement profile creation handler - - [x] 3.2.1 Create `src/gradio_ui/handlers/profile_handler.py` - - [x] 3.2.2 Implement `create_profile_handler()` function - - [x] 3.2.3 Integrate with `VoiceProfile.create()` - - [x] 3.2.4 Save profile to `data/profiles/{name}.json` - - [x] 3.2.5 Implement `list_available_profiles()` function - - [x] 3.2.6 Return profile info dict - - [x] 3.2.7 Return updated dropdown choices - - [x] 3.2.8 Handle empty files - - [x] 3.2.9 Handle missing profile name - - [x] 3.2.10 Handle duplicate profile names - - [x] 3.2.11 Handle file system errors - -- [x] 3.3 Wire profile creation UI - - [x] 3.3.1 Connect button click to handler - - [x] 3.3.2 Update Tab 2 dropdown on success - - [x] 3.3.3 Update Tab 3 dropdown on success - - [x] 3.3.4 Test profile creation flow - - [x] 3.3.5 Test error handling - - -## Phase 3: Tab 2 - Generate Audio (Week 2-3) - -### 4. Audio Generation UI - -- [x] 4.1 Implement profile selection - - [x] 4.1.1 Add `gr.Dropdown` for profile selection - - [x] 4.1.2 Populate with available profiles - - [x] 4.1.3 Add info text for dropdown - - [x] 4.1.4 Handle empty profile list - -- [x] 4.2 Implement text input - - [x] 4.2.1 Add `gr.Textbox` for text input (5-20 lines) - - [x] 4.2.2 Add placeholder text - - [x] 4.2.3 Set max length (2048 characters) - - [x] 4.2.4 Add character counter (optional)* - -- [x] 4.3 Implement advanced settings - - [x] 4.3.1 Add `gr.Accordion` for settings - - [x] 4.3.2 Add `gr.Slider` for temperature (0.5-1.0) - - [x] 4.3.3 Add `gr.Slider` for speed (0.8-1.2) - - [x] 4.3.4 Add info tooltips for sliders - - [x] 4.3.5 Set default values (0.75, 1.0) - -- [x] 4.4 Implement output display - - [x] 4.4.1 Add `gr.Audio` component for output - - [x] 4.4.2 Configure audio player (non-interactive) - - [x] 4.4.3 Enable download button - - [x] 4.4.4 Add `gr.Markdown` for generation info - - [x] 4.4.5 Add "Generate Audio" button - -- [x] 4.5 Add examples - - [x] 4.5.1 Add `gr.Examples` component - - [x] 4.5.2 Add 3-5 example texts in Spanish - - [x] 4.5.3 Wire examples to text input - - [x] 4.5.4 Test example loading - -### 5. Audio Generation Handler - -- [x] 5.1 Implement generation handler - - [x] 5.1.1 Create `src/gradio_ui/handlers/generation_handler.py` - - [x] 5.1.2 Implement `generate_audio_handler()` function - - [x] 5.1.3 Validate inputs (profile selected, text not empty) - - [x] 5.1.4 Load `VoiceProfile` from file - - [x] 5.1.5 Create `VoiceGenerator` instance - - [x] 5.1.6 Call `generator.generate()` with parameters - - [x] 5.1.7 Create output directory if needed - - [x] 5.1.8 Return audio file path - - [x] 5.1.9 Format generation info as Markdown - - [x] 5.1.10 Handle profile not found - - [x] 5.1.11 Handle generation errors - - [x] 5.1.12 Handle out of memory errors - -- [x] 5.2 Wire generation UI - - [x] 5.2.1 Connect button click to handler - - [x] 5.2.2 Enable progress bar (`show_progress="full"`) - - [x] 5.2.3 Test generation with valid inputs - - [x] 5.2.4 Test error handling - - [x] 5.2.5 Test audio playback - - [x] 5.2.6 Test audio download - - -## Phase 4: Tab 3 - Batch Processing (Week 3) - -### 6. Batch Processing UI - -- [x] 6.1 Implement batch UI - - [x] 6.1.1 Add `gr.Dropdown` for profile selection - - [x] 6.1.2 Sync with Tab 2 dropdown - - [x] 6.1.3 Add `gr.File` for script upload - - [x] 6.1.4 Configure file types (`.txt`, `.md`) - - [x] 6.1.5 Add script format instructions (Markdown) - - [x] 6.1.6 Add "Process Batch" button - -- [x] 6.2 Implement batch output - - [x] 6.2.1 Add `gr.File` for multiple output files - - [x] 6.2.2 Add `gr.Markdown` for processing info - - [x] 6.2.3 Configure download for multiple files - -### 7. Batch Processing Handler - -- [x] 7.1 Implement batch handler - - [x] 7.1.1 Create `src/gradio_ui/handlers/batch_handler.py` - - [x] 7.1.2 Implement `batch_process_handler()` function - - [x] 7.1.3 Validate inputs (profile selected, script uploaded) - - [x] 7.1.4 Load `VoiceProfile` from file - - [x] 7.1.5 Create `BatchProcessor` instance - - [x] 7.1.6 Call `processor.process_script()` - - [x] 7.1.7 Create output directory - - [x] 7.1.8 Collect generated audio files - - [x] 7.1.9 Count successful/failed segments - - [x] 7.1.10 Format results as Markdown - - [x] 7.1.11 Handle script parsing errors - - [x] 7.1.12 Handle partial failures - - [x] 7.1.13 Handle file system errors - -- [ ]* 7.2 Add progress tracking (OPTIONAL - Future Enhancement) - - [ ]* 7.2.1 Implement `batch_with_progress()` function - - [ ]* 7.2.2 Use `gr.Progress()` for tracking - - [ ]* 7.2.3 Show current segment being processed - - [ ]* 7.2.4 Show percentage complete - - [ ]* 7.2.5 Test progress display - -- [x] 7.3 Wire batch UI - - [x] 7.3.1 Connect button click to handler - - [x] 7.3.2 Enable progress bar - - [x] 7.3.3 Test batch processing with valid script - - [x] 7.3.4 Test error handling - - [x] 7.3.5 Test file downloads - - -## Phase 5: Polish & Testing (Week 4) - -### 8. UI Polish - -- [x] 8.1 Add header and footer - - [x] 8.1.1 Add main header with title and description - - [x] 8.1.2 Add emojis for visual appeal - - [x] 8.1.3 Add footer with tips and resources - - [x] 8.1.4 Add links to documentation - -- [x] 8.2 Improve layout - - [x] 8.2.1 Use `gr.Row` and `gr.Column` for responsive design - - [x] 8.2.2 Adjust column scales for balance - - [x] 8.2.3 Add section headers with Markdown - - [x] 8.2.4 Test on different screen sizes - -- [ ]* 8.3 Add custom styling (OPTIONAL - Future Enhancement) - - [ ]* 8.3.1 Create `src/gradio_ui/assets/styles.css` - - [ ]* 8.3.2 Customize colors and fonts - - [ ]* 8.3.3 Add branding elements - - [ ]* 8.3.4 Test CSS in browser - -- [x] 8.4 Improve error messages - - [x] 8.4.1 Review all error messages - - [x] 8.4.2 Make messages user-friendly - - [x] 8.4.3 Add emojis (⚠️, ❌, ✅) - - [x] 8.4.4 Add actionable suggestions - -### 9. Testing - -- [x] 9.1 Unit tests for handlers - - [x] 9.1.1 Create `tests/gradio_ui/` directory - - [x] 9.1.2 Create `tests/gradio_ui/test_handlers.py` - - [x] 9.1.3 Test `validate_samples_handler()` - - [x] 9.1.3.1 Test with empty file list - - [x] 9.1.3.2 Test with valid samples - - [x] 9.1.3.3 Test with invalid samples - - [x] 9.1.4 Test `create_profile_handler()` - - [x] 9.1.4.1 Test successful creation - - [x] 9.1.4.2 Test with no files - - [x] 9.1.4.3 Test with no name - - [x] 9.1.4.4 Test duplicate names - - [x] 9.1.5 Test `generate_audio_handler()` - - [x] 9.1.5.1 Test successful generation - - [x] 9.1.5.2 Test with no profile - - [x] 9.1.5.3 Test with no text - - [x] 9.1.5.4 Test with invalid profile - - [x] 9.1.6 Test `batch_process_handler()` - - [x] 9.1.6.1 Test successful batch - - [x] 9.1.6.2 Test with no profile - - [x] 9.1.6.3 Test with no script - - [x] 9.1.6.4 Test partial failures - -- [x] 9.2 Integration tests - - [x] 9.2.1 Create `tests/gradio_ui/test_integration.py` - - [x] 9.2.2 Test app creation - - [ ]* 9.2.3 Test app launch (OPTIONAL - requires running server) - - [x] 9.2.4 Test component existence - -- [ ]* 9.3 Property-based tests (OPTIONAL - Future Enhancement) - - [ ]* 9.3.1 Create `tests/gradio_ui/test_properties.py` - - [ ]* 9.3.2 Implement test data generators - - [ ]* 9.3.3 Test validation determinism - - [ ]* 9.3.4 Test profile uniqueness - - [ ]* 9.3.5 Test audio file existence - - [ ]* 9.3.6 Test batch file count - -- [x] 9.4 Manual testing - - [x] 9.4.1 Test Tab 1 complete workflow - - [x] 9.4.2 Test Tab 2 complete workflow - - [x] 9.4.3 Test Tab 3 complete workflow - - [x] 9.4.4 Test error scenarios - - [ ]* 9.4.5 Test on different browsers (OPTIONAL) - - [ ]* 9.4.6 Test with large files (OPTIONAL) - - [ ]* 9.4.7 Test with long texts (OPTIONAL) - - -## Phase 6: Documentation & Deployment (Week 5) - -### 10. Documentation - -- [x] 10.1 Update project documentation - - [x] 10.1.1 Update `README.md` (already done) - - [x] 10.1.2 Update `.kiro/steering/product.md` - - [x] 10.1.3 Update `.kiro/steering/tech.md` - - [x] 10.1.4 Update `.kiro/steering/structure.md` - - [x] 10.1.5 Update `.kiro/steering/workflow.md` - -- [x] 10.2 Create user guide - - [x] 10.2.1 Create `docs/ui-guide.md` - - [x] 10.2.2 Document Tab 1 usage - - [x] 10.2.3 Document Tab 2 usage - - [x] 10.2.4 Document Tab 3 usage - - [x] 10.2.5 Add troubleshooting section - - [x] 10.2.6 Add FAQ section - -- [ ]* 10.3 Add screenshots (OPTIONAL - Future Enhancement) - - [ ]* 10.3.1 Capture Tab 1 screenshot - - [ ]* 10.3.2 Capture Tab 2 screenshot - - [ ]* 10.3.3 Capture Tab 3 screenshot - - [ ]* 10.3.4 Add screenshots to README - - [ ]* 10.3.5 Add screenshots to user guide - -- [ ]* 10.4 Create video demo (OPTIONAL - Future Enhancement) - - [ ]* 10.4.1 Record complete workflow - - [ ]* 10.4.2 Upload to YouTube - - [ ]* 10.4.3 Add link to README - -### 11. Deployment - -- [x] 11.1 Local deployment - - [x] 11.1.1 Test `voice-clone ui` command - - [x] 11.1.2 Test with `--port` option - - [x] 11.1.3 Test with `--share` option - - [x] 11.1.4 Document deployment in README - -- [ ]* 11.2 Hugging Face Spaces (OPTIONAL - Future Enhancement) - - [ ]* 11.2.1 Create `app.py` in repository root - - [ ]* 11.2.2 Test app.py locally - - [ ]* 11.2.3 Create HF Space - - [ ]* 11.2.4 Push to HF Space - - [ ]* 11.2.5 Test deployed app - - [ ]* 11.2.6 Add link to README - -- [ ]* 11.3 Docker deployment (OPTIONAL - Future Enhancement) - - [ ]* 11.3.1 Create `Dockerfile` - - [ ]* 11.3.2 Create `.dockerignore` - - [ ]* 11.3.3 Build Docker image - - [ ]* 11.3.4 Test Docker container - - [ ]* 11.3.5 Document Docker usage - -### 12. Final Checks - -- [x] 12.1 Code quality - - [x] 12.1.1 Run linter: `ruff check src/gradio_ui/` - - [x] 12.1.2 Run formatter: `black src/gradio_ui/` - - [x] 12.1.3 Run type checker: `mypy src/gradio_ui/` - - [x] 12.1.4 Fix all issues - -- [x] 12.2 Test coverage - - [x] 12.2.1 Run tests with coverage: `pytest --cov=gradio_ui` - - [x] 12.2.2 Ensure coverage >70% - - [x] 12.2.3 Add tests for uncovered code - -- [x] 12.3 CLI compatibility - - [x] 12.3.1 Test all CLI commands still work - - [x] 12.3.2 Test CLI with UI running - - [x] 12.3.3 Verify no breaking changes - -- [ ] 12.4 Performance testing - - [ ] 12.4.1 Test with large audio files (50MB+) - - [ ] 12.4.2 Test with long texts (2000+ chars) - - [ ] 12.4.3 Test batch with 10+ segments - - [ ] 12.4.4 Monitor memory usage - - [ ] 12.4.5 Monitor CPU usage - -- [x] 12.5 Security review - - [x] 12.5.1 Review input validation - - [x] 12.5.2 Review file path handling - - [x] 12.5.3 Review error messages (no sensitive info) - - [x] 12.5.4 Test with malicious inputs - - -## Phase 7: Post-MVP Enhancements (Future) - -### 13. Performance Optimizations (Optional)* - -- [ ]* 13.1 Model caching - - [ ]* 13.1.1 Implement model cache with `gr.State` - - [ ]* 13.1.2 Add cache eviction policy - - [ ]* 13.1.3 Test memory usage - - [ ]* 13.1.4 Measure performance improvement - -- [ ]* 13.2 Streaming generation - - [ ]* 13.2.1 Implement streaming in backend - - [ ]* 13.2.2 Use `gr.Audio(streaming=True)` - - [ ]* 13.2.3 Test streaming playback - -- [ ]* 13.3 Parallel batch processing - - [ ]* 13.3.1 Implement parallel processing - - [ ]* 13.3.2 Add worker pool - - [ ]* 13.3.3 Test with multiple segments - -### 14. Feature Enhancements (Optional)* - -- [ ]* 14.1 Profile management - - [ ]* 14.1.1 Add delete profile button - - [ ]* 14.1.2 Add rename profile feature - - [ ]* 14.1.3 Add profile details view - - [ ]* 14.1.4 Add profile comparison - -- [ ]* 14.2 Audio post-processing - - [ ]* 14.2.1 Add volume normalization - - [ ]* 14.2.2 Add fade in/out - - [ ]* 14.2.3 Add silence removal - - [ ]* 14.2.4 Add format conversion (MP3, AAC) - -- [ ]* 14.3 Advanced settings - - [ ]* 14.3.1 Add more generation parameters - - [ ]* 14.3.2 Add custom sample rate option - - [ ]* 14.3.3 Add voice mixing feature - - [ ]* 14.3.4 Add emotion control - -- [ ]* 14.4 Batch improvements - - [ ]* 14.4.1 Add visual script editor - - [ ]* 14.4.2 Add segment preview - - [ ]* 14.4.3 Add retry failed segments - - [ ]* 14.4.4 Add export manifest - -### 15. UI Improvements (Optional)* - -- [ ]* 15.1 Better feedback - - [ ]* 15.1.1 Add waveform visualization - - [ ]* 15.1.2 Add audio quality metrics - - [ ]* 15.1.3 Add progress percentage - - [ ]* 15.1.4 Add estimated time remaining - -- [ ]* 15.2 Accessibility - - [ ]* 15.2.1 Add keyboard shortcuts - - [ ]* 15.2.2 Add screen reader support - - [ ]* 15.2.3 Add high contrast mode - - [ ]* 15.2.4 Add internationalization (i18n) - -- [ ]* 15.3 Mobile support - - [ ]* 15.3.1 Optimize for mobile screens - - [ ]* 15.3.2 Add touch-friendly controls - - [ ]* 15.3.3 Test on mobile devices - ---- - -## Task Summary - -### By Phase -- **Phase 1**: 4 main tasks, 16 subtasks -- **Phase 2**: 3 main tasks, 28 subtasks -- **Phase 3**: 3 main tasks, 23 subtasks -- **Phase 4**: 2 main tasks, 18 subtasks -- **Phase 5**: 4 main tasks, 42 subtasks -- **Phase 6**: 3 main tasks, 35 subtasks -- **Phase 7**: 3 main tasks, 30 subtasks (optional) - -### Total -- **Required Tasks**: 19 main tasks, 162 subtasks -- **Optional Tasks**: 3 main tasks, 30 subtasks -- **Grand Total**: 22 main tasks, 192 subtasks - -### Estimated Effort -- **Phase 1**: 1 week (8-10 hours) -- **Phase 2**: 1 week (10-12 hours) -- **Phase 3**: 1 week (10-12 hours) -- **Phase 4**: 1 week (8-10 hours) -- **Phase 5**: 1 week (12-15 hours) -- **Phase 6**: 1 week (8-10 hours) -- **Total MVP**: 4-5 weeks (56-69 hours) -- **Post-MVP**: 2-3 weeks (20-30 hours) - ---- - -**Status**: ✅ MVP COMPLETE -**Created**: 2025-01-25 -**Last Updated**: 2025-01-25 -**Completed**: 2025-01-25 -**Owner**: Development Team - ---- - -## 🎉 MVP COMPLETION SUMMARY - -### Status: PRODUCTION READY ✅ - -All core functionality has been implemented, tested, and documented. The Gradio UI is ready for production use. - -### Completed Tasks -- ✅ **Phase 1**: Setup & Infrastructure (16/16 subtasks) -- ✅ **Phase 2**: Tab 1 - Prepare Voice Profile (28/28 subtasks) -- ✅ **Phase 3**: Tab 2 - Generate Audio (23/23 subtasks) -- ✅ **Phase 4**: Tab 3 - Batch Processing (13/18 subtasks, 5 optional) -- ✅ **Phase 5**: Polish & Testing (34/42 subtasks, 8 optional) -- ✅ **Phase 6**: Documentation & Deployment (20/35 subtasks, 15 optional) - -### Core Features ✅ -- ✅ Sample upload and validation -- ✅ Voice profile creation -- ✅ Audio generation with parameters -- ✅ Batch script processing -- ✅ Error handling and validation -- ✅ User documentation - -### Test Results ✅ -- **Total Tests**: 41 tests -- **Passing**: 41/41 (100%) -- **Coverage**: >70% for gradio_ui module -- **Manual Testing**: All workflows verified - -### Documentation ✅ -- ✅ User guide created (`docs/ui-guide.md`, 500+ lines) -- ✅ Steering files updated -- ✅ README updated -- ✅ Code comments added - -### Optional Tasks (Future Enhancements) -The following tasks are marked as optional and can be implemented in future iterations: -- Progress tracking with `gr.Progress()` -- Custom CSS styling -- Property-based tests -- Browser compatibility testing -- Performance testing with large files -- Screenshots and video demos -- Hugging Face Spaces deployment -- Docker containerization - -### Next Steps -1. ✅ MVP is complete and ready for use -2. 🚀 Run `voice-clone ui` to start the application -3. 📖 Follow the user guide in `docs/ui-guide.md` -4. 💬 Gather user feedback for future enhancements -5. 🔄 Implement optional tasks based on user needs - -### Related Documents -- **Completion Summary**: `GRADIO_INTEGRATION_COMPLETE.md` -- **User Guide**: `docs/ui-guide.md` -- **Previous Summaries**: `TASK_5_COMPLETION_SUMMARY.md`, `TASK_6_7_COMPLETION_SUMMARY.md` - ---- - -**🎊 Congratulations! The Gradio UI integration is complete and production-ready! 🎊** diff --git a/.kiro/specs/project-rename-restructure/tasks.md b/.kiro/specs/project-rename-restructure/tasks.md new file mode 100644 index 0000000..33acd01 --- /dev/null +++ b/.kiro/specs/project-rename-restructure/tasks.md @@ -0,0 +1,638 @@ +# Tasks: TTS Studio - Project Rename & Restructure (Hexagonal Architecture + Monorepo) + +## Overview + +Implementation tasks for migrating to TTS Studio with hexagonal architecture and monorepo structure. + +**Architecture**: Hexagonal (Ports & Adapters) + Monorepo +**Duration**: 9 weeks +**Approach**: Phase by phase, layer by layer + +--- + +## Phase 1: Monorepo Setup & Hexagonal Structure (Week 1) + +### 1.1 Create Monorepo Directory Structure +- [x] Create `apps/` root directory +- [x] Create `apps/core/` for Python library +- [x] Create `packages/` for shared code (optional) + +### 1.2 Create Hexagonal Layer Structure +- [x] Create `apps/core/src/domain/` layer + - [x] Create `apps/core/src/domain/models/` + - [x] Create `apps/core/src/domain/ports/` + - [x] Create `apps/core/src/domain/services/` +- [x] Create `apps/core/src/app/` layer + - [x] Create `apps/core/src/app/use_cases/` + - [x] Create `apps/core/src/app/dto/` + - [x] Create `apps/core/src/app/services/` +- [x] Create `apps/core/src/infra/` layer + - [x] Create `apps/core/src/infra/engines/qwen3/` + - [x] Create `apps/core/src/infra/audio/` + - [x] Create `apps/core/src/infra/persistence/` + - [x] Create `apps/core/src/infra/config/` +- [x] Create `apps/core/src/api/` layer +- [x] Create `apps/core/src/shared/` utilities + +### 1.3 Create Test Structure +- [x] Create `apps/core/tests/domain/` +- [x] Create `apps/core/tests/app/` (matches `src/app/`) +- [x] Create `apps/core/tests/infra/` (matches `src/infra/`) +- [x] Create `apps/core/tests/integration/` +- [x] Create `apps/core/tests/properties/` + +### 1.4 Move Configuration Files +- [x] Move `setup.py` → `apps/core/setup.py` +- [x] Move `pyproject.toml` → `apps/core/pyproject.toml` +- [x] Move `requirements.txt` → `apps/core/requirements.txt` +- [x] Move `Makefile` → `apps/core/Makefile` +- [x] Move `.python-version` → `apps/core/.python-version` + +### 1.5 Update setup.py +- [x] Change package name to `tts-studio` +- [x] Update version to `1.0.0` +- [x] Remove CLI entry points +- [x] Update package discovery path +- [x] Add new dependencies (pydantic for DTOs) + +### 1.6 Update .gitignore +- [x] Add monorepo-specific ignores +- [x] Update paths for `apps/core/` +- [x] Add `apps/core/data/` to ignore downloaded models and cache + +### 1.7 Validation +- [x] Verify directory structure matches design +- [x] Test `cd apps/core && pip install -e .` +- [x] Verify no import errors + +--- + +## Phase 2: Domain Layer Implementation (Week 2) + +**Note**: Domain models contain ONLY pure business concepts that exist independently of app use cases. Request/Result objects are DTOs and belong in the App layer (Phase 4). + +### 2.1 Domain Models +- [x] Create `apps/core/src/domain/models/__init__.py` +- [x] Create `apps/core/src/domain/models/voice_profile.py` + - [x] Define `VoiceProfile` entity (with ID and identity) + - [x] Add `add_sample()` method + - [x] Add `is_valid()` method + - [x] Add `total_duration` property + - [x] Add `remove_sample()` method +- [x] Create `apps/core/src/domain/models/audio_sample.py` + - [x] Define `AudioSample` value object (immutable) + - [x] Add validation in constructor + - [x] Add `is_valid_duration()` method + - [x] Add `is_valid_sample_rate()` method + +### 2.2 Domain Ports (Interfaces) +- [x] Create `apps/core/src/domain/ports/__init__.py` +- [x] Create `apps/core/src/domain/ports/tts_engine.py` + - [x] Define `TTSEngine` ABC + - [x] Add `get_supported_modes()` abstract method + - [x] Add `generate_audio()` abstract method + - [x] Add `validate_profile()` abstract method +- [x] Create `apps/core/src/domain/ports/audio_processor.py` + - [x] Define `AudioProcessor` ABC + - [x] Add `validate_sample()` abstract method + - [x] Add `process_sample()` abstract method + - [x] Add `normalize_audio()` abstract method +- [x] Create `apps/core/src/domain/ports/profile_repository.py` + - [x] Define `ProfileRepository` ABC + - [x] Add `save()` abstract method + - [x] Add `find_by_id()` abstract method + - [x] Add `list_all()` abstract method + - [x] Add `delete()` abstract method +- [x] Create `apps/core/src/domain/ports/config_provider.py` + - [x] Define `ConfigProvider` ABC + +### 2.3 Domain Services +- [x] Create `apps/core/src/domain/services/__init__.py` +- [x] Create `apps/core/src/domain/services/voice_cloning.py` + - [x] Define `VoiceCloningService` class + - [x] Inject `AudioProcessor` port in constructor + - [x] Implement `create_profile_from_samples()` method + - [x] Add validation logic (pure business rules) +- [x] Create `apps/core/src/domain/services/audio_generation.py` + - [x] Define `AudioGenerationService` class + - [x] Add generation orchestration logic + +### 2.4 Domain Exceptions +- [x] Create `apps/core/src/domain/exceptions.py` + - [x] Define `DomainException` base class + - [x] Define `InvalidProfileException` + - [x] Define `InvalidSampleException` + - [x] Define `GenerationException` + +### 2.5 Domain Tests +- [x] Create `apps/core/tests/domain/models/test_voice_profile.py` + - [x] Test `VoiceProfile` creation + - [x] Test `add_sample()` method + - [x] Test `is_valid()` validation +- [x] Create `apps/core/tests/domain/services/test_voice_cloning.py` + - [x] Test `create_profile_from_samples()` with mocks + - [x] Test validation logic + - [x] Test error handling +- [x] Verify domain tests pass without infrastructure + +### 2.6 Validation +- [x] Domain layer has ZERO infra dependencies +- [x] All domain tests pass with mocks only +- [x] `pytest apps/core/tests/domain/` passes +- [x] No imports from `infra/` in domain + +--- + +## Phase 3: Infrastructure Adapters (Week 3) + +### 3.1 Qwen3 TTS Engine Adapter +- [x] Create `apps/core/src/infra/engines/qwen3/__init__.py` +- [x] Create `apps/core/src/infra/engines/qwen3/adapter.py` + - [x] Define `Qwen3Adapter` class implementing `TTSEngine` port + - [x] Implement `get_supported_modes()` method + - [x] Implement `generate_audio()` method + - [x] Implement `validate_profile()` method +- [x] Move existing Qwen3 code from `src/voice_clone/model/` + - [x] Move `qwen3_manager.py` → `model_loader.py` + - [x] Move `qwen3_generator.py` → `inference.py` + - [x] Refactor to work with adapter pattern +- [x] Create `apps/core/src/infra/engines/qwen3/modes/` + - [x] Move clone mode implementation + - [x] Move custom voice mode implementation + - [x] Move voice design mode implementation +- [x] Create `apps/core/src/infra/engines/qwen3/config.py` + +### 3.2 Audio Processor Adapter +- [x] Create `apps/core/src/infra/audio/__init__.py` +- [x] Create `apps/core/src/infra/audio/processor_adapter.py` + - [x] Define `LibrosaAudioProcessor` implementing `AudioProcessor` port + - [x] Implement `validate_sample()` method + - [x] Implement `process_sample()` method + - [x] Implement `normalize_audio()` method +- [x] Move existing audio code from `src/voice_clone/audio/` + - [x] Move `processor.py` logic to adapter + - [x] Move `validator.py` logic to adapter +- [x] Create `apps/core/src/infra/audio/converter.py` +- [x] Create `apps/core/src/infra/audio/effects.py` + +### 3.3 Profile Repository Adapter +- [x] Create `apps/core/src/infra/persistence/__init__.py` +- [x] Create `apps/core/src/infra/persistence/file_profile_repository.py` + - [x] Define `FileProfileRepository` implementing `ProfileRepository` port + - [x] Implement `save()` method (JSON serialization) + - [x] Implement `find_by_id()` method + - [x] Implement `list_all()` method + - [x] Implement `delete()` method +- [ ] Create `apps/core/src/infra/persistence/json_serializer.py` + - [x] Implement serialization logic + - [x] Implement deserialization logic + +### 3.4 Config Provider Adapter +- [x] Create `apps/core/src/infra/config/__init__.py` +- [x] Create `apps/core/src/infra/config/yaml_config.py` + - [x] Define `YAMLConfigProvider` implementing `ConfigProvider` port + - [x] Implement config loading from YAML + - [x] Implement config merging (defaults + user) +- [x] Create `apps/core/src/infra/config/env_config.py` + - [x] Support environment variables + +### 3.5 Infrastructure Tests +- [x] Create `apps/core/tests/infra/engines/test_qwen3_adapter.py` + - [x] Test adapter implements port correctly + - [x] Test `generate_audio()` with real Qwen3 + - [x] Test mode switching +- [x] Create `apps/core/tests/infra/audio/test_processor_adapter.py` + - [x] Test audio validation + - [x] Test audio processing + - [x] Test normalization +- [x] Create `apps/core/tests/infra/persistence/test_file_repository.py` + - [x] Test save/load profiles + - [x] Test JSON serialization + - [x] Test file operations + +### 3.6 Validation +- [x] All adapters implement their respective ports +- [x] `pytest apps/core/tests/infra/` passes +- [x] Qwen3 adapter can generate audio +- [x] Audio processor can validate samples +- [x] Repository can save/load profiles + +--- + +## Phase 4: Application Layer (Week 4) + +### 4.1 DTOs (Data Transfer Objects) +- [x] Create `apps/core/src/app/dto/__init__.py` +- [x] Create `apps/core/src/app/dto/voice_profile_dto.py` + - [x] Define `VoiceProfileDTO` dataclass + - [x] Add `from_entity()` class method + - [x] Add `to_dict()` method +- [x] Create `apps/core/src/app/dto/generation_dto.py` + - [x] Define `GenerationRequestDTO` dataclass + - [x] Define `GenerationResultDTO` dataclass + - [x] Add serialization methods +- [x] Create `apps/core/src/app/dto/batch_dto.py` + - [x] Define `BatchRequestDTO` dataclass + - [x] Define `BatchResultDTO` dataclass + +### 4.2 Use Cases +- [x] Create `apps/core/src/app/use_cases/__init__.py` +- [x] Create `apps/core/src/app/use_cases/create_voice_profile.py` + - [x] Define `CreateVoiceProfileUseCase` class + - [x] Inject `AudioProcessor` and `ProfileRepository` ports + - [x] Implement `execute()` method + - [x] Use `VoiceCloningService` from domain + - [x] Return `VoiceProfileDTO` +- [x] Create `apps/core/src/app/use_cases/generate_audio.py` + - [x] Define `GenerateAudioUseCase` class + - [x] Inject `TTSEngine` and `ProfileRepository` ports + - [x] Implement `execute()` method + - [x] Return `GenerationResultDTO` +- [x] Create `apps/core/src/app/use_cases/list_voice_profiles.py` + - [x] Define `ListVoiceProfilesUseCase` class + - [x] Inject `ProfileRepository` port + - [x] Implement `execute()` method +- [x] Create `apps/core/src/app/use_cases/validate_audio_samples.py` + - [x] Define `ValidateAudioSamplesUseCase` class + - [x] Inject `AudioProcessor` port + - [x] Implement `execute()` method +- [x] Create `apps/core/src/app/use_cases/process_batch.py` + - [x] Define `ProcessBatchUseCase` class + - [x] Inject necessary ports + - [x] Implement batch processing logic + +### 4.3 Application Services (SKIPPED - YAGNI) +**Decision**: Application Services are not needed at this time because: +- All use cases are self-contained and well-defined +- `ProcessBatchUseCase` already handles the orchestration we need +- No complex transactions requiring coordination +- API layer can call use cases directly +- Following YAGNI principle - will add if needed in future + +- [x]* Create `apps/core/src/app/services/__init__.py` (skipped) +- [x]* Create `apps/core/src/app/services/orchestrator.py` (skipped) + - [x]* Define `ApplicationOrchestrator` class (skipped) + - [x]* Coordinate multiple use cases if needed (skipped) + +### 4.4 Application Tests +- [x] Create `apps/core/tests/app/use_cases/test_create_voice_profile.py` + - [x] Test use case with mocked ports + - [x] Test orchestration logic + - [x] Test error handling +- [x] Create `apps/core/tests/app/use_cases/test_generate_audio.py` + - [x] Test use case with mocked ports + - [x] Test profile loading + - [x] Test generation flow +- [x] Create `apps/core/tests/app/use_cases/test_list_voice_profiles.py` + - [x] Test listing profiles + - [x] Test empty repository + - [x] Test DTO conversion +- [x] Create `apps/core/tests/app/use_cases/test_validate_audio_samples.py` + - [x] Test validation with valid samples + - [x] Test validation with invalid samples + - [x] Test validation summary +- [x] Create `apps/core/tests/app/use_cases/test_process_batch.py` + - [x] Test batch processing logic + - [x] Test error handling per segment + - [x] Test partial failures + +### 4.5 Validation +- [x] Use cases orchestrate domain and infrastructure correctly +- [x] `pytest apps/core/tests/app/` passes +- [x] Use cases work with mocked adapters +- [x] DTOs serialize/deserialize correctly + +--- + +## Phase 5: API Layer (Week 5) + +### 5.1 Python API Implementation +- [x] Create `apps/core/src/api/__init__.py` +- [x] Create `apps/core/src/api/studio.py` (renamed from `python_api.py` for clarity) + - [x] Define `TTSStudio` class (main API entry point) + - [x] Initialize all adapters in `__init__()` (dependency injection) + - [x] Initialize all use cases + - [x] Implement `create_voice_profile()` method + - [x] Implement `generate_audio()` method + - [x] Implement `list_voice_profiles()` method + - [x] Implement `delete_voice_profile()` method + - [x] Implement `validate_samples()` method + - [x] Add error handling (try/except with status dict) + - [x] Add logging + - [x] Add `get_config()` and `reload_config()` helper methods + +### 5.2 API Tests +- [x] Create `apps/core/tests/api/test_python_api.py` + - [x] Test API initialization + - [x] Test `create_voice_profile()` with real adapters + - [x] Test `generate_audio()` with real adapters + - [x] Test error handling + - [x] Test JSON response format +- [x] Create `apps/core/tests/api/test_cli.py` (SKIPPED - CLI removed in Phase 6) + - [x] Test CLI commands (SKIPPED) + - [x] Test JSON output (SKIPPED) + - [x] Test subprocess invocation (SKIPPED) + +### 5.3 Example Usage +- [x] Create `examples/api_usage.py` + - [x] Show basic API usage + - [x] Show profile creation + - [x] Show audio generation + - [x] Show error handling + +### 5.4 Validation +- [x] API can be called from Python +- [x] API returns proper JSON responses +- [x] `pytest apps/core/tests/api/` passes +- [x] Examples run successfully + +--- + +## Phase 6: Delete CLI and Gradio (Week 6) + +### 6.1 Delete CLI Code +- [x] Delete `src/cli/` directory completely +- [x] Delete `tests/cli/` directory completely +- [x] Delete `examples/test_validation_handler.py` +- [x] Remove CLI entry points from `apps/core/setup.py` +- [x] Remove `click` from `apps/core/requirements.txt` + +### 6.2 Delete Gradio Code +- [x] Delete `src/gradio_ui/` directory completely +- [x] Delete `tests/gradio_ui/` directory completely +- [x] Remove `gradio` from `apps/core/requirements.txt` +- [x] Remove any Gradio-related dependencies + +### 6.3 Update Documentation +- [x] Update `README.md` + - [x] Remove CLI usage examples + - [x] Remove Gradio UI references + - [x] Add Python API usage + - [x] Update architecture description +- [x] Update `docs/usage.md` + - [x] Remove CLI commands + - [x] Add Python API examples +- [x] Update `docs/installation.md` + - [x] Remove CLI installation + - [x] Add Python library installation +- [x] Update `docs/api.md` + - [x] Document new Python API + - [x] Document hexagonal architecture + - [x] Add adapter examples +- [x] Replace all `yourusername` with `bryanstevensacosta` + +### 6.4 Update Steering Files +- [x] Update `.kiro/steering/product.md` + - [x] Remove CLI/Gradio references + - [x] Update architecture description +- [x] Update `.kiro/steering/tech.md` + - [x] Add hexagonal architecture + - [x] Remove CLI/Gradio tech +- [x] Update `.kiro/steering/structure.md` + - [x] Document monorepo structure + - [x] Document hexagonal layers + - [x] Update file organization +- Note: `.kiro/` directory is gitignored (steering files are workspace-specific) + +### 6.5 Clean Up Tests +- [x] Remove CLI test imports (none found) +- [x] Remove Gradio test imports (none found) +- [x] Fix any broken test imports (none found) +- [x] Update test fixtures (not needed) +- [x] Update conftest.py (already clean) + +### 6.6 Validation +- [x] No CLI or Gradio code remains +- [x] All remaining tests pass +- [x] No broken imports +- [x] Documentation is consistent +- [x] `pytest apps/core/` passes +- [x] Deprecated docs/ui-guide.md (replaced with API usage) +- [x] Updated apps/core/Makefile (removed CLI/Gradio targets) + +--- + +## Phase 7: Testing & Documentation (Week 7) + +### 7.1 Integration Tests +- [x] Create `apps/core/tests/integration/test_end_to_end.py` + - [x] Test complete workflow: create profile → generate audio + - [x] Test with real infrastructure (Qwen3, librosa, files) + - [x] Test error scenarios +- [x] Create `apps/core/tests/integration/test_hexagonal_architecture.py` + - [x] Test dependency inversion + - [x] Test adapter swapping + - [x] Test port implementations + +### 7.2 Property-Based Tests +- [x] Create `apps/core/tests/pbt/test_domain_properties.py` + - [x] Test domain invariants + - [x] Test voice profile properties + - [x] Test audio sample properties +- [x] Create `apps/core/tests/pbt/test_use_case_properties.py` + - [x] Test use case properties + - [x] Test idempotency where applicable + +### 7.3 Documentation +- [x] Create `docs/MIGRATION.md` + - [x] Document Python API migration + - [x] Show before/after code examples + - [x] Document hexagonal architecture + - [x] Add FAQ section +- [x] Create `docs/HEXAGONAL_ARCHITECTURE.md` + - [x] Explain hexagonal architecture + - [x] Document layers (domain, application, infrastructure, API) + - [x] Show dependency flow + - [x] Add diagrams + - [x] Explain ports & adapters pattern +- [x] Update `docs/development.md` + - [x] Document monorepo structure + - [x] Add development workflow + - [x] Add testing guidelines +- [x] Update `CHANGELOG.md` + - [x] Document breaking changes + - [x] List new features + - [x] Add migration notes + +### 7.4 Code Quality +- [x] Run `black` on all Python code +- [x] Run `ruff check` and fix issues +- [x] Run `mypy` for type checking +- [x] Add type hints to all public APIs +- [x] Add docstrings to all public classes/methods +- [x] Check code coverage (target >80%) + - Note: Coverage at 69%, tests need fixes for VoiceProfile/AudioSample constructors + +### 7.5 CI/CD Updates +- [x] Update `.github/workflows/ci-python.yml` + - [x] Update paths to `apps/core/` + - [x] Add hexagonal architecture validation + - [x] Test on Python 3.10, 3.11 +- [x] Update pre-commit hooks for monorepo + +### 7.6 Validation +- [x] `pytest apps/core/` runs (208 tests collected) +- [x] 187/208 tests passing (90%) +- [ ] Fix 18 failing/error tests (documented in test-fixes-followup.md) +- [x] Code coverage 68% (target 80%, acceptable for MVP) +- [ ] CI/CD green (needs push to trigger) +- [x] Documentation reviewed and complete +- [x] No linting errors (black, ruff passing) +- [x] Type checking passes (mypy passing) + +**Status**: Phase 7.1-7.5 complete. Phase 7.6 validation shows 90% tests passing with 68% coverage. Remaining test fixes documented for follow-up. + +--- + +## Phase 8: Documentation Updates (Week 8) + +### 8.1 Update README.md +- [x] Update project description +- [x] Add model management features (download on-demand) +- [x] Expand architecture section with detailed hexagonal layers +- [x] Add model storage locations for all platforms +- [x] Update troubleshooting section with model management tips +- [x] Update roadmap with completed items and upcoming features +- [x] Remove all CLI references (CLI was removed in Phase 6) +- [x] Emphasize privacy-first, offline-first approach +- [x] Add comprehensive model download instructions +- [x] Commit changes: `73bd02a` +- [x] Push to trigger CI + +### 8.2 Update Other Documentation +- [ ] Update `docs/installation.md` + - [ ] Remove CLI installation instructions + - [ ] Add Python library installation +- [ ] Update `docs/usage.md` + - [ ] Remove CLI usage examples + - [ ] Add Python API usage examples +- [ ] Update `docs/development.md` + - [ ] Document monorepo structure + - [ ] Add development workflow for core library + - [ ] Add testing guidelines +- [ ] Update `CHANGELOG.md` + - [ ] Document v1.0.0 changes + - [ ] List breaking changes (CLI removed) + - [ ] Add migration notes + +### 8.3 Validation +- [ ] All documentation is consistent +- [ ] No CLI references remain +- [ ] Links work correctly +- [ ] Examples are accurate + +--- + +## Phase 9: Release Preparation (Week 9) + +**Note**: Desktop app implementation is covered in a separate spec (`.kiro/specs/tauri-desktop-ui/`). This phase focuses on releasing the Python core library v1.0.0. + +### 9.1 Final Testing +- [ ] Run full test suite: `pytest apps/core/` +- [ ] Manual testing of Python API +- [ ] Performance testing (generation speed) +- [ ] Memory usage testing +- [ ] Test on different platforms (macOS, Linux) +- [ ] Test with different Python versions (3.10, 3.11) + +### 9.2 Version Update +- [ ] Update version to `1.0.0` in `apps/core/setup.py` +- [ ] Update version in `apps/core/pyproject.toml` +- [ ] Update `CHANGELOG.md` with all changes +- [ ] Create release notes + +### 9.3 Build Package +- [ ] `cd apps/core` +- [ ] Clean old builds: `rm -rf dist/ build/` +- [ ] Build package: `python setup.py sdist bdist_wheel` +- [ ] Verify package: `twine check dist/*` + +### 9.4 Git Release +- [ ] Commit all changes +- [ ] Create git tag: `git tag v1.0.0` +- [ ] Push tag: `git push origin v1.0.0` +- [ ] Create GitHub release with notes + +### 9.5 Publish to PyPI +- [ ] Test publish to TestPyPI first + - [ ] `twine upload --repository testpypi dist/*` + - [ ] Test install: `pip install --index-url https://test.pypi.org/simple/ tts-studio` +- [ ] Publish to PyPI + - [ ] `twine upload dist/*` + - [ ] Verify on PyPI: https://pypi.org/project/tts-studio/ + +### 9.6 Update Documentation +- [ ] Update README badges +- [ ] Update installation instructions +- [ ] Update links to documentation +- [ ] Update examples + +### 9.7 Announcement +- [ ] Create GitHub release announcement +- [ ] Update project description +- [ ] Monitor for issues +- [ ] Respond to user feedback + +### 9.8 Validation +- [ ] `pip install tts-studio` works +- [ ] Package downloads successfully +- [ ] No critical issues reported +- [ ] Documentation is accessible +- [ ] Examples work + +--- + +## Summary + +| Phase | Duration | Tasks | Key Deliverable | +|-------|----------|-------|-----------------| +| 1 | Week 1 | 7 | Monorepo + hexagonal structure | +| 2 | Week 2 | 6 | Domain layer (pure business logic) | +| 3 | Week 3 | 6 | Infrastructure adapters (Qwen3, audio, persistence) | +| 4 | Week 4 | 5 | Application layer (use cases, DTOs) | +| 5 | Week 5 | 4 | API layer (Python API) | +| 6 | Week 6 | 6 | Delete CLI/Gradio, update docs | +| 7 | Week 7 | 6 | Testing, documentation, CI/CD | +| 8 | Week 8 | 3 | Documentation updates | +| 9 | Week 9 | 8 | Release v1.0.0 to PyPI | + +**Total**: 51 task groups across 9 weeks + +**Note**: Desktop app implementation is covered separately in `.kiro/specs/tauri-desktop-ui/` + +--- + +## Critical Path + +1. **Phase 1-2** must be completed before Phase 3 (domain before infrastructure) +2. **Phase 3** must be completed before Phase 4 (adapters before use cases) +3. **Phase 4** must be completed before Phase 5 (use cases before API) +4. **Phase 6-7** can run in parallel with Phase 8 +5. **Phase 8** must be completed before Phase 9 (documentation before release) +6. **Phase 9** requires all previous phases complete + +**Desktop App**: Separate implementation timeline in `.kiro/specs/tauri-desktop-ui/tasks.md` + +--- + +## Notes + +- Each task should be marked as complete when done +- Tasks can be broken down further if needed +- Some tasks may be done in parallel within a phase +- Testing should be continuous throughout all phases +- Documentation should be updated as code changes +- Hexagonal architecture principles must be maintained throughout + +--- + +## Hexagonal Architecture Validation Checklist + +After each phase, verify: +- [ ] Domain layer has NO infra dependencies +- [ ] All ports (interfaces) are defined in domain +- [ ] All adapters implement their respective ports +- [ ] Use cases only depend on ports, not adapters +- [ ] API layer wires everything together (dependency injection) +- [ ] Tests can use mocks for all ports +- [ ] Easy to swap implementations (e.g., Qwen3 → XTTS) diff --git a/.kiro/steering/ci-quality.md b/.kiro/steering/ci-quality.md deleted file mode 100644 index 74fb505..0000000 --- a/.kiro/steering/ci-quality.md +++ /dev/null @@ -1,275 +0,0 @@ -# CI/CD Quality Standards - -## Overview -Este documento define los estándares de calidad para CI/CD y las medidas preventivas para evitar fallos en el pipeline. - -## Lecciones Aprendidas de PR #7 - -### Problema Identificado -**Fecha**: 27 de enero de 2026 -**PR**: #7 - Feature/gradio UI complete implementation -**Fallo**: Lint check falló en Python 3.11 debido a formato inconsistente de código - -**Root Cause**: -- Archivo `tests/integration/test_manual_simulation.py` no estaba formateado con Black -- El código pasó localmente pero falló en CI -- Black formatea diferente en Python 3.10 vs 3.11 en algunos casos edge - -### Solución Implementada - -#### 1. Pre-commit Hooks Obligatorios -Todos los desarrolladores DEBEN instalar y usar pre-commit hooks: - -```bash -# Instalar pre-commit -pip install pre-commit - -# Instalar hooks -pre-commit install -pre-commit install --hook-type pre-push - -# Verificar instalación -pre-commit run --all-files -``` - -#### 2. Configuración Correcta de Ruff -**CRÍTICO**: Ruff debe auto-corregir sin fallar el commit. - -```yaml -# .pre-commit-config.yaml -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.9 - hooks: - - id: ruff - args: ['--fix'] # ✅ Auto-fix sin fallar - # ❌ NO usar: ['--fix', '--exit-non-zero-on-fix'] - - id: ruff-format -``` - -**Por qué**: El flag `--exit-non-zero-on-fix` hace que el hook **falle** después de aplicar correcciones, lo cual es contraproducente. Queremos que aplique los fixes y continúe. - -#### 3. Pre-push Format Check -Hook automático que verifica formato antes de push: -- Ejecuta Black en modo check -- Ejecuta Ruff para detectar issues -- Bloquea push si hay problemas de formato -- Ubicación: `scripts/pre-push-format-check.sh` - -#### 4. Configuración de Black -Black configurado para Python 3.11 en `pyproject.toml`: - -```toml -[tool.black] -line-length = 100 -target-version = ['py310', 'py311'] -include = '\.pyi?$' -``` - -**IMPORTANTE**: Siempre incluir `py311` en target-version para consistencia. - -## Reglas Obligatorias - -### Antes de Hacer Commit - -1. **Formatear código automáticamente**: - ```bash - black src/ tests/ - ruff check src/ tests/ --fix - ``` - -2. **Verificar con pre-commit**: - ```bash - pre-commit run --all-files - ``` - -3. **Ejecutar tests localmente**: - ```bash - pytest tests/ -v - ``` - -### Antes de Hacer Push - -1. **El pre-push hook se ejecuta automáticamente** - - Verifica formato con Black - - Verifica linting con Ruff - - Bloquea push si hay problemas - -2. **Si el hook falla**: - ```bash - # Formatear código - black src/ tests/ - - # Fix linting issues - ruff check src/ tests/ --fix - - # Verificar - black --check src/ tests/ - - # Intentar push nuevamente - git push - ``` - -### Antes de Crear PR - -1. **Verificar que CI pasará**: - ```bash - # Ejecutar todos los checks localmente - make lint - make type-check - make test - ``` - -2. **Verificar formato en ambas versiones de Python**: - ```bash - # Si tienes pyenv o múltiples versiones - python3.10 -m black --check src/ tests/ - python3.11 -m black --check src/ tests/ - ``` - -## Comandos de Makefile - -Agregados comandos para facilitar verificación: - -```makefile -# Formatear todo el código -format: - black src/ tests/ - ruff check src/ tests/ --fix - -# Verificar formato sin cambiar archivos -format-check: - black --check src/ tests/ - ruff check src/ tests/ - -# Ejecutar todos los checks de CI localmente -ci-check: format-check lint type-check test -``` - -## Configuración de CI - -### GitHub Actions Workflow -El workflow de CI ejecuta los mismos checks: - -```yaml -jobs: - lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ['3.10', '3.11'] - steps: - - name: Check code formatting with Black - run: black --check src/ tests/ - - - name: Lint with Ruff - run: ruff check src/ tests/ -``` - -**CRÍTICO**: Los checks de CI deben ser idénticos a los pre-commit hooks locales. - -## Prevención de Fallos Futuros - -### Checklist para Desarrolladores - -Antes de cada commit: -- [ ] Código formateado con Black -- [ ] Imports ordenados con Ruff -- [ ] Pre-commit hooks ejecutados -- [ ] Tests pasando localmente - -Antes de cada push: -- [ ] Pre-push hook pasó exitosamente -- [ ] Todos los tests pasan -- [ ] No hay warnings de Ruff - -Antes de crear PR: -- [ ] `make ci-check` pasa exitosamente -- [ ] Código revisado manualmente -- [ ] Commits tienen mensajes descriptivos -- [ ] Branch actualizado con master/main - -### Automatización - -1. **Pre-commit hooks**: Formatean código automáticamente en cada commit -2. **Pre-push hooks**: Verifican formato antes de push -3. **CI checks**: Validan en múltiples versiones de Python -4. **Branch protection**: Requiere que CI pase antes de merge - -## Troubleshooting - -### "Black would reformat files" en pre-push - -**Problema**: El hook de pre-push detectó archivos sin formatear - -**Solución**: -```bash -# Formatear archivos -black src/ tests/ - -# Agregar cambios -git add -u - -# Hacer commit -git commit -m "style: format code with black" - -# Intentar push nuevamente -git push -``` - -### Diferencias de formato entre Python 3.10 y 3.11 - -**Problema**: Black formatea diferente en distintas versiones - -**Solución**: -- Siempre usar Python 3.11 para desarrollo -- Configurar `target-version = ['py310', 'py311']` en pyproject.toml -- Ejecutar `black --check` en ambas versiones antes de PR - -### Pre-commit hooks no se ejecutan - -**Problema**: Los hooks no corren automáticamente - -**Solución**: -```bash -# Reinstalar hooks -pre-commit uninstall -pre-commit install -pre-commit install --hook-type pre-push - -# Verificar instalación -ls -la .git/hooks/ -``` - -## Métricas de Calidad - -### Objetivos -- **Formato**: 100% de archivos formateados con Black -- **Linting**: 0 errores de Ruff -- **Type checking**: 0 errores de Mypy (excepto tests) -- **Tests**: >80% coverage, 0 fallos - -### Monitoreo -- CI debe pasar en todas las versiones de Python (3.10, 3.11) -- Pre-commit hooks deben estar instalados en todos los entornos de desarrollo -- Revisión de código debe verificar calidad antes de aprobar PR - -## Referencias - -- [Black Documentation](https://black.readthedocs.io/) -- [Ruff Documentation](https://docs.astral.sh/ruff/) -- [Pre-commit Documentation](https://pre-commit.com/) -- [Git Workflow Guide](docs/git-workflow.md) - -## Historial de Cambios - -### 2026-01-28 -- **Corregido**: Configuración de Ruff pre-commit hook -- **Removido**: Flag `--exit-non-zero-on-fix` que causaba fallos innecesarios -- **Mejora**: Ahora Ruff auto-corrige imports sin fallar el commit -- **Lección**: Los hooks deben auto-corregir, no solo detectar - -### 2026-01-27 -- **Creado**: Documento de estándares de CI/CD -- **Agregado**: Pre-push format check hook -- **Actualizado**: Configuración de Black para Python 3.11 -- **Documentado**: Lecciones aprendidas de PR #7 diff --git a/.kiro/steering/product.md b/.kiro/steering/product.md new file mode 100644 index 0000000..81741a2 --- /dev/null +++ b/.kiro/steering/product.md @@ -0,0 +1,143 @@ +--- +inclusion: always +--- + +# Product Overview - TTS Studio + +## Purpose +Desktop application for voice cloning and text-to-speech synthesis using Qwen3-TTS, enabling content creators to generate natural-sounding narration for YouTube, TikTok, and other social media platforms without appearing on camera. + +## Target User +- **Primary User**: Content creators, YouTubers, podcasters +- **Profile**: Users who want professional voice narration without recording every time +- **Technical Level**: No technical knowledge required - desktop app with intuitive UI +- **Environment**: Desktop application (macOS, Windows, Linux) + +## Core Use Case +1. **Setup**: Download and install desktop app (~50-100MB, no models included) +2. **Model Download**: On first launch, download Qwen3-TTS model (~3.4GB) from UI +3. **Profile Creation**: Upload 1-3 audio samples with different tones/emotions +4. **Voice Cloning**: App processes samples to create voice profile +5. **Generation**: Convert text to audio using cloned voice via desktop UI +6. **Export**: Download generated audio for video editing + +## Architecture +- **Monorepo**: Separate apps for core library and desktop UI +- **Core Library**: Python library with hexagonal architecture (ports & adapters) +- **Desktop App**: Tauri desktop app (React + TypeScript + Rust) +- **Python API**: Core library exposes Python API for Tauri backend integration + +## Key Features + +### Desktop Application (Tauri + React) +- **Native UI**: Modern desktop app with intuitive interface +- **Offline-First**: Works completely offline after model download +- **No Login**: No authentication required, everything local +- **Model Management**: Download, install, and manage TTS models +- **Voice Profiles**: Create and manage multiple voice profiles +- **Audio Generation**: Generate speech from text with real-time preview +- **Batch Processing**: Process multiple text segments at once +- **Export Options**: Export to WAV, MP3, AAC for video editing + +### Core Library (Python) +- **Hexagonal Architecture**: Clean, testable, maintainable code +- **Multiple Engines**: Support for Qwen3-TTS (more engines in future) +- **Audio Processing**: Validation, normalization, format conversion +- **Batch Processing**: Efficient processing of multiple segments +- **Python API**: Exposes `TTSStudio` class for Tauri backend integration + +### Quality Requirements +- **Language**: Full support for Spanish (Latin American and Castilian) +- **Audio Quality**: Minimum 12kHz, native to Qwen3-TTS +- **Naturalness**: Preserve intonation and emotions from original samples +- **Consistency**: Stable and coherent voice across generations +- **Performance**: <30 seconds for 1 minute of audio (on GPU) + +## Architecture + +### Monorepo Structure +- **apps/core/**: Python core library with hexagonal architecture +- **apps/desktop/**: Tauri desktop app (React + TypeScript + Rust) +- **Separation**: Clear boundaries between core logic and UI + +### Hexagonal Architecture (Core Library) +- **Domain Layer**: Pure business logic, no external dependencies +- **Application Layer**: Use cases, orchestration +- **Infrastructure Layer**: TTS engines, audio processing, storage +- **API Layer**: Python API for Tauri backend + +### Desktop-First Principles +- **Offline-First**: Everything works without internet (except model downloads) +- **No Login/Registration**: No authentication, everything local +- **Privacy**: All data stored locally, no cloud sync +- **Native Performance**: Fast, responsive desktop app + +## Non-Features (Explicitly NOT Included) +- ❌ Authentication/Login/Registration +- ❌ REST API or GraphQL +- ❌ Web UI (browser-based) +- ❌ CLI (command-line interface) - Removed in Phase 6 +- ❌ Gradio UI - Removed in Phase 6 +- ❌ Database (uses SQLite locally) +- ❌ Multi-user support +- ❌ Cloud deployment +- ❌ Containerization (Docker) +- ❌ CD (Continuous Deployment) +- ✅ CI (Continuous Integration) - YES, to maintain code quality + +## Technical Constraints +- **Hardware**: MPS (Apple Silicon) recommended for optimal speed, CPU supported but slower +- **Storage**: + - App installer: ~50-100MB + - Qwen3-TTS model: ~3.4GB (downloaded separately by user) + - User data (samples, profiles, outputs): ~1-5GB + - Total: ~5-10GB recommended free space +- **Python Version**: 3.10-3.11 (for core library, bundled with app) +- **OS**: macOS, Windows, Linux (Tauri supports all platforms) +- **Internet**: Required only for initial model download (~3.4GB) + +## Success Metrics +- ✅ Generate coherent audio from text +- ✅ Perceptible similarity to real voice +- ✅ Reasonable generation time (<30 sec for 1 min audio on GPU) +- ✅ Audio ready for video editing without extensive post-processing +- ✅ Intuitive desktop UI that non-technical users can use +- ✅ Offline functionality after initial setup + +## Project Phases + +### Phase 1: Core Library Restructure (Current) +- Implement hexagonal architecture +- Restructure to monorepo with `apps/core/` +- Migrate all Python code +- Remove CLI and Gradio UI +- Update documentation + +### Phase 2: Desktop UI Development +- Implement Tauri backend (Rust) +- Implement React frontend (FSD architecture) +- Python bridge for core library +- SQLite for local storage +- Model management UI + +### Phase 3: Feature Completion +- Voice profile management +- Audio generation with preview +- Batch processing +- Export options +- Settings and preferences + +### Phase 4: Polish & Release +- UI/UX improvements +- Performance optimization +- Testing and bug fixes +- Documentation +- Release v1.0.0 + +## Business Objectives +- **Efficiency**: Produce video narration without recording audio every time +- **Consistency**: Maintain same voice across all videos +- **Privacy**: Complete control of voice data (local-only) +- **Flexibility**: Easy to iterate and regenerate audio +- **Accessibility**: Desktop app accessible to non-technical users +- **Scalability**: Produce more content in less time diff --git a/.kiro/steering/tech.md b/.kiro/steering/tech.md new file mode 100644 index 0000000..9ba9881 --- /dev/null +++ b/.kiro/steering/tech.md @@ -0,0 +1,543 @@ +--- +inclusion: always +--- + +# Technology Stack - TTS Studio + +## Project Architecture + +### Monorepo Structure +- **Architecture**: Monorepo with separate apps +- **Apps**: `apps/core/` (Python library), `apps/desktop/` (Tauri app) +- **Build System**: Independent builds per app +- **CI/CD**: Separate workflows per technology (Python, Rust, TypeScript) + +### Core Library Architecture (Hexagonal) +- **Pattern**: Hexagonal Architecture (Ports & Adapters) +- **Layers**: Domain → Application → Infrastructure → API +- **Dependency Direction**: All dependencies point inward (toward domain) +- **Testability**: Domain testable without infrastructure + +## Core Technologies + +### Python Core Library (`apps/core/`) +- **TTS Engine**: `Qwen/Qwen3-TTS-12Hz-1.7B-Base` - Primary TTS engine +- **Python Version**: `3.10-3.11` (optimal for Qwen3-TTS and type hints) +- **Architecture**: Hexagonal (Ports & Adapters) + +### Python Dependencies + +#### Core Dependencies +```python +# TTS Engine +qwen-tts>=1.0.0 # Qwen3-TTS framework +torch>=2.0.0 # PyTorch for model inference +torchaudio>=2.0.0 # Audio processing +transformers>=4.30.0 # Hugging Face transformers + +# Audio Processing +librosa>=0.10.0 # Audio analysis and manipulation +soundfile>=0.12.0 # Audio I/O +scipy>=1.10.0 # Signal processing +numpy>=1.24.0 # Numerical operations + +# Configuration & Utilities +pyyaml>=6.0 # YAML configuration +python-dotenv>=1.0.0 # Environment variables +pydantic>=2.0.0 # Data validation (for DTOs) +``` + +#### Development Dependencies +```python +# Testing +pytest>=7.4.0 # Testing framework +pytest-cov>=4.0.0 # Coverage reporting +hypothesis>=6.0.0 # Property-based testing + +# Code Quality +black>=23.0.0 # Code formatter +ruff>=0.1.0 # Fast linter +mypy>=1.0.0 # Type checker + +# Pre-commit +pre-commit>=3.0.0 # Git hooks +``` + +### Desktop App Technologies (`apps/desktop/`) + +#### Rust Backend (Tauri) +```toml +# Cargo.toml +[dependencies] +tauri = "1.5" # Desktop app framework +serde = "1.0" # Serialization +serde_json = "1.0" # JSON support +tokio = "1.0" # Async runtime +rusqlite = "0.30" # SQLite database +``` + +#### TypeScript Frontend (React) +```json +{ + "dependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0", + "@tauri-apps/api": "^1.5.0", + "zustand": "^4.4.0", + "react-router-dom": "^6.20.0", + "tailwindcss": "^3.3.0", + "shadcn/ui": "latest" + }, + "devDependencies": { + "typescript": "^5.2.0", + "vite": "^5.0.0", + "@vitejs/plugin-react": "^4.2.0", + "eslint": "^8.55.0", + "prettier": "^3.1.0" + } +} +``` + +## Hexagonal Architecture Principles + +### Layer Responsibilities + +#### Domain Layer (`apps/core/src/domain/`) +- **Purpose**: Pure business logic, NO external dependencies +- **Contains**: Entities, Value Objects, Domain Services, Ports (interfaces) +- **Rules**: + - NO imports from infrastructure + - NO framework dependencies + - Only Python standard library + domain code + - Defines interfaces (ports) for external dependencies + +#### Application Layer (`apps/core/src/app/`) +- **Purpose**: Orchestrate use cases, coordinate domain logic +- **Contains**: Use Cases, DTOs, Application Services +- **Rules**: + - Uses domain layer + - Uses ports (interfaces), NOT adapters (implementations) + - NO direct infrastructure dependencies + +#### Infrastructure Layer (`apps/core/src/infra/`) +- **Purpose**: Implement ports with concrete adapters +- **Contains**: TTS engine adapters, audio processors, repositories, config providers +- **Rules**: + - Implements ports defined in domain + - Can use external libraries (Qwen3, librosa, etc.) + - Depends on domain (via ports) + +#### API Layer (`apps/core/src/api/`) +- **Purpose**: Entry points for external consumers +- **Contains**: Python API for Tauri backend +- **Rules**: + - Wires everything together (dependency injection) + - Uses application layer (use cases) + - Provides clean API for Tauri backend + +### Dependency Inversion + +```python +# ✅ CORRECT: Infrastructure depends on domain +from domain.ports.tts_engine import TTSEngine # Port (interface) + +class Qwen3Adapter(TTSEngine): # Adapter implements port + def generate_audio(self, text: str, profile_id: str) -> Path: + # Qwen3-specific implementation + pass + +# ❌ WRONG: Domain depends on infrastructure +from infrastructure.engines.qwen3.adapter import Qwen3Adapter # NO! + +class VoiceProfile: + def __init__(self, engine: Qwen3Adapter): # NO! Use port instead + pass +``` + +### Ports & Adapters Pattern + +**Ports** (Interfaces in `domain/ports/`): +- `TTSEngine` - Interface for TTS engines +- `AudioProcessor` - Interface for audio processing +- `ProfileRepository` - Interface for profile storage +- `ConfigProvider` - Interface for configuration + +**Adapters** (Implementations in `infra/`): +- `Qwen3Adapter` - Implements `TTSEngine` for Qwen3 +- `LibrosaAudioProcessor` - Implements `AudioProcessor` with librosa +- `FileProfileRepository` - Implements `ProfileRepository` with files +- `YAMLConfigProvider` - Implements `ConfigProvider` with YAML + +### Dependency Injection + +```python +# apps/core/src/api/python_api.py +class TTSStudioAPI: + def __init__(self, config: Dict[str, Any]): + # Initialize adapters (infrastructure) + audio_processor = LibrosaAudioProcessor() + profile_repository = FileProfileRepository(Path(config['paths']['profiles'])) + tts_engine = Qwen3Adapter(config['engines']['qwen3']) + + # Initialize use cases (application) + self._create_profile_uc = CreateVoiceProfileUseCase( + audio_processor=audio_processor, + profile_repository=profile_repository + ) + self._generate_audio_uc = GenerateAudioUseCase( + tts_engine=tts_engine, + profile_repository=profile_repository + ) + + def create_voice_profile(self, name: str, sample_paths: List[str]) -> Dict[str, Any]: + """Public API method.""" + paths = [Path(p) for p in sample_paths] + profile_dto = self._create_profile_uc.execute(name, paths) + return profile_dto.to_dict() +``` + +## Desktop App Architecture (FSD) + +### Feature-Sliced Design Layers + +#### 1. App Layer (`apps/desktop/src/app/`) +- Application initialization +- Global providers (theme, router, store) +- Global styles +- Root-level configuration + +#### 2. Pages Layer (`apps/desktop/src/pages/`) +- Route components +- Page-level layouts +- Composition of widgets + +#### 3. Widgets Layer (`apps/desktop/src/widgets/`) +- Complex, self-contained UI blocks +- Combine multiple features and entities +- Business logic for widget behavior + +#### 4. Features Layer (`apps/desktop/src/features/`) +- User interactions (actions) +- Business logic for specific features +- Reusable across pages + +#### 5. Entities Layer (`apps/desktop/src/entities/`) +- Business entities (Profile, Sample, Generation, Model) +- Entity state management (Zustand) +- Tauri API calls for entities +- Entity UI components + +#### 6. Shared Layer (`apps/desktop/src/shared/`) +- Reusable UI components (shadcn/ui) +- Utilities, hooks, types +- No business logic + +### Desktop-First Principles + +**Offline-First**: +- Everything works without internet (except model downloads) +- Models downloaded once, stored locally +- SQLite for local storage (profiles, history, config) +- No authentication or registration +- No external API calls (everything local) + +**No Redundant Suffixes**: +```typescript +// ❌ WRONG +services/VoiceCloningService.ts +hooks/useVoiceProfilesHook.ts +components/ProfileCardComponent.tsx + +// ✅ CORRECT +services/voice-cloning.ts +hooks/use-profiles.ts +ui/profile-card.tsx +``` + +### Tauri Backend (Rust) + +**Python Bridge**: +- Subprocess management for Python core library +- Calls Python API via subprocess +- Handles Python process lifecycle + +**Local Storage**: +- SQLite database for profiles, history, settings +- File system for audio samples and outputs +- Model registry for installed models + +**Model Management**: +- Download models from Hugging Face on-demand (only time internet needed) +- Install models in OS-specific user directories: + - macOS: `~/Library/Application Support/TTS Studio/models/` + - Windows: `%LOCALAPPDATA%\TTS Studio\models\` + - Linux: `~/.local/share/tts-studio/models/` +- Delete models from UI to free disk space +- List installed models +- Models NOT included in installer (downloaded separately by user) + +## Hardware Requirements + +### Minimum (CPU Only) +- **CPU**: 4+ cores +- **RAM**: 8GB +- **Storage**: 10GB free +- **Generation Speed**: ~2-3 min por minuto de audio + +### Recommended (MPS - Apple Silicon) +- **GPU**: Apple M1/M2/M3 Pro +- **RAM**: 16GB unified memory +- **Storage**: 10GB free +- **Generation Speed**: ~15-30 seg por minuto de audio + +### Optimal +- **GPU**: Apple M1/M2/M3 Max (32GB+ unified memory) +- **RAM**: 32GB unified memory +- **Storage**: SSD con 20GB+ free +- **Generation Speed**: ~10-20 seg por minuto de audio + +## MPS Setup (Apple Silicon) +```bash +# Verificar MPS disponible +python -c "import torch; print(torch.backends.mps.is_available())" + +# PyTorch con MPS viene incluido en instalación estándar +pip install torch torchaudio + +# Configurar dtype para MPS (requerido) +# En config.yaml: dtype: "float32" +``` + +## Environment Setup + +### Python Environment Manager +- **Tool**: `venv` (built-in Python) o `conda` +- **Why**: Aislamiento de dependencias, evitar conflictos + +### Recommended: venv +```bash +python3.10 -m venv venv +source venv/bin/activate # Linux/Mac +# venv\Scripts\activate # Windows +``` + +## Directory Structure for Data + +### Development Environment (apps/core/) +``` +apps/core/data/ # Development only, gitignored +├── samples/ # Test audio samples +├── outputs/ # Generated test audio +├── models/ # Downloaded models for testing +│ └── Qwen3-TTS-12Hz-1.7B-Base/ +└── cache/ # Temporary cache +``` + +### Production Desktop App (User Directories) +``` +# macOS +~/Library/Application Support/TTS Studio/ +├── models/ # Downloaded TTS models +│ └── Qwen3-TTS-12Hz-1.7B-Base/ +├── profiles/ # User voice profiles +├── samples/ # User audio samples +├── outputs/ # Generated audio +└── cache/ # Temporary cache + +# Windows +%LOCALAPPDATA%\TTS Studio\ +├── models\ +├── profiles\ +├── samples\ +├── outputs\ +└── cache\ + +# Linux +~/.local/share/tts-studio/ +├── models/ +├── profiles/ +├── samples/ +├── outputs/ +└── cache/ +``` + +### Model Storage Strategy +- **Development**: Models in `apps/core/data/models/` (gitignored) +- **Production**: Models in OS-specific user directories +- **Installer**: Does NOT include models (~50-100MB installer) +- **First Launch**: User downloads models on-demand via UI +- **User Control**: Can delete models to free space, re-download anytime +## Audio Format Standards + +### Input Samples (Reference Voice) +- **Format**: WAV (uncompressed) +- **Sample Rate**: 12000 Hz (nativo de Qwen3-TTS) +- **Channels**: Mono (1 channel) +- **Bit Depth**: 16-bit +- **Duration**: 3-30 segundos por sample +- **Quality**: Sin ruido de fondo, voz clara + +### Output Generated Audio +- **Format**: WAV (default) o MP3 (para videos) +- **Sample Rate**: 12000 Hz (matching training) +- **Channels**: Mono +- **Bit Depth**: 16-bit +- **Export**: Convertible a MP3/AAC para edición de video + +## Model Specifications + +### Qwen3-TTS Details +- **Size**: ~3.4GB (model weights) +- **Languages**: Soporte nativo para español y múltiples idiomas +- **Context Length**: ~2048 tokens máximo por generación +- **Reference Audio**: Usa 1-3 samples por inferencia para mejor calidad +- **Loading Time**: 30-60 seg primera carga (MPS), 2-3 min (CPU) + +### Model Download Strategy + +#### Development (Core Library) +- **Auto-download**: Qwen3-TTS downloads automatically on first use from Hugging Face +- **Cache Location**: `apps/core/data/models/` (configurable, gitignored) +- **Purpose**: Testing and development only + +#### Production (Desktop App) +- **On-Demand Download**: User initiates download from UI +- **Storage Location**: OS-specific user directories (see above) +- **Download Flow**: + 1. User opens app for first time + 2. UI shows "No models installed" with "Download" button + 3. User clicks "Download Qwen3-TTS (~3.4GB)" + 4. Progress bar shows download status + 5. Model extracted to user directory + 6. App ready to use +- **Model Management UI**: + - List installed models with size + - Download new models + - Delete models to free space + - Re-download if needed +- **Offline Mode**: Once downloaded, works completely offline + +## Configuration Management + +### Config File Format +```python +# config.yaml (usando PyYAML) +model: + name: "Qwen/Qwen3-TTS-12Hz-1.7B-Base" + device: "mps" # or "cpu" + dtype: "float32" # Required for MPS + +audio: + sample_rate: 12000 + output_format: "wav" + +paths: + samples_dir: "./data/samples" + output_dir: "./data/outputs" + models_cache: "./data/qwen3_models" + +generation: + language: "Spanish" + temperature: 0.75 # Control de variabilidad + max_new_tokens: 2048 + speed: 1.0 +``` + +``` + +## Testing Strategy + +### Hexagonal Testing Approach + +#### Domain Tests (`apps/core/tests/domain/`) +- Test domain logic without infrastructure +- Use mocked ports (interfaces) +- Fast, isolated unit tests +- No external dependencies + +```python +# Test domain without infrastructure +def test_voice_profile_validation(): + mock_processor = Mock(spec=AudioProcessor) + mock_processor.validate_sample.return_value = True + + service = VoiceCloningService(mock_processor) + profile = service.create_profile_from_samples("test", [Path("sample.wav")]) + + assert profile.is_valid() +``` + +#### Application Tests (`apps/core/tests/app/`) +- Test use cases with mocked ports +- Verify orchestration logic +- Integration between domain and application + +#### Infrastructure Tests (`apps/core/tests/infra/`) +- Test adapters with real implementations +- Verify Qwen3 adapter works correctly +- Test audio processing with real files +- Test file repository with real filesystem + +#### Integration Tests (`apps/core/tests/integration/`) +- End-to-end tests with real components +- Test complete workflows +- Verify all layers work together + +### Desktop App Testing + +#### Unit Tests (Vitest) +- Test React components +- Test hooks +- Test utilities + +#### Integration Tests +- Test Tauri commands +- Test Python bridge +- Test SQLite operations + +### Test Framework +```python +# Python +pytest>=7.4.0 # Testing framework +pytest-cov>=4.0.0 # Coverage +hypothesis>=6.0.0 # Property-based testing +``` + +```json +// TypeScript +{ + "vitest": "^1.0.0", + "@testing-library/react": "^14.0.0" +} +``` +## Performance Considerations + +### Batch Processing +- **Strategy**: Process multiple texts sequentially without reloading model +- **Memory Management**: Clear cache between large batches +- **Concurrency**: Single user, sequential processing + +### Caching Strategy +- **Model**: Keep in memory during session +- **Voice Profile**: Load once at startup +- **Audio Samples**: Lazy load when needed + +## Known Limitations & Workarounds + +### Qwen3-TTS Constraints +- **Long text**: Split texts >2048 tokens into chunks +- **Punctuation**: Critical for correct intonation +- **Speaker consistency**: Use same subset of samples for coherence +- **MPS dtype**: Requires float32 (not float16) for stability + +### Platform-Specific Issues +- **Windows**: May require Microsoft C++ Build Tools +- **macOS M1/M2/M3**: Use MPS (Metal) - dtype must be float32 +- **Linux**: CPU-only mode (MPS not available) + +## Future Enhancements +- [ ] Voice fine-tuning with more samples +- [ ] FFmpeg integration for direct MP3 export +- [ ] Additional TTS engines (XTTS, ElevenLabs) +- [ ] Direct export to video editing formats (AAC, etc.) +- [ ] CUDA support (if Qwen3-TTS adds support) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b884934..080570c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: hooks: - id: black language_version: python3.11 - args: ['--config=pyproject.toml'] + args: ['--config=apps/core/pyproject.toml'] # Ruff - Fast linting and import sorting - repo: https://github.com/astral-sh/ruff-pre-commit @@ -23,8 +23,8 @@ repos: additional_dependencies: - types-PyYAML - types-requests - args: ['--config-file=pyproject.toml'] - exclude: '^(tests/|examples/)' + args: ['--config-file=apps/core/pyproject.toml'] + exclude: '^(tests/|examples/|apps/core/tests/|apps/core/examples/|apps/core/setup\.py|apps/core/src/domain/)' # Pre-commit hooks for common issues - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f8d9b9..ff8a60a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,109 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Security +## [1.0.0] - 2026-01-29 + +### Added +- **Hexagonal Architecture**: Clean architecture with ports & adapters pattern +- **Monorepo Structure**: Separate apps for core library (`apps/core/`) and desktop app (`apps/desktop/`) +- **Python API**: Programmatic access via `TTSStudio` class +- **Domain Layer**: Pure business logic with no external dependencies +- **Application Layer**: Use cases for orchestrating workflows +- **Infrastructure Layer**: Adapters for TTS engines, audio processing, and storage +- **API Layer**: Entry point with dependency injection +- **Integration Tests**: End-to-end workflow tests +- **Property-Based Tests**: Hypothesis-based tests for domain invariants +- **Hexagonal Architecture Tests**: Verify architectural boundaries +- **Documentation**: + - `MIGRATION.md` - Migration guide from v0.x to v1.0.0 + - `HEXAGONAL_ARCHITECTURE.md` - Architecture guide + - Updated `development.md` with monorepo and testing guidelines +- **DTOs**: Data transfer objects for clean API boundaries +- **Ports**: Interfaces for TTS engines, audio processors, and repositories +- **Adapters**: Qwen3 TTS engine, librosa audio processor, file repository + +### Changed +- **BREAKING**: Removed CLI (`voice-clone` commands) +- **BREAKING**: Removed Gradio UI +- **BREAKING**: Package structure changed to monorepo +- **BREAKING**: Import paths changed (use `api.studio.TTSStudio`) +- **BREAKING**: Configuration structure updated for hexagonal architecture +- Project renamed from `voice-clone-cli` to `tts-studio` +- All business logic moved to domain layer +- All infrastructure moved to adapters +- Test structure reorganized by architecture layers +- Documentation updated for Python API usage +- Steering files updated with hexagonal architecture + +### Deprecated +- `docs/ui-guide.md` - Replaced with Python API usage +- `docs/SVELTE_UI_SPECIFICATION.md` - Tauri chosen instead + +### Removed +- **CLI**: All `voice-clone` commands removed +- **Gradio UI**: Web interface removed +- `src/cli/` directory and all CLI code +- `src/gradio_ui/` directory and all Gradio code +- `click` dependency (CLI framework) +- `gradio` dependency (UI framework) +- CLI entry points from `setup.py` +- Old test files for CLI and Gradio + +### Fixed +- Improved testability with dependency inversion +- Better separation of concerns +- Cleaner error handling with domain exceptions +- More maintainable codebase with hexagonal architecture + +### Security +- Domain layer isolated from external dependencies +- Better validation with domain services +- Cleaner boundaries between layers + +### Migration Notes + +**From v0.x to v1.0.0**: + +1. **Uninstall old package**: + ```bash + pip uninstall voice-clone-cli + ``` + +2. **Install new package**: + ```bash + cd apps/core + pip install -e . + ``` + +3. **Update code to use Python API**: + ```python + from api.studio import TTSStudio + + studio = TTSStudio() + result = studio.create_voice_profile( + name='my_voice', + sample_paths=['./data/samples/sample1.wav'] + ) + ``` + +4. **See `docs/MIGRATION.md` for detailed migration guide** + +### Architecture + +**Hexagonal Architecture Layers**: +- **Domain**: Pure business logic (NO external dependencies) +- **Application**: Use cases (orchestration) +- **Infrastructure**: Adapters (implementations) +- **API**: Entry points (dependency injection) + +**Benefits**: +- Easy to test (mock adapters) +- Easy to swap implementations (Qwen3 → XTTS) +- Easy to extend (add new engines, processors, storage) +- Clean, maintainable codebase + +**See `docs/HEXAGONAL_ARCHITECTURE.md` for detailed architecture guide** + ## [0.2.0] - 2026-01-24 ### Added @@ -130,6 +233,7 @@ When releasing a new version: ## Links -[Unreleased]: https://github.com/yourusername/voice-clone-cli/compare/v0.2.0...HEAD -[0.2.0]: https://github.com/yourusername/voice-clone-cli/compare/v0.1.0...v0.2.0 -[0.1.0]: https://github.com/yourusername/voice-clone-cli/releases/tag/v0.1.0 +[Unreleased]: https://github.com/bryanstevensacosta/tts-studio/compare/v1.0.0...HEAD +[1.0.0]: https://github.com/bryanstevensacosta/tts-studio/compare/v0.2.0...v1.0.0 +[0.2.0]: https://github.com/bryanstevensacosta/tts-studio/compare/v0.1.0...v0.2.0 +[0.1.0]: https://github.com/bryanstevensacosta/tts-studio/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a3a563f..b8dbf97 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -21,7 +21,7 @@ This project adheres to a Code of Conduct that all contributors are expected to ### Reporting Bugs -Before creating a bug report, please check the [issue tracker](https://github.com/yourusername/voice-clone-cli/issues) to avoid duplicates. +Before creating a bug report, please check the [issue tracker](https://github.com/bryanstevensacosta/voice-clone-cli/issues) to avoid duplicates. When creating a bug report, include: @@ -89,7 +89,7 @@ git clone https://github.com/YOUR_USERNAME/voice-clone-cli.git cd voice-clone-cli # Add upstream remote -git remote add upstream https://github.com/yourusername/voice-clone-cli.git +git remote add upstream https://github.com/bryanstevensacosta/voice-clone-cli.git # Run automated setup ./setup.sh @@ -532,8 +532,8 @@ When making changes, update relevant documentation: ## Questions? -- 💬 Open a [Discussion](https://github.com/yourusername/voice-clone-cli/discussions) -- 🐛 Create an [Issue](https://github.com/yourusername/voice-clone-cli/issues) +- 💬 Open a [Discussion](https://github.com/bryanstevensacosta/voice-clone-cli/discussions) +- 🐛 Create an [Issue](https://github.com/bryanstevensacosta/voice-clone-cli/issues) - 📧 Email: bryanstevensacosta@gmail.com ## Recognition diff --git a/README.md b/README.md index a5fd485..9a843ae 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,79 @@ -# Voice Clone - AI Voice Cloning Tool +# TTS Studio - AI Voice Cloning [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![CI](https://github.com/bryanstevensacosta/voice-clone/workflows/CI/badge.svg)](https://github.com/bryanstevensacosta/voice-clone/actions) +[![CI](https://github.com/bryanstevensacosta/tts-studio/workflows/CI/badge.svg)](https://github.com/bryanstevensacosta/tts-studio/actions) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -AI voice cloning tool with modern web interface and CLI, powered by Qwen3-TTS. Clone any voice with just a few audio samples and generate natural-sounding speech from text. +Professional voice cloning and text-to-speech system with hexagonal architecture, powered by Qwen3-TTS. Clone any voice with just a few audio samples and generate natural-sounding speech from text. + +**Desktop application coming soon!** The core Python library is production-ready and can be integrated into your applications today. ## Features -- 🌐 **Web Interface**: Modern, intuitive UI built with Gradio - 🎤 **Voice Cloning**: Clone any voice using 1-3 audio samples - 🗣️ **Text-to-Speech**: Generate speech from text in the cloned voice - 🎯 **High Quality**: Powered by Qwen3-TTS for natural-sounding results -- ⚡ **Fast Processing**: Optimized for quick voice cloning and synthesis +- ⚡ **Fast Processing**: Optimized for Apple Silicon (MPS) and CUDA GPUs - 📦 **Batch Processing**: Process multiple text segments at once -- 🖥️ **CLI Interface**: Command-line interface for advanced users -- 🔧 **Configurable**: Flexible configuration options +- 🏗️ **Hexagonal Architecture**: Clean, testable, maintainable code +- 🔧 **Python API**: Easy-to-use Python library for integration +- 🖥️ **Desktop App**: Native Tauri desktop application (coming soon) +- 📥 **Model Management**: Download and manage TTS models on-demand +- 🔒 **Privacy-First**: Everything runs locally, no cloud required + +## Architecture + +TTS Studio uses a **monorepo structure** with **hexagonal architecture** (Ports & Adapters): + +``` +tts-studio/ +├── apps/ +│ ├── core/ # Python core library (hexagonal architecture) +│ │ ├── src/ +│ │ │ ├── domain/ # Business logic (pure, no dependencies) +│ │ │ ├── app/ # Use cases and orchestration +│ │ │ ├── infra/ # Adapters (Qwen3, audio, storage) +│ │ │ ├── api/ # Python API entry point +│ │ │ └── shared/ # Shared utilities +│ │ └── tests/ # Comprehensive test suite +│ └── desktop/ # Tauri desktop app (coming soon) +├── config/ # Shared configuration +├── data/ # Data directory (gitignored) +└── docs/ # Documentation +``` + +### Hexagonal Architecture + +The core library follows hexagonal architecture principles for maximum flexibility and testability: + +- **Domain Layer**: Pure business logic with zero external dependencies + - Entities (VoiceProfile, AudioSample) + - Ports (interfaces for TTS engines, audio processors, storage) + - Domain services (voice cloning logic) + +- **Application Layer**: Use cases that orchestrate domain logic + - CreateVoiceProfile, GenerateAudio, ValidateSamples + - DTOs for data transfer + - No business logic, only coordination + +- **Infrastructure Layer**: Concrete implementations (adapters) + - Qwen3 TTS engine adapter + - Librosa audio processor adapter + - File-based profile repository + - YAML configuration provider + +- **API Layer**: Entry point for external consumers + - TTSStudio class (main Python API) + - Dependency injection and wiring + +This architecture makes the code: +- ✅ **Easy to test**: Domain logic testable without infrastructure +- ✅ **Easy to maintain**: Clear separation of concerns +- ✅ **Easy to extend**: Swap TTS engines without changing business logic +- ✅ **Easy to understand**: Follows SOLID principles + +See [docs/HEXAGONAL_ARCHITECTURE.md](docs/HEXAGONAL_ARCHITECTURE.md) for detailed architecture documentation. ## Quick Start @@ -24,8 +81,11 @@ AI voice cloning tool with modern web interface and CLI, powered by Qwen3-TTS. C ```bash # Clone the repository -git clone https://github.com/yourusername/voice-clone.git -cd voice-clone +git clone https://github.com/bryanstevensacosta/tts-studio.git +cd tts-studio + +# Navigate to core library +cd apps/core # Run the automated setup script ./setup.sh @@ -33,67 +93,82 @@ cd voice-clone The setup script will: - Create a Python virtual environment -- Install all dependencies (including Gradio) +- Install all dependencies - Set up pre-commit hooks for development -### Web Interface (Recommended) +### Model Download -```bash -# Activate the virtual environment -source venv/bin/activate - -# Launch the web interface -voice-clone ui - -# Open your browser at: http://localhost:7860 -``` +TTS Studio uses an on-demand model download system. Models are **not** included in the installation to keep the package size small. -The web interface provides an intuitive way to: -1. **Upload and validate** audio samples -2. **Create voice profiles** with a few clicks -3. **Generate audio** from text interactively -4. **Process batches** of text segments -5. **Download results** directly from the browser +**First-time setup:** -### CLI Usage (Advanced) - -For advanced users and automation, the CLI is still available: - -```bash -# Activate the virtual environment -source venv/bin/activate +```python +from api.studio import TTSStudio -# 1. Validate your audio samples -voice-clone validate-samples --dir ./data/samples +# Initialize the API (will prompt for model download if needed) +studio = TTSStudio() -# 2. Create a voice profile -voice-clone prepare \ - --samples ./data/samples \ - --ref-text "Hola, esta es una muestra de mi voz para clonación." \ - --output ./data/voice_profile.json \ - --name "my_voice" - -# 3. Generate speech from text -voice-clone generate \ - --profile ./data/voice_profile.json \ - --text "Hola, esta es una prueba de mi voz clonada." \ - --output ./output.wav +# The Qwen3-TTS model (~3.4GB) will download automatically on first use +# This happens once and takes 10-15 minutes depending on your connection +``` -# 4. Quick test -voice-clone test --profile ./data/voice_profile.json +**Model storage locations:** +- macOS: `~/Library/Application Support/TTS Studio/models/` +- Windows: `%LOCALAPPDATA%\TTS Studio\models\` +- Linux: `~/.local/share/tts-studio/models/` + +You can delete models anytime to free disk space and re-download them later. + +### Python API Usage + +```python +from api.studio import TTSStudio + +# Initialize the API +studio = TTSStudio() + +# 1. Validate audio samples +validation = studio.validate_samples( + sample_paths=["./data/samples/neutral_01.wav", "./data/samples/happy_01.wav"] +) + +if validation["all_valid"]: + # 2. Create voice profile + profile = studio.create_voice_profile( + name="my_voice", + sample_paths=["./data/samples/neutral_01.wav", "./data/samples/happy_01.wav"], + language="es" + ) + + if profile["status"] == "success": + # 3. Generate audio from text + result = studio.generate_audio( + profile_id=profile["profile"]["id"], + text="Hola, esta es una prueba de mi voz clonada.", + temperature=0.75, + speed=1.0 + ) + + if result["status"] == "success": + print(f"Audio generated: {result['output_path']}") ``` +See [examples/api_usage.py](examples/api_usage.py) for complete examples. + ## Installation Options ### Option 1: Automated Setup (Recommended) ```bash +cd apps/core ./setup.sh ``` ### Option 2: Manual Setup ```bash +cd apps/core + # Create virtual environment python3.10 -m venv venv source venv/bin/activate @@ -108,136 +183,55 @@ pre-commit install --hook-type commit-msg pre-commit install --hook-type pre-push ``` -## Usage Examples - -### Web Interface - -The easiest way to use Voice Clone is through the web interface: - -1. **Launch the UI**: - ```bash - voice-clone ui - ``` +## Python API Reference -2. **Open your browser** at `http://localhost:7860` +### TTSStudio Class -3. **Follow the tabs**: - - **Tab 1**: Upload samples and create voice profile - - **Tab 2**: Generate audio from text - - **Tab 3**: Process batch scripts +Main API entry point for TTS Studio. -### CLI Examples +#### Methods -For automation and advanced usage: +**`create_voice_profile(name, sample_paths, language="es", reference_text="")`** +- Creates a voice profile from audio samples +- Returns: `{"status": "success|error", "profile": {...}, "error": None|str}` -### Preparing Voice Samples +**`generate_audio(profile_id, text, temperature=0.75, speed=1.0, mode="clone")`** +- Generates audio from text using a voice profile +- Returns: `{"status": "success|error", "output_path": str, "duration": float, ...}` -First, record 6-10 audio samples of your voice (10-20 seconds each): +**`list_voice_profiles()`** +- Lists all available voice profiles +- Returns: `{"status": "success|error", "profiles": [...], "count": int, ...}` -```bash -# Samples should be: -# - WAV format, 12000 Hz, mono, 16-bit -# - Clear speech, no background noise -# - Different emotions/tones -# - Named: neutral_01.wav, happy_01.wav, serious_01.wav, etc. - -# Place samples in data/samples/ -data/samples/ -├── neutral_01.wav -├── neutral_02.wav -├── happy_01.wav -├── serious_01.wav -└── calm_01.wav -``` - -### Validating Samples - -```bash -# Validate all samples in a directory -voice-clone validate-samples --dir ./data/samples - -# Output shows: -# ✓ neutral_01.wav - Valid -# ✓ happy_01.wav - Valid -# ✗ serious_01.wav - ERROR: Stereo (must be mono) -``` - -### Creating Voice Profile +**`delete_voice_profile(profile_id)`** +- Deletes a voice profile +- Returns: `{"status": "success|error", "deleted": bool, ...}` -```bash -# Create profile from validated samples -voice-clone prepare \ - --samples ./data/samples \ - --ref-text "Hola, esta es una muestra de mi voz para clonación." \ - --output ./data/voice_profile.json \ - --name "my_voice" - -# Output: -# ✓ Voice profile created successfully! -# Samples: 8 -# Duration: 127.3s -# Language: es -``` - -### Generating Speech - -```bash -# Generate from text -voice-clone generate \ - --profile ./data/voice_profile.json \ - --text "Bienvenidos a este tutorial sobre inteligencia artificial." \ - --output ./intro.wav - -# Generate from longer text (auto-chunking) -voice-clone generate \ - --profile ./data/voice_profile.json \ - --text "$(cat script.txt)" \ - --output ./narration.wav -``` +**`validate_samples(sample_paths)`** +- Validates audio samples for quality +- Returns: `{"status": "success|error", "results": [...], "all_valid": bool, ...}` -### Batch Processing +See [docs/api.md](docs/api.md) for complete API documentation. -```bash -# Create a script file with markers -cat > script.txt << 'EOF' -[INTRO] -Hola a todos, bienvenidos a este nuevo video. - -[SECTION_1] -Hoy vamos a hablar sobre inteligencia artificial. - -[OUTRO] -Gracias por ver este video. -EOF - -# Process entire script -voice-clone batch \ - --profile ./data/voice_profile.json \ - --input ./script.txt \ - --output-dir ./data/outputs/video_001 - -# Output: -# ✓ Batch processing complete! -# Total segments: 3 -# Successful: 3 -# Failed: 0 -``` +## Audio Sample Requirements -### Quick Testing +For best results, your audio samples should be: -```bash -# Test with default Spanish phrase -voice-clone test --profile ./data/voice_profile.json +- **Format**: WAV, 12000 Hz, mono, 16-bit +- **Duration**: 3-30 seconds per sample +- **Quantity**: 1-3 samples (Qwen3-TTS requires fewer samples) +- **Quality**: Clear speech, no background noise +- **Variety**: Different emotions and tones +- **Content**: Natural speech, complete sentences -# Test with custom text -voice-clone test \ - --profile ./data/voice_profile.json \ - --text "Esta es una prueba personalizada" \ - --output ./test.wav +### Sample Recording Tips -# Play the result (macOS) -afplay ./test.wav -``` +1. **Environment**: Record in a quiet room +2. **Microphone**: Use a decent quality mic (built-in MacBook mic is acceptable) +3. **Distance**: 15-20cm from microphone +4. **Volume**: Natural speaking volume (not whispering or shouting) +5. **Emotions**: Include neutral, happy, serious, calm tones +6. **Avoid**: Background noise, echo, mouth clicks, breathing sounds ## Configuration @@ -267,21 +261,10 @@ audio: paths: samples: "./data/samples" outputs: "./data/outputs" + profiles: "./data/profiles" models: "./data/models" ``` -### Environment Variables - -Create a `.env` file for sensitive settings: - -```bash -# Optional: Custom models directory -QWEN_TTS_MODELS_DIR=/path/to/models - -# Optional: Logging level -LOG_LEVEL=INFO -``` - ## Documentation For detailed documentation, see: @@ -290,6 +273,7 @@ For detailed documentation, see: - [Usage Guide](docs/usage.md) - Comprehensive usage examples - [Development Guide](docs/development.md) - Contributing and development setup - [API Documentation](docs/api.md) - API reference and integration guide +- [Hexagonal Architecture](docs/HEXAGONAL_ARCHITECTURE.md) - Architecture overview ## Requirements @@ -308,26 +292,6 @@ For detailed documentation, see: | RTX 3060 (12GB) | ~10-20s per minute | CUDA acceleration | | Intel i7 (CPU) | ~2-3 min per minute | CPU-only, slower | -## Audio Sample Requirements - -For best results, your audio samples should be: - -- **Format**: WAV, 12000 Hz, mono, 16-bit -- **Duration**: 3-30 seconds per sample -- **Quantity**: 1-3 samples (Qwen3-TTS requires fewer samples) -- **Quality**: Clear speech, no background noise -- **Variety**: Different emotions and tones -- **Content**: Natural speech, complete sentences - -### Sample Recording Tips - -1. **Environment**: Record in a quiet room -2. **Microphone**: Use a decent quality mic (built-in MacBook mic is acceptable) -3. **Distance**: 15-20cm from microphone -4. **Volume**: Natural speaking volume (not whispering or shouting) -5. **Emotions**: Include neutral, happy, serious, calm tones -6. **Avoid**: Background noise, echo, mouth clicks, breathing sounds - ## Development We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. @@ -336,8 +300,8 @@ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. ```bash # Clone and setup -git clone https://github.com/yourusername/voice-clone-cli.git -cd voice-clone-cli +git clone https://github.com/bryanstevensacosta/tts-studio.git +cd tts-studio/apps/core ./setup.sh # Run tests @@ -363,142 +327,32 @@ All checks run automatically via pre-commit hooks. ### Git Workflow -This project enforces a strict rebase workflow to maintain a clean, linear history: - -#### Branch Protection - -The following branches are protected: `master`, `main`, `develop` -- ❌ No direct pushes allowed -- ❌ No force pushes allowed -- ✅ All changes must go through Pull Requests -- ✅ CI checks must pass before merge -- ✅ Only rebase merge is allowed (linear history) - -#### Development Workflow - -```bash -# 1. Create a feature branch -git checkout -b feature/my-feature - -# 2. Make your changes and commit -git add . -git commit -m "feat: add new feature" - -# 3. Before pushing, ensure you're up to date -make sync # Fetch latest changes -make rebase-master # Rebase on master (or main/develop) - -# 4. Push your branch -git push origin feature/my-feature - -# 5. Create a Pull Request on GitHub -# The CI will run automatically - -# 6. After PR approval, merge via GitHub -# (GitHub will automatically rebase and merge) -``` - -#### Pre-Push Hooks - -The pre-push hook automatically checks: -- ✅ Your branch is up to date (rebased) -- ✅ You're not pushing to protected branches -- ✅ All tests pass -- ✅ Code coverage is above 70% - -If your branch is behind, you'll see: -``` -❌ Error: Your branch is not up to date with origin/master -To fix this, run: - git fetch origin - git rebase origin/master -``` - -#### Useful Commands - -```bash -make sync # Fetch and show status -make rebase-master # Rebase on master -make rebase-main # Rebase on main -make rebase-develop # Rebase on develop -make check-branch # Check if rebase is needed -``` - -#### Commit Message Convention - -This project uses [Conventional Commits](https://www.conventionalcommits.org/): - -``` -(): - -[optional body] - -[optional footer] -``` - -Types: -- `feat`: New feature -- `fix`: Bug fix -- `docs`: Documentation changes -- `style`: Code style changes (formatting, etc.) -- `refactor`: Code refactoring -- `test`: Adding or updating tests -- `chore`: Maintenance tasks -- `perf`: Performance improvements -- `ci`: CI/CD changes -- `build`: Build system changes - -Examples: -```bash -git commit -m "feat: add voice cloning feature" -git commit -m "fix: resolve audio processing bug" -git commit -m "docs: update installation guide" -git commit -m "test: add unit tests for synthesizer" -``` - -The commit-msg hook will validate your commit messages automatically. +This project enforces a strict rebase workflow to maintain a clean, linear history. See [docs/git-workflow.md](docs/git-workflow.md) for details. ## Project Structure ``` -voice-clone/ -├── src/ -│ ├── voice_clone/ # Backend package -│ │ ├── cli.py # CLI interface -│ │ ├── config.py # Configuration management -│ │ ├── audio/ # Audio processing -│ │ │ ├── processor.py # Audio validation & conversion -│ │ │ └── validator.py # Validation results -│ │ ├── model/ # Model management -│ │ │ ├── manager.py # Model loading & caching -│ │ │ ├── generator.py # TTS generation -│ │ │ └── profile.py # Voice profile data -│ │ ├── batch/ # Batch processing -│ │ │ └── processor.py # Script processing -│ │ └── utils/ # Utilities -│ │ ├── logger.py # Logging setup -│ │ └── helpers.py # Helper functions -│ └── gradio_ui/ # Web interface (NEW) -│ ├── app.py # Gradio application -│ ├── components/ # UI components -│ ├── handlers/ # Event handlers -│ └── utils/ # UI utilities -├── tests/ # Test suite -│ ├── unit/ # Unit tests -│ ├── property/ # Property-based tests -│ └── gradio_ui/ # UI tests -├── docs/ # Documentation -├── data/ # Data directory (gitignored) -│ ├── samples/ # Audio samples -│ ├── profiles/ # Voice profiles -│ ├── models/ # Cached models -│ ├── outputs/ # Generated audio -│ └── scripts/ # Example scripts -├── config/ # Configuration files -│ ├── default.yaml # Default config -│ └── config.yaml.example # Example custom config -└── .kiro/ # Project steering guides - └── steering/ # Workflow documentation +tts-studio/ +├── apps/ +│ ├── core/ # Python core library +│ │ ├── src/ +│ │ │ ├── domain/ # Domain layer (business logic) +│ │ │ ├── app/ # Application layer (use cases) +│ │ │ ├── infra/ # Infrastructure layer (adapters) +│ │ │ ├── api/ # API layer (entry points) +│ │ │ └── shared/ # Shared utilities +│ │ ├── tests/ # Test suite +│ │ ├── setup.py # Package setup +│ │ └── requirements.txt # Dependencies +│ └── desktop/ # Tauri desktop app (coming soon) +├── config/ # Configuration files +├── data/ # Data directory (gitignored) +│ ├── samples/ # Audio samples +│ ├── profiles/ # Voice profiles +│ ├── models/ # Cached models +│ └── outputs/ # Generated audio +├── docs/ # Documentation +└── examples/ # Usage examples ``` ## Troubleshooting @@ -507,7 +361,9 @@ voice-clone/ **Import errors**: Make sure you've activated the virtual environment: ```bash -source venv/bin/activate +source venv/bin/activate # macOS/Linux +# or +venv\Scripts\activate # Windows ``` **Model download fails**: The Qwen3-TTS model (~3.4GB) downloads automatically on first use. Ensure you have: @@ -515,6 +371,13 @@ source venv/bin/activate - At least 10GB free disk space - Patience (first download takes 10-15 minutes) +**Model storage**: Models are stored in OS-specific directories: +- macOS: `~/Library/Application Support/TTS Studio/models/` +- Windows: `%LOCALAPPDATA%\TTS Studio\models\` +- Linux: `~/.local/share/tts-studio/models/` + +You can delete models to free space and re-download them later. + **Audio quality issues**: Ensure your input samples are: - 12000 Hz sample rate (or will be converted) - Mono (single channel) @@ -524,10 +387,10 @@ source venv/bin/activate - At least 1 sample (1-3 recommended) **Generation is slow**: -- First generation is slower (model loading) -- CPU-only mode is significantly slower than MPS +- First generation is slower (model loading ~30-60 seconds) +- CPU-only mode is significantly slower than GPU - For M1/M2 Mac: Ensure PyTorch has MPS support and dtype is set to float32 -- CUDA is not supported by Qwen3-TTS +- For NVIDIA GPU: Ensure CUDA is properly installed **Voice sounds robotic**: - Add more samples with emotional variety @@ -539,7 +402,7 @@ source venv/bin/activate - Close other applications - Reduce batch size - Use shorter text chunks -- Consider upgrading RAM +- Consider upgrading RAM (16GB recommended) ### Getting Help @@ -563,26 +426,26 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file ## Support - 📖 [Documentation](docs/) -- 🐛 [Issue Tracker](https://github.com/yourusername/voice-clone-cli/issues) -- 💬 [Discussions](https://github.com/yourusername/voice-clone-cli/discussions) +- 🐛 [Issue Tracker](https://github.com/bryanstevensacosta/tts-studio/issues) +- 💬 [Discussions](https://github.com/bryanstevensacosta/tts-studio/discussions) ## Roadmap - [x] Core voice cloning with Qwen3-TTS -- [x] CLI interface with all commands +- [x] Hexagonal architecture implementation +- [x] Python API for integration - [x] Audio validation and conversion - [x] Batch processing for scripts - [x] Voice profile management -- [x] Migration from XTTS-v2 to Qwen3-TTS -- [x] Web interface with Gradio +- [x] Comprehensive test suite (206 tests, 99% passing) +- [ ] **Model management system** (download models on-demand) +- [ ] **Tauri desktop application** (native UI for all platforms) - [ ] Post-processing (normalization, fade effects) - [ ] Format export (MP3, AAC, platform-specific) -- [ ] Integration tests -- [ ] Manual testing with real samples - [ ] Streaming audio generation - [ ] Real-time voice conversion (future) - [ ] Multi-speaker support (future) -- [ ] Hugging Face Spaces deployment (future) +- [ ] Additional TTS engines (XTTS, ElevenLabs) --- diff --git a/SECURITY.md b/SECURITY.md index 072ff58..8a10677 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -175,7 +175,7 @@ We recognize security researchers who help improve our security: If you have questions about this security policy, please contact: - **Email**: bryanstevensacosta@gmail.com -- **GitHub Discussions**: [Security Category](https://github.com/yourusername/voice-clone-cli/discussions/categories/security) +- **GitHub Discussions**: [Security Category](https://github.com/bryanstevensacosta/voice-clone-cli/discussions/categories/security) ## Additional Resources diff --git a/.python-version b/apps/core/.python-version similarity index 100% rename from .python-version rename to apps/core/.python-version diff --git a/Makefile b/apps/core/Makefile similarity index 95% rename from Makefile rename to apps/core/Makefile index 1005eb1..70084f6 100644 --- a/Makefile +++ b/apps/core/Makefile @@ -120,11 +120,7 @@ activate: ## Show activation command dev: ## Run in development mode @echo "🔧 Starting development mode..." - @python -m voice_clone.cli --help - -ui: ## Launch Gradio web interface - @echo "🌐 Starting Gradio UI..." - @echo "📍 Opening at http://localhost:7860" - @python -m gradio_ui.app + @echo "💡 Use Python API: from api.studio import TTSStudio" + @python -c "from api.studio import TTSStudio; print('✅ API available')" .DEFAULT_GOAL := help diff --git a/apps/core/docs/UI_INTEGRATION_GUIDE.md b/apps/core/docs/UI_INTEGRATION_GUIDE.md new file mode 100644 index 0000000..7248345 --- /dev/null +++ b/apps/core/docs/UI_INTEGRATION_GUIDE.md @@ -0,0 +1,377 @@ +# UI Integration Guide - Text Length Validation + +## Overview + +This guide explains how the UI should integrate with the backend's text length validation system. The system uses a **defense in depth** approach with validation at both UI and backend layers. + +## Architecture: Defense in Depth + +``` +┌─────────────────────────────────────────────────────────┐ +│ UI Layer (React) │ +│ - Query engine capabilities │ +│ - Enforce limits proactively │ +│ - Provide real-time feedback │ +│ - Prevent invalid submissions │ +└────────────────────┬────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Backend Layer (Python API) │ +│ - Validate against engine capabilities │ +│ - Soft limit: Log warning, allow generation │ +│ - Hard limit: Raise error, block generation │ +│ - Protect against bugs and direct API calls │ +└─────────────────────────────────────────────────────────┘ +``` + +## Engine Capabilities + +Each TTS engine reports its capabilities via `get_capabilities()`: + +```python +@dataclass +class EngineCapabilities: + max_text_length: int # Hard limit (error if exceeded) + recommended_text_length: int # Soft limit (warning if exceeded) + supports_streaming: bool # Future: streaming generation + min_sample_duration: float # Minimum seconds per sample + max_sample_duration: float # Maximum seconds per sample +``` + +### Example: Qwen3-TTS Capabilities + +```python +EngineCapabilities( + max_text_length=2048, # Absolute maximum + recommended_text_length=400, # Best quality range + supports_streaming=False, + min_sample_duration=3.0, + max_sample_duration=30.0, +) +``` + +## UI Implementation + +### 1. Query Engine Capabilities + +When the UI loads or when the user selects a voice profile, query the engine capabilities: + +```typescript +// Tauri command to get capabilities +const capabilities = await invoke('get_engine_capabilities', { + profileId: selectedProfile.id +}); + +// TypeScript interface +interface EngineCapabilities { + max_text_length: number; + recommended_text_length: number; + supports_streaming: boolean; + min_sample_duration: number; + max_sample_duration: number; +} +``` + +### 2. Real-Time Character Counter + +Display a character counter that updates as the user types: + +```typescript +const [text, setText] = useState(''); +const [capabilities, setCapabilities] = useState(null); + +const textLength = text.length; +const isWithinRecommended = textLength <= capabilities.recommended_text_length; +const isWithinMax = textLength <= capabilities.max_text_length; + +// Character counter component +
+ + {textLength} / {capabilities.recommended_text_length} + + + (max: {capabilities.max_text_length}) + +
+``` + +### 3. Visual Feedback + +Provide visual feedback based on text length: + +```typescript +function getCounterColor(): string { + if (!isWithinMax) { + return 'text-red-600'; // Over hard limit - error + } + if (!isWithinRecommended) { + return 'text-yellow-600'; // Over soft limit - warning + } + return 'text-green-600'; // Within recommended - good +} + +function getWarningMessage(): string | null { + if (!isWithinMax) { + return `Text exceeds maximum limit of ${capabilities.max_text_length} characters. Please shorten your text.`; + } + if (!isWithinRecommended) { + return `Text exceeds recommended limit of ${capabilities.recommended_text_length} characters. Quality may be degraded.`; + } + return null; +} +``` + +### 4. Disable Submit Button + +Disable the generate button when text exceeds hard limit: + +```typescript + +``` + +### 5. Warning Dialog for Soft Limit + +Show a confirmation dialog when text exceeds recommended limit but is within max: + +```typescript +async function handleGenerate() { + // Check if exceeds recommended but within max + if (!isWithinRecommended && isWithinMax) { + const confirmed = await showConfirmDialog({ + title: 'Quality Warning', + message: `Your text (${textLength} characters) exceeds the recommended limit of ${capabilities.recommended_text_length} characters. This may result in degraded audio quality. Continue anyway?`, + confirmText: 'Generate Anyway', + cancelText: 'Edit Text', + }); + + if (!confirmed) { + return; // User chose to edit + } + } + + // Proceed with generation + await generateAudio(text, selectedProfile); +} +``` + +## Complete UI Example + +```typescript +import { useState, useEffect } from 'react'; +import { invoke } from '@tauri-apps/api/tauri'; + +interface EngineCapabilities { + max_text_length: number; + recommended_text_length: number; + supports_streaming: boolean; + min_sample_duration: number; + max_sample_duration: number; +} + +export function AudioGenerationForm() { + const [text, setText] = useState(''); + const [capabilities, setCapabilities] = useState(null); + const [isGenerating, setIsGenerating] = useState(false); + const [selectedProfile, setSelectedProfile] = useState(null); + + // Load capabilities when profile changes + useEffect(() => { + if (selectedProfile) { + loadCapabilities(selectedProfile); + } + }, [selectedProfile]); + + async function loadCapabilities(profileId: string) { + try { + const caps = await invoke('get_engine_capabilities', { + profileId + }); + setCapabilities(caps); + } catch (error) { + console.error('Failed to load capabilities:', error); + } + } + + async function handleGenerate() { + if (!capabilities || !selectedProfile) return; + + const textLength = text.length; + + // Hard limit check (should be prevented by disabled button, but double-check) + if (textLength > capabilities.max_text_length) { + alert(`Text exceeds maximum limit of ${capabilities.max_text_length} characters.`); + return; + } + + // Soft limit warning + if (textLength > capabilities.recommended_text_length) { + const confirmed = confirm( + `Your text (${textLength} characters) exceeds the recommended limit of ${capabilities.recommended_text_length} characters. Quality may be degraded. Continue?` + ); + if (!confirmed) return; + } + + // Generate audio + setIsGenerating(true); + try { + const result = await invoke('generate_audio', { + text, + profileId: selectedProfile, + }); + console.log('Generation successful:', result); + } catch (error) { + // Backend validation error + if (error.includes('exceeds maximum limit')) { + alert('Text is too long. Please shorten your text and try again.'); + } else { + alert(`Generation failed: ${error}`); + } + } finally { + setIsGenerating(false); + } + } + + if (!capabilities) { + return
Loading...
; + } + + const textLength = text.length; + const isWithinRecommended = textLength <= capabilities.recommended_text_length; + const isWithinMax = textLength <= capabilities.max_text_length; + + return ( +
+

Generate Audio

+ + {/* Text Input */} +
+ +