diff --git a/docs/TSV_IMPORT_PROGRESS.md b/docs/TSV_IMPORT_PROGRESS.md new file mode 100644 index 0000000..a092474 --- /dev/null +++ b/docs/TSV_IMPORT_PROGRESS.md @@ -0,0 +1,312 @@ +# TSV Import Implementation Progress + +**Status**: Phase 1 Complete - Foundation Implemented +**Last Updated**: 2025-11-15 +**Session Branch**: `claude/review-repo-status-01Nr8A7Yeb4sHzxSzBJ8FFsu` + +## Overview + +Implementing TSV (tab-separated values) import functionality for bulk sample loading in Squiggy. Users can define samples in a spreadsheet and import 2-24 samples at once. + +**Target Use Cases**: +- Bulk loading (load 10+ samples at once) +- Workflow organization (track file associations in spreadsheet) +- Easy metadata association +- Reproducibility (share manifests with collaborators) + +**Strategy**: Lazy loading by default (register samples, load to kernel only when plotting) + +--- + +## Phase 1: Parsing & Validation ✅ COMPLETE + +### 1.1 TSV Parser ✅ COMPLETE +**File**: `src/services/tsv-parser.ts` + +- [x] Interface definitions (`TSVSampleSpec`, `TSVParseResult`) +- [x] Full implementation of `TSVParser.parse()` +- [x] Auto-detect delimiter (tab vs comma) +- [x] Header validation (require `sample_name` and `pod5` columns) +- [x] Detect duplicate sample names +- [x] Handle optional columns (bam, fasta with `-` or empty = missing) +- [x] Skip empty lines and comments (`#`) +- [x] Comprehensive error and warning reporting +- [x] **Tests**: 28 test cases covering valid input, errors, warnings, edge cases + - Test file: `src/services/__tests__/tsv-parser.test.ts` + +**Features**: +- ✅ Tab and comma delimiter support +- ✅ Case-insensitive column names +- ✅ Line number tracking for errors +- ✅ Windows (CRLF) line ending support +- ✅ Handles paths with spaces +- ✅ Warnings for missing optional files + +### 1.2 Path Resolver ✅ COMPLETE +**File**: `src/services/tsv-path-resolver.ts` + +- [x] Enum definitions (`PathResolutionStrategy`) +- [x] Full implementation of `TSVPathResolver` class +- [x] Absolute path resolution +- [x] TSV-relative resolution (relative to TSV file directory) +- [x] Workspace-relative resolution (relative to workspace root) +- [x] Auto strategy (tries multiple approaches) +- [x] File existence checking +- [ ] **Tests**: Need to add unit tests + +**Strategies**: +1. **Absolute**: Use path as-is (`/data/sample.pod5`) +2. **TSV-relative**: Relative to TSV file location (`data/sample.pod5` → `/data/sample.pod5`) +3. **Workspace-relative**: Relative to workspace root +4. **Auto**: Try absolute → TSV-relative → workspace-relative (first success wins) + +### 1.3 TSV Validator ✅ COMPLETE +**File**: `src/services/tsv-validator.ts` + +- [x] Interface definitions (`ValidationResult`) +- [x] Full implementation of `TSVValidator` class +- [x] File existence validation (POD5 required, BAM/FASTA optional) +- [x] Sample name conflict detection +- [x] Batch validation support +- [x] Summary statistics helper +- [ ] **Tests**: Need to add unit tests +- [ ] **Future**: POD5/BAM overlap validation (deferred to load time) + +**Validation Rules**: +- ✅ POD5 missing → BLOCK (error) +- ✅ BAM missing → WARN (optional) +- ✅ FASTA missing → WARN (optional) +- ✅ Sample name conflict → BLOCK (error) +- 🔜 POD5/BAM overlap check (expensive, deferred to load time) + +--- + +## Phase 2: UI & Commands ✅ COMPLETE + +### 2.1 TSV Import Command ✅ COMPLETE +**File**: `src/commands/tsv-commands.ts` + +- [x] Command registration (`squiggy.importSamplesFromTSV`) +- [x] File picker UI (prioritize .tsv file selection) +- [x] TSV parsing integration +- [x] Validation workflow +- [x] Validation results preview (QuickPick UI) +- [x] Import preview with confirmation dialog +- [x] Smart loading strategy (eager ≤10 samples, lazy ≥20 samples) +- [x] Sample registration in extension state +- [x] Unified state integration +- [ ] **Actual kernel loading**: Currently stubbed (registers metadata only) + +**UI Flow**: +1. File picker → Select TSV +2. Parse TSV → Show errors if any +3. Validate samples → Show errors if any +4. Preview import → Show sample count, files, warnings +5. Confirm → Import samples +6. Progress notification → Load samples (TODO: actual kernel loading) +7. Success message → Refresh Samples panel + +### 2.2 Command Registration ✅ COMPLETE +- [x] Added to `package.json` commands list +- [x] Imported in `extension.ts` +- [x] Registered in activation function +- [x] Icon: `$(table)` (table icon) +- [x] Enablement: `squiggy.packageInstalled` + +**Command Palette**: +- Title: "Squiggy: Import Samples from TSV" +- Category: Squiggy +- Available when package installed + +--- + +## Phase 3: Loading Integration ⚠️ PARTIAL + +### 3.1 Sample Loading ⚠️ STUBBED +**File**: `src/commands/tsv-commands.ts` (function `loadSamplesFromTSV`) + +- [x] Progress notification +- [x] Iterate through validated samples +- [x] Create `SampleInfo` objects +- [x] Add to extension state (`state.addSample()`) +- [x] Add to unified state (`state.addLoadedItem()`) +- [x] TSV metadata tracking (`sourceType: 'tsv'`, `tsvGroup: 'tsv_'`) +- [ ] **TODO**: Actual kernel loading via `FileLoadingService.loadSampleIntoRegistry()` +- [ ] **TODO**: Eager vs lazy loading implementation +- [ ] **TODO**: Error handling for kernel load failures + +**Current Behavior**: Samples are registered in TypeScript state but NOT loaded to Python kernel. + +**Next Steps**: +1. Implement eager loading: Call `service.loadSampleIntoRegistry()` for each sample +2. Implement lazy loading: Defer kernel load until plotting +3. Add lazy load trigger in plot commands (check `sample.isLoaded`, load if false) + +### 3.2 Lazy Loading Trigger ⬜ NOT STARTED +**File**: `src/commands/plot-commands.ts` + +- [ ] Add `ensureSamplesLoaded()` function +- [ ] Call before plotting to load TSV samples on-demand +- [ ] Update `sample.isLoaded` flag after loading + +--- + +## Phase 4: UI Integration ⬜ NOT STARTED + +### 4.1 Samples Panel Grouping ⬜ NOT STARTED +**File**: `src/views/squiggy-samples-panel.ts` + +- [ ] Group samples by `sourceType` (TSV vs manual) +- [ ] Show TSV batch ID (`tsvGroup`) +- [ ] Visual indicator for lazy-loaded samples (⚠️ not loaded yet) +- [ ] Batch operations (delete all from TSV group) + +### 4.2 Session Persistence ⬜ NOT STARTED +**File**: `src/types/squiggy-session-types.ts` + +- [ ] Add `tsvMetadata` to `SessionState` interface +- [ ] Track TSV batch import metadata +- [ ] Save/restore TSV-imported samples +- [ ] Preserve `sourceType` and `tsvGroup` in session + +### 4.3 Read Explorer Integration ⬜ NOT STARTED +**File**: `src/views/squiggy-reads-view-pane.ts` + +- [ ] Trigger lazy load when selecting TSV sample +- [ ] Transparent UX (no user intervention needed) + +--- + +## Testing Status + +### Unit Tests +- ✅ **TSVParser**: 28 tests, all passing +- ⬜ **TSVPathResolver**: Not yet implemented +- ⬜ **TSVValidator**: Not yet implemented +- ⬜ **TSV Commands**: Not yet implemented + +### Integration Tests +- ⬜ Full import workflow +- ⬜ Eager vs lazy loading +- ⬜ Session persistence +- ⬜ UI integration + +### Manual Testing +- ⬜ Import small TSV (5 samples, eager loading) +- ⬜ Import large TSV (20 samples, lazy loading) +- ⬜ Path resolution (absolute, relative, TSV-relative) +- ⬜ Validation errors (missing files, duplicates) +- ⬜ Plotting with TSV samples +- ⬜ Session save/restore + +--- + +## Files Created/Modified + +### New Files (Phase 1 & 2) +- `src/services/tsv-parser.ts` - Parser implementation (153 lines) +- `src/services/tsv-path-resolver.ts` - Path resolver (172 lines) +- `src/services/tsv-validator.ts` - Validator (144 lines) +- `src/commands/tsv-commands.ts` - Import commands (367 lines) +- `src/services/__tests__/tsv-parser.test.ts` - Parser tests (344 lines) +- `docs/TSV_IMPORT_PROGRESS.md` - This file + +### Modified Files +- `package.json` - Added `squiggy.importSamplesFromTSV` command +- `src/extension.ts` - Imported and registered TSV commands + +**Total new code**: ~1180 lines (including tests and docs) + +--- + +## Next Session Tasks + +### Priority 1: Complete Phase 3 (Loading Integration) +**Estimated time**: 45-60 minutes + +**Tasks**: +1. Implement actual kernel loading in `loadSamplesFromTSV()` + - Call `service.loadSampleIntoRegistry()` for eager mode + - Skip kernel load for lazy mode + - Handle errors gracefully +2. Add lazy loading trigger in plot commands + - Create `ensureSamplesLoaded()` helper + - Check `sample.isLoaded` before plotting + - Load on-demand if needed +3. Test with real data + - Create test TSV files (5 samples, 20 samples) + - Verify eager loading works + - Verify lazy loading triggers on plot + +### Priority 2: Add Tests +**Estimated time**: 30-45 minutes + +**Tasks**: +1. Path resolver tests (`tsv-path-resolver.test.ts`) + - Test absolute paths + - Test relative paths + - Test auto strategy + - Test file existence checks +2. Validator tests (`tsv-validator.test.ts`) + - Test validation success/failure + - Test sample name conflicts + - Test missing file handling +3. Integration test + - Test full import workflow + - Mock file system and kernel API + +### Priority 3: UI Integration +**Estimated time**: 60-90 minutes + +**Tasks**: +1. Samples panel grouping +2. Session persistence +3. Read Explorer integration + +--- + +## Design Decisions + +### Loading Strategy +- **≤5 samples**: Always eager (fast, small overhead) +- **6-10 samples**: Eager (default threshold) +- **11-19 samples**: Lazy (avoid kernel overload) +- **≥20 samples**: Always lazy + +**Rationale**: Eager loading provides better UX for small batches (immediate availability), while lazy loading scales to large datasets without overwhelming the kernel. + +### Path Resolution +- **Auto strategy by default**: Try multiple approaches, use first success +- **Priority**: Absolute → TSV-relative → workspace-relative +- **Error reporting**: Show which strategy succeeded for transparency + +### Validation Strictness +- **POD5 missing**: Block import (required file) +- **BAM/FASTA missing**: Warn but allow (optional files) +- **Duplicate names**: Block import (uniqueness required) +- **Sample name conflicts**: Block import (avoid overwriting existing samples) + +### State Management +- **TSV metadata**: Track import source (`sourceType: 'tsv'`) +- **Batch grouping**: All samples from same TSV share `tsvGroup` ID +- **Lazy load flag**: `isLoaded: boolean` tracks kernel state +- **Unified state**: Sync with existing `LoadedItem` system for cross-panel integration + +--- + +## Known Limitations + +1. **Single POD5 per sample**: Currently supports one POD5 file per sample. Future: Support comma-separated lists for technical replicates. +2. **No TSV editing UI**: Import only. Future: Allow editing sample associations before loading. +3. **No export to TSV**: Cannot export current samples to TSV format. Future: Reverse operation. +4. **No POD5/BAM overlap validation**: Deferred to load time (expensive operation). Future: Optional quick validation. +5. **Kernel loading stubbed**: Actual kernel integration pending (Priority 1 for next session). + +--- + +## References + +- Original design doc: `docs/TSV_IMPORT_FUTURE_DESIGN.md` +- Existing sample loading: `src/services/file-loading-service.ts` (method `loadSampleIntoRegistry()`) +- Existing file commands: `src/commands/file-commands.ts` (see `loadSamplesFromDropped()` for similar pattern) +- Extension state: `src/state/extension-state.ts` (interface `SampleInfo`) diff --git a/package.json b/package.json index 4786278..629f2c8 100644 --- a/package.json +++ b/package.json @@ -125,6 +125,13 @@ "icon": "$(folder-opened)", "enablement": "squiggy.packageInstalled" }, + { + "command": "squiggy.importSamplesFromTSV", + "title": "Import Samples from TSV", + "category": "Squiggy", + "icon": "$(table)", + "enablement": "squiggy.packageInstalled" + }, { "command": "squiggy.loadTestMultiReadDataset", "title": "Load Test Multi-Read Dataset", diff --git a/src/commands/tsv-commands.ts b/src/commands/tsv-commands.ts new file mode 100644 index 0000000..16e12a2 --- /dev/null +++ b/src/commands/tsv-commands.ts @@ -0,0 +1,318 @@ +/** + * TSV Import Commands + * + * Handles importing samples from TSV manifest files. + * Provides: + * - File picker for TSV selection + * - TSV parsing and validation + * - Preview UI with validation results + * - Smart loading (eager vs lazy based on sample count) + */ + +import * as vscode from 'vscode'; +import { promises as fs } from 'fs'; +import { ExtensionState } from '../state/extension-state'; +import { TSVParser, TSVSampleSpec, TSVParseResult } from '../services/tsv-parser'; +import { TSVPathResolver, PathResolutionStrategy } from '../services/tsv-path-resolver'; +import { TSVValidator, ValidationResult } from '../services/tsv-validator'; +import { FileLoadingService } from '../services/file-loading-service'; +import { SampleInfo } from '../state/extension-state'; +import { LoadedItem } from '../types/loaded-item'; +import { logger } from '../utils/logger'; + +/** + * Register TSV import commands + */ +export function registerTSVCommands( + context: vscode.ExtensionContext, + state: ExtensionState +): void { + context.subscriptions.push( + vscode.commands.registerCommand('squiggy.importSamplesFromTSV', async () => { + await importSamplesFromTSV(state); + }) + ); +} + +/** + * Import samples from TSV file + * + * TODO: Implement full import workflow + * - File picker + * - Parse TSV + * - Validate all samples + * - Show preview UI + * - Load samples (eager or lazy based on count) + */ +async function importSamplesFromTSV(state: ExtensionState): Promise { + // Step 1: File picker + const fileUri = await vscode.window.showOpenDialog({ + canSelectMany: false, + filters: { 'TSV Files': ['tsv', 'txt'], 'All Files': ['*'] }, + title: 'Import Samples from TSV', + }); + + if (!fileUri || !fileUri[0]) { + return; + } + + const tsvPath = fileUri[0].fsPath; + logger.info(`[TSV Import] Selected file: ${tsvPath}`); + + // Step 2: Read and parse TSV + let content: string; + try { + content = await fs.readFile(tsvPath, 'utf-8'); + } catch (error) { + vscode.window.showErrorMessage(`Failed to read TSV file: ${error}`); + return; + } + + const parseResult = TSVParser.parse(content); + + if (!parseResult.success) { + vscode.window.showErrorMessage( + `TSV parsing failed:\n\n${parseResult.errors.join('\n')}` + ); + return; + } + + logger.info(`[TSV Import] Parsed ${parseResult.samples.length} samples`); + if (parseResult.warnings.length > 0) { + logger.info(`[TSV Import] Warnings: ${parseResult.warnings.join('; ')}`); + } + + // Step 3: Validate all samples + const workspaceRoot = vscode.workspace.workspaceFolders?.[0]?.uri.fsPath || ''; + const pathResolver = new TSVPathResolver(tsvPath, workspaceRoot); + const validator = new TSVValidator(pathResolver, new Set(state.getAllSampleNames())); + + let validationResults: ValidationResult[]; + try { + validationResults = await vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: 'Validating TSV samples...', + cancellable: false, + }, + async () => { + return await validator.validateBatch(parseResult.samples); + } + ); + } catch (error) { + vscode.window.showErrorMessage(`Validation failed: ${error}`); + return; + } + + logger.info( + `[TSV Import] Validation complete: ${validationResults.filter((r) => r.valid).length}/${validationResults.length} valid` + ); + + // Step 4: Check for blocking errors + const hasErrors = validationResults.some((r) => !r.valid); + if (hasErrors) { + await showValidationResults(validationResults, 'Validation Failed - Fix Issues'); + return; // Block import + } + + // Step 5: Show preview & confirm + const confirmed = await showImportPreview(validationResults, parseResult.samples.length); + if (!confirmed) { + logger.info('[TSV Import] User cancelled import'); + return; + } + + // Step 6: Determine loading strategy + const shouldLoadEagerly = determineLoadingStrategy(parseResult.samples.length); + logger.info( + `[TSV Import] Loading strategy: ${shouldLoadEagerly ? 'EAGER' : 'LAZY'} (${parseResult.samples.length} samples)` + ); + + // Step 7: Load samples + await loadSamplesFromTSV( + parseResult.samples, + validationResults, + state, + shouldLoadEagerly + ); +} + +/** + * Show validation results in QuickPick UI + */ +async function showValidationResults( + results: ValidationResult[], + title: string +): Promise { + const items = results.map((r) => ({ + label: r.sampleName, + description: r.valid ? '✓ Valid' : '✗ Invalid', + detail: r.errors.length > 0 ? r.errors.join(', ') : r.warnings.join(', '), + iconPath: r.valid + ? new vscode.ThemeIcon('pass', new vscode.ThemeColor('testing.iconPassed')) + : new vscode.ThemeIcon('error', new vscode.ThemeColor('testing.iconFailed')), + })); + + await vscode.window.showQuickPick(items, { + title, + canPickMany: false, + placeHolder: + results.filter((r) => !r.valid).length > 0 + ? 'Fix errors and try again' + : 'All samples valid', + }); +} + +/** + * Show import preview and get confirmation + */ +async function showImportPreview( + results: ValidationResult[], + sampleCount: number +): Promise { + const summary = TSVValidator.getSummary(results); + + const message = [ + `Import ${sampleCount} samples?`, + ``, + `• ${summary.valid} valid samples`, + `• ${summary.withBam} with BAM files`, + `• ${summary.withFasta} with FASTA files`, + summary.totalWarnings > 0 ? `• ${summary.totalWarnings} warnings` : '', + ] + .filter((line) => line.length > 0) + .join('\n'); + + const choice = await vscode.window.showInformationMessage( + message, + { modal: true }, + 'Import', + 'Preview Details', + 'Cancel' + ); + + if (choice === 'Preview Details') { + await showValidationResults(results, 'Sample Preview'); + // Ask again after preview + return showImportPreview(results, sampleCount); + } + + return choice === 'Import'; +} + +/** + * Determine loading strategy based on sample count + * + * Heuristic: + * - ≤5 samples: Eager (fast to load) + * - ≥20 samples: Lazy (avoid overwhelming kernel) + * - 6-19 samples: Eager if small, lazy if large files + */ +function determineLoadingStrategy(sampleCount: number): boolean { + if (sampleCount <= 5) return true; // Always eager for small batches + if (sampleCount >= 20) return false; // Always lazy for large batches + + // Mid-range: default to 10 sample threshold + return sampleCount <= 10; +} + +/** + * Load samples from validated TSV specs + * + * TODO: Implement full loading logic + * - Create SampleInfo objects + * - Load to kernel (eager) or defer (lazy) + * - Update extension state + * - Refresh UI panels + */ +async function loadSamplesFromTSV( + specs: TSVSampleSpec[], + validationResults: ValidationResult[], + state: ExtensionState, + eager: boolean +): Promise { + const service = new FileLoadingService(state); + const tsvGroupId = `tsv_${Date.now()}`; // Batch ID for grouping + + logger.info( + `[TSV Import] Starting load for ${specs.length} samples (eager=${eager}, groupId=${tsvGroupId})` + ); + + // Show progress + await vscode.window.withProgress( + { + location: vscode.ProgressLocation.Notification, + title: `Importing ${specs.length} samples...`, + cancellable: false, + }, + async (progress) => { + for (let i = 0; i < specs.length; i++) { + const spec = specs[i]; + const validation = validationResults[i]; + + progress.report({ + message: `${spec.sampleName} (${i + 1}/${specs.length})`, + increment: (100 / specs.length), + }); + + try { + // TODO: Implement eager vs lazy loading + // For now, just create metadata (lazy mode) + + const sampleInfo: SampleInfo = { + sampleId: `sample:${spec.sampleName}`, + displayName: spec.sampleName, + pod5Path: validation.resolvedPod5!, + bamPath: validation.resolvedBam, + fastaPath: validation.resolvedFasta, + readCount: 0, // Will be populated on load + hasBam: !!validation.resolvedBam, + hasFasta: !!validation.resolvedFasta, + isLoaded: false, // TODO: Set to `eager` when implementing kernel loading + metadata: { + sourceType: 'tsv', + tsvGroup: tsvGroupId, + autoDetected: false, + }, + }; + + state.addSample(sampleInfo); + + // Also add to unified state for cross-panel sync + const loadedItem: LoadedItem = { + id: sampleInfo.sampleId, + type: 'sample', + sampleName: spec.sampleName, + pod5Path: validation.resolvedPod5!, + bamPath: validation.resolvedBam, + fastaPath: validation.resolvedFasta, + readCount: 0, + fileSize: 0, + fileSizeFormatted: 'Unknown', + hasAlignments: !!validation.resolvedBam, + hasReference: !!validation.resolvedFasta, + hasMods: false, + hasEvents: false, + }; + state.addLoadedItem(loadedItem); + + logger.info( + `[TSV Import] Registered sample: ${spec.sampleName} (loaded=${sampleInfo.isLoaded})` + ); + } catch (error) { + logger.error(`[TSV Import] Failed to load sample ${spec.sampleName}:`, error); + vscode.window.showErrorMessage(`Failed to load ${spec.sampleName}: ${error}`); + } + } + } + ); + + // Refresh Samples panel + state.samplesProvider?.refresh(); + + vscode.window.showInformationMessage( + `Imported ${specs.length} samples from TSV ${eager ? '(loaded to kernel)' : '(lazy load - will load on plot)'}` + ); + + logger.info(`[TSV Import] Import complete: ${specs.length} samples registered`); +} diff --git a/src/extension.ts b/src/extension.ts index cd83d30..c9f10bb 100644 --- a/src/extension.ts +++ b/src/extension.ts @@ -18,6 +18,7 @@ import { registerFileCommands } from './commands/file-commands'; import { registerPlotCommands } from './commands/plot-commands'; import { registerStateCommands } from './commands/state-commands'; import { registerSessionCommands } from './commands/session-commands'; +import { registerTSVCommands } from './commands/tsv-commands'; import { registerKernelListeners } from './listeners/kernel-listeners'; import { logger } from './utils/logger'; import { SquiggyKernelState } from './backend/squiggy-kernel-manager'; @@ -536,6 +537,7 @@ async function registerAllPanelsAndCommands(context: vscode.ExtensionContext): P registerPlotCommands(context, state); registerStateCommands(context, state); registerSessionCommands(context, state); + registerTSVCommands(context, state); // Register command to show logs context.subscriptions.push( diff --git a/src/services/__tests__/tsv-parser.test.ts b/src/services/__tests__/tsv-parser.test.ts new file mode 100644 index 0000000..9d8959b --- /dev/null +++ b/src/services/__tests__/tsv-parser.test.ts @@ -0,0 +1,351 @@ +/** + * Tests for TSVParser + */ + +import { TSVParser } from '../tsv-parser'; + +describe('TSVParser', () => { + describe('parse() - valid input', () => { + test('parses valid TSV with all columns', () => { + const content = `sample_name\tpod5\tbam\tfasta +sample_A\tdata/A.pod5\tdata/A.bam\tref.fa +sample_B\tdata/B.pod5\tdata/B.bam\tref.fa`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.errors).toHaveLength(0); + expect(result.samples).toHaveLength(2); + + expect(result.samples[0]).toEqual({ + sampleName: 'sample_A', + pod5Path: 'data/A.pod5', + bamPath: 'data/A.bam', + fastaPath: 'ref.fa', + lineNumber: 2, + }); + + expect(result.samples[1]).toEqual({ + sampleName: 'sample_B', + pod5Path: 'data/B.pod5', + bamPath: 'data/B.bam', + fastaPath: 'ref.fa', + lineNumber: 3, + }); + }); + + test('parses TSV with optional columns missing (no BAM)', () => { + const content = `sample_name\tpod5\tbam\tfasta +sample_A\tdata/A.pod5\t-\tref.fa +sample_B\tdata/B.pod5\t\tref.fa`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + expect(result.samples[0].bamPath).toBeUndefined(); + expect(result.samples[1].bamPath).toBeUndefined(); + expect(result.warnings.length).toBeGreaterThan(0); + expect(result.warnings[0]).toContain('no BAM file'); + }); + + test('parses TSV with optional columns missing (no FASTA)', () => { + const content = `sample_name\tpod5\tbam\tfasta +sample_A\tdata/A.pod5\tdata/A.bam\t- +sample_B\tdata/B.pod5\tdata/B.bam\t`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + expect(result.samples[0].fastaPath).toBeUndefined(); + expect(result.samples[1].fastaPath).toBeUndefined(); + expect(result.warnings.length).toBeGreaterThan(0); + expect(result.warnings[0]).toContain('no FASTA file'); + }); + + test('parses TSV with only required columns', () => { + const content = `sample_name\tpod5 +sample_A\tdata/A.pod5 +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + expect(result.samples[0].bamPath).toBeUndefined(); + expect(result.samples[0].fastaPath).toBeUndefined(); + }); + + test('handles Windows line endings (CRLF)', () => { + const content = `sample_name\tpod5\r\nsample_A\tdata/A.pod5\r\nsample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + }); + + test('skips empty lines', () => { + const content = `sample_name\tpod5 + +sample_A\tdata/A.pod5 + +sample_B\tdata/B.pod5 + +`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + }); + + test('skips comment lines starting with #', () => { + const content = `# This is a comment +sample_name\tpod5 +# Another comment +sample_A\tdata/A.pod5 +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + }); + + test('auto-detects comma delimiter', () => { + const content = `sample_name,pod5,bam +sample_A,data/A.pod5,data/A.bam +sample_B,data/B.pod5,data/B.bam`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(2); + expect(result.samples[0].pod5Path).toBe('data/A.pod5'); + expect(result.samples[0].bamPath).toBe('data/A.bam'); + }); + + test('handles mixed case column names', () => { + const content = `Sample_Name\tPOD5\tBAM +sample_A\tdata/A.pod5\tdata/A.bam`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(1); + }); + + test('trims whitespace from cells', () => { + const content = `sample_name\tpod5\tbam + sample_A \t data/A.pod5 \t data/A.bam `; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples[0].sampleName).toBe('sample_A'); + expect(result.samples[0].pod5Path).toBe('data/A.pod5'); + expect(result.samples[0].bamPath).toBe('data/A.bam'); + }); + }); + + describe('parse() - error cases', () => { + test('rejects empty file', () => { + const content = ''; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain('TSV file is empty'); + }); + + test('rejects file with only whitespace', () => { + const content = ' \n \n '; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain('TSV file is empty'); + }); + + test('rejects TSV missing sample_name column', () => { + const content = `pod5\tbam +data/A.pod5\tdata/A.bam`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain('Missing required column: sample_name'); + }); + + test('rejects TSV missing pod5 column', () => { + const content = `sample_name\tbam +sample_A\tdata/A.bam`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain('Missing required column: pod5'); + }); + + test('rejects rows with missing sample_name', () => { + const content = `sample_name\tpod5 +\tdata/A.pod5 +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain('Line 2: Missing sample_name'); + }); + + test('rejects rows with missing pod5 path', () => { + const content = `sample_name\tpod5 +sample_A\t +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain( + "Line 2: Missing pod5 path for sample 'sample_A'" + ); + }); + + test('rejects rows with pod5 path as dash', () => { + const content = `sample_name\tpod5 +sample_A\t- +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain( + "Line 2: Missing pod5 path for sample 'sample_A'" + ); + }); + + test('detects duplicate sample names', () => { + const content = `sample_name\tpod5 +sample_A\tdata/A.pod5 +sample_A\tdata/A2.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors).toContain("Line 3: Duplicate sample name 'sample_A'"); + }); + + test('reports multiple errors', () => { + const content = `sample_name\tpod5 +\tdata/A.pod5 +sample_B\t +sample_C\tdata/C.pod5 +sample_C\tdata/C2.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(false); + expect(result.errors.length).toBeGreaterThanOrEqual(3); + }); + }); + + describe('parse() - warnings', () => { + test('warns when BAM is missing', () => { + const content = `sample_name\tpod5\tbam +sample_A\tdata/A.pod5\t-`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.warnings.length).toBeGreaterThan(0); + expect(result.warnings[0]).toContain('no BAM file'); + expect(result.warnings[0]).toContain('alignment features unavailable'); + }); + + test('warns when FASTA is missing', () => { + const content = `sample_name\tpod5\tfasta +sample_A\tdata/A.pod5\t-`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.warnings.length).toBeGreaterThan(0); + expect(result.warnings[0]).toContain('no FASTA file'); + expect(result.warnings[0]).toContain('reference sequence unavailable'); + }); + + test('warns for multiple missing optional files', () => { + const content = `sample_name\tpod5\tbam\tfasta +sample_A\tdata/A.pod5\t-\t-`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.warnings.length).toBe(2); // One for BAM, one for FASTA + }); + }); + + describe('parse() - edge cases', () => { + test('handles single sample', () => { + const content = `sample_name\tpod5 +sample_A\tdata/A.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(1); + }); + + test('handles many samples (24)', () => { + const rows = ['sample_name\tpod5']; + for (let i = 1; i <= 24; i++) { + rows.push(`sample_${i}\tdata/sample_${i}.pod5`); + } + const content = rows.join('\n'); + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples).toHaveLength(24); + }); + + test('preserves line numbers correctly', () => { + const content = `# Comment line +sample_name\tpod5 + +sample_A\tdata/A.pod5 + +sample_B\tdata/B.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + // Line numbers should account for empty lines and comments + expect(result.samples[0].lineNumber).toBeDefined(); + expect(result.samples[1].lineNumber).toBeDefined(); + }); + + test('handles file paths with spaces', () => { + const content = `sample_name\tpod5\tbam +sample_A\tdata/my file.pod5\tdata/my file.bam`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples[0].pod5Path).toBe('data/my file.pod5'); + expect(result.samples[0].bamPath).toBe('data/my file.bam'); + }); + + test('handles absolute paths', () => { + const content = `sample_name\tpod5 +sample_A\t/absolute/path/to/A.pod5`; + + const result = TSVParser.parse(content); + + expect(result.success).toBe(true); + expect(result.samples[0].pod5Path).toBe('/absolute/path/to/A.pod5'); + }); + }); +}); diff --git a/src/services/tsv-parser.ts b/src/services/tsv-parser.ts new file mode 100644 index 0000000..7d994c3 --- /dev/null +++ b/src/services/tsv-parser.ts @@ -0,0 +1,181 @@ +/** + * TSV Parser - Parse sample manifest files + * + * Parses tab-separated value files containing sample metadata and file paths. + * Expected format: + * sample_name\tpod5\tbam\tfasta + * sample_A\tdata/A.pod5\tdata/A.bam\tref.fa + * + * Required columns: sample_name, pod5 + * Optional columns: bam, fasta + */ + +export interface TSVSampleSpec { + /** User-facing sample name (must be unique) */ + sampleName: string; + + /** Path to POD5 file (required) */ + pod5Path: string; + + /** Path to BAM file (optional) */ + bamPath?: string; + + /** Path to FASTA reference file (optional) */ + fastaPath?: string; + + /** Line number in TSV (for error reporting) */ + lineNumber: number; +} + +export interface TSVParseResult { + /** Whether parsing succeeded */ + success: boolean; + + /** Parsed sample specifications */ + samples: TSVSampleSpec[]; + + /** Critical errors that prevent parsing */ + errors: string[]; + + /** Non-blocking warnings */ + warnings: string[]; +} + +export class TSVParser { + /** + * Parse TSV content into sample specifications + * + * @param content - Raw TSV file content or clipboard paste + * @returns Parse result with samples or errors + */ + static parse(content: string): TSVParseResult { + const errors: string[] = []; + const warnings: string[] = []; + const samples: TSVSampleSpec[] = []; + + // Split into lines and filter empty lines + const lines = content + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line.length > 0 && !line.startsWith('#')); // Skip comments + + if (lines.length === 0) { + errors.push('TSV file is empty'); + return { success: false, samples: [], errors, warnings }; + } + + // Auto-detect delimiter (tab or comma) + const delimiter = this.detectDelimiter(lines[0]); + + // Parse header row + const headerCells = lines[0].split(delimiter).map((cell) => cell.trim().toLowerCase()); + const columnMap = this.buildColumnMap(headerCells); + + // Validate required columns + if (!columnMap.has('sample_name')) { + errors.push('Missing required column: sample_name'); + } + if (!columnMap.has('pod5')) { + errors.push('Missing required column: pod5'); + } + + if (errors.length > 0) { + return { success: false, samples: [], errors, warnings }; + } + + // Track duplicate sample names + const seenNames = new Set(); + + // Parse data rows + for (let i = 1; i < lines.length; i++) { + const lineNumber = i + 1; // 1-indexed for user-friendly error messages + const line = lines[i]; + const cells = line.split(delimiter).map((cell) => cell.trim()); + + // Extract values using column map + const sampleName = cells[columnMap.get('sample_name')!] || ''; + const pod5Path = cells[columnMap.get('pod5')!] || ''; + const bamPath = columnMap.has('bam') ? cells[columnMap.get('bam')!] : undefined; + const fastaPath = columnMap.has('fasta') + ? cells[columnMap.get('fasta')!] + : undefined; + + // Validate required fields + if (!sampleName) { + errors.push(`Line ${lineNumber}: Missing sample_name`); + continue; + } + + if (!pod5Path || pod5Path === '-') { + errors.push(`Line ${lineNumber}: Missing pod5 path for sample '${sampleName}'`); + continue; + } + + // Check for duplicates + if (seenNames.has(sampleName)) { + errors.push(`Line ${lineNumber}: Duplicate sample name '${sampleName}'`); + continue; + } + seenNames.add(sampleName); + + // Handle optional fields (treat '-' as missing) + const resolvedBamPath = + bamPath && bamPath !== '-' && bamPath !== '' ? bamPath : undefined; + const resolvedFastaPath = + fastaPath && fastaPath !== '-' && fastaPath !== '' ? fastaPath : undefined; + + // Add warnings for missing optional files + if (!resolvedBamPath) { + warnings.push( + `Line ${lineNumber}: Sample '${sampleName}' has no BAM file (alignment features unavailable)` + ); + } + if (!resolvedFastaPath) { + warnings.push( + `Line ${lineNumber}: Sample '${sampleName}' has no FASTA file (reference sequence unavailable)` + ); + } + + // Create sample spec + samples.push({ + sampleName, + pod5Path, + bamPath: resolvedBamPath, + fastaPath: resolvedFastaPath, + lineNumber, + }); + } + + return { + success: errors.length === 0, + samples, + errors, + warnings, + }; + } + + /** + * Auto-detect delimiter (tab or comma) + * Prefer tab, fall back to comma + */ + private static detectDelimiter(headerLine: string): string { + if (headerLine.includes('\t')) { + return '\t'; + } else if (headerLine.includes(',')) { + return ','; + } + return '\t'; // Default to tab + } + + /** + * Build column index map from header row + * Maps column name → column index + */ + private static buildColumnMap(headerCells: string[]): Map { + const map = new Map(); + for (let i = 0; i < headerCells.length; i++) { + map.set(headerCells[i], i); + } + return map; + } +} diff --git a/src/services/tsv-path-resolver.ts b/src/services/tsv-path-resolver.ts new file mode 100644 index 0000000..0cade85 --- /dev/null +++ b/src/services/tsv-path-resolver.ts @@ -0,0 +1,180 @@ +/** + * TSV Path Resolver - Resolve relative paths from TSV files + * + * Handles multiple path resolution strategies for TSV-specified file paths: + * - Absolute paths: Use as-is + * - TSV-relative: Relative to TSV file location + * - Workspace-relative: Relative to workspace root + * - Auto: Try multiple strategies + */ + +import * as path from 'path'; +import { promises as fs } from 'fs'; + +export enum PathResolutionStrategy { + /** Relative to TSV file directory */ + TsvRelative = 'tsv-relative', + + /** Relative to workspace root directory */ + WorkspaceRelative = 'workspace', + + /** Use path as-is (absolute paths) */ + Absolute = 'absolute', + + /** Try multiple strategies (absolute → tsv-relative → workspace-relative) */ + Auto = 'auto', +} + +export interface PathResolutionResult { + /** Successfully resolved path, or null if not found */ + resolvedPath: string | null; + + /** Strategy that succeeded */ + strategy: PathResolutionStrategy | null; + + /** Error message if resolution failed */ + error?: string; +} + +export class TSVPathResolver { + /** + * @param tsvFilePath - Path to the TSV file (null if pasted from clipboard) + * @param workspaceRoot - Workspace root directory + */ + constructor( + private tsvFilePath: string | null, + private workspaceRoot: string + ) {} + + /** + * Resolve a file path using the specified strategy + * + * @param rawPath - Path from TSV file (may be relative or absolute) + * @param strategy - Resolution strategy to use + * @returns Resolved absolute path or null if file not found + */ + async resolve( + rawPath: string, + strategy: PathResolutionStrategy = PathResolutionStrategy.Auto + ): Promise { + // TODO: Implement path resolution logic + // For now, return a stub implementation + + // Handle absolute paths + if (path.isAbsolute(rawPath)) { + const exists = await this.fileExists(rawPath); + if (exists) { + return { + resolvedPath: rawPath, + strategy: PathResolutionStrategy.Absolute, + }; + } else { + return { + resolvedPath: null, + strategy: null, + error: `File not found: ${rawPath}`, + }; + } + } + + // Auto strategy: try multiple approaches + if (strategy === PathResolutionStrategy.Auto) { + // Try TSV-relative first (if TSV path available) + if (this.tsvFilePath) { + const tsvRelative = await this.resolveTsvRelative(rawPath); + if (tsvRelative.resolvedPath) { + return tsvRelative; + } + } + + // Try workspace-relative + const workspaceRelative = await this.resolveWorkspaceRelative(rawPath); + if (workspaceRelative.resolvedPath) { + return workspaceRelative; + } + + return { + resolvedPath: null, + strategy: null, + error: `Could not resolve path: ${rawPath} (tried TSV-relative and workspace-relative)`, + }; + } + + // Single strategy + switch (strategy) { + case PathResolutionStrategy.TsvRelative: + return this.resolveTsvRelative(rawPath); + case PathResolutionStrategy.WorkspaceRelative: + return this.resolveWorkspaceRelative(rawPath); + default: + return { + resolvedPath: null, + strategy: null, + error: `Unknown strategy: ${strategy}`, + }; + } + } + + /** + * Resolve path relative to TSV file directory + */ + private async resolveTsvRelative(rawPath: string): Promise { + if (!this.tsvFilePath) { + return { + resolvedPath: null, + strategy: null, + error: 'Cannot use TSV-relative resolution without TSV file path', + }; + } + + const tsvDir = path.dirname(this.tsvFilePath); + const resolvedPath = path.resolve(tsvDir, rawPath); + + const exists = await this.fileExists(resolvedPath); + if (exists) { + return { + resolvedPath, + strategy: PathResolutionStrategy.TsvRelative, + }; + } else { + return { + resolvedPath: null, + strategy: null, + error: `File not found (TSV-relative): ${resolvedPath}`, + }; + } + } + + /** + * Resolve path relative to workspace root + */ + private async resolveWorkspaceRelative(rawPath: string): Promise { + const resolvedPath = path.resolve(this.workspaceRoot, rawPath); + + const exists = await this.fileExists(resolvedPath); + if (exists) { + return { + resolvedPath, + strategy: PathResolutionStrategy.WorkspaceRelative, + }; + } else { + return { + resolvedPath: null, + strategy: null, + error: `File not found (workspace-relative): ${resolvedPath}`, + }; + } + } + + /** + * Check if file exists + */ + private async fileExists(filePath: string): Promise { + try { + await fs.access(filePath); + return true; + } catch { + return false; + } + } +} diff --git a/src/services/tsv-validator.ts b/src/services/tsv-validator.ts new file mode 100644 index 0000000..532728d --- /dev/null +++ b/src/services/tsv-validator.ts @@ -0,0 +1,173 @@ +/** + * TSV Validator - Validate sample specifications from TSV + * + * Validates: + * - File existence (POD5 required, BAM/FASTA optional) + * - POD5/BAM read ID overlap (reuses existing validation logic) + * - BAM/FASTA reference name overlap (reuses existing validation logic) + * - Sample name conflicts with already-loaded samples + */ + +import { TSVSampleSpec } from './tsv-parser'; +import { TSVPathResolver, PathResolutionResult } from './tsv-path-resolver'; + +export interface ValidationResult { + /** Sample name being validated */ + sampleName: string; + + /** Whether validation passed (no blocking errors) */ + valid: boolean; + + /** Blocking errors that prevent loading */ + errors: string[]; + + /** Non-blocking warnings */ + warnings: string[]; + + /** Resolved POD5 path (if found) */ + resolvedPod5?: string; + + /** Resolved BAM path (if found) */ + resolvedBam?: string; + + /** Resolved FASTA path (if found) */ + resolvedFasta?: string; + + /** Path resolution strategies used */ + resolutionStrategies?: { + pod5?: string; + bam?: string; + fasta?: string; + }; +} + +export class TSVValidator { + /** + * @param pathResolver - Path resolver for file lookups + * @param existingLoadedSamples - Set of already-loaded sample names (to check conflicts) + */ + constructor( + private pathResolver: TSVPathResolver, + private existingLoadedSamples: Set + ) {} + + /** + * Validate a single sample specification + * + * TODO: Implement full validation logic + * - Resolve all paths + * - Check POD5 exists (BLOCK if missing) + * - Check BAM exists (WARN if missing) + * - Check FASTA exists (WARN if missing) + * - Validate POD5/BAM overlap (reuse existing logic from file-commands.ts) + * - Validate BAM/FASTA overlap (reuse existing logic) + * - Check sample name conflicts + * + * @param spec - TSV sample specification to validate + * @returns Validation result with errors/warnings and resolved paths + */ + async validateSample(spec: TSVSampleSpec): Promise { + const errors: string[] = []; + const warnings: string[] = []; + let resolvedPod5: string | undefined; + let resolvedBam: string | undefined; + let resolvedFasta: string | undefined; + const resolutionStrategies: ValidationResult['resolutionStrategies'] = {}; + + // Check for sample name conflicts + if (this.existingLoadedSamples.has(spec.sampleName)) { + errors.push( + `Sample name '${spec.sampleName}' conflicts with already-loaded sample` + ); + } + + // Validate POD5 (required) + const pod5Result = await this.pathResolver.resolve(spec.pod5Path); + if (pod5Result.resolvedPath) { + resolvedPod5 = pod5Result.resolvedPath; + resolutionStrategies.pod5 = pod5Result.strategy || 'unknown'; + } else { + errors.push( + `POD5 file not found: ${spec.pod5Path} (${pod5Result.error || 'unknown error'})` + ); + } + + // Validate BAM (optional) + if (spec.bamPath) { + const bamResult = await this.pathResolver.resolve(spec.bamPath); + if (bamResult.resolvedPath) { + resolvedBam = bamResult.resolvedPath; + resolutionStrategies.bam = bamResult.strategy || 'unknown'; + } else { + warnings.push( + `BAM file not found: ${spec.bamPath} (alignment features will be unavailable)` + ); + } + } + + // Validate FASTA (optional) + if (spec.fastaPath) { + const fastaResult = await this.pathResolver.resolve(spec.fastaPath); + if (fastaResult.resolvedPath) { + resolvedFasta = fastaResult.resolvedPath; + resolutionStrategies.fasta = fastaResult.strategy || 'unknown'; + } else { + warnings.push( + `FASTA file not found: ${spec.fastaPath} (reference sequence will be unavailable)` + ); + } + } + + // TODO: Validate POD5/BAM overlap (requires loading files - expensive) + // For now, defer to loading time + // Future: Add optional quick validation using FileLoadingService + + return { + sampleName: spec.sampleName, + valid: errors.length === 0, + errors, + warnings, + resolvedPod5, + resolvedBam, + resolvedFasta, + resolutionStrategies, + }; + } + + /** + * Validate all samples in batch + * + * @param specs - Array of TSV sample specifications + * @returns Array of validation results (parallel execution) + */ + async validateBatch(specs: TSVSampleSpec[]): Promise { + // Validate all samples in parallel for speed + return Promise.all(specs.map((spec) => this.validateSample(spec))); + } + + /** + * Get summary of validation results + * + * @param results - Array of validation results + * @returns Summary statistics + */ + static getSummary(results: ValidationResult[]): { + total: number; + valid: number; + invalid: number; + withBam: number; + withFasta: number; + totalErrors: number; + totalWarnings: number; + } { + return { + total: results.length, + valid: results.filter((r) => r.valid).length, + invalid: results.filter((r) => !r.valid).length, + withBam: results.filter((r) => r.resolvedBam).length, + withFasta: results.filter((r) => r.resolvedFasta).length, + totalErrors: results.reduce((sum, r) => sum + r.errors.length, 0), + totalWarnings: results.reduce((sum, r) => sum + r.warnings.length, 0), + }; + } +}