Skip to content

Commit 9da6a71

Browse files
committed
fix database mistmatch for word level timestamps for multi-track audio
1 parent eebdc83 commit 9da6a71

File tree

7 files changed

+50
-100
lines changed

7 files changed

+50
-100
lines changed

internal/transcription/adapters/canary_adapter.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -574,7 +574,7 @@ func (c *CanaryAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
574574

575575
logger.Info("Canary transcription completed",
576576
"segments", len(result.Segments),
577-
"words", len(result.Words),
577+
"words", len(result.WordSegments),
578578
"processing_time", result.ProcessingTime,
579579
"task", c.GetStringParameter(params, "task"))
580580

@@ -665,7 +665,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
665665
Text: canaryResult.Transcription,
666666
Language: resultLanguage,
667667
Segments: make([]interfaces.TranscriptSegment, len(canaryResult.SegmentTimestamps)),
668-
Words: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
668+
WordSegments: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
669669
Confidence: 0.0, // Default confidence
670670
}
671671

@@ -681,7 +681,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
681681

682682
// Convert words
683683
for i, word := range canaryResult.WordTimestamps {
684-
result.Words[i] = interfaces.TranscriptWord{
684+
result.WordSegments[i] = interfaces.TranscriptWord{
685685
Start: word.Start,
686686
End: word.End,
687687
Word: word.Word,

internal/transcription/adapters/parakeet_adapter.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -512,7 +512,7 @@ func (p *ParakeetAdapter) Transcribe(ctx context.Context, input interfaces.Audio
512512

513513
logger.Info("Parakeet transcription completed",
514514
"segments", len(result.Segments),
515-
"words", len(result.Words),
515+
"words", len(result.WordSegments),
516516
"processing_time", result.ProcessingTime)
517517

518518
return result, nil
@@ -581,7 +581,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
581581
Text: parakeetResult.Transcription,
582582
Language: parakeetResult.Language,
583583
Segments: make([]interfaces.TranscriptSegment, len(parakeetResult.SegmentTimestamps)),
584-
Words: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
584+
WordSegments: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
585585
Confidence: 0.0, // Default confidence
586586
}
587587

@@ -596,7 +596,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
596596

597597
// Convert words
598598
for i, word := range parakeetResult.WordTimestamps {
599-
result.Words[i] = interfaces.TranscriptWord{
599+
result.WordSegments[i] = interfaces.TranscriptWord{
600600
Start: word.Start,
601601
End: word.End,
602602
Word: word.Word,

internal/transcription/adapters/whisperx_adapter.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -428,7 +428,7 @@ func (w *WhisperXAdapter) Transcribe(ctx context.Context, input interfaces.Audio
428428

429429
logger.Info("WhisperX transcription completed",
430430
"segments", len(result.Segments),
431-
"words", len(result.Words),
431+
"words", len(result.WordSegments),
432432
"processing_time", result.ProcessingTime)
433433

434434
return result, nil
@@ -552,7 +552,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
552552
result := &interfaces.TranscriptResult{
553553
Language: whisperxResult.Language,
554554
Segments: make([]interfaces.TranscriptSegment, len(whisperxResult.Segments)),
555-
Words: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
555+
WordSegments: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
556556
Confidence: 0.0, // WhisperX doesn't provide overall confidence
557557
}
558558

@@ -570,7 +570,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
570570

571571
// Convert words
572572
for i, word := range whisperxResult.Word {
573-
result.Words[i] = interfaces.TranscriptWord{
573+
result.WordSegments[i] = interfaces.TranscriptWord{
574574
Start: word.Start,
575575
End: word.End,
576576
Word: word.Word,

internal/transcription/interfaces/interfaces.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ type TranscriptResult struct {
7878
Text string `json:"text"`
7979
Language string `json:"language"`
8080
Segments []TranscriptSegment `json:"segments"`
81-
Words []TranscriptWord `json:"words,omitempty"`
81+
WordSegments []TranscriptWord `json:"word_segments,omitempty"`
8282
Confidence float64 `json:"confidence"`
8383
ProcessingTime time.Duration `json:"processing_time"`
8484
ModelUsed string `json:"model_used"`

internal/transcription/multitrack_transcriber.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ func (mt *MultiTrackTranscriber) transcribeIndividualTrack(ctx context.Context,
331331
logger.Info("Successfully transcribed track",
332332
"track_name", trackFile.FileName,
333333
"model_family", trackParams.ModelFamily,
334-
"word_count", len(result.Words),
334+
"word_count", len(result.WordSegments),
335335
"segment_count", len(result.Segments))
336336

337337
return result, nil
@@ -434,10 +434,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
434434
logger.Info("Collecting words from track",
435435
"speaker", speaker,
436436
"offset", offset,
437-
"word_count", len(trackTranscript.Result.Words))
437+
"word_count", len(trackTranscript.Result.WordSegments))
438438

439439
// Collect words with offset adjustment and speaker assignment
440-
for _, word := range trackTranscript.Result.Words {
440+
for _, word := range trackTranscript.Result.WordSegments {
441441
adjustedWord := interfaces.Word{
442442
Start: word.Start + offset,
443443
End: word.End + offset,
@@ -502,10 +502,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
502502
}
503503

504504
mergedResult := &interfaces.TranscriptResult{
505-
Segments: speakerTurns,
506-
Words: allWords,
507-
Language: language,
508-
Text: mergedText.String(),
505+
Segments: speakerTurns,
506+
WordSegments: allWords,
507+
Language: language,
508+
Text: mergedText.String(),
509509
}
510510

511511
logger.Info("Sort-and-group merging completed successfully",
@@ -655,7 +655,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
655655
"offset", offset,
656656
"language", result.Language,
657657
"total_segments", len(result.Segments),
658-
"total_words", len(result.Words))
658+
"total_words", len(result.WordSegments))
659659

660660
// Log segment-level data
661661
logger.Info("--- SEGMENTS (Original Timestamps) ---", "file", fileName)
@@ -687,7 +687,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
687687

688688
// Log word-level data (original timestamps)
689689
logger.Info("--- WORDS (Original Timestamps) ---", "file", fileName)
690-
for i, word := range result.Words {
690+
for i, word := range result.WordSegments {
691691
logger.Debug("Word",
692692
"file", fileName,
693693
"index", i+1,
@@ -700,7 +700,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
700700

701701
// Log word-level data with offset applied
702702
logger.Info("--- WORDS (With Offset Applied) ---", "file", fileName, "offset", offset)
703-
for i, word := range result.Words {
703+
for i, word := range result.WordSegments {
704704
adjustedStart := word.Start + offset
705705
adjustedEnd := word.End + offset
706706
logger.Info("Adjusted Word",

internal/transcription/unified_service.go

Lines changed: 9 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -702,12 +702,12 @@ func (u *UnifiedTranscriptionService) mergeDiarizationWithTranscription(transcri
702702
}
703703

704704
// Also assign speakers to words if available
705-
if len(transcript.Words) > 0 {
706-
mergedTranscript.Words = make([]interfaces.TranscriptWord, len(transcript.Words))
707-
copy(mergedTranscript.Words, transcript.Words)
708-
709-
for i := range mergedTranscript.Words {
710-
word := &mergedTranscript.Words[i]
705+
if len(transcript.WordSegments) > 0 {
706+
mergedTranscript.WordSegments = make([]interfaces.TranscriptWord, len(transcript.WordSegments))
707+
copy(mergedTranscript.WordSegments, transcript.WordSegments)
708+
709+
for i := range mergedTranscript.WordSegments {
710+
word := &mergedTranscript.WordSegments[i]
711711
bestSpeaker := u.findBestSpeakerForSegment(word.Start, word.End, diarization.Segments)
712712
if bestSpeaker != "" {
713713
word.Speaker = &bestSpeaker
@@ -757,81 +757,10 @@ func (u *UnifiedTranscriptionService) saveTranscriptionResults(jobID string, res
757757
return nil
758758
}
759759

760-
// convertTranscriptResultToJSON converts the interface result to the expected JSON format
760+
// convertTranscriptResultToJSON converts the interface result to JSON format
761761
func (u *UnifiedTranscriptionService) convertTranscriptResultToJSON(result *interfaces.TranscriptResult) (string, error) {
762-
// Convert to the format expected by the existing database schema
763-
legacyFormat := struct {
764-
Segments []struct {
765-
Start float64 `json:"start"`
766-
End float64 `json:"end"`
767-
Text string `json:"text"`
768-
Speaker *string `json:"speaker,omitempty"`
769-
} `json:"segments"`
770-
Word []struct {
771-
Start float64 `json:"start"`
772-
End float64 `json:"end"`
773-
Word string `json:"word"`
774-
Score float64 `json:"score"`
775-
Speaker *string `json:"speaker,omitempty"`
776-
} `json:"word_segments,omitempty"`
777-
Language string `json:"language"`
778-
Text string `json:"text"`
779-
}{
780-
Language: result.Language,
781-
Text: result.Text,
782-
}
783-
784-
// Convert segments
785-
legacyFormat.Segments = make([]struct {
786-
Start float64 `json:"start"`
787-
End float64 `json:"end"`
788-
Text string `json:"text"`
789-
Speaker *string `json:"speaker,omitempty"`
790-
}, len(result.Segments))
791-
792-
for i, seg := range result.Segments {
793-
legacyFormat.Segments[i] = struct {
794-
Start float64 `json:"start"`
795-
End float64 `json:"end"`
796-
Text string `json:"text"`
797-
Speaker *string `json:"speaker,omitempty"`
798-
}{
799-
Start: seg.Start,
800-
End: seg.End,
801-
Text: seg.Text,
802-
Speaker: seg.Speaker,
803-
}
804-
}
805-
806-
// Convert words
807-
if len(result.Words) > 0 {
808-
legacyFormat.Word = make([]struct {
809-
Start float64 `json:"start"`
810-
End float64 `json:"end"`
811-
Word string `json:"word"`
812-
Score float64 `json:"score"`
813-
Speaker *string `json:"speaker,omitempty"`
814-
}, len(result.Words))
815-
816-
for i, word := range result.Words {
817-
legacyFormat.Word[i] = struct {
818-
Start float64 `json:"start"`
819-
End float64 `json:"end"`
820-
Word string `json:"word"`
821-
Score float64 `json:"score"`
822-
Speaker *string `json:"speaker,omitempty"`
823-
}{
824-
Start: word.Start,
825-
End: word.End,
826-
Word: word.Word,
827-
Score: word.Score,
828-
Speaker: word.Speaker,
829-
}
830-
}
831-
}
832-
833-
// Convert to JSON string
834-
jsonBytes, err := json.Marshal(legacyFormat)
762+
// Now that the struct fields match the JSON field names, we can directly marshal
763+
jsonBytes, err := json.Marshal(result)
835764
if err != nil {
836765
return "", err
837766
}

web/landing/public/api/undocumented.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,27 @@
7070
"description": "Upload a video file, extract audio from it using ffmpeg, and create a transcription job",
7171
"tag": "transcription"
7272
},
73+
{
74+
"method": "POST",
75+
"path": "/api/v1/transcription/upload-multitrack",
76+
"summary": "Upload multi-track audio files",
77+
"description": "Upload multiple audio files with an .aup file for multi-track transcription",
78+
"tag": "transcription"
79+
},
80+
{
81+
"method": "GET",
82+
"path": "/api/v1/transcription/{id}/merge-status",
83+
"summary": "Get multi-track merge status",
84+
"description": "Get the current merge status for a multi-track job",
85+
"tag": "transcription"
86+
},
87+
{
88+
"method": "GET",
89+
"path": "/api/v1/transcription/{id}/track-progress",
90+
"summary": "Get multi-track job progress",
91+
"description": "Get real-time progress information for individual tracks in a multi-track job",
92+
"tag": "transcription"
93+
},
7394
{
7495
"method": "POST",
7596
"path": "/api/v1/transcription/submit",

0 commit comments

Comments
 (0)