Skip to content

Commit 6abfcff

Browse files
authored
Djt/0119/realtime (#89)
* Added segmentation and realtime support * Add some debugging * Fixed realtime for transcribe * Updated text streaming * Added vulkan drivers * Updates after PR reviews
1 parent 14d9328 commit 6abfcff

34 files changed

+603
-460
lines changed

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@ BUILD_LD_FLAGS += -X $(BUILD_MODULE)/pkg/version.GitBranch=$(shell git name-rev
2323
BUILD_LD_FLAGS += -X $(BUILD_MODULE)/pkg/version.GitHash=$(shell git rev-parse HEAD)
2424
BUILD_LD_FLAGS += -X $(BUILD_MODULE)/pkg/version.GoBuildTime=$(shell date -u '+%Y-%m-%dT%H:%M:%SZ')
2525
BUILD_FLAGS = -ldflags "-s -w $(BUILD_LD_FLAGS)"
26-
TEST_FLAGS = -v
2726
CMAKE_FLAGS = -DBUILD_SHARED_LIBS=OFF
2827

28+
# Test flags
29+
TEST_FLAGS ?=
30+
2931
# Default docker file is non-cuda
3032
DOCKER_FILE := etc/Dockerfile.vulkan
3133
DOCKER_SUFFIX := ""

cmd/gowhisper/server.go

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ import (
1212
"os"
1313
"path/filepath"
1414
"sync"
15+
"time"
1516

1617
// Packages
1718
client "github.com/mutablelogic/go-client"
1819
otel "github.com/mutablelogic/go-client/pkg/otel"
20+
"github.com/mutablelogic/go-media/pkg/segmenter"
1921
httpserver "github.com/mutablelogic/go-server/pkg/httpserver"
2022
pkg "github.com/mutablelogic/go-whisper/pkg"
2123
httphandler "github.com/mutablelogic/go-whisper/pkg/httphandler"
@@ -49,6 +51,12 @@ type RunServer struct {
4951
MaxContexts uint `name:"max-contexts" help:"Maximum number of concurrent contexts" default:"0"`
5052
GPU bool `name:"gpu" help:"Use GPU if available" default:"true"`
5153
} `embed:"" prefix:"whisper."`
54+
55+
// Segmenter options
56+
Segmenter struct {
57+
MinSilenceSize time.Duration `name:"min-silence-size" help:"Minimum silence segment size"`
58+
MaxSegmentSize time.Duration `name:"max-segment-size" help:"Maximum segment size"`
59+
} `embed:"" prefix:"segmenter."`
5260
}
5361

5462
///////////////////////////////////////////////////////////////////////////////
@@ -74,44 +82,48 @@ func (cmd *RunServer) Run(ctx *Globals) error {
7482
// Report models path
7583
ctx.logger.With("models", modelsPath).Print(ctx.ctx, "using models directory")
7684

77-
// Build whisper options
78-
whisperOpts := []whisper.Opt{}
85+
// Build options
86+
managerOpts := []pkg.Opt{}
7987
if cmd.Whisper.MaxContexts > 0 {
80-
whisperOpts = append(whisperOpts, whisper.OptMaxConcurrent(int(cmd.Whisper.MaxContexts)))
88+
managerOpts = append(managerOpts, pkg.WithWhisperOpt(whisper.OptMaxConcurrent(int(cmd.Whisper.MaxContexts))))
8189
}
8290
if !cmd.Whisper.GPU {
83-
whisperOpts = append(whisperOpts, whisper.OptNoGPU())
91+
managerOpts = append(managerOpts, pkg.WithWhisperOpt(whisper.OptNoGPU()))
8492
}
8593
if ctx.Debug {
86-
whisperOpts = append(whisperOpts, whisper.OptDebug())
94+
managerOpts = append(managerOpts, pkg.WithWhisperOpt(whisper.OptDebug()))
95+
8796
// Provide a log function so debug output is actually shown
88-
whisperOpts = append(whisperOpts, whisper.OptLog(func(s string) {
97+
managerOpts = append(managerOpts, pkg.WithWhisperOpt(whisper.OptLog(func(s string) {
8998
ctx.logger.Print(ctx.ctx, s)
90-
}))
99+
})))
91100
}
92-
93-
// Build manager options
94-
managerOpts := []pkg.Opt{}
95101
if ctx.tracer != nil {
96-
managerOpts = append(managerOpts, pkg.OptTracer(ctx.tracer))
102+
managerOpts = append(managerOpts, pkg.WithTracer(ctx.tracer))
97103
}
98104
if ctx.Debug {
99105
// Enable HTTP tracing for OpenAI and ElevenLabs clients
100-
managerOpts = append(managerOpts, pkg.OptClientOpts(client.OptTrace(os.Stderr, false)))
106+
managerOpts = append(managerOpts, pkg.WithClientOpts(client.OptTrace(os.Stderr, false)))
101107
}
102108
if ctx.HTTP.Timeout > 0 {
103109
// Set HTTP client timeout for OpenAI and ElevenLabs clients
104-
managerOpts = append(managerOpts, pkg.OptClientOpts(client.OptTimeout(ctx.HTTP.Timeout)))
110+
managerOpts = append(managerOpts, pkg.WithClientOpts(client.OptTimeout(ctx.HTTP.Timeout)))
105111
}
106112
if cmd.OpenAIKey != "" {
107113
managerOpts = append(managerOpts, pkg.OptOpenAIKey(cmd.OpenAIKey))
108114
}
109115
if cmd.ElevenLabsKey != "" {
110116
managerOpts = append(managerOpts, pkg.OptElevenLabsKey(cmd.ElevenLabsKey))
111117
}
118+
if cmd.Segmenter.MinSilenceSize > 0 {
119+
managerOpts = append(managerOpts, pkg.WithSegmenterOpt(segmenter.WithSilenceSize(cmd.Segmenter.MinSilenceSize)))
120+
}
121+
if cmd.Segmenter.MaxSegmentSize > 0 {
122+
managerOpts = append(managerOpts, pkg.WithSegmenterOpt(segmenter.WithSegmentSize(cmd.Segmenter.MaxSegmentSize)))
123+
}
112124

113125
// Create the whisper manager
114-
manager, err := pkg.New(modelsPath, whisperOpts, managerOpts...)
126+
manager, err := pkg.New(modelsPath, managerOpts...)
115127
if err != nil {
116128
return err
117129
}

cmd/gowhisper/transcribe.go

Lines changed: 56 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package main
22

33
import (
44
"fmt"
5+
"io"
56
"os"
67

78
// Packages
89
otel "github.com/mutablelogic/go-client/pkg/otel"
10+
httpresponse "github.com/mutablelogic/go-server/pkg/httpresponse"
911
httpclient "github.com/mutablelogic/go-whisper/pkg/httpclient"
1012
schema "github.com/mutablelogic/go-whisper/pkg/schema"
1113
)
@@ -18,13 +20,9 @@ type TranscribeCommands struct {
1820
}
1921

2022
type TranscribeCommand struct {
21-
Model string `arg:"" name:"model" help:"Model ID to use for transcription"`
22-
File string `arg:"" name:"file" help:"Audio file to transcribe"`
23-
Language *string `name:"language" help:"Language code (e.g., 'en', 'es', 'fr')"`
24-
Prompt *string `name:"prompt" help:"Initial prompt to guide transcription"`
25-
Temperature *float64 `name:"temperature" help:"Temperature (0.0-1.0)"`
26-
Diarize *bool `name:"diarize" help:"Enable speaker diarization"`
27-
Format string `name:"format" help:"Output format: json, text, vtt, srt" default:"json"`
23+
TranslateCommand
24+
Diarize *bool `name:"diarize" help:"Enable speaker diarization"`
25+
Language *string `name:"language" help:"Language code (e.g., 'en', 'es', 'fr')"`
2826
}
2927

3028
///////////////////////////////////////////////////////////////////////////////
@@ -63,20 +61,16 @@ func (cmd *TranscribeCommand) Run(ctx *Globals) (err error) {
6361
}
6462

6563
// Set format
66-
var format httpclient.FormatType
67-
switch cmd.Format {
68-
case "text", string(httpclient.FormatText):
69-
format = httpclient.FormatText
70-
case "vtt", string(httpclient.FormatVTT):
71-
format = httpclient.FormatVTT
72-
case "srt", string(httpclient.FormatSRT):
73-
format = httpclient.FormatSRT
74-
case "json", string(httpclient.FormatJSON):
75-
format = httpclient.FormatJSON
76-
default:
77-
return fmt.Errorf("unsupported format: %s", cmd.Format)
64+
format, err := formatFromString(cmd.Format)
65+
if err != nil {
66+
return err
7867
}
79-
opts = append(opts, httpclient.WithFormat(format))
68+
69+
// Add real-time segment printing callback
70+
opts = append(opts, httpclient.WithSegmentCallback(func(seg *schema.Segment) error {
71+
writeSegment(os.Stdout, seg, format)
72+
return nil
73+
}))
8074

8175
// Transcribe
8276
var result *schema.Transcription
@@ -85,38 +79,51 @@ func (cmd *TranscribeCommand) Run(ctx *Globals) (err error) {
8579
return err
8680
}
8781

88-
// Print result based on format
82+
// If segments were not printed via streaming, print from result
83+
for _, seg := range result.Segments {
84+
writeSegment(os.Stdout, seg, format)
85+
}
86+
writeTrailer(os.Stdout, format)
87+
88+
// Return success
89+
return nil
90+
}
91+
92+
// Method to write segment in specified format
93+
func writeSegment(w io.Writer, seg *schema.Segment, format httpclient.FormatType) {
8994
switch format {
90-
case httpclient.FormatJSON:
91-
fmt.Println(result)
9295
case httpclient.FormatVTT:
93-
if len(result.Segments) > 0 {
94-
// Client-side formatting from segments
95-
fmt.Print("WEBVTT\n\n")
96-
for _, seg := range result.Segments {
97-
if seg != nil {
98-
seg.WriteVTT(os.Stdout, 0)
99-
}
100-
}
101-
} else {
102-
// Server already formatted it
103-
fmt.Print(result.Text)
104-
}
96+
seg.WriteVTT(w, 0)
10597
case httpclient.FormatSRT:
106-
if len(result.Segments) > 0 {
107-
// Client-side formatting from segments
108-
for _, seg := range result.Segments {
109-
if seg != nil {
110-
seg.WriteSRT(os.Stdout, 0)
111-
}
112-
}
113-
} else {
114-
// Server already formatted it
115-
fmt.Print(result.Text)
116-
}
98+
seg.WriteSRT(w, 0)
99+
case httpclient.FormatJSON:
100+
seg.WriteJSON(w)
117101
default:
118-
// For text and other formats, print the formatted text from server
119-
fmt.Print(result.Text)
102+
seg.WriteText(w)
103+
}
104+
}
105+
106+
// Method to write a trailer in specified format
107+
func writeTrailer(w io.Writer, format httpclient.FormatType) {
108+
switch format {
109+
case httpclient.FormatJSON:
110+
schema.WriteJSONTrailer(w)
111+
default:
112+
schema.WriteTextTrailer(w)
113+
}
114+
}
115+
116+
func formatFromString(format string) (httpclient.FormatType, error) {
117+
switch format {
118+
case "text", string(httpclient.FormatText):
119+
return httpclient.FormatText, nil
120+
case "vtt", string(httpclient.FormatVTT):
121+
return httpclient.FormatVTT, nil
122+
case "srt", string(httpclient.FormatSRT):
123+
return httpclient.FormatSRT, nil
124+
case "json", string(httpclient.FormatJSON):
125+
return httpclient.FormatJSON, nil
126+
default:
127+
return "", httpresponse.ErrBadRequest.Withf("unsupported format: %q", format)
120128
}
121-
return nil
122129
}

cmd/gowhisper/translate.go

Lines changed: 15 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ type TranslateCommand struct {
2222
File string `arg:"" name:"file" help:"Audio file to translate"`
2323
Prompt *string `name:"prompt" help:"Initial prompt to guide translation"`
2424
Temperature *float64 `name:"temperature" help:"Temperature (0.0-1.0)"`
25-
Diarize *bool `name:"diarize" help:"Enable speaker diarization"`
2625
Format string `name:"format" help:"Output format: json, text, vtt, srt" default:"json"`
2726
}
2827

@@ -54,25 +53,18 @@ func (cmd *TranslateCommand) Run(ctx *Globals) (err error) {
5453
if cmd.Temperature != nil {
5554
opts = append(opts, httpclient.WithTemperature(*cmd.Temperature))
5655
}
57-
if cmd.Diarize != nil {
58-
opts = append(opts, httpclient.WithDiarize(*cmd.Diarize))
59-
}
6056

6157
// Set format
62-
var format httpclient.FormatType
63-
switch cmd.Format {
64-
case "text", string(httpclient.FormatText):
65-
format = httpclient.FormatText
66-
case "vtt", string(httpclient.FormatVTT):
67-
format = httpclient.FormatVTT
68-
case "srt", string(httpclient.FormatSRT):
69-
format = httpclient.FormatSRT
70-
case "json", string(httpclient.FormatJSON):
71-
format = httpclient.FormatJSON
72-
default:
73-
return fmt.Errorf("unsupported format: %s", cmd.Format)
58+
format, err := formatFromString(cmd.Format)
59+
if err != nil {
60+
return err
7461
}
75-
opts = append(opts, httpclient.WithFormat(format))
62+
63+
// Add real-time segment printing callback
64+
opts = append(opts, httpclient.WithSegmentCallback(func(seg *schema.Segment) error {
65+
writeSegment(os.Stdout, seg, format)
66+
return nil
67+
}))
7668

7769
// Translate
7870
var result *schema.Transcription
@@ -81,38 +73,12 @@ func (cmd *TranslateCommand) Run(ctx *Globals) (err error) {
8173
return err
8274
}
8375

84-
// Print result based on format
85-
switch format {
86-
case httpclient.FormatJSON:
87-
fmt.Println(result)
88-
case httpclient.FormatVTT:
89-
if len(result.Segments) > 0 {
90-
// Client-side formatting from segments
91-
fmt.Print("WEBVTT\n\n")
92-
for _, seg := range result.Segments {
93-
if seg != nil {
94-
seg.WriteVTT(os.Stdout, 0)
95-
}
96-
}
97-
} else {
98-
// Server already formatted it
99-
fmt.Print(result.Text)
100-
}
101-
case httpclient.FormatSRT:
102-
if len(result.Segments) > 0 {
103-
// Client-side formatting from segments
104-
for _, seg := range result.Segments {
105-
if seg != nil {
106-
seg.WriteSRT(os.Stdout, 0)
107-
}
108-
}
109-
} else {
110-
// Server already formatted it
111-
fmt.Print(result.Text)
112-
}
113-
default:
114-
// For text and other formats, print the formatted text from server
115-
fmt.Print(result.Text)
76+
// If segments were not printed via streaming, print from result
77+
for _, seg := range result.Segments {
78+
writeSegment(os.Stdout, seg, format)
11679
}
80+
writeTrailer(os.Stdout, format)
81+
82+
// Return success
11783
return nil
11884
}

etc/Dockerfile.vulkan

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ FROM ${BASE_RUN_CONTAINER} AS runtime
3333
RUN apt -y update \
3434
&& apt install -y ca-certificates libgomp1 \
3535
&& apt install -y libfreetype6 libmp3lame0 libopus0 libvorbis0a libvorbisenc2 libvpx9 libx264-164 libx265-215 libdav1d7 libnuma1 \
36-
&& apt install -y libvulkan1 libshaderc1 libplacebo349 vulkan-tools
36+
&& apt install -y libvulkan1 libshaderc1 libplacebo349 vulkan-tools mesa-vulkan-drivers
3737

3838
# Copy built gowhisper binary
3939
COPY --from=build --chmod=755 /app/build/gowhisper /usr/local/bin/gowhisper

pkg/elevenlabs/schema.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,11 @@ func (s TranscribeWord) String() string {
9696
// Return segments of a transcription response
9797
func (r *TranscribeResponse) Segments() *schema.Transcription {
9898
t := &schema.Transcription{
99-
Task: "transcribe",
100-
Language: r.Language,
101-
Text: r.Text,
99+
TranscriptionSummary: schema.TranscriptionSummary{
100+
Task: "transcribe",
101+
Language: r.Language,
102+
},
103+
Text: r.Text,
102104
}
103105

104106
// Current segment

pkg/httpclient/format.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
package httpclient
22

3+
import (
4+
"github.com/mutablelogic/go-server/pkg/types"
5+
"github.com/mutablelogic/go-whisper/pkg/schema"
6+
)
7+
38
///////////////////////////////////////////////////////////////////////////////
49
// TYPES
510

@@ -10,15 +15,8 @@ type FormatType string
1015
// GLOBALS
1116

1217
const (
13-
// FormatJSON requests JSON response (default)
14-
FormatJSON FormatType = "application/json"
15-
16-
// FormatText requests plain text response
17-
FormatText FormatType = "text/plain"
18-
19-
// FormatVTT requests WebVTT subtitle format
20-
FormatVTT FormatType = "text/vtt"
21-
22-
// FormatSRT requests SubRip subtitle format
23-
FormatSRT FormatType = "application/x-subrip"
18+
FormatJSON FormatType = types.ContentTypeJSON
19+
FormatText FormatType = types.ContentTypeTextPlain
20+
FormatVTT FormatType = schema.ContentTypeVTT
21+
FormatSRT FormatType = schema.ContentTypeSRT
2422
)

0 commit comments

Comments
 (0)