Skip to content

Commit 98d7500

Browse files
author
AI-Spawn
committed
Added vosk transcriber, version bumb (5.0.4->5.1.0)
1 parent 1e8b5a9 commit 98d7500

File tree

5 files changed

+124
-8
lines changed

5 files changed

+124
-8
lines changed

VoskServer.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"context"
6+
"fmt"
7+
"log"
8+
"net/http"
9+
"os"
10+
"os/exec"
11+
"runtime"
12+
)
13+
14+
func transcriberResultHandler(w http.ResponseWriter, r *http.Request) {
15+
w.Header().Set("Access-Control-Allow-Origin", "*")
16+
17+
r.ParseForm()
18+
voskTranscription = r.Form["text"][0]
19+
srv.Shutdown(context.TODO())
20+
}
21+
22+
func getAudioHandler(w http.ResponseWriter, r *http.Request) {
23+
w.Header().Set("Access-Control-Allow-Origin", "*")
24+
f, err := RetrieveROM(voskAudioPath)
25+
if err != nil {
26+
panic(err)
27+
}
28+
w.Write(f)
29+
30+
}
31+
32+
var srv *http.Server
33+
var voskTranscription string
34+
var voskAudioPath string
35+
36+
func handleRequests() {
37+
srv = &http.Server{Addr: ":8000"}
38+
39+
http.HandleFunc("/transcriber_result", transcriberResultHandler)
40+
http.HandleFunc("/audio_file", getAudioHandler)
41+
// always returns error. ErrServerClosed on graceful close
42+
if err := srv.ListenAndServe(); err != http.ErrServerClosed {
43+
// unexpected error. port in use?
44+
log.Fatalf("ListenAndServe(): %v", err)
45+
}
46+
47+
}
48+
func openbrowser(url string) {
49+
var err error
50+
51+
switch runtime.GOOS {
52+
case "linux":
53+
err = exec.Command("xdg-open", url).Start()
54+
case "windows":
55+
err = exec.Command("rundll32", "url.dll,FileProtocolHandler", url).Start()
56+
case "darwin":
57+
err = exec.Command("open", url).Start()
58+
default:
59+
err = fmt.Errorf("unsupported platform")
60+
}
61+
if err != nil {
62+
log.Fatal(err)
63+
}
64+
65+
}
66+
func RetrieveROM(filename string) ([]byte, error) {
67+
file, err := os.Open(filename)
68+
69+
if err != nil {
70+
return nil, err
71+
}
72+
defer file.Close()
73+
74+
stats, statsErr := file.Stat()
75+
if statsErr != nil {
76+
return nil, statsErr
77+
}
78+
79+
var size int64 = stats.Size()
80+
bytes := make([]byte, size)
81+
82+
bufr := bufio.NewReader(file)
83+
_, err = bufr.Read(bytes)
84+
85+
return bytes, err
86+
}
87+
func getVoskTranscription(audioPath string, transcriberUrl string) string {
88+
voskAudioPath = audioPath
89+
openbrowser(transcriberUrl)
90+
handleRequests()
91+
return voskTranscription
92+
}

argparser.go

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,15 @@ type Args struct {
2424
Timestamps string `json:"timestamps"`
2525
Verbose int8 `json:"verbose"`
2626
OutputPath string `json:"outputPath"`
27-
TranscriberUrl string `json:"transcriberUrl"`
27+
WhisperUrl string `json:"whisperUrl"`
28+
VoskUrl string `json:"voskUrl"`
2829
TranscriberApiKey string `json:"transcriberApiKey"`
2930
AlignerUrl string `json:"alignerUrl"`
3031
PhonemesPath string `json:"phonemesPath"`
3132
SkipTranscriber bool `json:"skipTranscriber"`
3233
CheckForUpdates bool `json:"checkForUpdates"`
3334
RunProfiler bool `json:"runProfiler"`
35+
Transcriber string `json:"transcriber"`
3436
}
3537

3638
func loadDefaults() Args {
@@ -87,13 +89,16 @@ func parseArgs() Args {
8789
verbose := flag.Int("v", int(defArgs.Verbose), "Verbose level")
8890
output := flag.String("o", defArgs.OutputPath, "output file path")
8991
transcriber_key := flag.String("k", defArgs.TranscriberApiKey, "OpenAI API Key")
90-
transcribe_url := flag.String("api_url", defArgs.TranscriberUrl, "Can be subsituted for the LocalAI url")
92+
whisper_url := flag.String("whisper_url", defArgs.WhisperUrl, "Can be subsituted for the LocalAI url")
93+
vosk_url := flag.String("vosk_url", defArgs.VoskUrl, "If using vosk and a different transcriber website")
94+
9195
aligner_url := flag.String("aligner_url", defArgs.AlignerUrl, "Gentle server url")
9296
phonemes_path := flag.String("phonemes", defArgs.PhonemesPath, "Custom phonemes JSON path")
9397
run_profiler := flag.Bool("run_profiler", defArgs.RunProfiler, "Run pprof server")
9498

9599
//dev settings
96100
skipTranscriber := flag.Bool("skipTranscriber", defArgs.SkipTranscriber, "Skips transcription and replaces with pangram text")
101+
transcriber := flag.String("transcriber", defArgs.Transcriber, "Transcriber to use (Whisper | LocalAI | Vosk)")
97102

98103
flag.Parse()
99104

@@ -117,8 +122,10 @@ func parseArgs() Args {
117122
Timestamps: *timestamps,
118123
Verbose: int8(*verbose),
119124
OutputPath: *output,
125+
Transcriber: *transcriber,
120126
TranscriberApiKey: *transcriber_key,
121-
TranscriberUrl: *transcribe_url,
127+
WhisperUrl: *whisper_url,
128+
VoskUrl: *vosk_url,
122129
AlignerUrl: *aligner_url,
123130
PhonemesPath: *phonemes_path,
124131
SkipTranscriber: *skipTranscriber,
@@ -129,9 +136,18 @@ func parseArgs() Args {
129136
if args.AudioPath == "" {
130137
log.Fatal("Audio path is required")
131138
}
139+
132140
openAiUrl := "https://api.openai.com/v1/"
133-
if args.TranscriberUrl == openAiUrl && args.TranscriberApiKey == "" {
134-
log.Fatal("Please provide an OpenAI API key or use LocalAI")
141+
if args.Transcriber == "Whisper" && args.WhisperUrl == "" {
142+
args.WhisperUrl = openAiUrl
143+
} else if args.Transcriber == "LocalAI" && args.WhisperUrl == "" {
144+
args.WhisperUrl = "http://localhost:8080/v1/"
145+
} else if args.Transcriber == "Vosk" && args.VoskUrl == "" {
146+
args.VoskUrl = "https://matamata.org/web-vosk-transcriber/"
147+
}
148+
149+
if args.WhisperUrl == openAiUrl && args.TranscriberApiKey == "" {
150+
log.Fatal("Please provide an OpenAI API key or use a different transcriber")
135151
}
136152
return args
137153
}

checkForUpdates.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type Release struct {
1616
}
1717

1818
func checkForUpdates() {
19-
currentSemVer := "v5.0.5"
19+
currentSemVer := "v5.1.0"
2020

2121
logM(1, "Checking for Updates...")
2222
endpointUrl := "https://api.github.com/repos/Matamata-Animator/Matamata/releases?per_page=1"

main.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ func main() {
3636
if !args.SkipTranscriber {
3737
logM(1, "Transcribing Audio...")
3838

39-
text = transcribe(args.AudioPath, args.TranscriberUrl, args.TranscriberApiKey)
39+
if args.Transcriber == "Vosk" {
40+
text = getVoskTranscription(args.AudioPath, args.VoskUrl)
41+
} else if args.Transcriber == "Whisper" || args.Transcriber == "LocalAI" {
42+
text = transcribe(args.AudioPath, args.WhisperUrl, args.TranscriberApiKey)
43+
} else {
44+
log.Fatal("Invalid Transcriber:", args.Transcriber)
45+
}
4046
} else {
4147
logM(1, "Using Stored Transcription...")
4248
//cache transcription to save time durinng development

readme.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,9 @@ You can also remove a placeable part by using the name `None`
183183
| -o | Path of the output file | output.mov |
184184
| -t | Path to the timestamps file | "defaults/characters.json" |
185185
| -v | How verbose to be (0 to 3). | 1 |
186-
| -api_url | URL for the transcription API. By default, this points to OpenAI's Whisper. You can also set it to point to a [LocalAI](https://localai.io/basics/getting_started/) instance. | https://api.openai.com/v1/ |
186+
| -transcriber | Which transcriber to use. Vosk doesn't require any setup, however it is the slowest.  (Whisper/LocalAI/Vosk) | Whisper |
187+
| -whisper_url | URL for the transcription API. By default, this points to OpenAI's Whisper. You can also set it to point to a [LocalAI](https://localai.io/basics/getting_started/) instance. | https://api.openai.com/v1/ |
188+
| -vosk_url | Url if using Vosk transcriber | https://matamata.org/web-vosk-transcriber/ |
187189
| -aligner_url | URL for Gentle Aligner server. | http://localhost:8765/transcriptions?async=false |
188190
| -phonemes | Custom phonemes JSON path | By default, the sample phonemes is used |
189191

0 commit comments

Comments
 (0)