Skip to content

Commit 4a75b27

Browse files
authored
Merge branch 'ggerganov:master' into feat/detectedLanguage
2 parents 36747d4 + 206459a commit 4a75b27

File tree

19 files changed

+669
-262
lines changed

19 files changed

+669
-262
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,8 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume
427427
428428
This is a naive example of performing real-time inference on audio from your microphone.
429429
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
430-
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
430+
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
431+
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
431432
432433
```bash
433434
cmake -B build -DWHISPER_SDL2=ON

bindings/go/Makefile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ UNAME_M := $(shell uname -m)
1111
endif
1212

1313
GGML_METAL_PATH_RESOURCES := $(abspath ../..)
14-
BUILD_DIR := build
14+
BUILD_DIR := build_go
1515
MODELS_DIR := models
1616
EXAMPLES_DIR := $(wildcard examples/*)
1717
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
18-
LIBRARY_PATH := $(abspath ../..)
18+
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))
1919

2020
ifeq ($(GGML_CUDA),1)
2121
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
@@ -29,8 +29,10 @@ endif
2929
all: clean whisper examples
3030

3131
whisper: mkdir
32-
@echo Build whisper
33-
@${MAKE} -C ../.. libwhisper.a
32+
cmake -S ../.. -B ../../${BUILD_DIR} \
33+
-DCMAKE_BUILD_TYPE=Release \
34+
-DBUILD_SHARED_LIBS=OFF
35+
cmake --build ../../${BUILD_DIR} --target whisper
3436

3537
test: model-small whisper modtidy
3638
ifeq ($(UNAME_S),Darwin)

bindings/go/whisper.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import (
99
// CGO
1010

1111
/*
12-
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
12+
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp
1313
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
1414
#include <whisper.h>
1515
#include <stdlib.h>

bindings/java/build.gradle

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@ sourceSets {
2525
}
2626

2727
tasks.register('copyLibwhisperDynlib', Copy) {
28-
from '../../build'
29-
include 'libwhisper.dynlib'
28+
from '../../build/src'
29+
include 'libwhisper.dylib'
3030
into 'build/generated/resources/main/darwin'
3131
}
3232

3333
tasks.register('copyLibwhisperSo', Copy) {
34-
from '../../build'
34+
from '../../build/src'
3535
include 'libwhisper.so'
3636
into 'build/generated/resources/main/linux-x86-64'
3737
}
@@ -55,7 +55,12 @@ java {
5555
withJavadocJar()
5656
}
5757

58+
sourcesJar() {
59+
dependsOn copyLibs
60+
}
61+
5862
jar {
63+
dependsOn copyLibs
5964
exclude '**/whisper_java.exp', '**/whisper_java.lib'
6065
}
6166

@@ -67,6 +72,9 @@ tasks.withType(Test) {
6772
useJUnitPlatform()
6873
}
6974

75+
test.dependsOn copyLibs
76+
processResources.dependsOn copyLibs
77+
7078
dependencies {
7179
implementation "net.java.dev.jna:jna:5.13.0"
7280
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"

bindings/java/gradlew

100644100755
File mode changed.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package io.github.ggerganov.whispercpp;
2+
3+
/**
4+
* Presets for alignment heads in DTW token timestamps
5+
*/
6+
public class WhisperConstants {
7+
// Alignment heads presets
8+
public static final int WHISPER_AHEADS_NONE = 0;
9+
public static final int WHISPER_AHEADS_TINY_EN = 1;
10+
public static final int WHISPER_AHEADS_TINY = 2;
11+
public static final int WHISPER_AHEADS_BASE_EN = 3;
12+
public static final int WHISPER_AHEADS_BASE = 4;
13+
public static final int WHISPER_AHEADS_SMALL_EN = 5;
14+
public static final int WHISPER_AHEADS_SMALL = 6;
15+
public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
16+
public static final int WHISPER_AHEADS_MEDIUM = 8;
17+
public static final int WHISPER_AHEADS_LARGE_V1 = 9;
18+
public static final int WHISPER_AHEADS_LARGE_V2 = 10;
19+
public static final int WHISPER_AHEADS_LARGE_V3 = 11;
20+
public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
21+
public static final int WHISPER_AHEADS_CUSTOM = 13;
22+
public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
23+
public static final int WHISPER_AHEADS_COUNT = 15;
24+
}
Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,36 @@
11
package io.github.ggerganov.whispercpp;
22

3+
import com.sun.jna.NativeLong;
34
import com.sun.jna.Structure;
45
import com.sun.jna.ptr.PointerByReference;
6+
import com.sun.jna.Pointer;
57
import io.github.ggerganov.whispercpp.ggml.GgmlType;
68
import io.github.ggerganov.whispercpp.WhisperModel;
79
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
810

911
import java.util.List;
1012

1113
public class WhisperContext extends Structure {
12-
int t_load_us = 0;
13-
int t_start_us = 0;
14+
public NativeLong t_load_us;
15+
public NativeLong t_start_us;
1416

1517
/** weight type (FP32 / FP16 / QX) */
16-
GgmlType wtype = GgmlType.GGML_TYPE_F16;
18+
public GgmlType wtype = GgmlType.GGML_TYPE_F16;
1719
/** intermediate type (FP32 or FP16) */
18-
GgmlType itype = GgmlType.GGML_TYPE_F16;
20+
public GgmlType itype = GgmlType.GGML_TYPE_F16;
1921

20-
// WhisperModel model;
21-
public PointerByReference model;
22-
// whisper_vocab vocab;
23-
// whisper_state * state = nullptr;
24-
public PointerByReference vocab;
25-
public PointerByReference state;
22+
public WhisperContextParams.ByValue params;
23+
24+
public Pointer model;
25+
public Pointer vocab;
26+
public Pointer state;
2627

2728
/** populated by whisper_init_from_file_with_params() */
28-
String path_model;
29-
WhisperContextParams params;
30-
31-
// public static class ByReference extends WhisperContext implements Structure.ByReference {
32-
// }
33-
//
34-
// public static class ByValue extends WhisperContext implements Structure.ByValue {
35-
// }
36-
//
37-
// @Override
38-
// protected List<String> getFieldOrder() {
39-
// return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
40-
// }
29+
public Pointer path_model;
30+
31+
@Override
32+
protected List<String> getFieldOrder() {
33+
return List.of("t_load_us", "t_start_us", "wtype", "itype",
34+
"params", "model", "vocab", "state", "path_model");
35+
}
4136
}

bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,11 @@ public void initContext(String modelPath) throws FileNotFoundException {
4343
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
4444
* @param params - params to use when initialising the context
4545
*/
46-
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
46+
public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
4747
initContextImpl(modelPath, params);
4848
}
4949

50-
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
50+
private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
5151
if (ctx != null) {
5252
lib.whisper_free(ctx);
5353
}
@@ -69,15 +69,13 @@ private void initContextImpl(String modelPath, WhisperContextParams params) thro
6969

7070
/**
7171
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
72-
* Because this function allocates memory for the params, the caller must call either:
73-
* - call `whisper_free_context_params()`
74-
* - `Native.free(Pointer.nativeValue(pointer));`
72+
* Returns a ByValue instance to ensure proper parameter passing to native code.
7573
*/
76-
public WhisperContextParams getContextDefaultParams() {
77-
paramsPointer = lib.whisper_context_default_params_by_ref();
78-
WhisperContextParams params = new WhisperContextParams(paramsPointer);
79-
params.read();
80-
return params;
74+
public WhisperContextParams.ByValue getContextDefaultParams() {
75+
WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
76+
lib.whisper_context_default_params_by_ref());
77+
valueParams.read();
78+
return valueParams;
8179
}
8280

8381
/**
@@ -88,7 +86,7 @@ public WhisperContextParams getContextDefaultParams() {
8886
*
8987
* @param strategy - GREEDY
9088
*/
91-
public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
89+
public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
9290
Pointer pointer;
9391

9492
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
@@ -104,7 +102,7 @@ public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy)
104102
pointer = beamParamsPointer;
105103
}
106104

107-
WhisperFullParams params = new WhisperFullParams(pointer);
105+
WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
108106
params.read();
109107
return params;
110108
}
@@ -138,15 +136,21 @@ private void freeParams() {
138136
}
139137

140138
/**
141-
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
139+
* Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text.
142140
* Not thread safe for same context
143141
* Uses the specified decoding strategy to obtain the text.
144142
*/
145-
public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
143+
public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
146144
if (ctx == null) {
147145
throw new IllegalStateException("Model not initialised");
148146
}
149147

148+
/*
149+
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
150+
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
151+
valueParams.read();
152+
*/
153+
150154
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
151155
throw new IOException("Failed to process audio");
152156
}
@@ -163,12 +167,17 @@ public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData)
163167

164168
return str.toString().trim();
165169
}
170+
166171
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
167172
if (ctx == null) {
168173
throw new IllegalStateException("Model not initialised");
169174
}
170175

171-
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
176+
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
177+
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
178+
valueParams.read();
179+
180+
if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
172181
throw new IOException("Failed to process audio");
173182
}
174183

bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
3838
* @param params Pointer to whisper_context_params
3939
* @return Whisper context on success, null on failure
4040
*/
41-
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
41+
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
4242

4343
/**
4444
* Allocate (almost) all memory needed for the model by loading from a buffer.
@@ -180,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
180180
/**
181181
* @return the id of the specified language, returns -1 if not found.
182182
* Examples:
183-
* "de" -> 2
184-
* "german" -> 2
183+
* "de" -&gt; 2
184+
* "german" -&gt; 2
185185
*/
186186
int whisper_lang_id(String lang);
187187

188-
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
188+
/** @return the short string of the specified language id (e.g. 2 -&gt; "de"), returns nullptr if not found */
189189
String whisper_lang_str(int id);
190190

191191
/**
@@ -268,20 +268,21 @@ public interface WhisperCppJnaLibrary extends Library {
268268
void whisper_free_params(Pointer params);
269269

270270
/**
271-
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
271+
* Run the entire model: PCM -&gt; log mel spectrogram -&gt; encoder -&gt; decoder -&gt; text
272272
* Not thread safe for same context
273273
* Uses the specified decoding strategy to obtain the text.
274274
*/
275-
int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
275+
int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
276276

277-
int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
277+
public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
278+
//int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
278279

279280
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
280281
// Result is stored in the default state of the context
281282
// Not thread safe if executed in parallel on the same context.
282283
// It seems this approach can offer some speedup in some cases.
283284
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
284-
int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
285+
int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
285286

286287
/**
287288
* Number of generated text segments.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package io.github.ggerganov.whispercpp.callbacks;
2+
3+
import com.sun.jna.Callback;
4+
5+
/**
6+
* Callback for aborting GGML computation
7+
* Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
8+
*/
9+
public interface GgmlAbortCallback extends Callback {
10+
/**
11+
* Return true to abort the computation, false to continue
12+
*
13+
* @param data User data passed to the callback
14+
* @return true to abort, false to continue
15+
*/
16+
boolean invoke(com.sun.jna.Pointer data);
17+
}

0 commit comments

Comments
 (0)