Skip to content

Commit c64f3e8

Browse files
authored
common : separate whisper sources (#2846)
* common : separate whisper sources * examples : add chrono * examples : add more headers
1 parent 9f83f67 commit c64f3e8

File tree

13 files changed

+228
-276
lines changed

13 files changed

+228
-276
lines changed

examples/CMakeLists.txt

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@ if (WHISPER_SDL2)
1414
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
1515
endif()
1616

17-
if (WHISPER_CLBLAST)
18-
find_package(CLBlast REQUIRED)
19-
endif()
20-
2117
# common
2218

2319
set(TARGET common)
@@ -56,6 +52,8 @@ add_library(${TARGET} STATIC
5652
common.cpp
5753
common-ggml.h
5854
common-ggml.cpp
55+
common-whisper.h
56+
common-whisper.cpp
5957
grammar-parser.h
6058
grammar-parser.cpp
6159
${COMMON_SOURCES_FFMPEG}

examples/cli/cli.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
#include "common.h"
2+
#include "common-whisper.h"
23

34
#include "whisper.h"
45
#include "grammar-parser.h"
56

67
#include <cmath>
78
#include <fstream>
89
#include <cstdio>
9-
#include <regex>
1010
#include <string>
1111
#include <thread>
1212
#include <vector>

examples/command/command.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,15 @@
1111
#include "whisper.h"
1212
#include "grammar-parser.h"
1313

14-
#include <sstream>
15-
#include <cassert>
14+
#include <algorithm>
15+
#include <chrono>
1616
#include <cstdio>
1717
#include <fstream>
18-
#include <mutex>
19-
#include <regex>
18+
#include <map>
19+
#include <sstream>
2020
#include <string>
2121
#include <thread>
2222
#include <vector>
23-
#include <map>
2423

2524
// command-line parameters
2625
struct whisper_params {

examples/common-whisper.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#define _USE_MATH_DEFINES // for M_PI
2+
3+
#include "common-whisper.h"
4+
5+
#include "common.h"
6+
7+
#include "whisper.h"
8+
9+
// third-party utilities
10+
// use your favorite implementations
11+
#define STB_VORBIS_HEADER_ONLY
12+
#include "stb_vorbis.c" /* Enables Vorbis decoding. */
13+
14+
#ifdef _WIN32
15+
#ifndef NOMINMAX
16+
#define NOMINMAX
17+
#endif
18+
#endif
19+
20+
#define MA_NO_DEVICE_IO
21+
#define MA_NO_THREADING
22+
#define MA_NO_ENCODING
23+
#define MA_NO_GENERATION
24+
#define MA_NO_RESOURCE_MANAGER
25+
#define MA_NO_NODE_GRAPH
26+
#define MINIAUDIO_IMPLEMENTATION
27+
#include "miniaudio.h"
28+
29+
#if defined(_MSC_VER)
30+
#pragma warning(disable: 4244 4267) // possible loss of data
31+
#endif
32+
33+
#ifdef _WIN32
34+
#include <fcntl.h>
35+
#include <io.h>
36+
#endif
37+
38+
#include <cstring>
39+
#include <fstream>
40+
41+
#ifdef WHISPER_FFMPEG
42+
// as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
43+
extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
44+
#endif
45+
46+
bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
47+
std::vector<uint8_t> audio_data; // used for pipe input from stdin or ffmpeg decoding output
48+
49+
ma_result result;
50+
ma_decoder_config decoder_config;
51+
ma_decoder decoder;
52+
53+
decoder_config = ma_decoder_config_init(ma_format_f32, stereo ? 2 : 1, WHISPER_SAMPLE_RATE);
54+
55+
if (fname == "-") {
56+
#ifdef _WIN32
57+
_setmode(_fileno(stdin), _O_BINARY);
58+
#endif
59+
60+
uint8_t buf[1024];
61+
while (true)
62+
{
63+
const size_t n = fread(buf, 1, sizeof(buf), stdin);
64+
if (n == 0) {
65+
break;
66+
}
67+
audio_data.insert(audio_data.end(), buf, buf + n);
68+
}
69+
70+
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
71+
72+
fprintf(stderr, "Error: failed to open audio data from stdin (%s)\n", ma_result_description(result));
73+
74+
return false;
75+
}
76+
77+
fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, audio_data.size());
78+
}
79+
else if (is_wav_buffer(fname)) {
80+
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
81+
fprintf(stderr, "Error: failed to open audio data from fname buffer (%s)\n", ma_result_description(result));
82+
83+
return false;
84+
}
85+
}
86+
else if ((result = ma_decoder_init_file(fname.c_str(), &decoder_config, &decoder)) != MA_SUCCESS) {
87+
#if defined(WHISPER_FFMPEG)
88+
if (ffmpeg_decode_audio(fname, audio_data) != 0) {
89+
fprintf(stderr, "error: failed to ffmpeg decode '%s'\n", fname.c_str());
90+
91+
return false;
92+
}
93+
94+
if ((result = ma_decoder_init_memory(audio_data.data(), audio_data.size(), &decoder_config, &decoder)) != MA_SUCCESS) {
95+
fprintf(stderr, "error: failed to read audio data as wav (%s)\n", ma_result_description(result));
96+
97+
return false;
98+
}
99+
#else
100+
fprintf(stderr, "error: failed to open '%s' file (%s)\n", fname.c_str(), ma_result_description(result));
101+
102+
return false;
103+
#endif
104+
}
105+
106+
ma_uint64 frame_count;
107+
ma_uint64 frames_read;
108+
109+
if ((result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count)) != MA_SUCCESS) {
110+
fprintf(stderr, "error: failed to retrieve the length of the audio data (%s)\n", ma_result_description(result));
111+
112+
return false;
113+
}
114+
115+
pcmf32.resize(stereo ? frame_count*2 : frame_count);
116+
117+
if ((result = ma_decoder_read_pcm_frames(&decoder, pcmf32.data(), frame_count, &frames_read)) != MA_SUCCESS) {
118+
fprintf(stderr, "error: failed to read the frames of the audio data (%s)\n", ma_result_description(result));
119+
120+
return false;
121+
}
122+
123+
if (stereo) {
124+
pcmf32s.resize(2);
125+
pcmf32s[0].resize(frame_count);
126+
pcmf32s[1].resize(frame_count);
127+
for (uint64_t i = 0; i < frame_count; i++) {
128+
pcmf32s[0][i] = pcmf32[2*i];
129+
pcmf32s[1][i] = pcmf32[2*i + 1];
130+
}
131+
}
132+
133+
ma_decoder_uninit(&decoder);
134+
135+
return true;
136+
}
137+
138+
// 500 -> 00:05.000
139+
// 6000 -> 01:00.000
140+
std::string to_timestamp(int64_t t, bool comma) {
141+
int64_t msec = t * 10;
142+
int64_t hr = msec / (1000 * 60 * 60);
143+
msec = msec - hr * (1000 * 60 * 60);
144+
int64_t min = msec / (1000 * 60);
145+
msec = msec - min * (1000 * 60);
146+
int64_t sec = msec / 1000;
147+
msec = msec - sec * 1000;
148+
149+
char buf[32];
150+
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
151+
152+
return std::string(buf);
153+
}
154+
155+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
156+
return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
157+
}
158+
159+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id) {
160+
std::ofstream speak_file(path.c_str());
161+
if (speak_file.fail()) {
162+
fprintf(stderr, "%s: failed to open speak_file\n", __func__);
163+
return false;
164+
} else {
165+
speak_file.write(text.c_str(), text.size());
166+
speak_file.close();
167+
int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
168+
if (ret != 0) {
169+
fprintf(stderr, "%s: failed to speak\n", __func__);
170+
return false;
171+
}
172+
}
173+
return true;
174+
}
175+
176+
#undef STB_VORBIS_HEADER_ONLY
177+
#include "stb_vorbis.c"

examples/common-whisper.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#pragma once
2+
3+
#include <string>
4+
#include <vector>
5+
#include <cstdint>
6+
7+
// Read WAV audio file and store the PCM data into pcmf32
8+
// fname can be a buffer of WAV data instead of a filename
9+
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
10+
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
11+
bool read_audio_data(
12+
const std::string & fname,
13+
std::vector<float> & pcmf32,
14+
std::vector<std::vector<float>> & pcmf32s,
15+
bool stereo);
16+
17+
// convert timestamp to string, 6000 -> 01:00.000
18+
std::string to_timestamp(int64_t t, bool comma = false);
19+
20+
// given a timestamp get the sample
21+
int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
22+
23+
// write text to file, and call system("command voice_id file")
24+
bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);

0 commit comments

Comments
 (0)