Skip to content

Commit ec68ab3

Browse files
authored
fix: convert m4a chunks to wav (#128)
1 parent e02c517 commit ec68ab3

File tree

1 file changed

+196
-20
lines changed

1 file changed

+196
-20
lines changed

src/services/TranscriptionService.ts

Lines changed: 196 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ export class TranscriptionService {
5353
private client: OpenAI | null = null;
5454
private cachedApiKey: string | null = null;
5555
private MAX_RETRIES = 3;
56+
private readonly CHUNK_SIZE_BYTES = 20 * 1024 * 1024;
57+
private readonly WAV_HEADER_SIZE = 44;
58+
private readonly PCM_BYTES_PER_SAMPLE = 2;
5659
private isTranscribing = false;
5760

5861
constructor(plugin: PodNotes) {
@@ -112,13 +115,13 @@ export class TranscriptionService {
112115
const fileExtension = podcastFile.extension;
113116
const mimeType = this.getMimeType(fileExtension);
114117

115-
const chunks = this.chunkFile(fileBuffer);
116-
const files = this.createChunkFiles(
117-
chunks,
118-
podcastFile.basename,
119-
fileExtension,
118+
notice.update("Creating audio chunks...");
119+
const files = await this.createChunkFiles({
120+
buffer: fileBuffer,
121+
basename: podcastFile.basename,
122+
extension: fileExtension,
120123
mimeType,
121-
);
124+
});
122125

123126
notice.update("Starting transcription...");
124127
const transcription = await this.transcribeChunks(files, notice.update);
@@ -138,28 +141,201 @@ export class TranscriptionService {
138141
}
139142
}
140143

141-
private chunkFile(fileBuffer: ArrayBuffer): ArrayBuffer[] {
142-
const CHUNK_SIZE_MB = 20;
143-
const chunkSizeBytes = CHUNK_SIZE_MB * 1024 * 1024; // Convert MB to bytes
144-
const chunks: ArrayBuffer[] = [];
145-
for (let i = 0; i < fileBuffer.byteLength; i += chunkSizeBytes) {
146-
chunks.push(fileBuffer.slice(i, i + chunkSizeBytes));
144+
private async createChunkFiles({
145+
buffer,
146+
basename,
147+
extension,
148+
mimeType,
149+
}: {
150+
buffer: ArrayBuffer;
151+
basename: string;
152+
extension: string;
153+
mimeType: string;
154+
}): Promise<File[]> {
155+
if (this.shouldConvertToWav(extension, mimeType)) {
156+
const wavChunks = await this.convertToWavChunks(buffer, basename);
157+
if (wavChunks.length > 0) {
158+
return wavChunks;
159+
}
147160
}
148-
return chunks;
161+
162+
return this.createBinaryChunkFiles(buffer, basename, extension, mimeType);
149163
}
150164

151-
private createChunkFiles(
152-
chunks: ArrayBuffer[],
153-
fileName: string,
154-
fileExtension: string,
165+
private shouldConvertToWav(extension: string, mimeType: string): boolean {
166+
const normalizedExtension = extension.toLowerCase();
167+
return normalizedExtension === "m4a" || mimeType === "audio/mp4";
168+
}
169+
170+
private createBinaryChunkFiles(
171+
buffer: ArrayBuffer,
172+
basename: string,
173+
extension: string,
155174
mimeType: string,
156175
): File[] {
157-
return chunks.map(
158-
(chunk, index) =>
159-
new File([chunk], `${fileName}.part${index}.${fileExtension}`, {
176+
if (buffer.byteLength <= this.CHUNK_SIZE_BYTES) {
177+
return [
178+
new File([buffer], `${basename}.${extension}`, {
179+
type: mimeType,
180+
}),
181+
];
182+
}
183+
184+
const files: File[] = [];
185+
for (
186+
let offset = 0, index = 0;
187+
offset < buffer.byteLength;
188+
offset += this.CHUNK_SIZE_BYTES, index++
189+
) {
190+
const chunk = buffer.slice(offset, offset + this.CHUNK_SIZE_BYTES);
191+
files.push(
192+
new File([chunk], `${basename}.part${index}.${extension}`, {
160193
type: mimeType,
161194
}),
195+
);
196+
}
197+
198+
return files;
199+
}
200+
201+
private async convertToWavChunks(
202+
buffer: ArrayBuffer,
203+
basename: string,
204+
): Promise<File[]> {
205+
const audioContext = this.createAudioContext();
206+
if (!audioContext) return [];
207+
208+
try {
209+
const audioBuffer = await audioContext.decodeAudioData(buffer.slice(0));
210+
return this.renderWavChunks(audioBuffer, basename);
211+
} catch (error) {
212+
console.warn("Failed to convert audio buffer for transcription", error);
213+
return [];
214+
} finally {
215+
try {
216+
await audioContext.close();
217+
} catch (error) {
218+
console.warn("Failed to close audio context", error);
219+
}
220+
}
221+
}
222+
223+
private createAudioContext(): AudioContext | null {
224+
if (typeof window === "undefined") {
225+
return null;
226+
}
227+
228+
const contextCtor =
229+
window.AudioContext ||
230+
(window as typeof window & { webkitAudioContext?: typeof AudioContext })
231+
.webkitAudioContext;
232+
if (!contextCtor) {
233+
return null;
234+
}
235+
236+
return new contextCtor();
237+
}
238+
239+
private renderWavChunks(audioBuffer: AudioBuffer, basename: string): File[] {
240+
const numChannels = audioBuffer.numberOfChannels;
241+
const bytesPerFrame = numChannels * this.PCM_BYTES_PER_SAMPLE;
242+
const availableBytesPerChunk = this.CHUNK_SIZE_BYTES - this.WAV_HEADER_SIZE;
243+
const maxSamplesPerChunk = Math.max(
244+
1,
245+
Math.floor(availableBytesPerChunk / bytesPerFrame),
246+
);
247+
const channelData = Array.from({ length: numChannels }, (_, channelIndex) =>
248+
audioBuffer.getChannelData(channelIndex),
249+
);
250+
const files: File[] = [];
251+
let chunkIndex = 0;
252+
253+
for (
254+
let startSample = 0;
255+
startSample < audioBuffer.length;
256+
startSample += maxSamplesPerChunk
257+
) {
258+
const endSample = Math.min(
259+
audioBuffer.length,
260+
startSample + maxSamplesPerChunk,
261+
);
262+
const wavBuffer = this.renderWavBuffer(
263+
channelData,
264+
audioBuffer.sampleRate,
265+
startSample,
266+
endSample,
267+
);
268+
files.push(
269+
new File([wavBuffer], `${basename}.part${chunkIndex}.wav`, {
270+
type: "audio/wav",
271+
}),
272+
);
273+
chunkIndex++;
274+
}
275+
276+
return files;
277+
}
278+
279+
private renderWavBuffer(
280+
channelData: Float32Array[],
281+
sampleRate: number,
282+
startSample: number,
283+
endSample: number,
284+
): ArrayBuffer {
285+
const numChannels = channelData.length;
286+
const sampleCount = Math.max(0, endSample - startSample);
287+
const blockAlign = numChannels * this.PCM_BYTES_PER_SAMPLE;
288+
const buffer = new ArrayBuffer(
289+
this.WAV_HEADER_SIZE + sampleCount * blockAlign,
162290
);
291+
const view = new DataView(buffer);
292+
this.writeWavHeader(view, sampleRate, numChannels, sampleCount);
293+
let offset = this.WAV_HEADER_SIZE;
294+
295+
for (let i = 0; i < sampleCount; i++) {
296+
for (let channel = 0; channel < numChannels; channel++) {
297+
const sample = channelData[channel][startSample + i] ?? 0;
298+
const clamped = Math.max(-1, Math.min(1, sample));
299+
const intSample =
300+
clamped < 0
301+
? clamped * 0x8000
302+
: clamped * 0x7fff;
303+
view.setInt16(offset, Math.round(intSample), true);
304+
offset += this.PCM_BYTES_PER_SAMPLE;
305+
}
306+
}
307+
308+
return buffer;
309+
}
310+
311+
private writeWavHeader(
312+
view: DataView,
313+
sampleRate: number,
314+
numChannels: number,
315+
sampleCount: number,
316+
): void {
317+
const blockAlign = numChannels * this.PCM_BYTES_PER_SAMPLE;
318+
const byteRate = sampleRate * blockAlign;
319+
const dataSize = sampleCount * blockAlign;
320+
this.writeString(view, 0, "RIFF");
321+
view.setUint32(4, 36 + dataSize, true);
322+
this.writeString(view, 8, "WAVE");
323+
this.writeString(view, 12, "fmt ");
324+
view.setUint32(16, 16, true);
325+
view.setUint16(20, 1, true);
326+
view.setUint16(22, numChannels, true);
327+
view.setUint32(24, sampleRate, true);
328+
view.setUint32(28, byteRate, true);
329+
view.setUint16(32, blockAlign, true);
330+
view.setUint16(34, this.PCM_BYTES_PER_SAMPLE * 8, true);
331+
this.writeString(view, 36, "data");
332+
view.setUint32(40, dataSize, true);
333+
}
334+
335+
private writeString(view: DataView, offset: number, str: string): void {
336+
for (let i = 0; i < str.length; i++) {
337+
view.setUint8(offset + i, str.charCodeAt(i));
338+
}
163339
}
164340

165341
private getMimeType(fileExtension: string): string {

0 commit comments

Comments
 (0)