@@ -53,6 +53,9 @@ export class TranscriptionService {
5353 private client : OpenAI | null = null ;
5454 private cachedApiKey : string | null = null ;
5555 private MAX_RETRIES = 3 ;
56+ private readonly CHUNK_SIZE_BYTES = 20 * 1024 * 1024 ;
57+ private readonly WAV_HEADER_SIZE = 44 ;
58+ private readonly PCM_BYTES_PER_SAMPLE = 2 ;
5659 private isTranscribing = false ;
5760
5861 constructor ( plugin : PodNotes ) {
@@ -112,13 +115,13 @@ export class TranscriptionService {
112115 const fileExtension = podcastFile . extension ;
113116 const mimeType = this . getMimeType ( fileExtension ) ;
114117
115- const chunks = this . chunkFile ( fileBuffer ) ;
116- const files = this . createChunkFiles (
117- chunks ,
118- podcastFile . basename ,
119- fileExtension ,
118+ notice . update ( "Creating audio chunks..." ) ;
119+ const files = await this . createChunkFiles ( {
120+ buffer : fileBuffer ,
121+ basename : podcastFile . basename ,
122+ extension : fileExtension ,
120123 mimeType,
121- ) ;
124+ } ) ;
122125
123126 notice . update ( "Starting transcription..." ) ;
124127 const transcription = await this . transcribeChunks ( files , notice . update ) ;
@@ -138,28 +141,201 @@ export class TranscriptionService {
138141 }
139142 }
140143
141- private chunkFile ( fileBuffer : ArrayBuffer ) : ArrayBuffer [ ] {
142- const CHUNK_SIZE_MB = 20 ;
143- const chunkSizeBytes = CHUNK_SIZE_MB * 1024 * 1024 ; // Convert MB to bytes
144- const chunks : ArrayBuffer [ ] = [ ] ;
145- for ( let i = 0 ; i < fileBuffer . byteLength ; i += chunkSizeBytes ) {
146- chunks . push ( fileBuffer . slice ( i , i + chunkSizeBytes ) ) ;
144+ private async createChunkFiles ( {
145+ buffer,
146+ basename,
147+ extension,
148+ mimeType,
149+ } : {
150+ buffer : ArrayBuffer ;
151+ basename : string ;
152+ extension : string ;
153+ mimeType : string ;
154+ } ) : Promise < File [ ] > {
155+ if ( this . shouldConvertToWav ( extension , mimeType ) ) {
156+ const wavChunks = await this . convertToWavChunks ( buffer , basename ) ;
157+ if ( wavChunks . length > 0 ) {
158+ return wavChunks ;
159+ }
147160 }
148- return chunks ;
161+
162+ return this . createBinaryChunkFiles ( buffer , basename , extension , mimeType ) ;
149163 }
150164
151- private createChunkFiles (
152- chunks : ArrayBuffer [ ] ,
153- fileName : string ,
154- fileExtension : string ,
165+ private shouldConvertToWav ( extension : string , mimeType : string ) : boolean {
166+ const normalizedExtension = extension . toLowerCase ( ) ;
167+ return normalizedExtension === "m4a" || mimeType === "audio/mp4" ;
168+ }
169+
170+ private createBinaryChunkFiles (
171+ buffer : ArrayBuffer ,
172+ basename : string ,
173+ extension : string ,
155174 mimeType : string ,
156175 ) : File [ ] {
157- return chunks . map (
158- ( chunk , index ) =>
159- new File ( [ chunk ] , `${ fileName } .part${ index } .${ fileExtension } ` , {
176+ if ( buffer . byteLength <= this . CHUNK_SIZE_BYTES ) {
177+ return [
178+ new File ( [ buffer ] , `${ basename } .${ extension } ` , {
179+ type : mimeType ,
180+ } ) ,
181+ ] ;
182+ }
183+
184+ const files : File [ ] = [ ] ;
185+ for (
186+ let offset = 0 , index = 0 ;
187+ offset < buffer . byteLength ;
188+ offset += this . CHUNK_SIZE_BYTES , index ++
189+ ) {
190+ const chunk = buffer . slice ( offset , offset + this . CHUNK_SIZE_BYTES ) ;
191+ files . push (
192+ new File ( [ chunk ] , `${ basename } .part${ index } .${ extension } ` , {
160193 type : mimeType ,
161194 } ) ,
195+ ) ;
196+ }
197+
198+ return files ;
199+ }
200+
201+ private async convertToWavChunks (
202+ buffer : ArrayBuffer ,
203+ basename : string ,
204+ ) : Promise < File [ ] > {
205+ const audioContext = this . createAudioContext ( ) ;
206+ if ( ! audioContext ) return [ ] ;
207+
208+ try {
209+ const audioBuffer = await audioContext . decodeAudioData ( buffer . slice ( 0 ) ) ;
210+ return this . renderWavChunks ( audioBuffer , basename ) ;
211+ } catch ( error ) {
212+ console . warn ( "Failed to convert audio buffer for transcription" , error ) ;
213+ return [ ] ;
214+ } finally {
215+ try {
216+ await audioContext . close ( ) ;
217+ } catch ( error ) {
218+ console . warn ( "Failed to close audio context" , error ) ;
219+ }
220+ }
221+ }
222+
223+ private createAudioContext ( ) : AudioContext | null {
224+ if ( typeof window === "undefined" ) {
225+ return null ;
226+ }
227+
228+ const contextCtor =
229+ window . AudioContext ||
230+ ( window as typeof window & { webkitAudioContext ?: typeof AudioContext } )
231+ . webkitAudioContext ;
232+ if ( ! contextCtor ) {
233+ return null ;
234+ }
235+
236+ return new contextCtor ( ) ;
237+ }
238+
239+ private renderWavChunks ( audioBuffer : AudioBuffer , basename : string ) : File [ ] {
240+ const numChannels = audioBuffer . numberOfChannels ;
241+ const bytesPerFrame = numChannels * this . PCM_BYTES_PER_SAMPLE ;
242+ const availableBytesPerChunk = this . CHUNK_SIZE_BYTES - this . WAV_HEADER_SIZE ;
243+ const maxSamplesPerChunk = Math . max (
244+ 1 ,
245+ Math . floor ( availableBytesPerChunk / bytesPerFrame ) ,
246+ ) ;
247+ const channelData = Array . from ( { length : numChannels } , ( _ , channelIndex ) =>
248+ audioBuffer . getChannelData ( channelIndex ) ,
249+ ) ;
250+ const files : File [ ] = [ ] ;
251+ let chunkIndex = 0 ;
252+
253+ for (
254+ let startSample = 0 ;
255+ startSample < audioBuffer . length ;
256+ startSample += maxSamplesPerChunk
257+ ) {
258+ const endSample = Math . min (
259+ audioBuffer . length ,
260+ startSample + maxSamplesPerChunk ,
261+ ) ;
262+ const wavBuffer = this . renderWavBuffer (
263+ channelData ,
264+ audioBuffer . sampleRate ,
265+ startSample ,
266+ endSample ,
267+ ) ;
268+ files . push (
269+ new File ( [ wavBuffer ] , `${ basename } .part${ chunkIndex } .wav` , {
270+ type : "audio/wav" ,
271+ } ) ,
272+ ) ;
273+ chunkIndex ++ ;
274+ }
275+
276+ return files ;
277+ }
278+
279+ private renderWavBuffer (
280+ channelData : Float32Array [ ] ,
281+ sampleRate : number ,
282+ startSample : number ,
283+ endSample : number ,
284+ ) : ArrayBuffer {
285+ const numChannels = channelData . length ;
286+ const sampleCount = Math . max ( 0 , endSample - startSample ) ;
287+ const blockAlign = numChannels * this . PCM_BYTES_PER_SAMPLE ;
288+ const buffer = new ArrayBuffer (
289+ this . WAV_HEADER_SIZE + sampleCount * blockAlign ,
162290 ) ;
291+ const view = new DataView ( buffer ) ;
292+ this . writeWavHeader ( view , sampleRate , numChannels , sampleCount ) ;
293+ let offset = this . WAV_HEADER_SIZE ;
294+
295+ for ( let i = 0 ; i < sampleCount ; i ++ ) {
296+ for ( let channel = 0 ; channel < numChannels ; channel ++ ) {
297+ const sample = channelData [ channel ] [ startSample + i ] ?? 0 ;
298+ const clamped = Math . max ( - 1 , Math . min ( 1 , sample ) ) ;
299+ const intSample =
300+ clamped < 0
301+ ? clamped * 0x8000
302+ : clamped * 0x7fff ;
303+ view . setInt16 ( offset , Math . round ( intSample ) , true ) ;
304+ offset += this . PCM_BYTES_PER_SAMPLE ;
305+ }
306+ }
307+
308+ return buffer ;
309+ }
310+
311+ private writeWavHeader (
312+ view : DataView ,
313+ sampleRate : number ,
314+ numChannels : number ,
315+ sampleCount : number ,
316+ ) : void {
317+ const blockAlign = numChannels * this . PCM_BYTES_PER_SAMPLE ;
318+ const byteRate = sampleRate * blockAlign ;
319+ const dataSize = sampleCount * blockAlign ;
320+ this . writeString ( view , 0 , "RIFF" ) ;
321+ view . setUint32 ( 4 , 36 + dataSize , true ) ;
322+ this . writeString ( view , 8 , "WAVE" ) ;
323+ this . writeString ( view , 12 , "fmt " ) ;
324+ view . setUint32 ( 16 , 16 , true ) ;
325+ view . setUint16 ( 20 , 1 , true ) ;
326+ view . setUint16 ( 22 , numChannels , true ) ;
327+ view . setUint32 ( 24 , sampleRate , true ) ;
328+ view . setUint32 ( 28 , byteRate , true ) ;
329+ view . setUint16 ( 32 , blockAlign , true ) ;
330+ view . setUint16 ( 34 , this . PCM_BYTES_PER_SAMPLE * 8 , true ) ;
331+ this . writeString ( view , 36 , "data" ) ;
332+ view . setUint32 ( 40 , dataSize , true ) ;
333+ }
334+
335+ private writeString ( view : DataView , offset : number , str : string ) : void {
336+ for ( let i = 0 ; i < str . length ; i ++ ) {
337+ view . setUint8 ( offset + i , str . charCodeAt ( i ) ) ;
338+ }
163339 }
164340
165341 private getMimeType ( fileExtension : string ) : string {
0 commit comments