@@ -11,6 +11,8 @@ import 'package:speech_to_text/speech_to_text.dart' as stt;
1111import 'package:flutter_sound/flutter_sound.dart' ;
1212import 'package:web_socket_channel/web_socket_channel.dart' ;
1313
14+ import 'package:http/http.dart' as http;
15+
1416import 'package:agixt/services/secure_storage_service.dart' ;
1517import 'package:agixt/utils/url_security.dart' ;
1618import 'package:agixt/models/agixt/auth/auth.dart' ;
@@ -28,6 +30,18 @@ abstract class WhisperService {
2830
2931 Future <String > transcribe (Uint8List voiceData);
3032
33+ /// Transcribe audio with speaker diarization.
34+ /// Returns a map with 'text' (speaker-attributed), 'segments' (list of
35+ /// segment maps with 'speaker', 'text', 'start', 'end'), and 'language'.
36+ Future <Map <String , dynamic >> transcribeWithDiarization (
37+ Uint8List voiceData, {
38+ int ? numSpeakers,
39+ }) async {
40+ // Default implementation falls back to plain transcription
41+ final text = await transcribe (voiceData);
42+ return {'text' : text, 'segments' : [], 'language' : null };
43+ }
44+
3145 // Method for AGiXT AI integration that returns a simulated transcription
3246 Future <String ?> getTranscription () async {
3347 try {
@@ -297,6 +311,110 @@ class WhisperRemoteService implements WhisperService {
297311 }
298312 }
299313
314+ /// Build a WAV file from raw PCM data and return the file path
315+ Future <String > _buildWavFile (Uint8List voiceData) async {
316+ final Directory documentDirectory =
317+ await getApplicationDocumentsDirectory ();
318+ final String wavPath = '${documentDirectory .path }/${Uuid ().v4 ()}.wav' ;
319+
320+ final int sampleRate = 16000 ;
321+ final int numChannels = 1 ;
322+ final int byteRate = sampleRate * numChannels * 2 ;
323+ final int blockAlign = numChannels * 2 ;
324+ final int bitsPerSample = 16 ;
325+ final int dataSize = voiceData.length;
326+ final int chunkSize = 36 + dataSize;
327+
328+ final List <int > header = [
329+ ...ascii.encode ('RIFF' ),
330+ chunkSize & 0xff , (chunkSize >> 8 ) & 0xff ,
331+ (chunkSize >> 16 ) & 0xff , (chunkSize >> 24 ) & 0xff ,
332+ ...ascii.encode ('WAVE' ),
333+ ...ascii.encode ('fmt ' ),
334+ 16 , 0 , 0 , 0 ,
335+ 1 , 0 ,
336+ numChannels, 0 ,
337+ sampleRate & 0xff , (sampleRate >> 8 ) & 0xff ,
338+ (sampleRate >> 16 ) & 0xff , (sampleRate >> 24 ) & 0xff ,
339+ byteRate & 0xff , (byteRate >> 8 ) & 0xff ,
340+ (byteRate >> 16 ) & 0xff , (byteRate >> 24 ) & 0xff ,
341+ blockAlign, 0 ,
342+ bitsPerSample, 0 ,
343+ ...ascii.encode ('data' ),
344+ dataSize & 0xff , (dataSize >> 8 ) & 0xff ,
345+ (dataSize >> 16 ) & 0xff , (dataSize >> 24 ) & 0xff ,
346+ ];
347+ header.addAll (voiceData.toList ());
348+
349+ final audioFile = File (wavPath);
350+ await audioFile.writeAsBytes (Uint8List .fromList (header));
351+ return wavPath;
352+ }
353+
354+ @override
355+ Future <Map <String , dynamic >> transcribeWithDiarization (
356+ Uint8List voiceData, {
357+ int ? numSpeakers,
358+ }) async {
359+ debugPrint (
360+ 'Transcribing with diarization: ${voiceData .length } bytes' );
361+ await init ();
362+
363+ final wavPath = await _buildWavFile (voiceData);
364+
365+ try {
366+ final url = await getBaseURL ();
367+ final sanitizedUrl = UrlSecurity .sanitizeBaseUrl (
368+ url! ,
369+ allowHttpOnLocalhost: true ,
370+ );
371+ final apiKey = await getApiKey ();
372+ final model = await getModel () ?? 'whisper-1' ;
373+
374+ // Use multipart request to pass enable_diarization param
375+ final uri = Uri .parse ('$sanitizedUrl /v1/audio/transcriptions' );
376+ final request = http.MultipartRequest ('POST' , uri);
377+ request.headers['Authorization' ] = 'Bearer ${apiKey ?? "" }' ;
378+ request.files.add (await http.MultipartFile .fromPath (
379+ 'file' ,
380+ wavPath,
381+ filename: 'audio.wav' ,
382+ ));
383+ request.fields['model' ] = model;
384+ request.fields['enable_diarization' ] = 'true' ;
385+ request.fields['response_format' ] = 'verbose_json' ;
386+ if (numSpeakers != null ) {
387+ request.fields['num_speakers' ] = numSpeakers.toString ();
388+ }
389+
390+ final streamedResponse = await request.send ().timeout (
391+ const Duration (seconds: 120 ),
392+ );
393+ final responseBody = await streamedResponse.stream.bytesToString ();
394+
395+ if (streamedResponse.statusCode != 200 ) {
396+ throw Exception (
397+ 'Diarization request failed (${streamedResponse .statusCode }): $responseBody ' );
398+ }
399+
400+ final result = jsonDecode (responseBody) as Map <String , dynamic >;
401+ debugPrint ('Diarization result: ${result ['text' ]?.toString ().substring (0 , (result ['text' ]?.toString ().length ?? 0 ).clamp (0 , 100 ))}...' );
402+
403+ // Clean up
404+ await File (wavPath).delete ();
405+
406+ return result;
407+ } catch (e) {
408+ try {
409+ await File (wavPath).delete ();
410+ } catch (_) {}
411+ debugPrint ('Diarization transcription error: $e ' );
412+ // Fall back to plain transcription
413+ final text = await transcribe (voiceData);
414+ return {'text' : text, 'segments' : [], 'language' : null };
415+ }
416+ }
417+
300418 @override
301419 Future <String ?> getTranscription () async {
302420 // Call the implementation from the abstract class
0 commit comments