improve right button handling

Josh-XT · Josh-XT · commit 1d5e77fc3a79 · 2026-03-05T14:36:50.000-05:00
diff --git a/lib/services/ai_service.dart b/lib/services/ai_service.dart
@@ -530,35 +530,106 @@ class AIService {
     try {
       await _showProcessingMessage();
 
-      // Transcribe the audio
-      debugPrint('AIService: Transcribing conversation audio...');
-      String? transcription;
-      if (_whisperService != null) {
-        transcription = await _whisperService!.transcribe(audioData);
-      } else {
+      // Ensure whisper service is available
+      if (_whisperService == null) {
         debugPrint('AIService: WhisperService is null, initializing...');
         await _initWhisperService();
-        if (_whisperService != null) {
-          transcription = await _whisperService!.transcribe(audioData);
-        }
       }
 
+      if (_whisperService == null) {
+        await _showErrorMessage('Could not initialize transcription service');
+        return;
+      }
+
+      // Use diarization to identify speakers in the conversation
+      debugPrint('AIService: Transcribing with speaker diarization...');
+      Map<String, dynamic> result;
+      try {
+        result =
+            await _whisperService!.transcribeWithDiarization(audioData);
+      } catch (e) {
+        debugPrint('AIService: Diarization failed, falling back to plain transcription: $e');
+        final plainText = await _whisperService!.transcribe(audioData);
+        result = {'text': plainText, 'segments': [], 'language': null};
+      }
+
+      final transcription = result['text'] as String?;
       if (transcription == null || transcription.isEmpty) {
         debugPrint('AIService: Conversation transcription failed or empty');
         await _showErrorMessage('Could not transcribe conversation');
         return;
       }
 
-      debugPrint('AIService: Conversation transcription: $transcription');
+      // Build speaker-attributed transcription from segments if available
+      final segments = result['segments'] as List<dynamic>? ?? [];
+      String formattedTranscription;
+      if (segments.isNotEmpty &&
+          segments.any((s) => s is Map && s.containsKey('speaker'))) {
+        final buffer = StringBuffer();
+        String? currentSpeaker;
+        for (final seg in segments) {
+          if (seg is Map) {
+            final speaker = seg['speaker'] as String? ?? 'SPEAKER_00';
+            final text = (seg['text'] as String? ?? '').trim();
+            if (text.isEmpty) continue;
+            if (speaker != currentSpeaker) {
+              if (buffer.isNotEmpty) buffer.writeln();
+              buffer.write('[$speaker]: ');
+              currentSpeaker = speaker;
+            }
+            buffer.write('$text ');
+          }
+        }
+        formattedTranscription = buffer.toString().trim();
+      } else {
+        formattedTranscription = transcription;
+      }
+
+      debugPrint(
+          'AIService: Diarized transcription: ${formattedTranscription.substring(0, formattedTranscription.length.clamp(0, 200))}');
+
+      // Build speaker-aware summary prompt
+      final hasSpeakers = segments.isNotEmpty &&
+          segments.any((s) => s is Map && s.containsKey('speaker'));
+      final speakerSet = <String>{};
+      if (hasSpeakers) {
+        for (final seg in segments) {
+          if (seg is Map && seg['speaker'] != null) {
+            speakerSet.add(seg['speaker'] as String);
+          }
+        }
+      }
 
-      // Wrap transcription in a conversation summary prompt
-      final prompt = 'The following is a transcription of a recorded '
-          'conversation. Please:\n'
-          '1. Summarize the conversation\n'
-          '2. Extract potentially important notes and highlights\n'
-          '3. Identify specific goals if mentioned\n'
-          '4. List any action items\n\n'
-          'Transcription:\n$transcription';
+      String prompt;
+      if (hasSpeakers && speakerSet.length > 1) {
+        prompt = 'The following is a speaker-diarized transcription of a '
+            'recorded conversation with ${speakerSet.length} speakers '
+            '(${speakerSet.join(", ")}). Please provide:\n\n'
+            '## Summary\n'
+            'A concise summary of the conversation.\n\n'
+            '## Key Points\n'
+            'Important information, decisions, or highlights.\n\n'
+            '## Action Items\n'
+            'List action items grouped by speaker. For each item, note:\n'
+            '- Who is responsible (which speaker)\n'
+            '- What they need to do\n'
+            '- Any deadlines or priorities mentioned\n\n'
+            '## Questions & Follow-ups\n'
+            'Any unresolved questions or topics that need follow-up.\n\n'
+            'Transcription:\n$formattedTranscription';
+      } else {
+        prompt = 'The following is a transcription of a recorded '
+            'conversation. Please provide:\n\n'
+            '## Summary\n'
+            'A concise summary of what was discussed.\n\n'
+            '## Key Points\n'
+            'Important information, decisions, or highlights.\n\n'
+            '## Action Items\n'
+            'Any action items or tasks mentioned.\n\n'
+            '## Questions & Follow-ups\n'
+            'Any unresolved questions or topics that need follow-up.\n\n'
+            'Transcription:\n$formattedTranscription';
+      }
 
       // Send to AGiXT
       await _sendMessageToAGiXT(prompt);
diff --git a/lib/services/whisper.dart b/lib/services/whisper.dart
@@ -11,6 +11,8 @@ import 'package:speech_to_text/speech_to_text.dart' as stt;
 import 'package:flutter_sound/flutter_sound.dart';
 import 'package:web_socket_channel/web_socket_channel.dart';
 
+import 'package:http/http.dart' as http;
+
 import 'package:agixt/services/secure_storage_service.dart';
 import 'package:agixt/utils/url_security.dart';
 import 'package:agixt/models/agixt/auth/auth.dart';
@@ -28,6 +30,18 @@ abstract class WhisperService {
 
   Future<String> transcribe(Uint8List voiceData);
 
+  /// Transcribe audio with speaker diarization.
+  /// Returns a map with 'text' (speaker-attributed), 'segments' (list of
+  /// segment maps with 'speaker', 'text', 'start', 'end'), and 'language'.
+  Future<Map<String, dynamic>> transcribeWithDiarization(
+    Uint8List voiceData, {
+    int? numSpeakers,
+  }) async {
+    // Default implementation falls back to plain transcription
+    final text = await transcribe(voiceData);
+    return {'text': text, 'segments': [], 'language': null};
+  }
+
   // Method for AGiXT AI integration that returns a simulated transcription
   Future<String?> getTranscription() async {
     try {
@@ -297,6 +311,110 @@ class WhisperRemoteService implements WhisperService {
     }
   }
 
+  /// Build a WAV file from raw PCM data and return the file path
+  Future<String> _buildWavFile(Uint8List voiceData) async {
+    final Directory documentDirectory =
+        await getApplicationDocumentsDirectory();
+    final String wavPath = '${documentDirectory.path}/${Uuid().v4()}.wav';
+
+    final int sampleRate = 16000;
+    final int numChannels = 1;
+    final int byteRate = sampleRate * numChannels * 2;
+    final int blockAlign = numChannels * 2;
+    final int bitsPerSample = 16;
+    final int dataSize = voiceData.length;
+    final int chunkSize = 36 + dataSize;
+
+    final List<int> header = [
+      ...ascii.encode('RIFF'),
+      chunkSize & 0xff, (chunkSize >> 8) & 0xff,
+      (chunkSize >> 16) & 0xff, (chunkSize >> 24) & 0xff,
+      ...ascii.encode('WAVE'),
+      ...ascii.encode('fmt '),
+      16, 0, 0, 0,
+      1, 0,
+      numChannels, 0,
+      sampleRate & 0xff, (sampleRate >> 8) & 0xff,
+      (sampleRate >> 16) & 0xff, (sampleRate >> 24) & 0xff,
+      byteRate & 0xff, (byteRate >> 8) & 0xff,
+      (byteRate >> 16) & 0xff, (byteRate >> 24) & 0xff,
+      blockAlign, 0,
+      bitsPerSample, 0,
+      ...ascii.encode('data'),
+      dataSize & 0xff, (dataSize >> 8) & 0xff,
+      (dataSize >> 16) & 0xff, (dataSize >> 24) & 0xff,
+    ];
+    header.addAll(voiceData.toList());
+
+    final audioFile = File(wavPath);
+    await audioFile.writeAsBytes(Uint8List.fromList(header));
+    return wavPath;
+  }
+
+  @override
+  Future<Map<String, dynamic>> transcribeWithDiarization(
+    Uint8List voiceData, {
+    int? numSpeakers,
+  }) async {
+    debugPrint(
+        'Transcribing with diarization: ${voiceData.length} bytes');
+    await init();
+
+    final wavPath = await _buildWavFile(voiceData);
+
+    try {
+      final url = await getBaseURL();
+      final sanitizedUrl = UrlSecurity.sanitizeBaseUrl(
+        url!,
+        allowHttpOnLocalhost: true,
+      );
+      final apiKey = await getApiKey();
+      final model = await getModel() ?? 'whisper-1';
+
+      // Use multipart request to pass enable_diarization param
+      final uri = Uri.parse('$sanitizedUrl/v1/audio/transcriptions');
+      final request = http.MultipartRequest('POST', uri);
+      request.headers['Authorization'] = 'Bearer ${apiKey ?? ""}';
+      request.files.add(await http.MultipartFile.fromPath(
+        'file',
+        wavPath,
+        filename: 'audio.wav',
+      ));
+      request.fields['model'] = model;
+      request.fields['enable_diarization'] = 'true';
+      request.fields['response_format'] = 'verbose_json';
+      if (numSpeakers != null) {
+        request.fields['num_speakers'] = numSpeakers.toString();
+      }
+
+      final streamedResponse = await request.send().timeout(
+        const Duration(seconds: 120),
+      );
+      final responseBody = await streamedResponse.stream.bytesToString();
+
+      if (streamedResponse.statusCode != 200) {
+        throw Exception(
+            'Diarization request failed (${streamedResponse.statusCode}): $responseBody');
+      }
+
+      final result = jsonDecode(responseBody) as Map<String, dynamic>;
+      debugPrint('Diarization result: ${result['text']?.toString().substring(0, (result['text']?.toString().length ?? 0).clamp(0, 100))}...');
+
+      // Clean up
+      await File(wavPath).delete();
+
+      return result;
+    } catch (e) {
+      try {
+        await File(wavPath).delete();
+      } catch (_) {}
+      debugPrint('Diarization transcription error: $e');
+      // Fall back to plain transcription
+      final text = await transcribe(voiceData);
+      return {'text': text, 'segments': [], 'language': null};
+    }
+  }
+
   @override
   Future<String?> getTranscription() async {
     // Call the implementation from the abstract class