Skip to content

Commit d94c6fa

Browse files
renefloorBrazol
andauthored
feat(llc): speech recognition while muted (#999)
* wip speech recognition * Refactor feature out of callState * Add documentation and test * Add changelog * formatting * Apply suggestions from code review Co-authored-by: Maciej Brażewicz <[email protected]> * Improve error handling --------- Co-authored-by: Maciej Brażewicz <[email protected]>
1 parent 804b916 commit d94c6fa

File tree

9 files changed

+558
-1
lines changed

9 files changed

+558
-1
lines changed

dogfooding/lib/screens/call_screen.dart

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,39 @@ class CallScreen extends StatefulWidget {
4343
class _CallScreenState extends State<CallScreen> {
4444
late final _userChatRepo = locator.get<UserChatRepository>();
4545
late final _videoEffectsManager = StreamVideoEffectsManager(widget.call);
46+
late StreamSubscription<SpeakingWhileMutedState> _speechSubscription;
4647

4748
Channel? _channel;
4849
ParticipantLayoutMode _currentLayoutMode = ParticipantLayoutMode.grid;
4950
bool _moreMenuVisible = false;
51+
late SpeakingWhileMutedRecognition _speakingWhileMutedRecognition;
5052

5153
@override
5254
void initState() {
5355
super.initState();
5456
_connectChatChannel();
57+
_speakingWhileMutedRecognition =
58+
SpeakingWhileMutedRecognition(call: widget.call);
59+
_speechSubscription = _speakingWhileMutedRecognition.stream.listen((state) {
60+
final context = this.context;
61+
if (state.isSpeakingWhileMuted && context.mounted) {
62+
final colorTheme = StreamVideoTheme.of(context).colorTheme;
63+
64+
ScaffoldMessenger.maybeOf(context)?.showSnackBar(
65+
SnackBar(
66+
content: const Text('You are speaking while muted'),
67+
behavior: SnackBarBehavior.floating,
68+
backgroundColor: colorTheme.accentPrimary,
69+
),
70+
);
71+
}
72+
});
5573
}
5674

5775
@override
5876
void dispose() {
77+
_speechSubscription.cancel();
78+
_speakingWhileMutedRecognition.dispose();
5979
widget.call.leave();
6080
_userChatRepo.disconnectUser();
6181
super.dispose();

packages/stream_video/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
✅ Added
44
* Introduced the `reconnectTimeout` option in `CallPreferences`, allowing you to set the maximum duration the SDK will attempt to reconnect to a call before giving up.
5+
* `SpeakingWhileMutedRecognition` to notify users when there is audio input, but microphone is muted.
56
* Added tracking for failed SFU join attempts. If a client fails to connect to the same SFU twice, it will now request a new SFU from the Coordinator. Also added max join retries counter (default to 3) to improve call connection reliability - it can be configured by providing `maxJoinRetries` parameter in `join()` method.
67

78
🔄 Changed
@@ -31,6 +32,7 @@
3132
✅ Added
3233
* Added `setMirrorVideo` method to `Call` class to control video mirroring for participants.
3334
* Added `call.partialState` for more specific and efficient state updates.
35+
3436
* Added `maxParticipantsExcludeOwner` and `maxParticipantsExcludeRoles` to Call limits settings, providing finer control over participant limits by allowing exclusion of call owners and specific roles from the maximum count.
3537

3638
🐞 Fixed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import 'dart:async';
2+
3+
abstract interface class AudioRecognition {
4+
Future<void> start({
5+
required SoundStateChangedCallback onSoundStateChanged,
6+
});
7+
8+
Future<void> stop();
9+
10+
Future<void> dispose();
11+
}
12+
13+
typedef SoundStateChangedCallback = void Function(SoundState state);
14+
15+
class SoundState {
16+
const SoundState({
17+
required this.isSpeaking,
18+
required this.audioLevel,
19+
});
20+
21+
final bool isSpeaking;
22+
final double audioLevel;
23+
}
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
import 'dart:async';
2+
import 'dart:math' as math;
3+
4+
import 'package:collection/collection.dart';
5+
import 'package:flutter/foundation.dart';
6+
import 'package:stream_webrtc_flutter/stream_webrtc_flutter.dart' as rtc;
7+
import 'package:stream_webrtc_flutter/stream_webrtc_flutter.dart';
8+
9+
import '../../stream_video.dart';
10+
import '../webrtc/model/stats/rtc_audio_source.dart';
11+
import '../webrtc/model/stats/rtc_stats_mapper.dart';
12+
13+
class AudioRecognitionWebRTC implements AudioRecognition {
14+
AudioRecognitionWebRTC({this.config = const AudioRecognitionConfig()}) {
15+
_init();
16+
}
17+
18+
Completer<void>? _initCompleter;
19+
AudioRecognitionConfig config;
20+
21+
late RTCPeerConnection _pc1;
22+
late RTCPeerConnection _pc2;
23+
MediaStream? _audioStream;
24+
25+
VoidCallback? _disposeTimers;
26+
27+
@override
28+
Future<void> start({
29+
required SoundStateChangedCallback onSoundStateChanged,
30+
}) async {
31+
if (_initCompleter case final completer?) await completer.future;
32+
33+
_disposeTimers = _startListening(onSoundStateChanged);
34+
}
35+
36+
@override
37+
Future<void> stop() async {
38+
_disposeTimers?.call();
39+
_disposeTimers = null;
40+
}
41+
42+
@override
43+
Future<void> dispose() async {
44+
await stop();
45+
await Future.wait([_pc1.close(), _pc2.close()]);
46+
await _cleanupAudioStream();
47+
}
48+
49+
Future<void> _init() async {
50+
_initCompleter = Completer<void>();
51+
try {
52+
_pc1 = await rtc.createPeerConnection(const RTCConfiguration().toMap());
53+
_pc2 = await rtc.createPeerConnection(const RTCConfiguration().toMap());
54+
55+
final audioStream = await rtc.navigator.mediaDevices.getUserMedia(
56+
const AudioConstraints().toMap(),
57+
);
58+
_audioStream = audioStream;
59+
60+
_pc1.onIceCandidate = _pc2.addCandidate;
61+
_pc2.onIceCandidate = _pc1.addCandidate;
62+
63+
audioStream.getAudioTracks().forEach((track) {
64+
_pc1.addTrack(track, audioStream);
65+
});
66+
67+
final offer = await _pc1.createOffer();
68+
await _pc2.setRemoteDescription(offer);
69+
await _pc1.setLocalDescription(offer);
70+
71+
final answer = await _pc2.createAnswer();
72+
await _pc1.setRemoteDescription(answer);
73+
await _pc2.setLocalDescription(answer);
74+
_initCompleter?.complete();
75+
_initCompleter = null;
76+
} catch (e, trace) {
77+
_initCompleter?.completeError(e, trace);
78+
}
79+
}
80+
81+
VoidCallback _startListening(SoundStateChangedCallback onSoundStateChanged) {
82+
var baselineNoiseLevel = config.initialBaselineNoiseLevel;
83+
var speechDetected = false;
84+
Timer? speechTimer;
85+
Timer? silenceTimer;
86+
final audioLevelHistory =
87+
<double>[]; // Store recent audio levels for smoother detection
88+
89+
Future<void> checkAudioLevel(Timer timer) async {
90+
final stats = await _pc1.getStats();
91+
final audioMediaSourceStats = stats
92+
.map((stat) => stat.toRtcStats())
93+
.whereType<RtcAudioSource>()
94+
.firstOrNull;
95+
96+
final audioLevel = audioMediaSourceStats?.audioLevel;
97+
if (audioLevel == null) return;
98+
99+
// Update audio level history (with max historyLength sized list)
100+
audioLevelHistory.add(audioLevel);
101+
if (audioLevelHistory.length > config.historyLength) {
102+
audioLevelHistory.removeAt(0);
103+
}
104+
105+
if (audioLevelHistory.length < 5) return;
106+
107+
// Calculate average audio level
108+
final averageAudioLevel =
109+
audioLevelHistory.reduce((a, b) => a + b) / audioLevelHistory.length;
110+
111+
// Update baseline (if necessary) based on silence detection
112+
if (averageAudioLevel < baselineNoiseLevel * config.silenceThreshold) {
113+
silenceTimer ??= Timer(config.silenceTimeout, () {
114+
baselineNoiseLevel = math.min(
115+
averageAudioLevel * config.resetThreshold,
116+
baselineNoiseLevel,
117+
);
118+
});
119+
} else {
120+
silenceTimer?.cancel();
121+
silenceTimer = null;
122+
}
123+
124+
// Check for speech detection
125+
if (averageAudioLevel > baselineNoiseLevel * config.speechThreshold) {
126+
if (!speechDetected) {
127+
speechDetected = true;
128+
onSoundStateChanged(
129+
SoundState(isSpeaking: true, audioLevel: averageAudioLevel));
130+
}
131+
132+
speechTimer?.cancel();
133+
speechTimer = Timer(config.speechTimeout, () {
134+
speechDetected = false;
135+
onSoundStateChanged(
136+
SoundState(isSpeaking: false, audioLevel: averageAudioLevel),
137+
);
138+
speechTimer = null;
139+
});
140+
}
141+
}
142+
143+
final interval =
144+
Timer.periodic(const Duration(milliseconds: 100), checkAudioLevel);
145+
146+
return () {
147+
speechTimer?.cancel();
148+
silenceTimer?.cancel();
149+
interval.cancel();
150+
};
151+
}
152+
153+
Future<void> _cleanupAudioStream() async {
154+
_audioStream?.getAudioTracks().forEach((track) {
155+
track.stop();
156+
});
157+
await _audioStream?.dispose();
158+
_audioStream = null;
159+
}
160+
}
161+
162+
class AudioRecognitionConfig {
163+
const AudioRecognitionConfig({
164+
this.initialBaselineNoiseLevel = 0.13,
165+
this.historyLength = 10,
166+
this.silenceThreshold = 1.1,
167+
this.speechThreshold = 5,
168+
this.resetThreshold = 0.9,
169+
this.speechTimeout = const Duration(milliseconds: 500),
170+
this.silenceTimeout = const Duration(seconds: 5),
171+
});
172+
173+
final double initialBaselineNoiseLevel;
174+
final int historyLength;
175+
final double silenceThreshold;
176+
final double speechThreshold;
177+
final double resetThreshold;
178+
final Duration speechTimeout;
179+
final Duration silenceTimeout;
180+
}

0 commit comments

Comments
 (0)