feat: Add field for indicating whether or not audio frames are from the loudest speaker

tyler-albert · copybara-github · commit 77f13786e99d · 2025-02-02T11:25:14.000-08:00
PiperOrigin-RevId: 722391150
diff --git a/native_with_state/api/media_api_client_interface.h b/native_with_state/api/media_api_client_interface.h
@@ -121,6 +121,7 @@ struct AudioFrame {
   int sample_rate;
   size_t number_of_channels;
   size_t number_of_frames;
+  bool is_from_loudest_speaker;
   /// Contributing source (CSRC) of the current audio frame. This ID is used to
   /// identify which participant in the conference generated the frame.
   /// Integrators can cross reference this value with values pushed from Meet
diff --git a/native_with_state/internal/conference_media_tracks.cc b/native_with_state/internal/conference_media_tracks.cc
@@ -40,19 +40,32 @@ void ConferenceAudioTrack::OnData(
                << ". Expected 16.";
     return;
   }
+
   // Audio data is expected to be in PCM format, where each sample is 16 bits.
   const auto* pcm_data = reinterpret_cast<const int16_t*>(audio_data);
 
+  bool is_from_loudest_speaker = false;
   std::optional<uint32_t> csrc;
   std::optional<uint32_t> ssrc;
   // Audio csrcs and ssrcs are not included in the audio data. Therefore,
   // extract them from the RtpReceiver.
   for (const auto& rtp_source : receiver_->GetSources()) {
-    // It is expected that there will be only one CSRC and SSRC per audio frame.
+    // It is expected that there may be 1 or 2 contributing sources. The
+    // contributing source corresponding to the participant's audio stream will
+    // always be present. Meet may also send a contributing source with value
+    // `kLoudestSpeakerCsrc` to indicate that this audio stream is from the
+    // loudest speaker.
+    //
+    // Knowing the loudest speaker can be useful, as it can be used to determine
+    // which participant to prioritize when rendering audio or video (although
+    // other methods may be used as well).
     if (rtp_source.source_type() == webrtc::RtpSourceType::CSRC) {
-      csrc = rtp_source.source_id();
-    }
-    if (rtp_source.source_type() == webrtc::RtpSourceType::SSRC) {
+      if (rtp_source.source_id() == kLoudestSpeakerCsrc) {
+        is_from_loudest_speaker = true;
+      } else {
+        csrc = rtp_source.source_id();
+      }
+    } else if (rtp_source.source_type() == webrtc::RtpSourceType::SSRC) {
       ssrc = rtp_source.source_id();
     }
   }
@@ -67,11 +80,6 @@ void ConferenceAudioTrack::OnData(
     return;
   }
 
-  if (*csrc == kLoudestSpeakerCsrc) {
-    LOG(INFO) << "Ignoring loudest speaker indicator for mid: " << mid_;
-    return;
-  }
-
   // Audio data in PCM format is expected to be stored in a contiguous buffer,
   // where there are `number_of_channels * number_of_frames` audio frames.
   absl::Span<const int16_t> pcm_data_span =
@@ -81,6 +89,7 @@ void ConferenceAudioTrack::OnData(
                        .sample_rate = sample_rate,
                        .number_of_channels = number_of_channels,
                        .number_of_frames = number_of_frames,
+                       .is_from_loudest_speaker = is_from_loudest_speaker,
                        .contributing_source = csrc.value(),
                        .synchronization_source = ssrc.value()});
 };
diff --git a/native_with_state/internal/conference_media_tracks_test.cc b/native_with_state/internal/conference_media_tracks_test.cc
@@ -27,7 +27,6 @@
 #include "testing/base/public/mock-log.h"
 #include "absl/base/log_severity.h"
 #include "native_with_state/api/media_api_client_interface.h"
-#include "webrtc/api/rtp_headers.h"
 #include "webrtc/api/rtp_packet_info.h"
 #include "webrtc/api/rtp_packet_infos.h"
 #include "webrtc/api/scoped_refptr.h"
@@ -41,7 +40,6 @@ namespace meet {
 namespace {
 
 using ::base_logging::ERROR;
-using ::base_logging::INFO;
 using ::testing::_;
 using ::testing::kDoNotCaptureLogsYet;
 using ::testing::MockFunction;
@@ -50,23 +48,71 @@ using ::testing::ScopedMockLog;
 using ::testing::SizeIs;
 using ::testing::UnorderedElementsAre;
 
-TEST(ConferenceAudioTrackTest, CallsObserverWithAudioFrame) {
-  auto mock_receiver = rtc::scoped_refptr<webrtc::MockRtpReceiver>(
+TEST(ConferenceAudioTrackTest, CallsObserverWithAudioFrameFromLoudestSpeaker) {
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
       new webrtc::MockRtpReceiver());
   webrtc::RtpSource csrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/123, webrtc::RtpSourceType::CSRC,
       /*rtp_timestamp=*/1111111,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
+  webrtc::RtpSource loudest_speaker_csrc_rtp_source(
+      webrtc::Timestamp::Micros(1234567890),
+      /*source_id=*/kLoudestSpeakerCsrc, webrtc::RtpSourceType::CSRC,
+      /*rtp_timestamp=*/1111111,
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
   webrtc::RtpSource ssrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/456, webrtc::RtpSourceType::SSRC,
       /*rtp_timestamp=*/2222222,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
+  EXPECT_CALL(*mock_receiver, GetSources)
+      .WillOnce(Return(std::vector<webrtc::RtpSource>{
+          std::move(csrc_rtp_source),
+          std::move(loudest_speaker_csrc_rtp_source),
+          std::move(ssrc_rtp_source)}));
+  MockFunction<void(AudioFrame)> mock_function;
+  std::optional<AudioFrame> received_frame;
+  EXPECT_CALL(mock_function, Call)
+      .WillOnce([&received_frame](AudioFrame frame) {
+        received_frame = std::move(frame);
+      });
+  ConferenceAudioTrack audio_track("mid", mock_receiver,
+                                   mock_function.AsStdFunction());
+  int16_t pcm_data[2 * 100];
+
+  audio_track.OnData(pcm_data,
+                     /*bits_per_sample=*/16,
+                     /*sample_rate=*/48000,
+                     /*number_of_channels=*/2,
+                     /*number_of_frames=*/100,
+                     /*absolute_capture_timestamp_ms=*/std::nullopt);
+
+  ASSERT_TRUE(received_frame.has_value());
+  EXPECT_THAT(received_frame->pcm16, SizeIs(100 * 2));
+  EXPECT_EQ(received_frame->bits_per_sample, 16);
+  EXPECT_EQ(received_frame->sample_rate, 48000);
+  EXPECT_EQ(received_frame->number_of_channels, 2);
+  EXPECT_EQ(received_frame->number_of_frames, 100);
+  EXPECT_TRUE(received_frame->is_from_loudest_speaker);
+  EXPECT_EQ(received_frame->contributing_source, 123);
+  EXPECT_EQ(received_frame->synchronization_source, 456);
+}
+
+TEST(ConferenceAudioTrackTest,
+     CallsObserverWithAudioFrameFromNonLoudestSpeaker) {
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
+      new webrtc::MockRtpReceiver());
+  webrtc::RtpSource csrc_rtp_source(
+      webrtc::Timestamp::Micros(1234567890),
+      /*source_id=*/123, webrtc::RtpSourceType::CSRC,
+      /*rtp_timestamp=*/1111111,
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
+  webrtc::RtpSource ssrc_rtp_source(
+      webrtc::Timestamp::Micros(1234567890),
+      /*source_id=*/456, webrtc::RtpSourceType::SSRC,
+      /*rtp_timestamp=*/2222222,
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
   EXPECT_CALL(*mock_receiver, GetSources)
       .WillOnce(Return(std::vector<webrtc::RtpSource>{
           std::move(csrc_rtp_source), std::move(ssrc_rtp_source)}));
@@ -93,6 +139,7 @@ TEST(ConferenceAudioTrackTest, CallsObserverWithAudioFrame) {
   EXPECT_EQ(received_frame->sample_rate, 48000);
   EXPECT_EQ(received_frame->number_of_channels, 2);
   EXPECT_EQ(received_frame->number_of_frames, 100);
+  EXPECT_FALSE(received_frame->is_from_loudest_speaker);
   EXPECT_EQ(received_frame->contributing_source, 123);
   EXPECT_EQ(received_frame->synchronization_source, 456);
 }
@@ -119,15 +166,13 @@ TEST(ConferenceAudioTrackTest, LogsErrorWithUnsupportedBitsPerSample) {
 }
 
 TEST(ConferenceAudioTrackTest, LogsErrorWithMissingCsrc) {
-  auto mock_receiver = rtc::scoped_refptr<webrtc::MockRtpReceiver>(
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
       new webrtc::MockRtpReceiver());
   webrtc::RtpSource ssrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/456, webrtc::RtpSourceType::SSRC,
       /*rtp_timestamp=*/2222222,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 0, .absolute_capture_time = std::nullopt});
   EXPECT_CALL(*mock_receiver, GetSources)
       .WillOnce(
           Return(std::vector<webrtc::RtpSource>{std::move(ssrc_rtp_source)}));
@@ -153,15 +198,13 @@ TEST(ConferenceAudioTrackTest, LogsErrorWithMissingCsrc) {
 }
 
 TEST(ConferenceAudioTrackTest, LogsErrorWithMissingSsrc) {
-  auto mock_receiver = rtc::scoped_refptr<webrtc::MockRtpReceiver>(
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
       new webrtc::MockRtpReceiver());
   webrtc::RtpSource csrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/123, webrtc::RtpSourceType::CSRC,
       /*rtp_timestamp=*/1111111,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
   EXPECT_CALL(*mock_receiver, GetSources)
       .WillOnce(
           Return(std::vector<webrtc::RtpSource>{std::move(csrc_rtp_source)}));
@@ -186,7 +229,7 @@ TEST(ConferenceAudioTrackTest, LogsErrorWithMissingSsrc) {
   EXPECT_EQ(message, "AudioFrame is missing SSRC for mid: mid");
 }
 TEST(ConferenceAudioTrackTest, LogsErrorWithMissingCsrcAndSsrc) {
-  auto mock_receiver = rtc::scoped_refptr<webrtc::MockRtpReceiver>(
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
       new webrtc::MockRtpReceiver());
   EXPECT_CALL(*mock_receiver, GetSources)
       .WillOnce(Return(std::vector<webrtc::RtpSource>()));
@@ -215,31 +258,27 @@ TEST(ConferenceAudioTrackTest, LogsErrorWithMissingCsrcAndSsrc) {
                                    "AudioFrame is missing SSRC for mid: mid"));
 }
 
-TEST(ConferenceAudioTrackTest, LogsIgnoringLoudestParticipantIndicator) {
-  auto mock_receiver = rtc::scoped_refptr<webrtc::MockRtpReceiver>(
+TEST(ConferenceAudioTrackTest, LogsErrorWithOnlyLoudestSpeakerCsrc) {
+  rtc::scoped_refptr<webrtc::MockRtpReceiver> mock_receiver(
       new webrtc::MockRtpReceiver());
   webrtc::RtpSource csrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/kLoudestSpeakerCsrc, webrtc::RtpSourceType::CSRC,
       /*rtp_timestamp=*/1111111,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
   webrtc::RtpSource ssrc_rtp_source(
       webrtc::Timestamp::Micros(1234567890),
       /*source_id=*/456, webrtc::RtpSourceType::SSRC,
       /*rtp_timestamp=*/2222222,
-      {.audio_level = 100,
-       .absolute_capture_time =
-           webrtc::AbsoluteCaptureTime(1234567890, 1000000000)});
+      {.audio_level = 100, .absolute_capture_time = std::nullopt});
   EXPECT_CALL(*mock_receiver, GetSources)
       .WillOnce(Return(std::vector<webrtc::RtpSource>{
           std::move(csrc_rtp_source), std::move(ssrc_rtp_source)}));
   ConferenceAudioTrack audio_track("mid", mock_receiver,
                                    [](AudioFrame /*frame*/) {});
   ScopedMockLog log(kDoNotCaptureLogsYet);
   std::string message;
-  EXPECT_CALL(log, Log(INFO, _, _))
+  EXPECT_CALL(log, Log(ERROR, _, _))
       .WillOnce([&message](int, const std::string &, const std::string &msg) {
         message = msg;
       });
@@ -253,7 +292,7 @@ TEST(ConferenceAudioTrackTest, LogsIgnoringLoudestParticipantIndicator) {
                      /*number_of_frames=*/100,
                      /*absolute_capture_timestamp_ms=*/std::nullopt);
 
-  EXPECT_EQ(message, "Ignoring loudest speaker indicator for mid: mid");
+  EXPECT_EQ(message, "AudioFrame is missing CSRC for mid: mid");
 }
 
 TEST(ConferenceVideoTrackTest, CallsObserverWithVideoFrame) {