support appending to pcm_data

tbarbugli · tbarbugli · commit f4d1a50bc055 · 2025-10-26T12:57:28.000+01:00
diff --git a/getstream/video/rtc/track_util.py b/getstream/video/rtc/track_util.py
@@ -38,7 +38,7 @@ class PcmData(NamedTuple):
 
     format: str
     sample_rate: int
-    samples: NDArray
+    samples: NDArray = np.array([], dtype=np.int16)
     pts: Optional[int] = None  # Presentation timestamp
     dts: Optional[int] = None  # Decode timestamp
     time_base: Optional[float] = None  # Time base for converting timestamps to seconds
@@ -521,7 +521,8 @@ def to_float32(self) -> "PcmData":
                 ).samples
 
         # Convert to float32 and scale if needed
-        if self.format == "s16" or (
+        fmt = (self.format or "").lower()
+        if fmt in ("s16", "int16") or (
             isinstance(arr, np.ndarray) and arr.dtype == np.int16
         ):
             arr_f32 = arr.astype(np.float32) / 32768.0
@@ -539,6 +540,174 @@ def to_float32(self) -> "PcmData":
             channels=self.channels,
         )
 
+    def append(self, other: "PcmData") -> "PcmData":
+        """Append another PcmData to this one and return a new instance.
+
+        The input chunk is adjusted to match this instance's sample rate,
+        channel count, and sample format before concatenation.
+
+        Notes:
+        - Preserves shape semantics: mono as 1D, multi-channel as 2D [channels, samples].
+        - Keeps metadata (sample_rate, format, channels, pts/dts/time_base) from self.
+        - Does not modify self; returns a new PcmData.
+        """
+
+        # Early exits for empty cases
+        def _is_empty(arr: Any) -> bool:
+            try:
+                return isinstance(arr, np.ndarray) and arr.size == 0
+            except Exception:
+                return False
+
+        # Normalize numpy arrays from bytes-like if needed
+        def _ensure_ndarray(pcm: "PcmData") -> np.ndarray:
+            if isinstance(pcm.samples, np.ndarray):
+                return pcm.samples
+            return PcmData.from_bytes(
+                pcm.to_bytes(),
+                sample_rate=pcm.sample_rate,
+                format=pcm.format,
+                channels=pcm.channels,
+            ).samples
+
+        # Adjust other to match sample rate and channels first
+        other_adj = other
+        if (
+            other_adj.sample_rate != self.sample_rate
+            or other_adj.channels != self.channels
+        ):
+            other_adj = other_adj.resample(
+                self.sample_rate, target_channels=self.channels
+            )
+
+        # Then adjust format to match
+        fmt = (self.format or "").lower()
+        if fmt in ("f32", "float32"):
+            other_adj = other_adj.to_float32()
+        elif fmt in ("s16", "int16"):
+            # Ensure int16 dtype and mark as s16
+            arr = _ensure_ndarray(other_adj)
+            if arr.dtype != np.int16:
+                if other_adj.format == "f32":
+                    arr = (np.clip(arr.astype(np.float32), -1.0, 1.0) * 32767.0).astype(
+                        np.int16
+                    )
+                else:
+                    arr = arr.astype(np.int16)
+            other_adj = PcmData(
+                samples=arr,
+                sample_rate=other_adj.sample_rate,
+                format="s16",
+                pts=other_adj.pts,
+                dts=other_adj.dts,
+                time_base=other_adj.time_base,
+                channels=other_adj.channels,
+            )
+        else:
+            # For unknown formats, fallback to bytes round-trip in self's format
+            other_adj = PcmData.from_bytes(
+                other_adj.to_bytes(),
+                sample_rate=self.sample_rate,
+                format=self.format,
+                channels=self.channels,
+            )
+
+        # Ensure ndarrays for concatenation
+        self_arr = _ensure_ndarray(self)
+        other_arr = _ensure_ndarray(other_adj)
+
+        # If either is empty, return the other while preserving self's metadata
+        if _is_empty(self_arr):
+            # Conform shape to target channels semantics and dtype
+            if isinstance(other_arr, np.ndarray):
+                if (self.channels or 1) == 1 and other_arr.ndim > 1:
+                    other_arr = other_arr.reshape(-1)
+                target_dtype = (
+                    np.float32
+                    if (self.format or "").lower() in ("f32", "float32")
+                    else np.int16
+                )
+                other_arr = other_arr.astype(target_dtype, copy=False)
+            return PcmData(
+                samples=other_arr,
+                sample_rate=self.sample_rate,
+                format=self.format,
+                pts=self.pts,
+                dts=self.dts,
+                time_base=self.time_base,
+                channels=self.channels,
+            )
+        if _is_empty(other_arr):
+            return self
+
+        ch = max(1, int(self.channels or 1))
+
+        # Concatenate respecting shape conventions
+        if ch == 1:
+            # Mono: keep 1D shape
+            if self_arr.ndim > 1:
+                self_arr = self_arr.reshape(-1)
+            if other_arr.ndim > 1:
+                other_arr = other_arr.reshape(-1)
+            out = np.concatenate([self_arr, other_arr])
+            # Enforce dtype based on format
+            if (self.format or "").lower() in (
+                "f32",
+                "float32",
+            ) and out.dtype != np.float32:
+                out = out.astype(np.float32)
+            elif (self.format or "").lower() in (
+                "s16",
+                "int16",
+            ) and out.dtype != np.int16:
+                out = out.astype(np.int16)
+            return PcmData(
+                samples=out,
+                sample_rate=self.sample_rate,
+                format=self.format,
+                pts=self.pts,
+                dts=self.dts,
+                time_base=self.time_base,
+                channels=self.channels,
+            )
+        else:
+            # Multi-channel: normalize to (channels, samples)
+            def _to_cmaj(arr: np.ndarray, channels: int) -> np.ndarray:
+                if arr.ndim == 2:
+                    if arr.shape[0] == channels:
+                        return arr
+                    if arr.shape[1] == channels:
+                        return arr.T
+                    # Ambiguous; assume time-major and transpose
+                    return arr.T
+                # 1D input: replicate across channels
+                return np.tile(arr.reshape(1, -1), (channels, 1))
+
+            self_cmaj = _to_cmaj(self_arr, ch)
+            other_cmaj = _to_cmaj(other_arr, ch)
+            out = np.concatenate([self_cmaj, other_cmaj], axis=1)
+            # Enforce dtype based on format
+            if (self.format or "").lower() in (
+                "f32",
+                "float32",
+            ) and out.dtype != np.float32:
+                out = out.astype(np.float32)
+            elif (self.format or "").lower() in (
+                "s16",
+                "int16",
+            ) and out.dtype != np.int16:
+                out = out.astype(np.int16)
+
+            return PcmData(
+                samples=out,
+                sample_rate=self.sample_rate,
+                format=self.format,
+                pts=self.pts,
+                dts=self.dts,
+                time_base=self.time_base,
+                channels=self.channels,
+            )
+
     @classmethod
     def from_response(
         cls,
diff --git a/tests/rtc/test_pcm_data.py b/tests/rtc/test_pcm_data.py
@@ -293,3 +293,86 @@ def test_to_float32_converts_int16_and_preserves_metadata():
     f32_2 = f32.to_float32()
     assert f32_2.samples.dtype == np.float32
     assert np.allclose(f32_2.samples, f32.samples, atol=1e-7)
+
+
+def test_append_mono_s16_concatenates_and_preserves_format():
+    sr = 16000
+    a = np.array([1, 2, 3, 4], dtype=np.int16)
+    b = np.array([5, 6], dtype=np.int16)
+
+    pcm_a = PcmData(samples=a, sample_rate=sr, format="s16", channels=1)
+    pcm_b = PcmData(samples=b, sample_rate=sr, format="s16", channels=1)
+
+    out = pcm_a.append(pcm_b)
+
+    assert out.format == "s16"
+    assert out.channels == 1
+    assert isinstance(out.samples, np.ndarray)
+    assert out.samples.dtype == np.int16
+    assert out.samples.ndim == 1
+    assert out.sample_rate == sr
+    assert np.array_equal(out.samples, np.array([1, 2, 3, 4, 5, 6], dtype=np.int16))
+
+
+def test_append_resamples_and_converts_to_match_target_format():
+    # Target is float32 stereo 48kHz
+    base = np.array([[0.0, 0.1, -0.1], [0.0, 0.1, -0.1]], dtype=np.float32)
+    pcm_target = PcmData(samples=base, sample_rate=48000, format="f32", channels=2)
+
+    # Other is s16 mono 16kHz
+    other_raw = np.array([1000, -1000, 1000, -1000, 1000, -1000], dtype=np.int16)
+    pcm_other = PcmData(samples=other_raw, sample_rate=16000, format="s16", channels=1)
+
+    # Pre-compute expected resampled length by using the same resample pipeline
+    other_resampled = pcm_other.resample(48000, target_channels=2).to_float32()
+    if other_resampled.samples.ndim == 2:
+        expected_added = other_resampled.samples.shape[1]
+    else:
+        expected_added = other_resampled.samples.shape[0]
+
+    out = pcm_target.append(pcm_other)
+
+    # Check format/channels preserved and dtype matches
+    assert out.format == "f32"
+    assert out.channels == 2
+    assert isinstance(out.samples, np.ndarray) and out.samples.dtype == np.float32
+    assert out.samples.shape[0] == 2
+
+    # First part must equal the original base (append should not alter original)
+    assert np.allclose(out.samples[:, : base.shape[1]], base)
+
+    # Total length should be base + resampled other
+    assert out.samples.shape[1] == base.shape[1] + expected_added
+
+
+def test_append_empty_buffer_float32_adjusts_other_and_keeps_meta():
+    # Create an empty buffer specifying desired output meta using alternate format name
+    buffer = PcmData(format="float32", sample_rate=16000, channels=1)
+
+    # Other is int16 stereo at 48kHz, small ramp
+    other = np.array(
+        [[1000, -1000, 500, -500], [-1000, 1000, -500, 500]], dtype=np.int16
+    )
+    pcm_other = PcmData(samples=other, sample_rate=48000, format="s16", channels=2)
+
+    # Expected result if we first resample/downmix then convert to float32
+    expected_pcm = pcm_other.resample(16000, target_channels=1).to_float32()
+
+    # Append to the empty buffer
+    out = buffer.append(pcm_other)
+
+    # Metadata should be preserved from buffer
+    assert out.format in ("f32", "float32")
+    assert out.sample_rate == 16000
+    assert out.channels == 1
+
+    # Data should match expected (mono float32)
+    assert isinstance(out.samples, np.ndarray)
+    assert out.samples.dtype == np.float32
+    assert out.samples.ndim == 1
+    # Normalize expected to 1D if needed
+    if isinstance(expected_pcm.samples, np.ndarray) and expected_pcm.samples.ndim == 2:
+        expected_samples = expected_pcm.samples.reshape(-1)
+    else:
+        expected_samples = expected_pcm.samples
+    assert np.allclose(out.samples[-expected_samples.shape[0] :], expected_samples)