♻️ add Python interface

jvdd · jvdd · commit 08caffdab306 · 2022-11-24T22:39:16.000+01:00
diff --git a/downsample_rs/src/lttb/scalar.rs b/downsample_rs/src/lttb/scalar.rs
@@ -104,18 +104,22 @@ pub fn lttb_without_x<Ty: Num>(y: ArrayView1<Ty>, n_out: usize) -> Array1<usize>
         // Slicing seems to be a lot slower
         // let avg_x: Tx = x.slice(s![avg_range_start..avg_range_end]).sum();
         let avg_y: f64 = avg_y.to_f64() / (avg_range_end - avg_range_start) as f64;
+        let avg_x: f64 = (avg_range_start + avg_range_end - 1) as f64 / 2.0;
 
         // Get the range for this bucket
         let range_offs = (every * i as f64) as usize + 1;
         let range_to = (every * (i + 1) as f64) as usize + 1;
 
         // Point a
         let point_ay = y[a].to_f64();
+        let point_ax = a as f64;
 
         let mut max_area = -1.0;
         for i in range_offs..range_to {
             // Calculate triangle area over three buckets
-            let area = ((y[i].to_f64() - point_ay) - (avg_y - point_ay)).abs();
+            let area = ((point_ax - avg_x) * (y[i].to_f64() - point_ay)
+                - (point_ax - i as f64) * (avg_y - point_ay))
+                .abs();
             if area > max_area {
                 max_area = area;
                 a = i;
@@ -167,13 +171,9 @@ mod tests {
             let n = 5_000;
             let x: Array1<i32> = Array1::from((0..n).map(|i| i as i32).collect::<Vec<i32>>());
             let y = utils::get_random_array(n, f32::MIN, f32::MAX);
-            let sampled_indices = lttb(x.view(), y.view(), 200);
+            let sampled_indices1 = lttb(x.view(), y.view(), 200);
             let sampled_indices2 = lttb_without_x(y.view(), 200);
-            // TODO: for some reason the second last point is off..
-            assert_eq!(
-                sampled_indices.slice(s![0..198]),
-                sampled_indices2.slice(s![0..198])
-            );
+            assert_eq!(sampled_indices1, sampled_indices2);
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -122,7 +122,7 @@ use downsample_rs::minmax as minmax_mod;
 
 // Create a sub module for the minmax algorithm
 #[pymodule]
-fn min_max(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
+fn minmax(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     // ----------------- SCALAR
 
     let scalar_mod = PyModule::new(_py, "scalar")?;
@@ -320,15 +320,15 @@ fn minmaxlttb(_py: Python, m: &PyModule) -> PyResult<()> {
 
 #[pymodule] // The super module
 fn tsdownsample_rs(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
-    m.add_wrapped(wrap_pymodule!(min_max))?;
+    m.add_wrapped(wrap_pymodule!(minmax))?;
     m.add_wrapped(wrap_pymodule!(m4))?;
     m.add_wrapped(wrap_pymodule!(lttb))?;
     m.add_wrapped(wrap_pymodule!(minmaxlttb))?;
 
     _py.run(
         "\
 import sys
-sys.modules['tsdownsample_rs.min_max'] = min_max
+sys.modules['tsdownsample_rs.minmax'] = minmax
 sys.modules['tsdownsample_rs.m4'] = m4
 sys.modules['tsdownsample_rs.lttb'] = lttb
 sys.modules['tsdownsample_rs.minmaxlttb'] = minmaxlttb
diff --git a/tsdownsample/__init__.py b/tsdownsample/__init__.py
@@ -1,4 +1,4 @@
-MinMaxAggregator = RustDownsamplingInterface(resampling_rs.minmax)
-M4Aggregator = RustDownsamplingInterface(resampling_rs.m4)
-LTTBAggregator = RustDownsamplingInterface(resampling_rs.lttb)
-MinMaxLTTBAggregator = RustDownsamplingInterface(resampling_rs.minmax_lttb)
+__version__ = "0.1.0a1"
+__author__ = "Jeroen Van Der Donckt"
+
+from .downsamplers import *
diff --git a/tsdownsample/downsamplers.py b/tsdownsample/downsamplers.py
@@ -1,13 +1,30 @@
+# ------------------ Rust Downsamplers ------------------
 import tsdownsample._rust.tsdownsample_rs as tsdownsample_rs
 from .downsampling_interface import RustDownsamplingInterface
 
-# ------------------ Rust Downsamplers ------------------
+MinMaxDownsampler = RustDownsamplingInterface("MinMax", tsdownsample_rs.minmax)
+M4Downsampler = RustDownsamplingInterface("M4", tsdownsample_rs.m4)
+LTTBDownsampler = RustDownsamplingInterface("LTTB", tsdownsample_rs.lttb)
+MinMaxLTTBDownsampler = RustDownsamplingInterface("MinMaxLTTB", tsdownsample_rs.minmaxlttb)
+
+# ------------------ Function Downsamplers ------------------
+import numpy as np
+from .downsampling_interface import FuncDownsamplingInterface
+
+MeanDownsampler = FuncDownsamplingInterface("Mean", np.mean)
+MedianDownsampler = FuncDownsamplingInterface("Median", np.median)
+
+# ------------------ EveryNth Downsampler ------------------
+import math
+import pandas as pd
+from .downsampling_interface import DownsampleInterface
 
-MinMaxDownsampler = RustDownsamplingInterface(tsdownsample_rs.minmax)
-M4Downsampler = RustDownsamplingInterface(tsdownsample_rs.m4)
-LTTBDownsampler = RustDownsamplingInterface(tsdownsample_rs.lttb)
-MinMaxLTTBDownsampler = RustDownsamplingInterface(tsdownsample_rs.minmax_lttb)
+class _EveryNthDownsampler(DownsampleInterface):
 
-# ------------------ Python Downsamplers ------------------
+    def __init__(self) -> None:
+        super().__init__(f"EveryNth")
+    
+    def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Series:
+        return s[:: max(1, math.ceil(len(s) / n_out))]
 
-MeanDownsampler = PythonDownsamplingInterface(np.mean)
+EveryNthDownsampler = _EveryNthDownsampler()
diff --git a/tsdownsample/downsampling_interface.py b/tsdownsample/downsampling_interface.py
@@ -9,8 +9,8 @@
 
 class DownsampleInterface(ABC):
 
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, name: str) -> None:
+        self.name = name
 
     @staticmethod
     def _construct_output_series(s: pd.Series, idxs: np.ndarray) -> pd.Series:
@@ -29,7 +29,7 @@ def _supports_dtype(self, s: pd.Series):
             f"{s.dtype} doesn't match with any regex in {self.dtype_regex_list}"
         )
 
-    def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Series
+    def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Series:
         """Downsample a pandas series to n_out samples.
 
         Parameters
@@ -47,6 +47,9 @@ def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Ser
             The downsampled series.
         """
         raise NotImplementedError
+    
+    def __repr__(self) -> str:
+        return f"{self.name}"
 
 # ------------------- Rust Downsample Interface -------------------
 
@@ -59,14 +62,14 @@ def _switch_mod_with_y(y_dtype: np.dtype, mod: ModuleType, downsample_func: str
     ----------
     y_dtype : np.dtype
         The dtype of the y-data
-    mod : Module
+    mod : ModuleType
         The module to select the appropriate function from
     downsample_func : str, optional
         The name of the function to use, by default DOWNSAMPLE_FUNC.
     """
     # FLOATS
     if np.issubdtype(y_dtype, np.floating):
-        if y.dtype == np.float16:
+        if y_dtype == np.float16:
             return getattr(mod, downsample_func + '_f16')
         elif y_dtype == np.float32:
             return getattr(mod, downsample_func + '_f32')
@@ -105,33 +108,33 @@ def _switch_mod_with_x_and_y(x_dtype: np.dtype, y_dtype: np.dtype, mod: ModuleTy
         The dtype of the x-data
     y_dtype : np.dtype
         The dtype of the y-data
-    mod : Module
+    mod : ModuleType
         The module to select the appropriate function from
     """
     # FLOATS
     if np.issubdtype(x_dtype, np.floating):
         if x_dtype == np.float16:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f16')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f16')
         elif x_dtype == np.float32:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f32')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f32')
         elif x_dtype == np.float64:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f64')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_f64')
     # INTS
     elif np.issubdtype(x_dtype, np.integer):
         if x_dtype == np.int16:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i16')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i16')
         elif x_dtype == np.int32:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i32')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i32')
         elif x_dtype == np.int64:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i64')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_i64')
     # UINTS
     elif np.issubdtype(x_dtype, np.unsignedinteger):
         if x_dtype == np.uint16:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u16')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u16')
         elif x_dtype == np.uint32:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u32')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u32')
         elif x_dtype == np.uint64:
-            return switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u64')
+            return _switch_mod_with_y(y_dtype, mod, f'{DOWNSAMPLE_F}_u64')
     # BOOLS
     # TODO: support bools
     # elif data_dtype == np.bool:
@@ -140,14 +143,25 @@ def _switch_mod_with_x_and_y(x_dtype: np.dtype, y_dtype: np.dtype, mod: ModuleTy
 
 class RustDownsamplingInterface(DownsampleInterface):
 
-    def __init__(self, resampling_mod: Module) -> None:
-        self._mod = resampling_mod
-        if hasattr(self.mod, 'simd'):
-            self.mod_single_core = self._mod.simd
-            self.mod_multi_core = self._mod.simd_parallel
-        else:
-            self.mod_single_core = self._mod.scalar
-            self.mod_multi_core = self._mod.scalar_parallel
+    def __init__(self, name: str, resampling_mod: ModuleType) -> None:
+        super().__init__(name + " [tsdownsample_rs]")
+        self.rust_mod = resampling_mod
+
+        # Store the single core sub module
+        self.mod_single_core = self.rust_mod.scalar
+        if hasattr(self.rust_mod, "simd"):
+            # use SIMD implementation if available
+            self.mod_single_core = self.rust_mod.simd
+
+        # Store the multi-core sub module (if present)
+        self.mod_multi_core = None  # no multi-core implementation (default)
+        if hasattr(self.rust_mod, "simd_parallel"):
+            # use SIMD implementation if available
+            self.mod_multi_core = self.rust_mod.simd_parallel
+        elif hasattr(self.rust_mod, "scalar_parallel"):
+            # use scalar implementation if available (when no SIMD available)
+            self.mod_multi_core = self.rust_mod.scalar_parallel
+        
         
     def _downsample_without_x(self, s: pd.Series, n_out: int) -> pd.Series:
         downsample_method = _switch_mod_with_y(s.dtype, self.mod_single_core)
@@ -170,7 +184,10 @@ def _downsample_with_x_parallel(self, s: pd.Series, n_out: int) -> pd.Series:
         return self._construct_output_series(s, idxs)
 
     def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Series:
-        if s.index.freq is None:  # TODO: or the other way around??
+        fixed_sr = False
+        if isinstance(s.index, pd.RangeIndex) or s.index.freq is not None:
+            fixed_sr = True
+        if fixed_sr:  # TODO: or the other way around??
             if parallel:
                 return self._downsample_without_x_parallel(s, n_out)
             else:
@@ -183,10 +200,39 @@ def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Ser
 
 # ------------------ Numpy Downsample Interface ------------------ 
 
-class NumpyDownsamplingInterface():
+class FuncDownsamplingInterface(DownsampleInterface):
 
-    def __init__(self, resampling_func: Callable) -> None:
-        self._func = resampling_func
+    def __init__(self, name: str, downsample_func: Callable) -> None:
+        super().__init__("[Func]_" + name)
+        self.downsample_func = downsample_func
 
     def downsample(self, s: pd.Series, n_out: int, parallel: bool = False) -> pd.Series:
-        
+        if isinstance(s.index, pd.DatetimeIndex):
+            t_start, t_end = s.index[:: len(s) - 1]
+            rate = (t_end - t_start) / n_out
+            return s.resample(rate).apply(self.downsample_func).dropna()
+
+        # no time index -> use the every nth heuristic
+        group_size = max(1, np.ceil(len(s) / n_out))
+        s_out = (
+            s.groupby(
+                # create an array of [0, 0, 0, ...., n_out, n_out]
+                # where each value is repeated based $len(s)/n_out$ times
+                by=np.repeat(np.arange(n_out), group_size)[: len(s)]
+            )
+            .agg(self.downsample_func)
+            .dropna()
+        )
+        # Create an index-estimation for real-time data
+        # Add one to the index so it's pointed at the end of the window
+        # Note: this can be adjusted to .5 to center the data
+        # Multiply it with the group size to get the real index-position
+        # TODO: add option to select start / middle / end as index
+        idx_locs = (np.arange(len(s_out)) + 1) * group_size
+        idx_locs[-1] = len(s) - 1
+        return pd.Series(
+            index=s.iloc[idx_locs.astype(s.index.dtype)].index.astype(s.index.dtype),
+            data=s_out.values,
+            name=str(s.name),
+            copy=False,
+        )