Skip to content

Commit c8de84b

Browse files
authored
Merge pull request #103 from predict-idlab/lttbv2_cp
Lttbv2 🍒 ⛏️ branch
2 parents 1e4e069 + 06c4a33 commit c8de84b

File tree

7 files changed

+732
-473
lines changed

7 files changed

+732
-473
lines changed

build.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import os
2+
import shutil
3+
import sys
4+
5+
from distutils.command.build_ext import build_ext
6+
from distutils.core import Distribution
7+
from distutils.core import Extension
8+
from distutils.errors import CCompilerError
9+
from distutils.errors import DistutilsExecError
10+
from distutils.errors import DistutilsPlatformError
11+
12+
import numpy as np
13+
14+
# C Extensions
15+
with_extensions = True
16+
17+
18+
def get_script_path():
19+
return os.path.dirname(os.path.realpath(sys.argv[0]))
20+
21+
extensions = []
22+
if with_extensions:
23+
extensions = [
24+
Extension(
25+
name="plotly_resampler.aggregation.algorithms.lttbcv2",
26+
sources=["plotly_resampler/aggregation/algorithms/lttbcv2.c"],
27+
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
28+
include_dirs=[np.get_include(), get_script_path()],
29+
),
30+
]
31+
32+
33+
class BuildFailed(Exception):
34+
35+
pass
36+
37+
38+
class ExtBuilder(build_ext):
39+
# This class allows C extension building to fail.
40+
41+
built_extensions = []
42+
43+
def run(self):
44+
try:
45+
build_ext.run(self)
46+
except (DistutilsPlatformError, FileNotFoundError) as e:
47+
print(" Unable to build the C extensions.")
48+
raise e
49+
50+
def build_extension(self, ext):
51+
try:
52+
build_ext.build_extension(self, ext)
53+
except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError) as e:
54+
print(' Unable to build the "{}" C extension, '.format(ext.name))
55+
raise e
56+
57+
58+
def build(setup_kwargs):
59+
"""
60+
This function is mandatory in order to build the extensions.
61+
"""
62+
distribution = Distribution({"name": "plotly_resampler", "ext_modules": extensions})
63+
distribution.package_dir = "plotly_resampler"
64+
65+
cmd = ExtBuilder(distribution)
66+
cmd.ensure_finalized()
67+
cmd.run()
68+
69+
# Copy built extensions back to the project
70+
for output in cmd.get_outputs():
71+
relative_extension = os.path.relpath(output, cmd.build_lib)
72+
if not os.path.exists(output):
73+
continue
74+
75+
shutil.copyfile(output, relative_extension)
76+
mode = os.stat(relative_extension).st_mode
77+
mode |= (mode & 0o444) >> 2
78+
os.chmod(relative_extension, mode)
79+
80+
return setup_kwargs
81+
82+
83+
if __name__ == "__main__":
84+
build({})

plotly_resampler/aggregation/aggregation_interface.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _insert_gap_none(self, s: pd.Series) -> pd.Series:
9595
df_gap_idx = s.index.values[s_idx_diff > 3 * med_diff]
9696
if len(df_gap_idx):
9797
df_res_gap = pd.Series(
98-
index=df_gap_idx, data=None, name=s.name, copy=False
98+
index=df_gap_idx, data=None, name=s.name, copy=False, dtype=s.dtype
9999
)
100100

101101
if isinstance(df_res_gap.index, pd.DatetimeIndex):

plotly_resampler/aggregation/aggregators.py

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111

1212
import math
1313

14-
import lttbc
1514
import numpy as np
1615
import pandas as pd
1716

1817
from ..aggregation.aggregation_interface import AbstractSeriesAggregator
18+
from .algorithms import lttbcv2
1919

2020

2121
class LTTB(AbstractSeriesAggregator):
@@ -73,42 +73,19 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
7373
)
7474

7575
def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
76-
# if we have categorical data, LTTB will convert the categorical values into
77-
# their numeric codes, i.e., the index position of the category array
7876
s_v = s.cat.codes.values if str(s.dtype) == "category" else s.values
79-
s_i = s.index.values
80-
81-
if s_i.dtype.type == np.datetime64:
82-
# lttbc does not support this datatype -> convert to int
83-
# (where the time is represented in ns)
84-
# REMARK:
85-
# -> additional logic is needed to mitigate rounding errors
86-
# First, the start offset is subtracted, after which the input series
87-
# is set in the already requested format, i.e. np.float64
88-
89-
# NOTE -> Rounding errors can still persist, but this approach is already
90-
# significantly less prone to it than the previos implementation.
91-
s_i0 = s_i[0].astype(np.int64)
92-
idx, data = lttbc.downsample(
93-
(s_i.astype(np.int64) - s_i0).astype(np.float64), s_v, n_out
94-
)
9577

96-
# add the start-offset and convert back to datetime
97-
idx = pd.to_datetime(
98-
idx.astype(np.int64) + s_i0, unit="ns", utc=True
99-
).tz_convert(s.index.tz)
100-
else:
101-
idx, data = lttbc.downsample(s_i, s_v, n_out)
102-
idx = idx.astype(s_i.dtype)
78+
s_i = s.index.values
79+
s_i = s_i.astype(np.int64) if s_i.dtype.type == np.datetime64 else s_i
10380

104-
if str(s.dtype) == "category":
105-
# reconvert the downsampled numeric codes to the category array
106-
data = np.vectorize(s.dtype.categories.values.item)(data.astype(s_v.dtype))
107-
else:
108-
# default case, use the series it's dtype as return type
109-
data = data.astype(s.dtype)
81+
index = lttbcv2.downsample_return_index(s_i, s_v, n_out)
11082

111-
return pd.Series(index=idx, data=data, name=str(s.name), copy=False)
83+
return pd.Series(
84+
index=s.index[index],
85+
data=s.values[index],
86+
name=str(s.name),
87+
copy=False,
88+
)
11289

11390

11491
class MinMaxOverlapAggregator(AbstractSeriesAggregator):
@@ -166,14 +143,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
166143
# Calculate the argmin & argmax on the reshaped view of `s` &
167144
# add the corresponding offset
168145
argmin = (
169-
s.iloc[: block_size * offset.shape[0]]
170-
.values.reshape(-1, block_size)
146+
s.values[: block_size * offset.shape[0]]
147+
.reshape(-1, block_size)
171148
.argmin(axis=1)
172149
+ offset
173150
)
174151
argmax = (
175-
s.iloc[argmax_offset : block_size * offset.shape[0] + argmax_offset]
176-
.values.reshape(-1, block_size)
152+
s.values[argmax_offset : block_size * offset.shape[0] + argmax_offset]
153+
.reshape(-1, block_size)
177154
.argmax(axis=1)
178155
+ offset
179156
+ argmax_offset
@@ -231,14 +208,14 @@ def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
231208
# Calculate the argmin & argmax on the reshaped view of `s` &
232209
# add the corresponding offset
233210
argmin = (
234-
s.iloc[: block_size * offset.shape[0]]
235-
.values.reshape(-1, block_size)
211+
s.values[: block_size * offset.shape[0]]
212+
.reshape(-1, block_size)
236213
.argmin(axis=1)
237214
+ offset
238215
)
239216
argmax = (
240-
s.iloc[: block_size * offset.shape[0]]
241-
.values.reshape(-1, block_size)
217+
s.values[: block_size * offset.shape[0]]
218+
.reshape(-1, block_size)
242219
.argmax(axis=1)
243220
+ offset
244221
)
@@ -297,7 +274,7 @@ def __init__(self, interleave_gaps: bool = True, nan_position="end"):
297274
)
298275

299276
def _aggregate(self, s: pd.Series, n_out: int) -> pd.Series:
300-
if s.shape[0] > n_out * 1_000:
277+
if s.shape[0] > n_out * 2_000:
301278
s = self.minmax._aggregate(s, n_out * 50)
302279
return self.lttb._aggregate(s, n_out)
303280

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
#define PY_SSIZE_T_CLEAN
2+
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
3+
#include <Python.h> // pull the python API into the current namespace
4+
#include <numpy/arrayobject.h>
5+
#include <numpy/npy_math.h>
6+
#include <math.h>
7+
8+
// This code is adapted from https://github.com/dgoeries/lttbc
9+
// Most credits are due to https://github.com/dgoeries
10+
11+
// method below assumes the x-delta's are equidistant
12+
static PyObject* downsample_return_index(PyObject *self, PyObject *args) {
13+
int threshold;
14+
PyObject *x_obj = NULL, *y_obj = NULL;
15+
PyArrayObject *x_array = NULL, *y_array = NULL;
16+
17+
if (!PyArg_ParseTuple(args, "OOi", &x_obj, &y_obj, &threshold))
18+
return NULL;
19+
20+
if ((!PyArray_Check(x_obj) && !PyList_Check(x_obj)) || (!PyArray_Check(y_obj) && !PyList_Check(y_obj))) {
21+
PyErr_SetString(PyExc_TypeError, "Function requires x and y input to be of type list or ndarray ...");
22+
goto fail;
23+
}
24+
25+
26+
// Interpret the input objects as numpy arrays, with reqs (contiguous, aligned, and writeable ...)
27+
x_array = (PyArrayObject *)PyArray_FROM_OTF(x_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
28+
y_array = (PyArrayObject *)PyArray_FROM_OTF(y_obj, NPY_DOUBLE, NPY_ARRAY_IN_ARRAY);
29+
if (x_array == NULL || y_array == NULL) {
30+
goto fail;
31+
}
32+
33+
if (PyArray_NDIM(x_array) != 1 || PyArray_NDIM(y_array) != 1) {;
34+
PyErr_SetString(PyExc_ValueError, "Both x and y must have a single dimension ...");
35+
goto fail;
36+
}
37+
38+
if (!PyArray_SAMESHAPE(x_array, y_array)) {
39+
PyErr_SetString(PyExc_ValueError, "Input x and y must have the same shape ...");
40+
goto fail;
41+
}
42+
43+
// Declare data length and check if we actually have to downsample!
44+
const Py_ssize_t data_length = (Py_ssize_t)PyArray_DIM(y_array, 0);
45+
if (threshold >= data_length || threshold <= 0) {
46+
// Nothing to do!
47+
PyObject *value = Py_BuildValue("O", x_array);
48+
Py_DECREF(x_array);
49+
Py_DECREF(y_array);
50+
return value;
51+
}
52+
53+
// Access the data in the NDArray!
54+
double *y = (double*)PyArray_DATA(y_array);
55+
56+
// Create an empty output array with shape and dim for the output!
57+
npy_intp dims[1];
58+
dims[0] = threshold;
59+
PyArrayObject *sampled_x = (PyArrayObject *)PyArray_Empty(1, dims,
60+
PyArray_DescrFromType(NPY_INT64), 0);
61+
62+
// Get a pointer to its data
63+
long long *sampled_x_data = (long long*)PyArray_DATA(sampled_x);
64+
65+
// The main loop here!
66+
Py_ssize_t sampled_index = 0;
67+
const double every = (double)(data_length - 2) / (threshold - 2);
68+
69+
Py_ssize_t a = 0;
70+
Py_ssize_t next_a = 0;
71+
72+
// Always add the first point!
73+
sampled_x_data[sampled_index] = 0;
74+
75+
sampled_index++;
76+
Py_ssize_t i;
77+
for (i = 0; i < threshold - 2; ++i) {
78+
// Calculate point average for next bucket (containing c)
79+
double avg_x = threshold;
80+
double avg_y = 0;
81+
Py_ssize_t avg_range_start = (Py_ssize_t)(floor((i + 1)* every) + 1);
82+
Py_ssize_t avg_range_end = (Py_ssize_t)(floor((i + 2) * every) + 1);
83+
if (avg_range_end >= data_length){
84+
avg_range_end = data_length;
85+
}
86+
Py_ssize_t avg_range_length = avg_range_end - avg_range_start;
87+
88+
for (;avg_range_start < avg_range_end; avg_range_start++){
89+
avg_y += y[avg_range_start];
90+
}
91+
avg_y /= avg_range_length;
92+
avg_x = avg_range_start + every / 2;
93+
94+
// Get the range for this bucket
95+
Py_ssize_t range_offs = (Py_ssize_t)(floor((i + 0) * every) + 1);
96+
Py_ssize_t range_to = (Py_ssize_t)(floor((i + 1) * every) + 1);
97+
98+
99+
// Point a
100+
double point_a_y = y[a];
101+
102+
double max_area = -1.0;
103+
for (; range_offs < range_to; range_offs++){
104+
// Calculate triangle area over three buckets
105+
double area = fabs((a - avg_x) * (y[range_offs] - point_a_y) - (a - range_offs) * (avg_y - point_a_y)) * 0.5;
106+
if (area > max_area){
107+
max_area = area;
108+
next_a = range_offs; // Next a is this b
109+
}
110+
}
111+
// Pick this point from the bucket
112+
sampled_x_data[sampled_index] = next_a;
113+
114+
sampled_index++;
115+
116+
// Current a becomes the next_a (chosen b)
117+
a = next_a;
118+
}
119+
120+
// Always add last! Check for finite values!
121+
sampled_x_data[sampled_index] = data_length - 1;
122+
123+
// Provide our return value
124+
PyObject *value = Py_BuildValue("O", sampled_x);
125+
126+
// And remove the references!
127+
Py_DECREF(x_array);
128+
Py_DECREF(y_array);
129+
Py_XDECREF(sampled_x);
130+
131+
return value;
132+
133+
fail:
134+
Py_DECREF(x_array);
135+
Py_XDECREF(y_array);
136+
return NULL;
137+
}
138+
139+
140+
// Method definition object
141+
static PyMethodDef lttbcv2Methods[] = {
142+
{
143+
"downsample_return_index", // The name of the method
144+
downsample_return_index, // Function pointer to the method implementation
145+
METH_VARARGS,
146+
"Compute the largest triangle three buckets (LTTB) algorithm in a C extension."
147+
},
148+
{NULL, NULL, 0, NULL} /* Sentinel */
149+
};
150+
151+
static struct PyModuleDef lttbc_module_definition = {
152+
PyModuleDef_HEAD_INIT,
153+
"lttbcv2", /* name of module */
154+
"A Python module that computes the largest triangle three buckets algorithm (LTTB) using C code.",
155+
-1, /* size of per-interpreter state of the module,
156+
or -1 if the module keeps state in global variables. */
157+
lttbcv2Methods
158+
};
159+
160+
// Module initialization
161+
PyMODINIT_FUNC PyInit_lttbcv2(void) {
162+
Py_Initialize();
163+
import_array();
164+
return PyModule_Create(&lttbc_module_definition);
165+
}

0 commit comments

Comments
 (0)