Skip to content

Commit e4a52e7

Browse files
committed
0.0.182
1 parent 3923b5d commit e4a52e7

File tree

9 files changed

+193
-40
lines changed

9 files changed

+193
-40
lines changed

Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
lint:
2-
python -m pip install --quiet --upgrade pycln isort ruff yamllint
2+
python -m pip install --upgrade pip uv
3+
python -m uv pip install --quiet --upgrade pycln isort ruff yamllint cython-lint
34
# python -m yamllint .
5+
# cython-lint orso/compute/*.pyx
46
python -m ruff check --fix --exit-zero
57
python -m pycln .
68
python -m isort .
79
python -m ruff format orso
810

911
update:
10-
python -m pip install --quiet --upgrade -r requirements.txt
11-
python -m pip install --quiet --upgrade -r tests/requirements.txt
12+
python -m pip install --upgrade pip uv
13+
python -m uv pip install --upgrade -r tests/requirements.txt
14+
python -m uv pip install --upgrade -r requirements.txt
1215

1316
test:
1417
python -m pip install --quiet --upgrade pytest coverage

orso/compute/compiled.pyx

Lines changed: 67 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,11 @@ from cython cimport int
2525
from datetime import datetime
2626
from ormsgpack import unpackb
2727
from orso.exceptions import DataError
28-
from typing import Dict, Any, Tuple
29-
from libc.stdlib cimport malloc, free
3028
import numpy as np
31-
cimport cython
3229
cimport numpy as cnp
3330
from numpy cimport ndarray
34-
from libc.stdint cimport int32_t
31+
from libc.stdint cimport int32_t, int64_t
32+
from cpython.dict cimport PyDict_GetItem
3533

3634
cnp.import_array()
3735

@@ -47,12 +45,12 @@ cpdef from_bytes_cython(bytes data):
4745
# Validate header and size, now using pointer arithmetic
4846
if length < HEADER_SIZE or (data_ptr[0] & 0xF0 != 0x10):
4947
raise DataError("Data malformed")
50-
48+
5149
# Deserialize record bytes
5250
cdef Py_ssize_t record_size = (
5351
(<unsigned char>data_ptr[2]) << 24 |
5452
(<unsigned char>data_ptr[3]) << 16 |
55-
(<unsigned char>data_ptr[4]) << 8 |
53+
(<unsigned char>data_ptr[4]) << 8 |
5654
(<unsigned char>data_ptr[5])
5755
)
5856

@@ -64,7 +62,6 @@ cpdef from_bytes_cython(bytes data):
6462
cdef list processed_list = []
6563
cdef object item
6664

67-
6865
for item in raw_tuple:
6966
if isinstance(item, list) and len(item) == 2 and item[0] == "__datetime__":
7067
processed_list.append(datetime.fromtimestamp(item[1]))
@@ -73,47 +70,74 @@ cpdef from_bytes_cython(bytes data):
7370

7471
return tuple(processed_list)
7572

76-
7773
cpdef tuple extract_dict_columns(dict data, tuple fields):
78-
cdef int i
79-
cdef str field
80-
cdef list field_data = [None] * len(fields) # Preallocate list size
74+
"""
75+
Extracts the given fields from a dictionary and returns them as a tuple.
76+
77+
Parameters:
78+
data: dict
79+
The dictionary to extract fields from.
80+
fields: tuple
81+
The field names to extract.
82+
83+
Returns:
84+
A tuple containing values from the dictionary for the requested fields.
85+
Missing fields will have None.
86+
"""
87+
cdef int64_t i, num_fields = len(fields)
88+
cdef void* value_ptr
89+
cdef list field_data = [None] * num_fields
90+
91+
for i in range(num_fields):
92+
value_ptr = PyDict_GetItem(data, fields[i])
93+
if value_ptr != NULL:
94+
field_data[i] = <object>value_ptr
95+
else:
96+
field_data[i] = None
8197

82-
for i, field in enumerate(fields):
83-
if field in data:
84-
field_data[i] = data[field]
8598
return tuple(field_data) # Convert list to tuple
8699

87100

88-
cpdef cnp.ndarray collect_cython(list rows, cnp.ndarray[cnp.int32_t, ndim=1] columns, int limit=-1):
101+
cpdef cnp.ndarray collect_cython(list rows, int32_t[:] columns, int limit=-1):
89102
"""
90103
Collects columns from a list of tuples (rows).
91104
"""
92-
cdef int32_t i, j, col_idx
93-
cdef int32_t num_rows = len(rows)
94-
cdef int32_t num_cols = columns.shape[0]
95-
cdef cnp.ndarray row
105+
cdef int64_t i, j, col_idx
106+
cdef int64_t num_rows = len(rows)
107+
cdef int64_t num_cols = columns.shape[0]
108+
cdef int64_t row_width = len(rows[0]) if num_rows > 0 else 0
96109

110+
# Check if limit is set and within bounds
97111
if limit >= 0 and limit < num_rows:
98112
num_rows = limit
99113

114+
# Check if there are any rows or columns and exit early
115+
if num_rows == 0 or num_cols == 0:
116+
return np.empty((num_cols, num_rows), dtype=object)
117+
118+
# Check if columns are within bounds
119+
for j in range(num_cols):
120+
col_idx = columns[j]
121+
if col_idx < 0 or col_idx > row_width:
122+
raise IndexError(f"Column index out of bounds (0 < {col_idx} < {row_width})")
123+
100124
# Initialize result memory view with pre-allocated numpy arrays for each column
101-
cdef cnp.ndarray result = np.empty((num_cols, num_rows), dtype=object)
125+
cdef object[:, :] result = np.empty((num_cols, num_rows), dtype=object)
102126

103127
# Populate each column one at a time
104128
for j in range(num_cols):
105129
col_idx = columns[j]
106130
for i in range(num_rows):
107131
result[j, i] = rows[i][col_idx]
108-
132+
109133
# Convert each column back to a list and return the list of lists
110-
return result
134+
return np.asarray(result)
111135

112136

113137
cpdef int calculate_data_width(cnp.ndarray column_values):
114138
cdef int width, max_width
115139
cdef object value
116-
140+
117141
max_width = 4 # Default width
118142
for value in column_values:
119143
if value is not None:
@@ -124,11 +148,29 @@ cpdef int calculate_data_width(cnp.ndarray column_values):
124148
return max_width
125149

126150

151+
from cpython.list cimport PyList_New, PyList_SET_ITEM
152+
127153
def process_table(table, row_factory, int max_chunksize) -> list:
128-
cdef list rows = []
154+
"""
155+
Processes a PyArrow table and applies a row factory function to each row.
156+
157+
Parameters:
158+
table: PyArrow Table
159+
The input table to process.
160+
row_factory: function
161+
A function applied to each row.
162+
max_chunksize: int
163+
The batch size to process at a time.
164+
165+
Returns:
166+
A list of transformed rows.
167+
"""
168+
cdef list rows = [None] * table.num_rows
169+
cdef int64_t i = 0
129170

130171
for batch in table.to_batches(max_chunksize):
131172
df = batch.to_pandas().replace({np.nan: None})
132173
for row in df.itertuples(index=False, name=None):
133-
rows.append(row_factory(row))
174+
rows[i] = row_factory(row)
175+
i += 1
134176
return rows

orso/compute/varchar_array.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,4 +117,4 @@ def unpack_byte_array(const unsigned char[::1] raw_bytes, Py_ssize_t n, const ch
117117
bytecount -= 4 + itemlen
118118
i += 1
119119

120-
return out
120+
return out

orso/dataframe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def polars(self, size=None):
125125

126126
return to_polars(self, size)
127127

128-
def nbytes(self):
128+
def nbytes(self) -> int:
129129
"""Approximate the number of bytes used by the DataFrame"""
130130
self.materialize()
131131
if self._nbytes is None:
@@ -140,10 +140,10 @@ def append(self, entry):
140140
self._nbytes += new_row.nbytes()
141141
self._cursor = None
142142

143-
def head(self, size: int = 5):
143+
def head(self, size: int = 5) -> "DataFrame":
144144
return self.slice(0, size)
145145

146-
def tail(self, size: int = 5):
146+
def tail(self, size: int = 5) -> "DataFrame":
147147
return self.slice(offset=0 - size, length=size)
148148

149149
def query(self, predicate) -> "DataFrame":

orso/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
1212

13-
__version__: str = "0.0.181"
13+
__version__: str = "0.0.182"
1414
__author__: str = "@joocer"

tests/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def run_tests():
2323
for index, method in enumerate(test_methods):
2424
start_time = time.monotonic_ns()
2525
test_name = f"\033[38;2;255;184;108m{(index + 1):04}\033[0m \033[38;2;189;147;249m{str(method.__name__)}\033[0m"
26-
print(test_name.ljust(display_width - 20), end="")
26+
print(test_name.ljust(display_width - 20), end="", flush=True)
2727
error = None
2828
output = ""
2929
try:

tests/test_compiled.py

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,74 @@
11
import os
22
import sys
3+
import numpy
34

45
sys.path.insert(1, os.path.join(sys.path[0], ".."))
56

67
from orso.compute.compiled import collect_cython
7-
import numpy
88

99
def test_collector():
10-
1110
columns = collect_cython([(1, 2), (2, 1), (7, 8)], numpy.array([1, 0], dtype=numpy.int32))
1211
assert len(columns) == 2
1312
assert len(columns[0]) == 3
1413
assert sum(columns[0]) == 11
1514
assert sum(columns[1]) == 10
1615

17-
if __name__ == "__main__": # prgama: nocover
16+
def test_collector_empty_input():
17+
columns = collect_cython([], numpy.array([], dtype=numpy.int32))
18+
assert len(columns) == 0, len(columns)
19+
20+
def test_collector_single_tuple():
21+
columns = collect_cython([(5, 10)], numpy.array([1], dtype=numpy.int32))
22+
assert len(columns) == 1, len(columns)
23+
assert columns[0] == [10]
24+
25+
def test_collector_large_data():
26+
data = [(i, i * 2) for i in range(10000)]
27+
index = numpy.array([1, 0], dtype=numpy.int32)
28+
columns = collect_cython(data, index)
29+
assert len(columns) == 2
30+
assert len(columns[0]) == 10000
31+
assert sum(columns[0]) == sum(i * 2 for i in range(10000))
32+
assert sum(columns[1]) == sum(range(10000))
33+
34+
35+
def test_collector_non_integer_index():
36+
data = [(1, 2), (3, 4)]
37+
index = numpy.array([0.5, 1.5], dtype=numpy.float64)
38+
try:
39+
collect_cython(data, index)
40+
assert False, "Expected a ValueError"
41+
except ValueError:
42+
pass
43+
44+
def test_collector_negative_index():
45+
data = [(1, 2), (3, 4)]
46+
index = numpy.array([-1, 0], dtype=numpy.int32)
47+
try:
48+
collect_cython(data, index)
49+
assert False, "Expected an IndexError"
50+
except IndexError:
51+
pass
52+
53+
def test_collector_large_index_values():
54+
data = [(1, 2), (3, 4)]
55+
index = numpy.array([100, 200], dtype=numpy.int32)
56+
try:
57+
collect_cython(data, index)
58+
assert False, "Expected an IndexError"
59+
except IndexError:
60+
pass
61+
62+
def test_collector_duplicate_indices():
63+
data = [(1, 2), (3, 4), (5, 6)]
64+
index = numpy.array([1, 1, 0], dtype=numpy.int32)
65+
columns = collect_cython(data, index)
66+
assert len(columns) == 3
67+
assert sum(columns[0]) == 12, sum(columns[0])
68+
assert sum(columns[1]) == 12, sum(columns[1])
69+
assert sum(columns[2]) == 9, sum(columns[2])
70+
71+
if __name__ == "__main__": # pragma: nocover
1872
from tests import run_tests
19-
test_collector()
73+
2074
run_tests()

tests/test_display.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def find_all_substrings(s: str, sub: str) -> List[int]:
3939

4040
def test_display_ascii_lazy():
4141

42-
for i in range(10):
42+
for i in range(10):
4343
df = DataFrame(cities.values).head(i)
4444
df._rows = (r for r in df._rows)
4545

tests/test_field_extractor.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def test_extract_dict_columns_basic():
1010
data = {'a': 1, 'b': 2, 'c': 3}
1111
fields = ('a', 'b', 'c')
1212
result = extract_dict_columns(data, fields)
13-
assert result == (1, 2, 3)
13+
assert result == (1, 2, 3), result
1414

1515
def test_extract_dict_columns_missing_fields():
1616
data = {'a': 1, 'b': 2, 'c': 3}
@@ -160,6 +160,60 @@ def test_extract_dict_columns_with_same_field_order():
160160
assert result1 == (1, 2, 3, 4)
161161
assert result2 == (4, 3, 2, 1)
162162

163+
def test_extract_dict_columns_sparse_dict():
164+
data = {'a': 1}
165+
fields = ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i')
166+
result = extract_dict_columns(data, fields)
167+
assert result == (1, None, None, None, None, None, None, None, None)
168+
169+
def test_extract_dict_columns_immutability():
170+
data = {'a': 1, 'b': 2, 'c': 3}
171+
original_data = data.copy()
172+
fields = ('a', 'b', 'c')
173+
extract_dict_columns(data, fields)
174+
assert data == original_data, "Function modified input data!"
175+
176+
def test_extract_dict_columns_stress_test():
177+
data = {str(i): i for i in range(10000)}
178+
fields = tuple(str(i) for i in range(20000))
179+
result = extract_dict_columns(data, fields)
180+
assert result[:10000] == tuple(range(10000)) and result[10000:] == (None,) * 10000
181+
182+
def test_extract_dict_columns_duplicate_missing_fields():
183+
data = {'a': 1, 'b': 2}
184+
fields = ('a', 'x', 'x', 'b')
185+
result = extract_dict_columns(data, fields)
186+
assert result == (1, None, None, 2)
187+
188+
def test_extract_dict_columns_case_sensitivity():
189+
data = {'A': 1, 'b': 2}
190+
fields = ('a', 'b')
191+
result = extract_dict_columns(data, fields)
192+
assert result == (None, 2)
193+
194+
def test_extract_dict_columns_whitespace_keys():
195+
data = {' a': 1, 'b ': 2}
196+
fields = ('a', 'b ')
197+
result = extract_dict_columns(data, fields)
198+
assert result == (None, 2)
199+
200+
def test_extract_dict_columns_non_string_keys():
201+
data = {None: 1, 42: "forty-two"}
202+
fields = (None, 42, "missing")
203+
result = extract_dict_columns(data, fields)
204+
assert result == (1, "forty-two", None)
205+
206+
def test_extract_dict_columns_deeply_nested_keys():
207+
data = {'a': {'b': {'c': 1}}}
208+
fields = ('a.b.c', 'a')
209+
result = extract_dict_columns(data, fields)
210+
assert result == (None, {'b': {'c': 1}})
211+
212+
def test_extract_dict_columns_large_dataset_with_missing():
213+
data = {str(i): i for i in range(10**6)}
214+
fields = ('1000001', '500000', '999999')
215+
result = extract_dict_columns(data, fields)
216+
assert result == (None, 500000, 999999)
163217

164218
if __name__ == "__main__": # prgama: nocover
165219
from tests import run_tests

0 commit comments

Comments
 (0)