@@ -25,13 +25,11 @@ from cython cimport int
2525from datetime import datetime
2626from ormsgpack import unpackb
2727from orso.exceptions import DataError
28- from typing import Dict, Any, Tuple
29- from libc.stdlib cimport malloc, free
3028import numpy as np
31- cimport cython
3229cimport numpy as cnp
3330from numpy cimport ndarray
34- from libc.stdint cimport int32_t
31+ from libc.stdint cimport int32_t, int64_t
32+ from cpython.dict cimport PyDict_GetItem
3533
3634cnp.import_array()
3735
@@ -47,12 +45,12 @@ cpdef from_bytes_cython(bytes data):
4745 # Validate header and size, now using pointer arithmetic
4846 if length < HEADER_SIZE or (data_ptr[0 ] & 0xF0 != 0x10 ):
4947 raise DataError(" Data malformed" )
50-
48+
5149 # Deserialize record bytes
5250 cdef Py_ssize_t record_size = (
5351 (< unsigned char > data_ptr[2 ]) << 24 |
5452 (< unsigned char > data_ptr[3 ]) << 16 |
55- (< unsigned char > data_ptr[4 ]) << 8 |
53+ (< unsigned char > data_ptr[4 ]) << 8 |
5654 (< unsigned char > data_ptr[5 ])
5755 )
5856
@@ -64,7 +62,6 @@ cpdef from_bytes_cython(bytes data):
6462 cdef list processed_list = []
6563 cdef object item
6664
67-
6865 for item in raw_tuple:
6966 if isinstance (item, list ) and len (item) == 2 and item[0 ] == " __datetime__" :
7067 processed_list.append(datetime.fromtimestamp(item[1 ]))
@@ -73,47 +70,74 @@ cpdef from_bytes_cython(bytes data):
7370
7471 return tuple (processed_list)
7572
76-
7773cpdef tuple extract_dict_columns(dict data, tuple fields):
78- cdef int i
79- cdef str field
80- cdef list field_data = [None ] * len (fields) # Preallocate list size
74+ """
75+ Extracts the given fields from a dictionary and returns them as a tuple.
76+
77+ Parameters:
78+ data: dict
79+ The dictionary to extract fields from.
80+ fields: tuple
81+ The field names to extract.
82+
83+ Returns:
84+ A tuple containing values from the dictionary for the requested fields.
85+ Missing fields will have None.
86+ """
87+ cdef int64_t i, num_fields = len (fields)
88+ cdef void * value_ptr
89+ cdef list field_data = [None ] * num_fields
90+
91+ for i in range (num_fields):
92+ value_ptr = PyDict_GetItem(data, fields[i])
93+ if value_ptr != NULL :
94+ field_data[i] = < object > value_ptr
95+ else :
96+ field_data[i] = None
8197
82- for i, field in enumerate (fields):
83- if field in data:
84- field_data[i] = data[field]
8598 return tuple (field_data) # Convert list to tuple
8699
87100
88- cpdef cnp.ndarray collect_cython(list rows, cnp.ndarray[cnp. int32_t, ndim = 1 ] columns, int limit = - 1 ):
101+ cpdef cnp.ndarray collect_cython(list rows, int32_t[: ] columns, int limit = - 1 ):
89102 """
90103 Collects columns from a list of tuples (rows).
91104 """
92- cdef int32_t i, j, col_idx
93- cdef int32_t num_rows = len (rows)
94- cdef int32_t num_cols = columns.shape[0 ]
95- cdef cnp.ndarray row
105+ cdef int64_t i, j, col_idx
106+ cdef int64_t num_rows = len (rows)
107+ cdef int64_t num_cols = columns.shape[0 ]
108+ cdef int64_t row_width = len (rows[ 0 ]) if num_rows > 0 else 0
96109
110+ # Check if limit is set and within bounds
97111 if limit >= 0 and limit < num_rows:
98112 num_rows = limit
99113
114+ # Check if there are any rows or columns and exit early
115+ if num_rows == 0 or num_cols == 0 :
116+ return np.empty((num_cols, num_rows), dtype = object )
117+
118+ # Check if columns are within bounds
119+ for j in range (num_cols):
120+ col_idx = columns[j]
121+ if col_idx < 0 or col_idx > row_width:
122+ raise IndexError (f" Column index out of bounds (0 < {col_idx} < {row_width})" )
123+
100124 # Initialize result memory view with pre-allocated numpy arrays for each column
101- cdef cnp.ndarray result = np.empty((num_cols, num_rows), dtype = object )
125+ cdef object [:, :] result = np.empty((num_cols, num_rows), dtype = object )
102126
103127 # Populate each column one at a time
104128 for j in range (num_cols):
105129 col_idx = columns[j]
106130 for i in range (num_rows):
107131 result[j, i] = rows[i][col_idx]
108-
132+
109133 # Convert each column back to a list and return the list of lists
110- return result
134+ return np.asarray( result)
111135
112136
113137cpdef int calculate_data_width(cnp.ndarray column_values):
114138 cdef int width, max_width
115139 cdef object value
116-
140+
117141 max_width = 4 # Default width
118142 for value in column_values:
119143 if value is not None :
@@ -124,11 +148,29 @@ cpdef int calculate_data_width(cnp.ndarray column_values):
124148 return max_width
125149
126150
151+ from cpython.list cimport PyList_New, PyList_SET_ITEM
152+
127153def process_table (table , row_factory , int max_chunksize ) -> list:
128- cdef list rows = []
154+ """
155+ Processes a PyArrow table and applies a row factory function to each row.
156+
157+ Parameters:
158+ table: PyArrow Table
159+ The input table to process.
160+ row_factory: function
161+ A function applied to each row.
162+ max_chunksize: int
163+ The batch size to process at a time.
164+
165+ Returns:
166+ A list of transformed rows.
167+ """
168+ cdef list rows = [None ] * table.num_rows
169+ cdef int64_t i = 0
129170
130171 for batch in table.to_batches(max_chunksize ):
131172 df = batch.to_pandas().replace({np.nan: None })
132173 for row in df.itertuples(index = False , name = None ):
133- rows.append(row_factory(row))
174+ rows[i] = row_factory(row)
175+ i += 1
134176 return rows
0 commit comments