77
88from logging import getLogger
99from cpython.ref cimport PyObject
10+ from libc.stdint cimport *
11+ from libcpp cimport bool as c_bool
12+ from libcpp.memory cimport shared_ptr
13+ from libcpp.string cimport string as c_string
14+ from libcpp.vector cimport vector
1015
1116logger = getLogger(__name__ )
1217
@@ -26,12 +31,87 @@ cdef extern from "cpp/ArrowIterator/CArrowIterator.hpp" namespace "sf":
2631
2732cdef extern from " cpp/ArrowIterator/CArrowChunkIterator.hpp" namespace " sf" :
2833 cdef cppclass CArrowChunkIterator(CArrowIterator):
29- CArrowChunkIterator(PyObject* context, PyObject * batches) except +
34+ CArrowChunkIterator(PyObject* context, vector[shared_ptr[CRecordBatch]] * batches) except +
3035
3136
3237cdef extern from " cpp/ArrowIterator/CArrowTableIterator.hpp" namespace " sf" :
3338 cdef cppclass CArrowTableIterator(CArrowIterator):
34- CArrowTableIterator(PyObject* context, PyObject* batches) except +
39+ CArrowTableIterator(PyObject* context, vector[shared_ptr[CRecordBatch]]* batches) except +
40+
41+
42+ cdef extern from " arrow/api.h" namespace " arrow" nogil:
43+ cdef cppclass CStatus " arrow::Status" :
44+ CStatus()
45+
46+ c_string ToString()
47+ c_string message()
48+
49+ c_bool ok()
50+ c_bool IsIOError()
51+ c_bool IsOutOfMemory()
52+ c_bool IsInvalid()
53+ c_bool IsKeyError()
54+ c_bool IsNotImplemented()
55+ c_bool IsTypeError()
56+ c_bool IsCapacityError()
57+ c_bool IsIndexError()
58+ c_bool IsSerializationError()
59+
60+
61+ cdef cppclass CBuffer" arrow::Buffer" :
62+ CBuffer(const uint8_t* data, int64_t size)
63+
64+ cdef cppclass CRecordBatch" arrow::RecordBatch"
65+
66+ cdef cppclass CRecordBatchReader" arrow::RecordBatchReader" :
67+ CStatus ReadNext(shared_ptr[CRecordBatch]* batch)
68+
69+
70+ cdef extern from " arrow/ipc/api.h" namespace " arrow::ipc" nogil:
71+ cdef cppclass CRecordBatchStreamReader \
72+ " arrow::ipc::RecordBatchStreamReader" (CRecordBatchReader):
73+ @staticmethod
74+ CStatus Open(const InputStream* stream,
75+ shared_ptr[CRecordBatchReader]* out)
76+
77+
78+ cdef extern from " arrow/io/api.h" namespace " arrow::io" nogil:
79+ enum FileMode" arrow::io::FileMode::type" :
80+ FileMode_READ" arrow::io::FileMode::READ"
81+ FileMode_WRITE" arrow::io::FileMode::WRITE"
82+ FileMode_READWRITE" arrow::io::FileMode::READWRITE"
83+
84+ cdef cppclass FileInterface:
85+ CStatus Close()
86+ CStatus Tell(int64_t* position)
87+ FileMode mode()
88+ c_bool closed()
89+
90+ cdef cppclass Readable:
91+ # put overload under a different name to avoid cython bug with multiple
92+ # layers of inheritance
93+ CStatus ReadBuffer" Read" (int64_t nbytes, shared_ptr[CBuffer]* out)
94+ CStatus Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out)
95+
96+ cdef cppclass InputStream(FileInterface, Readable):
97+ pass
98+
99+ cdef cppclass Seekable:
100+ CStatus Seek(int64_t position)
101+
102+ cdef cppclass RandomAccessFile(InputStream, Seekable):
103+ CStatus GetSize(int64_t* size)
104+
105+ CStatus ReadAt(int64_t position, int64_t nbytes,
106+ int64_t* bytes_read, uint8_t* buffer )
107+ CStatus ReadAt(int64_t position, int64_t nbytes,
108+ shared_ptr[CBuffer]* out)
109+ c_bool supports_zero_copy()
110+
111+
112+ cdef extern from " arrow/python/api.h" namespace " arrow::py" nogil:
113+ cdef cppclass PyReadableFile(RandomAccessFile):
114+ PyReadableFile(object fo)
35115
36116
37117cdef class EmptyPyArrowIterator:
@@ -53,12 +133,22 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
53133 cdef CArrowIterator* cIterator
54134 cdef str unit
55135 cdef PyObject* cret
56- cdef list batches
136+ cdef vector[shared_ptr[CRecordBatch]] batches
137+
138+ def __cinit__ (self , object py_inputstream , object arrow_context ):
139+ cdef shared_ptr[InputStream] input_stream
140+ cdef shared_ptr[CRecordBatchReader] reader
141+ cdef shared_ptr[CRecordBatch] record_batch
142+ input_stream.reset(new PyReadableFile(py_inputstream))
143+ CRecordBatchStreamReader.Open(input_stream.get(), & reader)
144+ while True :
145+ reader.get().ReadNext(& record_batch)
146+
147+ if record_batch.get() is NULL :
148+ break
149+
150+ self .batches.push_back(record_batch)
57151
58- def __cinit__ (self , object arrow_stream_reader , object arrow_context ):
59- self .batches = []
60- for rb in arrow_stream_reader:
61- self .batches.append(rb)
62152 self .context = arrow_context
63153 self .cIterator = NULL
64154 self .unit = ' '
@@ -85,8 +175,8 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
85175 if iter_unit != ROW_UNIT and iter_unit != TABLE_UNIT:
86176 raise NotImplementedError
87177 elif iter_unit == ROW_UNIT:
88- self .cIterator = new CArrowChunkIterator(< PyObject* > self .context, < PyObject * > self .batches)
178+ self .cIterator = new CArrowChunkIterator(< PyObject* > self .context, & self .batches)
89179 elif iter_unit == TABLE_UNIT:
90- self .cIterator = new CArrowTableIterator(< PyObject* > self .context, < PyObject * > self .batches)
180+ self .cIterator = new CArrowTableIterator(< PyObject* > self .context, & self .batches)
91181 self .unit = iter_unit
92182
0 commit comments