Skip to content

Commit 987746a

Browse files
authored
ARROW-2 Add support for BSON string type (#46)
1 parent 0da6fd6 commit 987746a

File tree

12 files changed

+109
-53
lines changed

12 files changed

+109
-53
lines changed

.github/workflows/release-python.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ on:
44
push:
55
pull_request:
66

7+
concurrency:
8+
group: wheels-${{ github.ref }}
9+
cancel-in-progress: true
10+
711
jobs:
812
build-non-linux-wheels:
913
runs-on: ${{ matrix.os }}
@@ -100,4 +104,4 @@ jobs:
100104
- uses: actions/upload-artifact@v2
101105
with:
102106
name: nix-wheels
103-
path: "*-wheel"
107+
path: "*-wheel"

.github/workflows/test-python.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ on:
44
push:
55
pull_request:
66

7+
concurrency:
8+
group: tests-${{ github.ref }}
9+
cancel-in-progress: true
10+
711
defaults:
812
run:
913
working-directory: ./bindings/python

bindings/python/docs/source/changelog.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ Changes in Version 0.3.0
55
------------------------
66
- Support for `ObjectId` `bson` type.
77
- Improve error message when schema contains an unsupported type.
8+
- Add support for BSON string type.
89

910
Changes in Version 0.2.0
1011
------------------------

bindings/python/docs/source/supported_types.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Support for additional types will be added in subsequent releases.
1515

1616
* - BSON Type
1717
- Type Identifiers
18+
* - String
19+
- :class:`py.str`, an instance of :class:`pyarrow.string`
1820
* - ObjectId
1921
- :class:`py.bytes`, :class:`bson.ObjectId`, an instance of :class:`pyarrow.FixedSizeBinaryScalar`
2022
* - 64-bit binary floating point

bindings/python/pymongoarrow/bson.pyi

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,16 @@ def process_bson_stream(bson_stream, context):
3030
cdef const char* key
3131
cdef bson_type_t value_t
3232
cdef Py_ssize_t count = 0
33+
cdef const char * bson_str
34+
cdef uint32_t str_len
3335

3436
# Localize types for better performance.
3537
t_int32 = _BsonArrowTypes.int32
3638
t_int64 = _BsonArrowTypes.int64
3739
t_double = _BsonArrowTypes.double
3840
t_datetime = _BsonArrowTypes.datetime
3941
t_oid = _BsonArrowTypes.objectid
42+
t_string = _BsonArrowTypes.string
4043
builder_map = context.builder_map
4144

4245
# initialize count to current length of builders
@@ -75,6 +78,12 @@ def process_bson_stream(bson_stream, context):
7578
builder.append(<bytes>(<uint8_t*>bson_iter_oid(&doc_iter))[:12])
7679
else:
7780
builder.append_null()
81+
elif ftype == t_string:
82+
if value_t == BSON_TYPE_UTF8:
83+
bson_str = bson_iter_utf8 (&doc_iter, &str_len)
84+
builder.append(<bytes>(bson_str)[:str_len])
85+
else:
86+
builder.append_null()
7887
elif ftype == t_double:
7988
if (value_t == BSON_TYPE_DOUBLE or
8089
value_t == BSON_TYPE_BOOL or

bindings/python/pymongoarrow/builders.pyi

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,38 @@
1616
cdef class _ArrayBuilderBase:
1717
def append_values(self, values):
1818
for value in values:
19-
self.append(value)
19+
if value is None or value is np.nan:
20+
self.append_null()
21+
else:
22+
self.append(value)
23+
24+
25+
cdef class StringBuilder(_ArrayBuilderBase):
26+
type_marker = _BsonArrowTypes.string
27+
cdef:
28+
shared_ptr[CStringBuilder] builder
29+
30+
def __cinit__(self, MemoryPool memory_pool=None):
31+
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
32+
self.builder.reset(new CStringBuilder(pool))
33+
34+
cpdef append_null(self):
35+
self.builder.get().AppendNull()
36+
37+
def __len__(self):
38+
return self.builder.get().length()
39+
40+
cpdef append(self, value):
41+
self.builder.get().Append(tobytes(value))
42+
43+
cpdef finish(self):
44+
cdef shared_ptr[CArray] out
45+
with nogil:
46+
self.builder.get().Finish(&out)
47+
return pyarrow_wrap_array(out)
48+
49+
cdef shared_ptr[CStringBuilder] unwrap(self):
50+
return self.builder
2051

2152

2253
cdef class ObjectIdBuilder(_ArrayBuilderBase):
@@ -36,12 +67,7 @@ cdef class ObjectIdBuilder(_ArrayBuilderBase):
3667
return self.builder.get().length()
3768

3869
cpdef append(self, value):
39-
if value is None or value is np.nan:
40-
self.builder.get().AppendNull()
41-
elif isinstance(value, bytes):
42-
self.builder.get().Append(value)
43-
else:
44-
raise TypeError('ObjectIdBuilder only accepts bytes objects')
70+
self.builder.get().Append(value)
4571

4672
cpdef finish(self):
4773
cdef shared_ptr[CArray] out
@@ -69,12 +95,7 @@ cdef class Int32Builder(_ArrayBuilderBase):
6995
return self.builder.get().length()
7096

7197
cpdef append(self, value):
72-
if value is None or value is np.nan:
73-
self.builder.get().AppendNull()
74-
elif isinstance(value, int):
75-
self.builder.get().Append(value)
76-
else:
77-
raise TypeError('Int32Builder only accepts integer objects')
98+
self.builder.get().Append(value)
7899

79100
cpdef finish(self):
80101
cdef shared_ptr[CArray] out
@@ -102,12 +123,7 @@ cdef class Int64Builder(_ArrayBuilderBase):
102123
return self.builder.get().length()
103124

104125
cpdef append(self, value):
105-
if value is None or value is np.nan:
106-
self.builder.get().AppendNull()
107-
elif isinstance(value, int):
108-
self.builder.get().Append(value)
109-
else:
110-
raise TypeError('Int64Builder only accepts integer objects')
126+
self.builder.get().Append(value)
111127

112128
cpdef finish(self):
113129
cdef shared_ptr[CArray] out
@@ -135,12 +151,7 @@ cdef class DoubleBuilder(_ArrayBuilderBase):
135151
return self.builder.get().length()
136152

137153
cpdef append(self, value):
138-
if value is None or value is np.nan:
139-
self.builder.get().AppendNull()
140-
elif isinstance(value, (int, float)):
141-
self.builder.get().Append(value)
142-
else:
143-
raise TypeError('DoubleBuilder only accepts floats and ints')
154+
self.builder.get().Append(value)
144155

145156
cpdef finish(self):
146157
cdef shared_ptr[CArray] out
@@ -176,12 +187,7 @@ cdef class DatetimeBuilder(_ArrayBuilderBase):
176187
return self.builder.get().length()
177188

178189
cpdef append(self, value):
179-
if value is None or value is np.nan:
180-
self.builder.get().AppendNull()
181-
elif isinstance(value, int):
182-
self.builder.get().Append(value)
183-
else:
184-
raise TypeError('TimestampBuilder only accepts 64-bit integers')
190+
self.builder.get().Append(value)
185191

186192
cpdef finish(self):
187193
cdef shared_ptr[CArray] out

bindings/python/pymongoarrow/context.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from bson.codec_options import DEFAULT_CODEC_OPTIONS
1515

1616
from pyarrow import timestamp, Table
17-
from pymongoarrow.lib import Int32Builder, Int64Builder, DoubleBuilder, DatetimeBuilder, ObjectIdBuilder
17+
from pymongoarrow.lib import Int32Builder, Int64Builder, DoubleBuilder, DatetimeBuilder, ObjectIdBuilder, StringBuilder
1818
from pymongoarrow.types import _get_internal_typemap, _BsonArrowTypes
1919

2020

@@ -23,7 +23,8 @@
2323
_BsonArrowTypes.int64: Int64Builder,
2424
_BsonArrowTypes.double: DoubleBuilder,
2525
_BsonArrowTypes.datetime: DatetimeBuilder,
26-
_BsonArrowTypes.objectid: ObjectIdBuilder
26+
_BsonArrowTypes.objectid: ObjectIdBuilder,
27+
_BsonArrowTypes.string: StringBuilder,
2728
}
2829

2930

bindings/python/pymongoarrow/lib.pyx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ from libcpp cimport bool as cbool
3434
from libcpp.map cimport map
3535
from libcpp.string cimport string
3636
from pyarrow.lib cimport *
37+
from pyarrow.lib import tobytes
3738
from pymongoarrow.libarrow cimport *
3839
from pymongoarrow.libbson cimport *
3940
from pymongoarrow.types import _BsonArrowTypes

bindings/python/pymongoarrow/libbson.pxd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,9 @@ cdef extern from "<bson/bson.h>":
122122

123123
int64_t bson_iter_as_int64(const bson_iter_t *iter)
124124

125+
const char * bson_iter_utf8 (const bson_iter_t *iter, uint32_t *length)
126+
127+
125128

126129
# bson_reader_t API
127130
cdef extern from "<bson/bson.h>":

bindings/python/pymongoarrow/types.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from bson import Int64, ObjectId
1818

19-
from pyarrow import timestamp, float64, int64, int32
19+
from pyarrow import timestamp, float64, int64, int32, string
2020
from pyarrow import DataType as _ArrowDataType
2121
import pyarrow.types as _atypes
2222

@@ -27,14 +27,16 @@ class _BsonArrowTypes(enum.Enum):
2727
int32 = 3
2828
int64 = 4
2929
objectid = 5
30+
string = 6
3031

3132

3233
_TYPE_NORMALIZER_FACTORY = {
3334
Int64: lambda _: int64(),
3435
float: lambda _: float64(),
3536
int: lambda _: int64(),
3637
datetime: lambda _: timestamp('ms'), # TODO: add tzinfo support
37-
ObjectId: lambda _: ObjectId
38+
ObjectId: lambda _: ObjectId,
39+
str: lambda: string(),
3840
}
3941

4042

@@ -47,7 +49,8 @@ def _is_objectid(obj):
4749
_atypes.is_int64: _BsonArrowTypes.int64,
4850
_atypes.is_float64: _BsonArrowTypes.double,
4951
_atypes.is_timestamp: _BsonArrowTypes.datetime,
50-
_is_objectid: _BsonArrowTypes.objectid
52+
_is_objectid: _BsonArrowTypes.objectid,
53+
_atypes.is_string: _BsonArrowTypes.string,
5154
}
5255

5356

0 commit comments

Comments
 (0)