Skip to content

Commit db6ac11

Browse files
authored
GH-48306: [Ruby] Add support for reading binary array (#48307)
### Rationale for this change We can use this as a base feature for UTF-8/large binary/large UTF-8 array. ### What changes are included in this PR? * Add `ArrowFormat::BinaryType` * Add `ArrowFormat::BinaryArray` * Add support `Binary` FlatBuffers type ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48306 Authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 57cb172 commit db6ac11

File tree

4 files changed

+73
-12
lines changed

4 files changed

+73
-12
lines changed

ruby/red-arrow-format/lib/arrow-format/array.rb

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,10 @@ class Array
1919
attr_reader :type
2020
attr_reader :size
2121
alias_method :length, :size
22-
def initialize(type, size, validity_buffer, values_buffer)
22+
def initialize(type, size, validity_buffer)
2323
@type = type
2424
@size = size
2525
@validity_buffer = validity_buffer
26-
@values_buffer = values_buffer
2726
end
2827

2928
def valid?(i)
@@ -55,15 +54,41 @@ def apply_validity(array)
5554
end
5655
end
5756

58-
class Int8Array < Array
57+
class IntArray < Array
58+
def initialize(type, size, validity_buffer, values_buffer)
59+
super(type, size, validity_buffer)
60+
@values_buffer = values_buffer
61+
end
62+
end
63+
64+
class Int8Array < IntArray
5965
def to_a
6066
apply_validity(@values_buffer.values(:S8, 0, @size))
6167
end
6268
end
6369

64-
class UInt8Array < Array
70+
class UInt8Array < IntArray
6571
def to_a
6672
apply_validity(@values_buffer.values(:U8, 0, @size))
6773
end
6874
end
75+
76+
class BinaryArray < Array
77+
def initialize(type, size, validity_buffer, offsets_buffer, values_buffer)
78+
super(type, size, validity_buffer)
79+
@offsets_buffer = offsets_buffer
80+
@values_buffer = values_buffer
81+
end
82+
83+
def to_a
84+
values = @offsets_buffer.
85+
each(:s32, 0, @size + 1). # TODO: big endian support
86+
each_cons(2).
87+
collect do |(_, offset), (_, next_offset)|
88+
length = next_offset - offset
89+
@values_buffer.get_string(offset, length)
90+
end
91+
apply_validity(values)
92+
end
93+
end
6994
end

ruby/red-arrow-format/lib/arrow-format/file-reader.rb

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,26 +137,34 @@ def read_schema(fb_schema)
137137
type = UInt8Type.singleton
138138
end
139139
end
140+
when Org::Apache::Arrow::Flatbuf::Binary
141+
type = BinaryType.singleton
140142
end
141143
Field.new(fb_field.name, type)
142144
end
143145
Schema.new(fields)
144146
end
145147

146148
def read_column(field, n_rows, buffers, body)
149+
validity_buffer = buffers.shift
150+
if validity_buffer.length.zero?
151+
validity = nil
152+
else
153+
validity = body.slice(validity_buffer.offset, validity_buffer.length)
154+
end
155+
147156
case field.type
148157
when Int8Type,
149158
UInt8Type
150-
validity_buffer = buffers.shift
151-
if validity_buffer.length.zero?
152-
validity = nil
153-
else
154-
validity = body.slice(validity_buffer.offset, validity_buffer.length)
155-
end
156-
157159
values_buffer = buffers.shift
158160
values = body.slice(values_buffer.offset, values_buffer.length)
159161
field.type.build_array(n_rows, validity, values)
162+
when BinaryType
163+
offsets_buffer = buffers.shift
164+
values_buffer = buffers.shift
165+
offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)
166+
values = body.slice(values_buffer.offset, values_buffer.length)
167+
field.type.build_array(n_rows, validity, offsets, values)
160168
end
161169
end
162170
end

ruby/red-arrow-format/lib/arrow-format/type.rb

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,21 @@ def build_array(size, validity_buffer, values_buffer)
6363
UInt8Array.new(self, size, validity_buffer, values_buffer)
6464
end
6565
end
66+
67+
class BinaryType < Type
68+
class << self
69+
def singleton
70+
@singleton ||= new
71+
end
72+
end
73+
74+
attr_reader :name
75+
def initialize
76+
super("Binary")
77+
end
78+
79+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
80+
BinaryArray.new(self, size, validity_buffer, offsets_buffer, values_buffer)
81+
end
82+
end
6683
end

ruby/red-arrow-format/test/test-file-reader.rb

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,20 @@ def build_array
5656
Arrow::UInt8Array.new([0, nil, 255])
5757
end
5858

59-
def test_uint8
59+
def test_read
6060
assert_equal([{"value" => [0, nil, 255]}],
6161
read)
6262
end
6363
end
64+
65+
sub_test_case("Binary") do
66+
def build_array
67+
Arrow::BinaryArray.new(["Hello".b, nil, "World".b])
68+
end
69+
70+
def test_read
71+
assert_equal([{"value" => ["Hello".b, nil, "World".b]}],
72+
read)
73+
end
74+
end
6475
end

0 commit comments

Comments
 (0)