Skip to content

Commit 1515197

Browse files
authored
GH-48410: [Ruby] Add support for reading large list array (#48411)
### Rationale for this change It's a large variant of list array. ### What changes are included in this PR? * Add `ArrowFormat::LargeListType` * Add `ArrowFormat::LargeListArray` * Improve large list support in Red Arrow ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48410 Authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent f65ee2c commit 1515197

File tree

14 files changed

+421
-103
lines changed

14 files changed

+421
-103
lines changed

ruby/red-arrow-format/lib/arrow-format/array.rb

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def initialize(type, size, validity_buffer, offsets_buffer, child)
172172
def to_a
173173
child_values = @child.to_a
174174
values = @offsets_buffer.
175-
each(:s32, 0, @size + 1). # TODO: big endian support
175+
each(offset_type, 0, @size + 1).
176176
each_cons(2).
177177
collect do |(_, offset), (_, next_offset)|
178178
child_values[offset...next_offset]
@@ -182,6 +182,17 @@ def to_a
182182
end
183183

184184
class ListArray < VariableSizeListArray
185+
private
186+
def offset_type
187+
:s32 # TODO: big endian support
188+
end
189+
end
190+
191+
class LargeListArray < VariableSizeListArray
192+
private
193+
def offset_type
194+
:s64 # TODO: big endian support
195+
end
185196
end
186197

187198
class StructArray < Array
@@ -215,5 +226,10 @@ def to_a
215226
end
216227
end
217228
end
229+
230+
private
231+
def offset_type
232+
:s32 # TODO: big endian support
233+
end
218234
end
219235
end

ruby/red-arrow-format/lib/arrow-format/file-reader.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
require_relative "org/apache/arrow/flatbuf/footer"
2929
require_relative "org/apache/arrow/flatbuf/int"
3030
require_relative "org/apache/arrow/flatbuf/large_binary"
31+
require_relative "org/apache/arrow/flatbuf/large_list"
3132
require_relative "org/apache/arrow/flatbuf/list"
3233
require_relative "org/apache/arrow/flatbuf/map"
3334
require_relative "org/apache/arrow/flatbuf/message"
@@ -161,6 +162,8 @@ def read_field(fb_field)
161162
end
162163
when Org::Apache::Arrow::Flatbuf::List
163164
type = ListType.new(read_field(fb_field.children[0]))
165+
when Org::Apache::Arrow::Flatbuf::LargeList
166+
type = LargeListType.new(read_field(fb_field.children[0]))
164167
when Org::Apache::Arrow::Flatbuf::Struct
165168
children = fb_field.children.collect {|child| read_field(child)}
166169
type = StructType.new(children)

ruby/red-arrow-format/lib/arrow-format/type.rb

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,16 @@ def build_array(size, validity_buffer, offsets_buffer, child)
214214
end
215215
end
216216

217+
class LargeListType < VariableSizeListType
218+
def initialize(child)
219+
super("LargeList", child)
220+
end
221+
222+
def build_array(size, validity_buffer, offsets_buffer, child)
223+
LargeListArray.new(self, size, validity_buffer, offsets_buffer, child)
224+
end
225+
end
226+
217227
class StructType < Type
218228
attr_reader :children
219229
def initialize(children)

ruby/red-arrow-format/test/test-file-reader.rb

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,18 @@ def test_read
140140
end
141141
end
142142

143+
sub_test_case("LargeList") do
144+
def build_array
145+
data_type = Arrow::LargeListDataType.new(name: "count", type: :int8)
146+
Arrow::LargeListArray.new(data_type, [[-128, 127], nil, [-1, 0, 1]])
147+
end
148+
149+
def test_read
150+
assert_equal([{"value" => [[-128, 127], nil, [-1, 0, 1]]}],
151+
read)
152+
end
153+
end
154+
143155
sub_test_case("Struct") do
144156
def build_array
145157
data_type = Arrow::StructDataType.new(count: :int8,

ruby/red-arrow/ext/arrow/converters.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ namespace red_arrow {
2525
return list_array_value_converter_->convert(array, i);
2626
}
2727

28+
VALUE ArrayValueConverter::convert(const arrow::LargeListArray& array,
29+
const int64_t i) {
30+
return large_list_array_value_converter_->convert(array, i);
31+
}
32+
2833
VALUE ArrayValueConverter::convert(const arrow::StructArray& array,
2934
const int64_t i) {
3035
return struct_array_value_converter_->convert(array, i);

ruby/red-arrow/ext/arrow/converters.hpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
namespace red_arrow {
3030
class ListArrayValueConverter;
31+
class LargeListArrayValueConverter;
3132
class StructArrayValueConverter;
3233
class MapArrayValueConverter;
3334
class UnionArrayValueConverter;
@@ -38,18 +39,21 @@ namespace red_arrow {
3839
ArrayValueConverter()
3940
: decimal_buffer_(),
4041
list_array_value_converter_(nullptr),
42+
large_list_array_value_converter_(nullptr),
4143
struct_array_value_converter_(nullptr),
4244
map_array_value_converter_(nullptr),
4345
union_array_value_converter_(nullptr),
4446
dictionary_array_value_converter_(nullptr) {
4547
}
4648

4749
inline void set_sub_value_converters(ListArrayValueConverter* list_array_value_converter,
50+
LargeListArrayValueConverter* large_list_array_value_converter,
4851
StructArrayValueConverter* struct_array_value_converter,
4952
MapArrayValueConverter* map_array_value_converter,
5053
UnionArrayValueConverter* union_array_value_converter,
5154
DictionaryArrayValueConverter* dictionary_array_value_converter) {
5255
list_array_value_converter_ = list_array_value_converter;
56+
large_list_array_value_converter_ = large_list_array_value_converter;
5357
struct_array_value_converter_ = struct_array_value_converter;
5458
map_array_value_converter_ = map_array_value_converter;
5559
union_array_value_converter_ = union_array_value_converter;
@@ -263,6 +267,9 @@ namespace red_arrow {
263267
VALUE convert(const arrow::ListArray& array,
264268
const int64_t i);
265269

270+
VALUE convert(const arrow::LargeListArray& array,
271+
const int64_t i);
272+
266273
VALUE convert(const arrow::StructArray& array,
267274
const int64_t i);
268275

@@ -298,6 +305,7 @@ namespace red_arrow {
298305

299306
std::string decimal_buffer_;
300307
ListArrayValueConverter* list_array_value_converter_;
308+
LargeListArrayValueConverter* large_list_array_value_converter_;
301309
StructArrayValueConverter* struct_array_value_converter_;
302310
MapArrayValueConverter* map_array_value_converter_;
303311
UnionArrayValueConverter* union_array_value_converter_;
@@ -359,6 +367,106 @@ namespace red_arrow {
359367
VISIT(DayTimeInterval)
360368
VISIT(MonthDayNanoInterval)
361369
VISIT(List)
370+
VISIT(LargeList)
371+
VISIT(Struct)
372+
VISIT(Map)
373+
VISIT(SparseUnion)
374+
VISIT(DenseUnion)
375+
VISIT(Dictionary)
376+
VISIT(Decimal128)
377+
VISIT(Decimal256)
378+
// TODO
379+
// VISIT(Extension)
380+
381+
#undef VISIT
382+
383+
private:
384+
template <typename ArrayType>
385+
inline VALUE convert_value(const ArrayType& array,
386+
const int64_t i) {
387+
return array_value_converter_->convert(array, i);
388+
}
389+
390+
template <typename ArrayType>
391+
arrow::Status visit_value(const ArrayType& array) {
392+
if (array.null_count() > 0) {
393+
for (int64_t i = 0; i < length_; ++i) {
394+
auto value = Qnil;
395+
if (!array.IsNull(i + offset_)) {
396+
value = convert_value(array, i + offset_);
397+
}
398+
rb_ary_push(result_, value);
399+
}
400+
} else {
401+
for (int64_t i = 0; i < length_; ++i) {
402+
rb_ary_push(result_, convert_value(array, i + offset_));
403+
}
404+
}
405+
return arrow::Status::OK();
406+
}
407+
408+
ArrayValueConverter* array_value_converter_;
409+
int32_t offset_;
410+
int32_t length_;
411+
VALUE result_;
412+
};
413+
414+
class LargeListArrayValueConverter : public arrow::ArrayVisitor {
415+
public:
416+
explicit LargeListArrayValueConverter(ArrayValueConverter* converter)
417+
: array_value_converter_(converter),
418+
offset_(0),
419+
length_(0),
420+
result_(Qnil) {}
421+
422+
VALUE convert(const arrow::LargeListArray& array, const int64_t index) {
423+
auto values = array.values().get();
424+
auto offset_keep = offset_;
425+
auto length_keep = length_;
426+
offset_ = array.value_offset(index);
427+
length_ = array.value_length(index);
428+
auto result_keep = result_;
429+
result_ = rb_ary_new_capa(length_);
430+
check_status(values->Accept(this),
431+
"[raw-records][large-list-array]");
432+
offset_ = offset_keep;
433+
length_ = length_keep;
434+
auto result_return = result_;
435+
result_ = result_keep;
436+
return result_return;
437+
}
438+
439+
#define VISIT(TYPE) \
440+
arrow::Status Visit(const arrow::TYPE ## Array& array) override { \
441+
return visit_value(array); \
442+
}
443+
444+
VISIT(Null)
445+
VISIT(Boolean)
446+
VISIT(Int8)
447+
VISIT(Int16)
448+
VISIT(Int32)
449+
VISIT(Int64)
450+
VISIT(UInt8)
451+
VISIT(UInt16)
452+
VISIT(UInt32)
453+
VISIT(UInt64)
454+
VISIT(HalfFloat)
455+
VISIT(Float)
456+
VISIT(Double)
457+
VISIT(Binary)
458+
VISIT(String)
459+
VISIT(FixedSizeBinary)
460+
VISIT(Date32)
461+
VISIT(Date64)
462+
VISIT(Time32)
463+
VISIT(Time64)
464+
VISIT(Timestamp)
465+
VISIT(MonthInterval)
466+
VISIT(DayTimeInterval)
467+
VISIT(MonthDayNanoInterval)
468+
VISIT(List)
469+
VISIT(LargeList)
362470
VISIT(Struct)
363471
VISIT(Map)
364472
VISIT(SparseUnion)
@@ -465,6 +573,7 @@ namespace red_arrow {
465573
VISIT(DayTimeInterval)
466574
VISIT(MonthDayNanoInterval)
467575
VISIT(List)
576+
VISIT(LargeList)
468577
VISIT(Struct)
469578
VISIT(Map)
470579
VISIT(SparseUnion)
@@ -567,6 +676,7 @@ namespace red_arrow {
567676
VISIT(DayTimeInterval)
568677
VISIT(MonthDayNanoInterval)
569678
VISIT(List)
679+
VISIT(LargeList)
570680
VISIT(Struct)
571681
VISIT(Map)
572682
VISIT(SparseUnion)
@@ -670,6 +780,7 @@ namespace red_arrow {
670780
VISIT(DayTimeInterval)
671781
VISIT(MonthDayNanoInterval)
672782
VISIT(List)
783+
VISIT(LargeList)
673784
VISIT(Struct)
674785
VISIT(Map)
675786
VISIT(SparseUnion)
@@ -781,6 +892,7 @@ namespace red_arrow {
781892
VISIT(DayTimeInterval)
782893
VISIT(MonthDayNanoInterval)
783894
VISIT(List)
895+
VISIT(LargeList)
784896
VISIT(Struct)
785897
VISIT(Map)
786898
VISIT(SparseUnion)
@@ -810,12 +922,14 @@ namespace red_arrow {
810922
explicit Converter()
811923
: array_value_converter_(),
812924
list_array_value_converter_(&array_value_converter_),
925+
large_list_array_value_converter_(&array_value_converter_),
813926
struct_array_value_converter_(&array_value_converter_),
814927
map_array_value_converter_(&array_value_converter_),
815928
union_array_value_converter_(&array_value_converter_),
816929
dictionary_array_value_converter_(&array_value_converter_) {
817930
array_value_converter_.
818931
set_sub_value_converters(&list_array_value_converter_,
932+
&large_list_array_value_converter_,
819933
&struct_array_value_converter_,
820934
&map_array_value_converter_,
821935
&union_array_value_converter_,
@@ -830,6 +944,7 @@ namespace red_arrow {
830944

831945
ArrayValueConverter array_value_converter_;
832946
ListArrayValueConverter list_array_value_converter_;
947+
LargeListArrayValueConverter large_list_array_value_converter_;
833948
StructArrayValueConverter struct_array_value_converter_;
834949
MapArrayValueConverter map_array_value_converter_;
835950
UnionArrayValueConverter union_array_value_converter_;

ruby/red-arrow/ext/arrow/values.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ namespace red_arrow {
8080
VISIT(DayTimeInterval)
8181
VISIT(MonthDayNanoInterval)
8282
VISIT(List)
83+
VISIT(LargeList)
8384
VISIT(Struct)
8485
VISIT(Map)
8586
VISIT(SparseUnion)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
module Arrow
19+
class LargeListArrayBuilder
20+
class << self
21+
def build(data_type, values)
22+
builder = new(data_type)
23+
builder.build(values)
24+
end
25+
end
26+
27+
prepend ListValuesAppendable
28+
end
29+
end

0 commit comments

Comments
 (0)