Skip to content

Commit 7efd1d1

Browse files
authored
GH-48328: [Ruby] Add support for reading UTF-8 array (#48329)
### Rationale for this change It's similar to already implemented binary array. ### What changes are included in this PR? * Add `ArrowFormat::UTF8Type` * Add `ArrowFormat::UTF8Array` ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #48328 Authored-by: Sutou Kouhei <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 3868ba9 commit 7efd1d1

File tree

4 files changed

+48
-3
lines changed

4 files changed

+48
-3
lines changed

ruby/red-arrow-format/lib/arrow-format/array.rb

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def to_a
7373
end
7474
end
7575

76-
class BinaryArray < Array
76+
class VariableSizeBinaryLayoutArray < Array
7777
def initialize(type, size, validity_buffer, offsets_buffer, values_buffer)
7878
super(type, size, validity_buffer)
7979
@offsets_buffer = offsets_buffer
@@ -86,9 +86,23 @@ def to_a
8686
each_cons(2).
8787
collect do |(_, offset), (_, next_offset)|
8888
length = next_offset - offset
89-
@values_buffer.get_string(offset, length)
89+
@values_buffer.get_string(offset, length, encoding)
9090
end
9191
apply_validity(values)
9292
end
9393
end
94+
95+
class BinaryArray < VariableSizeBinaryLayoutArray
96+
private
97+
def encoding
98+
Encoding::ASCII_8BIT
99+
end
100+
end
101+
102+
class UTF8Array < VariableSizeBinaryLayoutArray
103+
private
104+
def encoding
105+
Encoding::UTF_8
106+
end
107+
end
94108
end

ruby/red-arrow-format/lib/arrow-format/file-reader.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ def read_schema(fb_schema)
139139
end
140140
when Org::Apache::Arrow::Flatbuf::Binary
141141
type = BinaryType.singleton
142+
when Org::Apache::Arrow::Flatbuf::Utf8
143+
type = UTF8Type.singleton
142144
end
143145
Field.new(fb_field.name, type)
144146
end
@@ -159,7 +161,8 @@ def read_column(field, n_rows, buffers, body)
159161
values_buffer = buffers.shift
160162
values = body.slice(values_buffer.offset, values_buffer.length)
161163
field.type.build_array(n_rows, validity, values)
162-
when BinaryType
164+
when BinaryType,
165+
UTF8Type
163166
offsets_buffer = buffers.shift
164167
values_buffer = buffers.shift
165168
offsets = body.slice(offsets_buffer.offset, offsets_buffer.length)

ruby/red-arrow-format/lib/arrow-format/type.rb

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,21 @@ def build_array(size, validity_buffer, offsets_buffer, values_buffer)
8080
BinaryArray.new(self, size, validity_buffer, offsets_buffer, values_buffer)
8181
end
8282
end
83+
84+
class UTF8Type < Type
85+
class << self
86+
def singleton
87+
@singleton ||= new
88+
end
89+
end
90+
91+
attr_reader :name
92+
def initialize
93+
super("UTF8")
94+
end
95+
96+
def build_array(size, validity_buffer, offsets_buffer, values_buffer)
97+
UTF8Array.new(self, size, validity_buffer, offsets_buffer, values_buffer)
98+
end
99+
end
83100
end

ruby/red-arrow-format/test/test-file-reader.rb

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,15 @@ def test_read
7272
read)
7373
end
7474
end
75+
76+
sub_test_case("UTF8") do
77+
def build_array
78+
Arrow::StringArray.new(["Hello", nil, "World"])
79+
end
80+
81+
def test_read
82+
assert_equal([{"value" => ["Hello", nil, "World"]}],
83+
read)
84+
end
85+
end
7586
end

0 commit comments

Comments
 (0)