Skip to content

Commit bbfd1e6

Browse files
add test and address review comments
1 parent 49dfcb1 commit bbfd1e6

File tree

2 files changed

+58
-15
lines changed

2 files changed

+58
-15
lines changed

cpp/src/arrow/compute/kernels/scalar_cast_string.cc

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "arrow/result.h"
3030
#include "arrow/type.h"
3131
#include "arrow/type_traits.h"
32+
#include "arrow/util/bit_util.h"
3233
#include "arrow/util/formatting.h"
3334
#include "arrow/util/int_util.h"
3435
#include "arrow/util/logging_internal.h"
@@ -305,23 +306,33 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
305306
}
306307
}
307308

308-
std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
309-
ArrayData* output = out->array_data().get();
310-
output->length = input_arr->length;
311-
output->SetNullCount(input_arr->null_count);
312-
output->buffers = std::move(input_arr->buffers);
313-
output->child_data = std::move(input_arr->child_data);
314-
315-
if (output->buffers[0]) {
316-
// If reusing the null bitmap, ensure offset into the first byte is the same as input.
317-
output->offset = input_arr->offset % 8;
318-
output->buffers[0] = SliceBuffer(output->buffers[0], input_arr->offset / 8);
309+
if constexpr (sizeof(typename I::offset_type) != sizeof(typename O::offset_type)) {
310+
std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
311+
ArrayData* output = out->array_data().get();
312+
output->length = input_arr->length;
313+
// output->offset is set below
314+
output->SetNullCount(input_arr->null_count);
315+
output->buffers = std::move(input_arr->buffers);
316+
317+
// Slice buffers to reduce allocation when casting the offsets buffer
318+
int64_t offset = input_arr->offset;
319+
size_t input_offset_type_size = sizeof(typename I::offset_type);
320+
if (output->null_count != 0 && output->buffers[0]) {
321+
// Avoid reallocation of the validity buffer by allowing some padding bits
322+
output->offset = input_arr->offset % 8;
323+
} else {
324+
output->offset = 0;
325+
}
326+
if (output->buffers[0]) {
327+
output->buffers[0] = SliceBuffer(output->buffers[0], offset / 8);
328+
}
329+
output->buffers[1] = SliceBuffer(output->buffers[1], offset * input_offset_type_size);
330+
331+
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
332+
ctx, input, out->array_data().get());
319333
} else {
320-
output->offset = 0;
334+
return ZeroCopyCastExec(ctx, batch, out);
321335
}
322-
323-
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
324-
ctx, input, out->array_data().get());
325336
}
326337

327338
// String View -> Offset String

cpp/src/arrow/compute/kernels/scalar_cast_test.cc

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3400,6 +3400,38 @@ TEST(Cast, StringToString) {
34003400
}
34013401
}
34023402

3403+
TEST(Cast, StringToStringWithOffset) {
3404+
std::cout << " casting string offset test " << std::endl;
3405+
// GH-43660: Check casting String Arrays with nonzero offset
3406+
for (auto from_type : {utf8(), large_utf8()}) {
3407+
for (auto to_type : {utf8(), large_utf8()}) {
3408+
for (int64_t offset : {3, 8, 10, 12}) {
3409+
auto input_with_nulls = R"([
3410+
"foo", null, "bar", null, "quu", "foo", "baz", "bar",
3411+
null, "bar", "baz", null
3412+
])";
3413+
3414+
auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
3415+
auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
3416+
std::cout << "from type: " << from_type->ToString() << " offset: " << offset
3417+
<< " to type: " << to_type->ToString() << std::endl;
3418+
CheckCast(input_arr_with_nulls->Slice(offset),
3419+
output_arr_with_nulls->Slice(offset));
3420+
3421+
std::cout << "wout nulls " << std::endl;
3422+
auto input_no_nulls = R"([
3423+
"foo", "aa", "bar", "bb", "quu", "foo", "baz", "bar",
3424+
"cc", "bar", "baz", "foo"
3425+
])";
3426+
3427+
auto input_arr_no_nulls = ArrayFromJSON(from_type, input_no_nulls);
3428+
auto output_arr_no_nulls = ArrayFromJSON(to_type, input_no_nulls);
3429+
CheckCast(input_arr_no_nulls->Slice(offset), output_arr_no_nulls->Slice(offset));
3430+
}
3431+
}
3432+
}
3433+
}
3434+
34033435
TEST(Cast, BinaryOrStringToFixedSizeBinary) {
34043436
for (auto in_type :
34053437
{utf8(), large_utf8(), utf8_view(), binary(), binary_view(), large_binary()}) {

0 commit comments

Comments
 (0)