Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@

#include "arrow/array/array_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/buffer.h"
#include "arrow/compute/kernels/base_arithmetic_internal.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/common_internal.h"
#include "arrow/compute/kernels/scalar_cast_internal.h"
#include "arrow/compute/kernels/temporal_internal.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/formatting.h"
Expand Down Expand Up @@ -304,10 +306,34 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}
}

// Start with a zero-copy cast, but change indices to expected size
// Start with a zero-copy cast, but change indices to the correct size and set validity
// bitmap and offset if needed.
RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
ctx, input, out->array_data().get());
if constexpr (sizeof(typename I::offset_type) != sizeof(typename O::offset_type)) {
std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
ArrayData* output = out->array_data().get();

// Slice buffers to minimize the output's offset. We need a small offset because
// CastBinaryToBinaryOffsets() will reallocate the offsets buffer with size
// (out_length + out_offset + 1) * sizeof(offset_type).
int64_t input_offset = input_arr->offset;
size_t input_offset_type_size = sizeof(typename I::offset_type);
if (output->null_count != 0 && output->buffers[0]) {
// Avoid reallocation of the validity buffer by allowing some padding bits
output->offset = input_offset % 8;
} else {
output->offset = 0;
}
if (output->buffers[0]) {
output->buffers[0] = SliceBuffer(output->buffers[0], input_offset / 8);
}
output->buffers[1] = SliceBuffer(
output->buffers[1], (input_offset - output->offset) * input_offset_type_size);

return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
ctx, input, out->array_data().get());
}
return Status::OK();
}

// String View -> Offset String
Expand Down
40 changes: 40 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3400,6 +3400,46 @@ TEST(Cast, StringToString) {
}
}

TEST(Cast, StringToStringWithOffset) {
// GH-43660: Check casting String Arrays with nonzero offset
std::vector<int64_t> offsets = {3, 8, 10, 12};
std::vector<int64_t> lengths = {5, 2, 1, 0};

for (auto from_type : {utf8(), large_utf8()}) {
for (auto to_type : {utf8(), large_utf8()}) {
for (size_t i = 0; i < offsets.size(); ++i) {
auto offset = offsets[i];
auto length = lengths[i];

auto input_with_nulls = R"([
"foo", null, "bar", null, "quu", "foo", "baz", "bar",
null, "bar", "baz", null
])";

auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
CheckCast(input_arr_with_nulls->Slice(offset),
output_arr_with_nulls->Slice(offset));
// Slice with length
CheckCast(input_arr_with_nulls->Slice(offset, length),
output_arr_with_nulls->Slice(offset, length));

auto input_no_nulls = R"([
"foo", "aa", "bar", "bb", "quu", "foo", "baz", "bar",
"cc", "bar", "baz", "foo"
])";

auto input_arr_no_nulls = ArrayFromJSON(from_type, input_no_nulls);
auto output_arr_no_nulls = ArrayFromJSON(to_type, input_no_nulls);
CheckCast(input_arr_no_nulls->Slice(offset), output_arr_no_nulls->Slice(offset));
// Slice with length
CheckCast(input_arr_no_nulls->Slice(offset, length),
output_arr_no_nulls->Slice(offset, length));
}
}
}
}

TEST(Cast, BinaryOrStringToFixedSizeBinary) {
for (auto in_type :
{utf8(), large_utf8(), utf8_view(), binary(), binary_view(), large_binary()}) {
Expand Down
Loading