Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions cpp/src/arrow/compute/kernels/scalar_cast_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "arrow/array/array_base.h"
#include "arrow/array/builder_binary.h"
#include "arrow/buffer.h"
#include "arrow/compute/kernels/base_arithmetic_internal.h"
#include "arrow/compute/kernels/codegen_internal.h"
#include "arrow/compute/kernels/common_internal.h"
Expand Down Expand Up @@ -304,10 +305,33 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
}
}

// Start with a zero-copy cast, but change indices to expected size
RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
ctx, input, out->array_data().get());
if constexpr (sizeof(typename I::offset_type) != sizeof(typename O::offset_type)) {
std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
ArrayData* output = out->array_data().get();
output->length = input_arr->length;
// output->offset is set below
output->SetNullCount(input_arr->null_count);
output->buffers = std::move(input_arr->buffers);

// Slice buffers to reduce allocation when casting the offsets buffer
int64_t offset = input_arr->offset;
size_t input_offset_type_size = sizeof(typename I::offset_type);
if (output->null_count != 0 && output->buffers[0]) {
// Avoid reallocation of the validity buffer by allowing some padding bits
output->offset = input_arr->offset % 8;
} else {
output->offset = 0;
}
if (output->buffers[0]) {
output->buffers[0] = SliceBuffer(output->buffers[0], offset / 8);
}
output->buffers[1] = SliceBuffer(output->buffers[1], offset * input_offset_type_size);
Copy link
Contributor Author

@scott-routledge2 scott-routledge2 Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe in order for this slicing logic to be fully correct it needs to take into account output->offset when doing the slicing as well as the minimum size requirements. So it would look something like

int64_t offset_buffer_offset = (input_offset - output_offset) * offset_type_size
int64_t offset_buffer_length = (length + output_offset + 1) * offset_type_size
if (offset_buffer_length < minimum_size) {
  // update the output->offset so that it's now == minimum size
}

?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current code seems correct to me. I'm not sure what's the concern here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I was testing the buffer slicing logic here (for example, casting a slice with length 3 and offset 1) I was seeing an error that looked like:

'datum.make_array()->ValidateFull()' failed with Invalid: Offsets buffer size (bytes): 16 isn't large enough for length: 3 and offset: 1

Since I think technically the offsets buffer needs to be big enough to hold offset + length + 1 elements whereas the way this is written it will only hold length + 1 many elements. However, because of the code structure here, this path isn't actually reachable since we always reallocate the offsets buffer with the correct size in CastBinaryToBinaryOffsets anyways.


return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
ctx, input, out->array_data().get());
} else {
return ZeroCopyCastExec(ctx, batch, out);
}
}

// String View -> Offset String
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3400,6 +3400,38 @@ TEST(Cast, StringToString) {
}
}

TEST(Cast, StringToStringWithOffset) {
std::cout << " casting string offset test " << std::endl;
// GH-43660: Check casting String Arrays with nonzero offset
for (auto from_type : {utf8(), large_utf8()}) {
for (auto to_type : {utf8(), large_utf8()}) {
for (int64_t offset : {3, 8, 10, 12}) {
auto input_with_nulls = R"([
"foo", null, "bar", null, "quu", "foo", "baz", "bar",
null, "bar", "baz", null
])";

auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
std::cout << "from type: " << from_type->ToString() << " offset: " << offset
<< " to type: " << to_type->ToString() << std::endl;
CheckCast(input_arr_with_nulls->Slice(offset),
output_arr_with_nulls->Slice(offset));

std::cout << "wout nulls " << std::endl;
auto input_no_nulls = R"([
"foo", "aa", "bar", "bb", "quu", "foo", "baz", "bar",
"cc", "bar", "baz", "foo"
])";

auto input_arr_no_nulls = ArrayFromJSON(from_type, input_no_nulls);
auto output_arr_no_nulls = ArrayFromJSON(to_type, input_no_nulls);
CheckCast(input_arr_no_nulls->Slice(offset), output_arr_no_nulls->Slice(offset));
}
}
}
}

TEST(Cast, BinaryOrStringToFixedSizeBinary) {
for (auto in_type :
{utf8(), large_utf8(), utf8_view(), binary(), binary_view(), large_binary()}) {
Expand Down
Loading