Skip to content

Commit 86166d5

Browse files
GH-43660: [C++][Compute] Avoid ZeroCopyCastExec when casting Binary offset -> Binary offset types (#48171)
### Rationale for this change Casting Binary offset -> Binary offset types relies on ZeroCopyCastExec, which propagates the offset of the input to the output. This can lead to larger allocations than necessary when casting arrays with offsets. See #43660 and #43661 for more context. ### What changes are included in this PR? Ensure output array has a small offset (it can still be non-zero since reusing the null bitmap requires in_offset % 8 == out_offset % 8) ### Are these changes tested? Ran unit tests and benchmarked locally. ### Are there any user-facing changes? No * GitHub Issue: #43660 Authored-by: Scott Routledge <[email protected]> Signed-off-by: Rossi Sun <[email protected]>
1 parent c58bfe5 commit 86166d5

File tree

2 files changed

+69
-3
lines changed

2 files changed

+69
-3
lines changed

cpp/src/arrow/compute/kernels/scalar_cast_string.cc

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@
2020

2121
#include "arrow/array/array_base.h"
2222
#include "arrow/array/builder_binary.h"
23+
#include "arrow/buffer.h"
2324
#include "arrow/compute/kernels/base_arithmetic_internal.h"
2425
#include "arrow/compute/kernels/codegen_internal.h"
2526
#include "arrow/compute/kernels/common_internal.h"
2627
#include "arrow/compute/kernels/scalar_cast_internal.h"
2728
#include "arrow/compute/kernels/temporal_internal.h"
2829
#include "arrow/result.h"
30+
#include "arrow/status.h"
2931
#include "arrow/type.h"
3032
#include "arrow/type_traits.h"
3133
#include "arrow/util/formatting.h"
@@ -304,10 +306,34 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
304306
}
305307
}
306308

307-
// Start with a zero-copy cast, but change indices to expected size
309+
// Start with a zero-copy cast, but change indices to the correct size and set validity
310+
// bitmap and offset if needed.
308311
RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
309-
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
310-
ctx, input, out->array_data().get());
312+
if constexpr (sizeof(typename I::offset_type) != sizeof(typename O::offset_type)) {
313+
std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
314+
ArrayData* output = out->array_data().get();
315+
316+
// Slice buffers to minimize the output's offset. We need a small offset because
317+
// CastBinaryToBinaryOffsets() will reallocate the offsets buffer with size
318+
// (out_length + out_offset + 1) * sizeof(offset_type).
319+
int64_t input_offset = input_arr->offset;
320+
size_t input_offset_type_size = sizeof(typename I::offset_type);
321+
if (output->null_count != 0 && output->buffers[0]) {
322+
// Avoid reallocation of the validity buffer by allowing some padding bits
323+
output->offset = input_offset % 8;
324+
} else {
325+
output->offset = 0;
326+
}
327+
if (output->buffers[0]) {
328+
output->buffers[0] = SliceBuffer(output->buffers[0], input_offset / 8);
329+
}
330+
output->buffers[1] = SliceBuffer(
331+
output->buffers[1], (input_offset - output->offset) * input_offset_type_size);
332+
333+
return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
334+
ctx, input, out->array_data().get());
335+
}
336+
return Status::OK();
311337
}
312338

313339
// String View -> Offset String

cpp/src/arrow/compute/kernels/scalar_cast_test.cc

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3400,6 +3400,46 @@ TEST(Cast, StringToString) {
34003400
}
34013401
}
34023402

3403+
TEST(Cast, StringToStringWithOffset) {
3404+
// GH-43660: Check casting String Arrays with nonzero offset
3405+
std::vector<int64_t> offsets = {3, 8, 10, 12};
3406+
std::vector<int64_t> lengths = {5, 2, 1, 0};
3407+
3408+
for (auto from_type : {utf8(), large_utf8()}) {
3409+
for (auto to_type : {utf8(), large_utf8()}) {
3410+
for (size_t i = 0; i < offsets.size(); ++i) {
3411+
auto offset = offsets[i];
3412+
auto length = lengths[i];
3413+
3414+
auto input_with_nulls = R"([
3415+
"foo", null, "bar", null, "quu", "foo", "baz", "bar",
3416+
null, "bar", "baz", null
3417+
])";
3418+
3419+
auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
3420+
auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
3421+
CheckCast(input_arr_with_nulls->Slice(offset),
3422+
output_arr_with_nulls->Slice(offset));
3423+
// Slice with length
3424+
CheckCast(input_arr_with_nulls->Slice(offset, length),
3425+
output_arr_with_nulls->Slice(offset, length));
3426+
3427+
auto input_no_nulls = R"([
3428+
"foo", "aa", "bar", "bb", "quu", "foo", "baz", "bar",
3429+
"cc", "bar", "baz", "foo"
3430+
])";
3431+
3432+
auto input_arr_no_nulls = ArrayFromJSON(from_type, input_no_nulls);
3433+
auto output_arr_no_nulls = ArrayFromJSON(to_type, input_no_nulls);
3434+
CheckCast(input_arr_no_nulls->Slice(offset), output_arr_no_nulls->Slice(offset));
3435+
// Slice with length
3436+
CheckCast(input_arr_no_nulls->Slice(offset, length),
3437+
output_arr_no_nulls->Slice(offset, length));
3438+
}
3439+
}
3440+
}
3441+
}
3442+
34033443
TEST(Cast, BinaryOrStringToFixedSizeBinary) {
34043444
for (auto in_type :
34053445
{utf8(), large_utf8(), utf8_view(), binary(), binary_view(), large_binary()}) {

0 commit comments

Comments
 (0)