Skip to content

Commit 5f12de2

Browse files
Anakin100100pitrou
andauthored
apacheGH-49310: [C++][Compute] Fix segmentation fault in pyarrow.compute.if_else (apache#49375)
### Rationale for this change Fixing apache#49310 ### What changes are included in this PR? I added a check for the if else compute kernels that verify that the result will fit in the offset of the result. The check asserts that no matter which values are chosen from either the left or right arrays/scalars there will be no overflow in order to use the UnsafeAppend which skips almost all safety checks. ### Are these changes tested? Yes, There are tests for each of the kernels that include an array that the values that would overflow the 32 bit index trigger the check. These tests don't actually allocate the data to save on time and compute. There are also two tests that verify an overflow for a 32 bit offset type and trigger an error and the second one that ensures that for a data type with a 64 bit offset there is no overflow error. ### Are there any user-facing changes? Yes, if the result may not fit in a type that relies on 32 bit offsets the user will receive a helpful error message instead of a segfault. * GitHub Issue: apache#49310 Lead-authored-by: Paweł Biegun <biegunpawel900@gmail.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent b69ce86 commit 5f12de2

File tree

2 files changed

+107
-2
lines changed

2 files changed

+107
-2
lines changed

cpp/src/arrow/compute/kernels/scalar_if_else.cc

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717

1818
#include <cstring>
19+
#include <limits>
1920
#include "arrow/array/builder_nested.h"
2021
#include "arrow/array/builder_primitive.h"
2122
#include "arrow/array/builder_time.h"
@@ -688,6 +689,17 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
688689
using ArrayType = typename TypeTraits<Type>::ArrayType;
689690
using BuilderType = typename TypeTraits<Type>::BuilderType;
690691

692+
static Status ValidateCapacityForOffsetType(int64_t data_bytes) {
693+
int64_t max_offset = static_cast<int64_t>(std::numeric_limits<OffsetType>::max());
694+
if (data_bytes > max_offset) {
695+
return Status::CapacityError("Result may exceed offset capacity for this type: ",
696+
data_bytes, " > ", max_offset,
697+
". Convert inputs to a type that uses an int64 based "
698+
"offset such as a large_string");
699+
}
700+
return Status::OK();
701+
}
702+
691703
// A - Array, S - Scalar, X = Array/Scalar
692704

693705
// SXX
@@ -712,9 +724,13 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
712724
const uint8_t* right_data = right.buffers[2].data;
713725

714726
// allocate data buffer conservatively
715-
int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
716-
right_offsets[right.length] - right_offsets[0];
727+
int64_t data_buff_alloc = (static_cast<int64_t>(left_offsets[left.length]) -
728+
static_cast<int64_t>(left_offsets[0])) +
729+
(static_cast<int64_t>(right_offsets[right.length]) -
730+
static_cast<int64_t>(right_offsets[0]));
717731

732+
// output a nicer error message if the heuristic overflows max capacity
733+
ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
718734
BuilderType builder(ctx->memory_pool());
719735
ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
720736
ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
@@ -758,6 +774,8 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
758774
int64_t data_buff_alloc =
759775
left_size * cond.length + right_offsets[right.length] - right_offsets[0];
760776

777+
// output a nicer error message if the heuristic overflows max capacity
778+
ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
761779
BuilderType builder(ctx->memory_pool());
762780
ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
763781
ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
@@ -798,6 +816,8 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
798816
int64_t data_buff_alloc =
799817
right_size * cond.length + left_offsets[left.length] - left_offsets[0];
800818

819+
// output a nicer error message if the heuristic overflows max capacity
820+
ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(data_buff_alloc));
801821
BuilderType builder(ctx->memory_pool());
802822
ARROW_RETURN_NOT_OK(builder.Reserve(cond.length + 1));
803823
ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));

cpp/src/arrow/compute/kernels/scalar_if_else_test.cc

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
#include <limits>
1819
#include <numeric>
1920

2021
#include <gtest/gtest.h>
@@ -608,6 +609,90 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
608609
CheckIfElseOutput(cond, left, right, expected_data);
609610
}
610611

612+
Result<std::shared_ptr<Array>> MakeBinaryArrayWithData(
613+
const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data_buffer) {
614+
// Make a (large-)binary array with a single item backed by the given data
615+
const auto data_length = data_buffer->size();
616+
if (type->id() == Type::STRING || type->id() == Type::BINARY) {
617+
if (data_length > std::numeric_limits<int32_t>::max()) {
618+
return Status::Invalid("data_length exceeds int32 offset range");
619+
}
620+
auto offsets =
621+
Buffer::FromVector(std::vector<int32_t>{0, static_cast<int32_t>(data_length)});
622+
return MakeArray(
623+
ArrayData::Make(type, 1, {nullptr, std::move(offsets), std::move(data_buffer)}));
624+
}
625+
if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
626+
return Status::TypeError("unsupported var-binary type for helper: ", *type);
627+
}
628+
auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
629+
return MakeArray(
630+
ArrayData::Make(type, 1, {nullptr, std::move(offsets), std::move(data_buffer)}));
631+
}
632+
633+
void CheckIfElseCapacityBehavior(const Datum& cond, const Datum& left, const Datum& right,
634+
bool expect_capacity_error) {
635+
auto maybe_out = CallFunction("if_else", {cond, left, right});
636+
if (expect_capacity_error) {
637+
EXPECT_RAISES_WITH_MESSAGE_THAT(
638+
CapacityError,
639+
::testing::HasSubstr("Result may exceed offset capacity for this type"),
640+
maybe_out);
641+
} else {
642+
ASSERT_OK(maybe_out);
643+
}
644+
}
645+
646+
TEST_F(TestIfElseKernel, LARGE_MEMORY_TEST(IfElseBinaryAAANear2GB)) {
647+
// See GH-49310.
648+
// `kPerSide` is below the capacity limit for a single binary array,
649+
// but trying to allocate twice `kPerSide` would overflow the capacity limit.
650+
constexpr int64_t kPerSide =
651+
static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
652+
auto cond = ArrayFromJSON(boolean(), "[true]");
653+
654+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> large_buffer, AllocateBuffer(kPerSide));
655+
656+
ASSERT_OK_AND_ASSIGN(auto binary_left, MakeBinaryArrayWithData(binary(), large_buffer));
657+
ASSERT_OK_AND_ASSIGN(auto binary_right,
658+
MakeBinaryArrayWithData(binary(), large_buffer));
659+
// The if_else heuristic for preallocation is too crude and fails with
660+
// a capacity error as it would like to pre-allocate more than 2GiB
661+
// of binary data.
662+
CheckIfElseCapacityBehavior(cond, binary_left, binary_right,
663+
/*expect_capacity_error=*/true);
664+
ASSERT_OK_AND_ASSIGN(auto large_binary_left,
665+
MakeBinaryArrayWithData(large_binary(), large_buffer));
666+
ASSERT_OK_AND_ASSIGN(auto large_binary_right,
667+
MakeBinaryArrayWithData(large_binary(), large_buffer));
668+
CheckIfElseCapacityBehavior(cond, large_binary_left, large_binary_right,
669+
/*expect_capacity_error=*/false);
670+
}
671+
672+
TEST_F(TestIfElseKernel, LARGE_MEMORY_TEST(IfElseBinaryASANear2GB)) {
673+
constexpr int64_t kPerSide =
674+
static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
675+
auto cond = ArrayFromJSON(boolean(), "[true]");
676+
677+
ASSERT_OK_AND_ASSIGN(std::shared_ptr<Buffer> large_buffer, AllocateBuffer(kPerSide));
678+
ASSERT_OK_AND_ASSIGN(auto binary_array,
679+
MakeBinaryArrayWithData(binary(), large_buffer));
680+
ASSERT_OK_AND_ASSIGN(auto binary_scalar, MakeScalar(binary(), large_buffer));
681+
CheckIfElseCapacityBehavior(cond, binary_scalar, binary_array,
682+
/*expect_capacity_error=*/true);
683+
CheckIfElseCapacityBehavior(cond, binary_array, binary_scalar,
684+
/*expect_capacity_error=*/true);
685+
686+
ASSERT_OK_AND_ASSIGN(auto large_binary_array,
687+
MakeBinaryArrayWithData(large_binary(), large_buffer));
688+
ASSERT_OK_AND_ASSIGN(auto large_binary_scalar,
689+
MakeScalar(large_binary(), large_buffer));
690+
CheckIfElseCapacityBehavior(cond, large_binary_scalar, large_binary_array,
691+
/*expect_capacity_error=*/false);
692+
CheckIfElseCapacityBehavior(cond, large_binary_array, large_binary_scalar,
693+
/*expect_capacity_error=*/false);
694+
}
695+
611696
TEST_F(TestIfElseKernel, IfElseFSBinary) {
612697
auto type = fixed_size_binary(4);
613698

0 commit comments

Comments
 (0)