Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions cpp/src/parquet/bloom_filter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {

num_bytes_ = num_bytes;
PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
memcpy(data_->mutable_data(), bitset, num_bytes_);
std::memcpy(data_->mutable_data(), bitset, num_bytes_);

this->hasher_ = std::make_unique<XxHasher>();
}
Expand Down Expand Up @@ -157,10 +157,8 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
auto buffer = AllocateBuffer(properties.memory_pool(), bloom_filter_size);

const auto bloom_filter_bytes_in_header = header_buf->size() - header_size;
if (bloom_filter_bytes_in_header > 0) {
std::memcpy(buffer->mutable_data(), header_buf->data() + header_size,
static_cast<size_t>(bloom_filter_bytes_in_header));
}
SafeMemcpy(buffer->mutable_data(), header_buf->data() + header_size,
static_cast<size_t>(bloom_filter_bytes_in_header));

const auto required_read_size = bloom_filter_size - bloom_filter_bytes_in_header;
PARQUET_ASSIGN_OR_THROW(
Expand Down
8 changes: 3 additions & 5 deletions cpp/src/parquet/column_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -559,11 +559,9 @@ std::shared_ptr<Buffer> SerializedPageReader::DecompressIfNeeded(
PARQUET_THROW_NOT_OK(
decompression_buffer_->Resize(uncompressed_len, /*shrink_to_fit=*/false));

if (levels_byte_len > 0) {
// First copy the levels as-is
uint8_t* decompressed = decompression_buffer_->mutable_data();
memcpy(decompressed, page_buffer->data(), levels_byte_len);
}
// First copy the levels as-is
uint8_t* decompressed = decompression_buffer_->mutable_data();
SafeMemcpy(decompressed, page_buffer->data(), levels_byte_len);

// GH-31992: DataPageV2 may store only levels and no values when all
// values are null. In this case, Parquet java is known to produce a
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -919,13 +919,13 @@ class ColumnWriterImpl {
void ConcatenateBuffers(int64_t definition_levels_rle_size,
int64_t repetition_levels_rle_size,
const std::shared_ptr<Buffer>& values, uint8_t* combined) {
memcpy(combined, repetition_levels_rle_->data(),
static_cast<size_t>(repetition_levels_rle_size));
SafeMemcpy(combined, repetition_levels_rle_->data(),
static_cast<size_t>(repetition_levels_rle_size));
combined += repetition_levels_rle_size;
memcpy(combined, definition_levels_rle_->data(),
static_cast<size_t>(definition_levels_rle_size));
SafeMemcpy(combined, definition_levels_rle_->data(),
static_cast<size_t>(definition_levels_rle_size));
combined += definition_levels_rle_size;
memcpy(combined, values->data(), static_cast<size_t>(values->size()));
SafeMemcpy(combined, values->data(), static_cast<size_t>(values->size()));
}
};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ void TestPrimitiveWriter<ByteArrayType>::ReadColumnFully(Compression::type compr
uint8_t* data_ptr = data.data();
for (int64_t i = 0; i < values_read_recently; i++) {
const ByteArray& value = this->values_out_ptr_[i + values_read_];
memcpy(data_ptr, value.ptr, value.len);
SafeMemcpy(data_ptr, value.ptr, value.len);
this->values_out_[i + values_read_].ptr = data_ptr;
data_ptr += value.len;
}
Expand Down
15 changes: 6 additions & 9 deletions cpp/src/parquet/decoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -458,10 +458,7 @@ inline int DecodePlain(const uint8_t* data, int64_t data_size, int num_values,
if (bytes_to_decode > data_size || bytes_to_decode > INT_MAX) {
ParquetException::EofException();
}
// If bytes_to_decode == 0, data could be null
if (bytes_to_decode > 0) {
memcpy(out, data, static_cast<size_t>(bytes_to_decode));
}
SafeMemcpy(out, data, static_cast<size_t>(bytes_to_decode));
return static_cast<int>(bytes_to_decode);
}

Expand Down Expand Up @@ -675,7 +672,7 @@ inline int PlainDecoder<FLBAType>::DecodeArrow(

// 1. Copy directly into the FixedSizeBinary data buffer, packed to the right.
uint8_t* decode_out = builder->GetMutableValue(builder->length() + null_count);
memcpy(decode_out, data_, values_decoded * byte_width);
SafeMemcpy(decode_out, data_, values_decoded * byte_width);

// 2. Expand the values into their final positions.
if (null_count == 0) {
Expand Down Expand Up @@ -1058,7 +1055,7 @@ void DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
uint8_t* bytes_data = byte_array_data_->mutable_data();
int32_t* bytes_offsets = byte_array_offsets_->mutable_data_as<int32_t>();
for (int i = 0; i < dictionary_length_; ++i) {
memcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
SafeMemcpy(bytes_data + offset, dict_values[i].ptr, dict_values[i].len);
bytes_offsets[i] = offset;
dict_values[i].ptr = bytes_data + offset;
offset += dict_values[i].len;
Expand All @@ -1079,7 +1076,7 @@ inline void DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionar
/*shrink_to_fit=*/false));
uint8_t* bytes_data = byte_array_data_->mutable_data();
for (int32_t i = 0, offset = 0; i < dictionary_length_; ++i, offset += fixed_len) {
memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
std::memcpy(bytes_data + offset, dict_values[i].ptr, fixed_len);
dict_values[i].ptr = bytes_data + offset;
}
}
Expand Down Expand Up @@ -2017,9 +2014,9 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl<DType> {
// Both prefix and suffix are non-empty, so we need to decode the string
// into `data_ptr`.
// 1. Copy the prefix
memcpy(*data_ptr, prefix->data(), prefix_len_ptr[i]);
SafeMemcpy(*data_ptr, prefix->data(), prefix_len_ptr[i]);
// 2. Copy the suffix.
memcpy(*data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len);
SafeMemcpy(*data_ptr + prefix_len_ptr[i], buffer[i].ptr, buffer[i].len);
// 3. Point buffer[i] to the decoded string.
buffer[i].ptr = *data_ptr;
buffer[i].len += prefix_len_ptr[i];
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/parquet/encoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -658,9 +658,9 @@ template <>
void DictEncoderImpl<ByteArrayType>::WriteDict(uint8_t* buffer) const {
memo_table_.VisitValues(0, [&buffer](::std::string_view v) {
uint32_t len = static_cast<uint32_t>(v.length());
memcpy(buffer, &len, sizeof(len));
std::memcpy(buffer, &len, sizeof(len));
buffer += sizeof(len);
memcpy(buffer, v.data(), len);
SafeMemcpy(buffer, v.data(), len);
buffer += len;
});
}
Expand All @@ -669,7 +669,7 @@ template <>
void DictEncoderImpl<FLBAType>::WriteDict(uint8_t* buffer) const {
memo_table_.VisitValues(0, [&](::std::string_view v) {
DCHECK_EQ(v.length(), static_cast<size_t>(type_length_));
memcpy(buffer, v.data(), type_length_);
std::memcpy(buffer, v.data(), type_length_);
buffer += type_length_;
});
}
Expand Down Expand Up @@ -1216,8 +1216,8 @@ std::shared_ptr<Buffer> DeltaBitPackEncoder<DType>::FlushValues() {
// and data was written immediately after. We now write the header data immediately
// before the end of reserved space.
const size_t offset_bytes = kMaxPageHeaderWriterSize - header_writer.bytes_written();
std::memcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
header_writer.bytes_written());
SafeMemcpy(buffer->mutable_data() + offset_bytes, header_buffer_,
header_writer.bytes_written());

// Reset counter of cached values
total_value_count_ = 0;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ inline void TypedStatisticsImpl<ByteArrayType>::Copy(const ByteArray& src, ByteA
ResizableBuffer* buffer) {
if (dst->ptr == src.ptr) return;
PARQUET_THROW_NOT_OK(buffer->Resize(src.len, false));
std::memcpy(buffer->mutable_data(), src.ptr, src.len);
SafeMemcpy(buffer->mutable_data(), src.ptr, src.len);
*dst = ByteArray(src.len, buffer->data());
}

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ std::vector<ByteArray> TestStatistics<ByteArrayType>::GetDeepCopy(
for (const ByteArray& ba : values) {
uint8_t* ptr;
PARQUET_THROW_NOT_OK(pool->Allocate(ba.len, &ptr));
memcpy(ptr, ba.ptr, ba.len);
SafeMemcpy(ptr, ba.ptr, ba.len);
copy.emplace_back(ba.len, ptr);
}
return copy;
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/parquet/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,16 @@ static inline std::string ByteArrayToString(const ByteArray& a) {
return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
}

/// \brief Safe memcpy that avoids undefined behavior when src is nullptr.
///
/// According to the C++ standard, calling std::memcpy with a nullptr is undefined
/// behavior even when size is 0. This function guards against that case.
static inline void SafeMemcpy(void* dest, const void* src, size_t size) {
if (size > 0) {
std::memcpy(dest, src, size);
}
}

static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
}
Expand Down