Skip to content

Commit cfa00ba

Browse files
committed
Obtain compressed size and number of elements from page header
1 parent e56c877 commit cfa00ba

File tree

3 files changed

+23
-66
lines changed

3 files changed

+23
-66
lines changed

cpp/src/arrow/util/alp/alp_wrapper.cc

Lines changed: 17 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -37,32 +37,28 @@ namespace {
3737

3838
/// \brief Header structure for ALP compression blocks
3939
///
40-
/// Contains metadata required to decompress the data.
40+
/// Contains metadata required to decompress the data. Note that compressed_size
41+
/// and num_elements are NOT stored in the header - they are available from the
42+
/// page header and passed to the Decode() function.
4143
///
4244
/// Serialization format (version 1):
4345
///
4446
/// +---------------------------------------------------+
45-
/// | CompressionBlockHeader (40 bytes) |
47+
/// | CompressionBlockHeader (24 bytes) |
4648
/// +---------------------------------------------------+
4749
/// | Offset | Field | Size |
4850
/// +---------+---------------------+-------------------+
4951
/// | 0 | version | 8 bytes (uint64) |
50-
/// | 8 | compressed_size | 8 bytes (uint64) |
51-
/// | 16 | num_elements | 8 bytes (uint64) |
52-
/// | 24 | vector_size | 8 bytes (uint64) |
53-
/// | 32 | compression_mode | 4 bytes (enum) |
54-
/// | 36 | bit_pack_layout | 4 bytes (enum) |
52+
/// | 8 | vector_size | 8 bytes (uint64) |
53+
/// | 16 | compression_mode | 4 bytes (enum) |
54+
/// | 20 | bit_pack_layout | 4 bytes (enum) |
5555
/// +---------------------------------------------------+
5656
///
5757
/// \note version must remain the first field to allow reading the rest
5858
/// of the header based on version number.
5959
struct CompressionBlockHeader {
6060
/// Version number. Must remain the first field for version-based parsing.
6161
uint64_t version = 0;
62-
/// Size of the compressed data in bytes (includes header).
63-
uint64_t compressed_size = 0;
64-
/// Number of elements in the compressed data.
65-
uint64_t num_elements = 0;
6662
/// Vector size used for compression.
6763
/// Must be AlpConstants::kAlpVectorSize for decompression.
6864
uint64_t vector_size = 0;
@@ -78,8 +74,8 @@ struct CompressionBlockHeader {
7874
static size_t GetSizeForVersion(uint64_t v) {
7975
size_t size;
8076
if (v == 1) {
81-
size = sizeof(version) + sizeof(compressed_size) + sizeof(num_elements) +
82-
sizeof(vector_size) + sizeof(compression_mode) + sizeof(bit_pack_layout);
77+
size = sizeof(version) + sizeof(vector_size) + sizeof(compression_mode) +
78+
sizeof(bit_pack_layout);
8379
} else {
8480
ARROW_CHECK(false) << "unknown_version: " << v;
8581
}
@@ -151,33 +147,24 @@ void AlpWrapper<T>::Encode(const T* decomp, size_t decomp_size, char* comp,
151147

152148
CompressionBlockHeader header{};
153149
header.version = version;
154-
header.compressed_size =
155-
::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version) +
156-
compression_progress.num_compressed_bytes_produced;
157-
header.num_elements = decomp_size / sizeof(T);
158150
header.vector_size = AlpConstants::kAlpVectorSize;
159151
header.compression_mode = AlpMode::kAlp;
160152
header.bit_pack_layout = AlpBitPackLayout::kNormal;
161153

162154
std::memcpy(encoded_header, &header,
163155
::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version));
164-
*comp_size = header.compressed_size;
156+
*comp_size = ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version) +
157+
compression_progress.num_compressed_bytes_produced;
165158
}
166159

167160
template <typename T>
168161
template <typename TargetType>
169-
void AlpWrapper<T>::Decode(TargetType* decomp, size_t* decomp_size, const char* comp,
162+
void AlpWrapper<T>::Decode(TargetType* decomp, uint64_t num_elements, const char* comp,
170163
size_t comp_size) {
171164
const CompressionBlockHeader header = LoadHeader(comp, comp_size);
172165
ARROW_CHECK(header.vector_size == AlpConstants::kAlpVectorSize)
173166
<< "unsupported_vector_size: " << header.vector_size;
174167

175-
if (header.num_elements * sizeof(TargetType) > *decomp_size) {
176-
*decomp_size = 0;
177-
return;
178-
}
179-
180-
const uint64_t elements_to_decode = header.num_elements;
181168
const char* compression_body =
182169
comp + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(header.version);
183170
const uint64_t compression_body_size =
@@ -186,34 +173,17 @@ void AlpWrapper<T>::Decode(TargetType* decomp, size_t* decomp_size, const char*
186173

187174
ARROW_CHECK(header.compression_mode == AlpMode::kAlp) << "alp_decode_unsupported_mode";
188175

189-
uint64_t elements_decoded =
190-
DecodeAlp(decomp, elements_to_decode, compression_body, compression_body_size,
191-
header.bit_pack_layout)
192-
.num_decompressed_elements_produced;
193-
*decomp_size = elements_decoded * sizeof(TargetType);
176+
DecodeAlp<TargetType>(decomp, num_elements, compression_body, compression_body_size,
177+
header.bit_pack_layout);
194178
}
195179

196-
template void AlpWrapper<float>::Decode(float* decomp, size_t* decomp_size,
180+
template void AlpWrapper<float>::Decode(float* decomp, uint64_t num_elements,
197181
const char* comp, size_t comp_size);
198-
template void AlpWrapper<float>::Decode(double* decomp, size_t* decomp_size,
182+
template void AlpWrapper<float>::Decode(double* decomp, uint64_t num_elements,
199183
const char* comp, size_t comp_size);
200-
template void AlpWrapper<double>::Decode(double* decomp, size_t* decomp_size,
184+
template void AlpWrapper<double>::Decode(double* decomp, uint64_t num_elements,
201185
const char* comp, size_t comp_size);
202186

203-
template <typename T>
204-
template <typename TargetType>
205-
uint64_t AlpWrapper<T>::GetDecompressedSize(const char* comp, uint64_t comp_size) {
206-
const CompressionBlockHeader header = LoadHeader(comp, comp_size);
207-
return header.num_elements * sizeof(TargetType);
208-
}
209-
210-
template uint64_t AlpWrapper<float>::GetDecompressedSize<float>(const char* comp,
211-
uint64_t comp_size);
212-
template uint64_t AlpWrapper<float>::GetDecompressedSize<double>(const char* comp,
213-
uint64_t comp_size);
214-
template uint64_t AlpWrapper<double>::GetDecompressedSize<double>(const char* comp,
215-
uint64_t comp_size);
216-
217187
template <typename T>
218188
uint64_t AlpWrapper<T>::GetMaxCompressedSize(uint64_t decomp_size) {
219189
ARROW_CHECK(decomp_size % sizeof(T) == 0)

cpp/src/arrow/util/alp/alp_wrapper.h

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -63,30 +63,17 @@ class AlpWrapper {
6363
/// \brief Decode floating point values
6464
///
6565
/// \param[out] decomp pointer to the memory region we will decode into.
66-
/// The caller is responsible for ensuring this is big enough.
67-
/// \param[in,out] decomp_size the actual size of decoded data in bytes,
68-
/// expects the decomp size as input.
66+
/// The caller is responsible for ensuring this is big enough
67+
/// to hold num_elements values.
68+
/// \param[in] num_elements number of elements to decode (from page header)
6969
/// \param[in] comp pointer to the input that is to be decoded
70-
/// \param[in] comp_size size of the input in bytes.
70+
/// \param[in] comp_size size of the input in bytes (from page header)
7171
/// \tparam TargetType the type that is used to store the output.
7272
/// May not be a narrowing conversion from T.
7373
template <typename TargetType>
74-
static void Decode(TargetType* decomp, size_t* decomp_size, const char* comp,
74+
static void Decode(TargetType* decomp, uint64_t num_elements, const char* comp,
7575
size_t comp_size);
7676

77-
/// \brief Get the decompressed size of a compression block
78-
///
79-
/// Get the size of a compression block encoded previously with
80-
/// AlpWrapper::Encode().
81-
///
82-
/// \param[in] comp start of the memory region containing the compression block
83-
/// \param[in] comp_size size of the compression block
84-
/// \return the decompressed size of the block, in bytes
85-
/// \tparam TargetType the type that is used to store the output.
86-
/// May not be a narrowing conversion from T.
87-
template <typename TargetType>
88-
static uint64_t GetDecompressedSize(const char* comp, uint64_t comp_size);
89-
9077
/// \brief Get the maximum compressed size of an uncompressed buffer
9178
///
9279
/// \param[in] decomp_size the size of the uncompressed buffer in bytes

cpp/src/parquet/encoding_alp_benchmark.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#include <benchmark/benchmark.h>
3232

3333
#include "arrow/buffer.h"
34-
#include "arrow/util/alp/AlpWrapper.h"
34+
#include "arrow/util/alp/alp_wrapper.h"
3535
#include "arrow/util/compression.h"
3636
#include "parquet/encoding.h"
3737
#include "parquet/schema.h"

0 commit comments

Comments
 (0)