Skip to content

Commit 719468b

Browse files
committed
Better pack the compression block header
1 parent a1d11ee commit 719468b

File tree

2 files changed

+45
-41
lines changed

2 files changed

+45
-41
lines changed

cpp/src/arrow/util/alp/alp_constants.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class AlpConstants {
4949
static constexpr uint64_t kSamplerSampleVectorsPerRowgroup = 8;
5050

5151
/// Version number for the ALP compression format.
52-
static constexpr uint64_t kAlpVersion = 1;
52+
static constexpr uint8_t kAlpVersion = 1;
5353

5454
/// Type used to store exception positions within a compressed vector.
5555
using PositionType = uint16_t;

cpp/src/arrow/util/alp/alp_wrapper.cc

Lines changed: 44 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -44,54 +44,58 @@ namespace {
4444
/// Serialization format (version 1):
4545
///
4646
/// +---------------------------------------------------+
47-
/// | CompressionBlockHeader (24 bytes) |
47+
/// | CompressionBlockHeader (8 bytes) |
4848
/// +---------------------------------------------------+
4949
/// | Offset | Field | Size |
5050
/// +---------+---------------------+-------------------+
51-
/// | 0 | version | 8 bytes (uint64) |
52-
/// | 8 | vector_size | 8 bytes (uint64) |
53-
/// | 16 | compression_mode | 4 bytes (enum) |
54-
/// | 20 | bit_pack_layout | 4 bytes (enum) |
51+
/// | 0 | version | 1 byte (uint8) |
52+
/// | 1 | compression_mode | 1 byte (uint8) |
53+
/// | 2 | bit_pack_layout | 1 byte (uint8) |
54+
/// | 3 | reserved | 1 byte (uint8) |
55+
/// | 4 | vector_size | 4 bytes (uint32) |
5556
/// +---------------------------------------------------+
5657
///
5758
/// \note version must remain the first field to allow reading the rest
5859
/// of the header based on version number.
5960
struct CompressionBlockHeader {
6061
/// Version number. Must remain the first field for version-based parsing.
61-
uint64_t version = 0;
62-
/// Vector size used for compression.
63-
/// Must be AlpConstants::kAlpVectorSize for decompression.
64-
uint64_t vector_size = 0;
62+
uint8_t version = 0;
6563
/// Compression mode (currently only kAlp is supported).
66-
AlpMode compression_mode = AlpMode::kAlp;
64+
uint8_t compression_mode = static_cast<uint8_t>(AlpMode::kAlp);
6765
/// Bit packing layout used for bitpacking.
68-
AlpBitPackLayout bit_pack_layout = AlpBitPackLayout::kNormal;
66+
uint8_t bit_pack_layout = static_cast<uint8_t>(AlpBitPackLayout::kNormal);
67+
/// Reserved for future use (also ensures 4-byte alignment for vector_size).
68+
uint8_t reserved = 0;
69+
/// Vector size used for compression.
70+
/// Must be AlpConstants::kAlpVectorSize for decompression.
71+
uint32_t vector_size = 0;
6972

7073
/// \brief Get the size in bytes of the CompressionBlockHeader for a version
7174
///
7275
/// \param[in] v the version number
7376
/// \return the size in bytes
74-
static size_t GetSizeForVersion(uint64_t v) {
75-
size_t size;
76-
if (v == 1) {
77-
size = sizeof(version) + sizeof(vector_size) + sizeof(compression_mode) +
78-
sizeof(bit_pack_layout);
79-
} else {
80-
ARROW_CHECK(false) << "unknown_version: " << v;
81-
}
82-
return size;
77+
static constexpr size_t GetSizeForVersion(uint8_t v) {
78+
// Version 1 header is 8 bytes
79+
return (v == 1) ? 8 : 0;
8380
}
8481

8582
/// \brief Check whether the given version is valid
8683
///
8784
/// \param[in] v the version to check
8885
/// \return the version if valid, otherwise asserts
89-
static uint64_t IsValidVersion(uint64_t v) {
90-
if (v == 1) {
91-
return v;
92-
}
93-
ARROW_CHECK(false) << "invalid_version: " << v;
94-
return 0; // Unreachable, but silences warning.
86+
static uint8_t IsValidVersion(uint8_t v) {
87+
ARROW_CHECK(v == 1) << "invalid_version: " << static_cast<int>(v);
88+
return v;
89+
}
90+
91+
/// \brief Get the AlpMode enum from the stored uint8_t
92+
AlpMode GetCompressionMode() const {
93+
return static_cast<AlpMode>(compression_mode);
94+
}
95+
96+
/// \brief Get the AlpBitPackLayout enum from the stored uint8_t
97+
AlpBitPackLayout GetBitPackLayout() const {
98+
return static_cast<AlpBitPackLayout>(bit_pack_layout);
9599
}
96100
};
97101

@@ -110,15 +114,14 @@ struct AlpWrapper<T>::CompressionBlockHeader : public ::arrow::util::alp::Compre
110114
template <typename T>
111115
typename AlpWrapper<T>::CompressionBlockHeader AlpWrapper<T>::LoadHeader(
112116
const char* comp, size_t comp_size) {
113-
CompressionBlockHeader header{};
114-
ARROW_CHECK(comp_size > sizeof(header.version))
115-
<< "alp_loadHeader_compSize_too_small_for_header_version";
116-
uint64_t version;
117-
std::memcpy(&version, comp, sizeof(header.version));
117+
ARROW_CHECK(comp_size >= 1) << "alp_loadHeader_compSize_too_small_for_version";
118+
uint8_t version;
119+
std::memcpy(&version, comp, sizeof(version));
118120
CompressionBlockHeader::IsValidVersion(version);
119-
ARROW_CHECK(comp_size >= CompressionBlockHeader::GetSizeForVersion(version))
120-
<< "alp_loadHeader_compSize_too_small";
121-
std::memcpy(&header, comp, CompressionBlockHeader::GetSizeForVersion(version));
121+
const size_t header_size = CompressionBlockHeader::GetSizeForVersion(version);
122+
ARROW_CHECK(comp_size >= header_size) << "alp_loadHeader_compSize_too_small";
123+
CompressionBlockHeader header{};
124+
std::memcpy(&header, comp, header_size);
122125
return header;
123126
}
124127

@@ -127,7 +130,7 @@ void AlpWrapper<T>::Encode(const T* decomp, size_t decomp_size, char* comp,
127130
size_t* comp_size, std::optional<AlpMode> enforce_mode) {
128131
ARROW_CHECK(decomp_size % sizeof(T) == 0) << "alp_encode_input_must_be_multiple_of_T";
129132
const uint64_t element_count = decomp_size / sizeof(T);
130-
const uint64_t version =
133+
const uint8_t version =
131134
CompressionBlockHeader::IsValidVersion(AlpConstants::kAlpVersion);
132135

133136
AlpSampler<T> sampler;
@@ -146,9 +149,9 @@ void AlpWrapper<T>::Encode(const T* decomp, size_t decomp_size, char* comp,
146149

147150
CompressionBlockHeader header{};
148151
header.version = version;
152+
header.compression_mode = static_cast<uint8_t>(AlpMode::kAlp);
153+
header.bit_pack_layout = static_cast<uint8_t>(AlpBitPackLayout::kNormal);
149154
header.vector_size = AlpConstants::kAlpVectorSize;
150-
header.compression_mode = AlpMode::kAlp;
151-
header.bit_pack_layout = AlpBitPackLayout::kNormal;
152155

153156
std::memcpy(encoded_header, &header, header_size);
154157
*comp_size = header_size + compression_progress.num_compressed_bytes_produced;
@@ -166,10 +169,11 @@ void AlpWrapper<T>::Decode(TargetType* decomp, uint64_t num_elements, const char
166169
const char* compression_body = comp + header_size;
167170
const uint64_t compression_body_size = comp_size - header_size;
168171

169-
ARROW_CHECK(header.compression_mode == AlpMode::kAlp) << "alp_decode_unsupported_mode";
172+
ARROW_CHECK(header.GetCompressionMode() == AlpMode::kAlp)
173+
<< "alp_decode_unsupported_mode";
170174

171175
DecodeAlp<TargetType>(decomp, num_elements, compression_body, compression_body_size,
172-
header.bit_pack_layout);
176+
header.GetBitPackLayout());
173177
}
174178

175179
template void AlpWrapper<float>::Decode(float* decomp, uint64_t num_elements,
@@ -184,7 +188,7 @@ uint64_t AlpWrapper<T>::GetMaxCompressedSize(uint64_t decomp_size) {
184188
ARROW_CHECK(decomp_size % sizeof(T) == 0)
185189
<< "alp_decompressed_size_not_multiple_of_T";
186190
const uint64_t element_count = decomp_size / sizeof(T);
187-
const uint64_t version =
191+
const uint8_t version =
188192
CompressionBlockHeader::IsValidVersion(AlpConstants::kAlpVersion);
189193
uint64_t max_alp_size = CompressionBlockHeader::GetSizeForVersion(version);
190194
// Add per-vector header sizes.

0 commit comments

Comments
 (0)