Skip to content

Commit a7eb42b

Browse files
committed
Adapt methods and variable names to arrow style
Also ensure that no line exceeds 90 characters
1 parent c1bfe5d commit a7eb42b

File tree

7 files changed

+833
-734
lines changed

7 files changed

+833
-734
lines changed

cpp/src/arrow/util/alp/Alp.cc

Lines changed: 374 additions & 341 deletions
Large diffs are not rendered by default.

cpp/src/arrow/util/alp/Alp.h

Lines changed: 130 additions & 119 deletions
Large diffs are not rendered by default.

cpp/src/arrow/util/alp/AlpConstants.h

Lines changed: 30 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ namespace alp {
3333
/// \brief Constants used throughout ALP compression
3434
class AlpConstants {
3535
public:
36-
/// Number of elements compressed together as a unit. This value is fixed for compatibility.
36+
/// Number of elements compressed together as a unit. Fixed for compatibility.
3737
static constexpr uint64_t kAlpVectorSize = 1024;
3838

3939
/// Number of elements to use when determining sampling parameters.
@@ -61,15 +61,15 @@ class AlpConstants {
6161
static constexpr uint8_t kMaxCombinations = 5;
6262

6363
/// Loop unroll factor for tight loops in ALP compression/decompression.
64-
/// ALP has multiple tight loops that profit from unrolling. Setting this might affect
65-
/// performance, so benchmarking is recommended. The gains from kLoopUnrolls = 4 are marginal.
64+
/// ALP has multiple tight loops that profit from unrolling. Setting this
65+
/// might affect performance, so benchmarking is recommended.
6666
static constexpr uint64_t kLoopUnrolls = 4;
6767

6868
/// \brief Get power of ten as uint64_t
6969
///
7070
/// \param[in] power the exponent (must be <= 19)
7171
/// \return 10^power as uint64_t
72-
static uint64_t powerOfTenUB8(const uint8_t power) {
72+
static uint64_t PowerOfTenUB8(const uint8_t power) {
7373
ARROW_DCHECK(power <= 19) << "power_out_of_range: " << static_cast<int>(power);
7474
static constexpr uint64_t kTable[20] = {1,
7575
10,
@@ -99,13 +99,15 @@ class AlpConstants {
9999
///
100100
/// \param[in] power the exponent (must be in range [-10, 10])
101101
/// \return 10^power as float
102-
static float powerOfTenFloat(int8_t power) {
103-
ARROW_DCHECK(power >= -10 && power <= 10) << "power_out_of_range: " << static_cast<int>(power);
102+
static float PowerOfTenFloat(int8_t power) {
103+
ARROW_DCHECK(power >= -10 && power <= 10)
104+
<< "power_out_of_range: " << static_cast<int>(power);
104105
static constexpr float kTable[21] = {
105-
0.0000000001F, 0.000000001F, 0.00000001F, 0.0000001F, 0.000001F, 0.00001F,
106-
0.0001F, 0.001F, 0.01F, 0.1F, 1.0F, 10.0F,
107-
100.0F, 1000.0F, 10000.0F, 100000.0F, 1000000.0F, 10000000.0F,
108-
100000000.0F, 1000000000.0F, 10000000000.0F};
106+
0.0000000001F, 0.000000001F, 0.00000001F, 0.0000001F, 0.000001F,
107+
0.00001F, 0.0001F, 0.001F, 0.01F, 0.1F,
108+
1.0F, 10.0F, 100.0F, 1000.0F, 10000.0F,
109+
100000.0F, 1000000.0F, 10000000.0F, 100000000.0F,
110+
1000000000.0F, 10000000000.0F};
109111

110112
return kTable[power + 10];
111113
}
@@ -114,8 +116,9 @@ class AlpConstants {
114116
///
115117
/// \param[in] power the exponent (must be in range [-20, 20])
116118
/// \return 10^power as double
117-
static double powerOfTenDouble(const int8_t power) {
118-
ARROW_DCHECK(power >= -20 && power <= 20) << "power_out_of_range: " << static_cast<int>(power);
119+
static double PowerOfTenDouble(const int8_t power) {
120+
ARROW_DCHECK(power >= -20 && power <= 20)
121+
<< "power_out_of_range: " << static_cast<int>(power);
119122
static constexpr double kTable[41] = {
120123
0.00000000000000000001,
121124
0.0000000000000000001,
@@ -166,7 +169,7 @@ class AlpConstants {
166169
///
167170
/// \param[in] power the exponent
168171
/// \return 10^power as int64_t
169-
static int64_t getFactor(const int8_t power) { return powerOfTenUB8(power); }
172+
static int64_t GetFactor(const int8_t power) { return PowerOfTenUB8(power); }
170173
};
171174

172175
// ----------------------------------------------------------------------
@@ -194,16 +197,19 @@ struct AlpTypedConstants<float> {
194197
///
195198
/// \param[in] power the exponent
196199
/// \return 10^power as float
197-
static float getExponent(const uint8_t power) { return AlpConstants::powerOfTenFloat(power); }
200+
static float GetExponent(const uint8_t power) {
201+
return AlpConstants::PowerOfTenFloat(power);
202+
}
198203

199204
/// \brief Get factor multiplier
200205
///
201206
/// \param[in] power the factor
202207
/// \return 10^(-power) as float
203-
static float getFactor(const uint8_t power) {
204-
// This double cast is necessary since subtraction on int8_t does not necessarily yield an
205-
// int8_t.
206-
return AlpConstants::powerOfTenFloat(static_cast<int8_t>(-static_cast<int8_t>(power)));
208+
static float GetFactor(const uint8_t power) {
209+
// This double cast is necessary since subtraction on int8_t does not
210+
// necessarily yield an int8_t.
211+
return AlpConstants::PowerOfTenFloat(
212+
static_cast<int8_t>(-static_cast<int8_t>(power)));
207213
}
208214

209215
using FloatingToExact = uint32_t;
@@ -228,14 +234,17 @@ class AlpTypedConstants<double> {
228234
///
229235
/// \param[in] power the exponent
230236
/// \return 10^power as double
231-
static double getExponent(const uint8_t power) { return AlpConstants::powerOfTenDouble(power); }
237+
static double GetExponent(const uint8_t power) {
238+
return AlpConstants::PowerOfTenDouble(power);
239+
}
232240

233241
/// \brief Get factor multiplier
234242
///
235243
/// \param[in] power the factor
236244
/// \return 10^(-power) as double
237-
static double getFactor(const uint8_t power) {
238-
return AlpConstants::powerOfTenDouble(static_cast<int8_t>(-static_cast<int8_t>(power)));
245+
static double GetFactor(const uint8_t power) {
246+
return AlpConstants::PowerOfTenDouble(
247+
static_cast<int8_t>(-static_cast<int8_t>(power)));
239248
}
240249

241250
using FloatingToExact = uint64_t;

cpp/src/arrow/util/alp/AlpSampler.cc

Lines changed: 69 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -33,97 +33,107 @@ namespace alp {
3333

3434
template <typename T>
3535
AlpSampler<T>::AlpSampler()
36-
: m_sampleVectorSize(AlpConstants::kSamplerVectorSize),
37-
m_rowgroupSize(AlpConstants::kSamplerRowgroupSize),
38-
m_samplesPerVector(AlpConstants::kSamplerSamplesPerVector),
39-
m_sampleVectorsPerRowgroup(AlpConstants::kSamplerSampleVectorsPerRowgroup),
40-
m_rowgroupSampleJump((m_rowgroupSize / m_sampleVectorsPerRowgroup) / m_sampleVectorSize) {}
36+
: sample_vector_size_(AlpConstants::kSamplerVectorSize),
37+
rowgroup_size_(AlpConstants::kSamplerRowgroupSize),
38+
samples_per_vector_(AlpConstants::kSamplerSamplesPerVector),
39+
sample_vectors_per_rowgroup_(AlpConstants::kSamplerSampleVectorsPerRowgroup),
40+
rowgroup_sample_jump_((rowgroup_size_ / sample_vectors_per_rowgroup_) /
41+
sample_vector_size_) {}
4142

4243
template <typename T>
43-
void AlpSampler<T>::addSample(arrow::util::span<const T> input) {
44-
for (uint64_t i = 0; i < input.size(); i += m_sampleVectorSize) {
45-
const uint64_t elements = std::min(input.size() - i, m_sampleVectorSize);
46-
addSampleVector({input.data() + i, elements});
44+
void AlpSampler<T>::AddSample(arrow::util::span<const T> input) {
45+
for (uint64_t i = 0; i < input.size(); i += sample_vector_size_) {
46+
const uint64_t elements = std::min(input.size() - i, sample_vector_size_);
47+
AddSampleVector({input.data() + i, elements});
4748
}
4849
}
4950

5051
template <typename T>
51-
void AlpSampler<T>::addSampleVector(arrow::util::span<const T> input) {
52-
const bool mustSkipCurrentVector =
53-
mustSkipSamplingFromCurrentVector(m_vectorsCount, m_vectorsSampledCount, input.size());
54-
55-
m_vectorsCount += 1;
56-
m_totalValuesCount += input.size();
57-
if (mustSkipCurrentVector) {
52+
void AlpSampler<T>::AddSampleVector(arrow::util::span<const T> input) {
53+
const bool must_skip_current_vector =
54+
MustSkipSamplingFromCurrentVector(vectors_count_, vectors_sampled_count_,
55+
input.size());
56+
57+
vectors_count_ += 1;
58+
total_values_count_ += input.size();
59+
if (must_skip_current_vector) {
5860
return;
5961
}
6062

61-
const AlpSamplingParameters samplingParams = getAlpSamplingParameters(input.size());
63+
const AlpSamplingParameters sampling_params = GetAlpSamplingParameters(input.size());
6264

63-
// Slice: take first numLookupValue elements.
64-
std::vector<T> currentVectorValues(
65-
input.begin(), input.begin() + std::min<size_t>(samplingParams.numLookupValue, input.size()));
65+
// Slice: take first num_lookup_value elements.
66+
std::vector<T> current_vector_values(
67+
input.begin(),
68+
input.begin() + std::min<size_t>(sampling_params.num_lookup_value, input.size()));
6669

67-
// Stride: take every numSampledIncrements-th element.
68-
std::vector<T> currentVectorSample;
69-
for (size_t i = 0; i < currentVectorValues.size(); i += samplingParams.numSampledIncrements) {
70-
currentVectorSample.push_back(currentVectorValues[i]);
70+
// Stride: take every num_sampled_increments-th element.
71+
std::vector<T> current_vector_sample;
72+
for (size_t i = 0; i < current_vector_values.size();
73+
i += sampling_params.num_sampled_increments) {
74+
current_vector_sample.push_back(current_vector_values[i]);
7175
}
72-
m_sampleStored += currentVectorSample.size();
76+
sample_stored_ += current_vector_sample.size();
7377

74-
m_completeVectorsSampled.push_back(std::move(currentVectorValues));
75-
m_rowgroupSample.push_back(std::move(currentVectorSample));
76-
m_vectorsSampledCount++;
78+
complete_vectors_sampled_.push_back(std::move(current_vector_values));
79+
rowgroup_sample_.push_back(std::move(current_vector_sample));
80+
vectors_sampled_count_++;
7781
}
7882

7983
template <typename T>
80-
typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::finalize() {
81-
ARROW_LOG(DEBUG) << "AlpSampler finalized: vectorsSampled=" << m_vectorsSampledCount << "/"
82-
<< m_vectorsCount << " total"
83-
<< ", valuesSampled=" << m_sampleStored << "/" << m_totalValuesCount << " total";
84+
typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::Finalize() {
85+
ARROW_LOG(DEBUG) << "AlpSampler finalized: vectorsSampled=" << vectors_sampled_count_
86+
<< "/" << vectors_count_ << " total"
87+
<< ", valuesSampled=" << sample_stored_ << "/" << total_values_count_
88+
<< " total";
8489

8590
AlpSamplerResult result;
86-
result.alpPreset = AlpCompression<T>::createEncodingPreset(m_rowgroupSample);
91+
result.alp_preset = AlpCompression<T>::CreateEncodingPreset(rowgroup_sample_);
8792

88-
ARROW_LOG(DEBUG) << "AlpSampler preset: " << result.alpPreset.combinations.size()
93+
ARROW_LOG(DEBUG) << "AlpSampler preset: " << result.alp_preset.combinations.size()
8994
<< " exponent/factor combinations"
90-
<< ", estimatedSize=" << result.alpPreset.bestCompressedSize << " bytes";
95+
<< ", estimatedSize=" << result.alp_preset.best_compressed_size
96+
<< " bytes";
9197

9298
return result;
9399
}
94100

95101
template <typename T>
96-
typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::getAlpSamplingParameters(
97-
uint64_t numCurrentVectorValues) {
98-
const uint64_t numLookupValues =
99-
std::min(numCurrentVectorValues, static_cast<uint64_t>(AlpConstants::kAlpVectorSize));
100-
// We sample equidistant values within a vector; to do this we jump a fixed number of values.
101-
const uint64_t numSampledIncrements = std::max(
102-
uint64_t{1},
103-
static_cast<uint64_t>(std::ceil(static_cast<double>(numLookupValues) / m_samplesPerVector)));
104-
const uint64_t numSampledValues =
105-
std::ceil(static_cast<double>(numLookupValues) / numSampledIncrements);
106-
107-
ARROW_CHECK(numSampledValues < AlpConstants::kAlpVectorSize) << "alp_sample_too_large";
108-
109-
return AlpSamplingParameters{numLookupValues, numSampledIncrements, numSampledValues};
102+
typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::GetAlpSamplingParameters(
103+
uint64_t num_current_vector_values) {
104+
const uint64_t num_lookup_values =
105+
std::min(num_current_vector_values,
106+
static_cast<uint64_t>(AlpConstants::kAlpVectorSize));
107+
// Sample equidistant values within a vector; jump a fixed number of values.
108+
const uint64_t num_sampled_increments =
109+
std::max(uint64_t{1}, static_cast<uint64_t>(std::ceil(
110+
static_cast<double>(num_lookup_values) /
111+
samples_per_vector_)));
112+
const uint64_t num_sampled_values =
113+
std::ceil(static_cast<double>(num_lookup_values) / num_sampled_increments);
114+
115+
ARROW_CHECK(num_sampled_values < AlpConstants::kAlpVectorSize) << "alp_sample_too_large";
116+
117+
return AlpSamplingParameters{num_lookup_values, num_sampled_increments,
118+
num_sampled_values};
110119
}
111120

112121
template <typename T>
113-
bool AlpSampler<T>::mustSkipSamplingFromCurrentVector(const uint64_t vectorsCount,
114-
const uint64_t vectorsSampledCount,
115-
const uint64_t currentVectorNValues) {
116-
// We sample equidistant vectors; to do this we skip a fixed number of vectors.
117-
const bool mustSelectRowgroupSamples = (vectorsCount % m_rowgroupSampleJump) == 0;
118-
119-
// If we are not in the correct jump, we do not take sample from this vector.
120-
if (!mustSelectRowgroupSamples) {
122+
bool AlpSampler<T>::MustSkipSamplingFromCurrentVector(
123+
const uint64_t vectors_count, const uint64_t vectors_sampled_count,
124+
const uint64_t current_vector_n_values) {
125+
// Sample equidistant vectors; skip a fixed number of vectors.
126+
const bool must_select_rowgroup_samples = (vectors_count % rowgroup_sample_jump_) == 0;
127+
128+
// If we are not in the correct jump, do not take sample from this vector.
129+
if (!must_select_rowgroup_samples) {
121130
return true;
122131
}
123132

124-
// We do not take samples of non-complete vectors (usually the last one),
133+
// Do not take samples of non-complete vectors (usually the last one),
125134
// except in the case of too little data.
126-
if (currentVectorNValues < AlpConstants::kSamplerSamplesPerVector && vectorsSampledCount != 0) {
135+
if (current_vector_n_values < AlpConstants::kSamplerSamplesPerVector &&
136+
vectors_sampled_count != 0) {
127137
return true;
128138
}
129139
return false;

cpp/src/arrow/util/alp/AlpSampler.h

Lines changed: 32 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ namespace alp {
3535
/// \class AlpSampler
3636
/// \brief Collects samples from data to be compressed with ALP
3737
///
38-
/// Usage: Call addSample() or addSampleVector() multiple times to collect samples,
39-
/// then call finalize() to retrieve the resulting preset.
38+
/// Usage: Call AddSample() or AddSampleVector() multiple times to collect
39+
/// samples, then call Finalize() to retrieve the resulting preset.
4040
///
4141
/// \tparam T the floating point type (float or double) to sample
4242
template <typename T>
@@ -47,72 +47,75 @@ class AlpSampler {
4747

4848
/// \brief Helper struct containing the preset for ALP compression
4949
struct AlpSamplerResult {
50-
AlpEncodingPreset alpPreset;
50+
AlpEncodingPreset alp_preset;
5151
};
5252

5353
/// \brief Add a sample of arbitrary size
5454
///
55-
/// The sample is internally separated into vectors on which addSampleVector() is called.
55+
/// The sample is internally separated into vectors on which AddSampleVector()
56+
/// is called.
5657
///
5758
/// \param[in] input the input data to sample from
58-
void addSample(arrow::util::span<const T> input);
59+
void AddSample(arrow::util::span<const T> input);
5960

6061
/// \brief Add a single vector as a sample
6162
///
62-
/// \param[in] input the input vector to add. Size should be <= AlpConstants::kAlpVectorSize.
63-
void addSampleVector(arrow::util::span<const T> input);
63+
/// \param[in] input the input vector to add.
64+
/// Size should be <= AlpConstants::kAlpVectorSize.
65+
void AddSampleVector(arrow::util::span<const T> input);
6466

6567
/// \brief Finalize sampling and generate the encoding preset
6668
///
6769
/// \return an AlpSamplerResult containing the generated encoding preset
68-
AlpSamplerResult finalize();
70+
AlpSamplerResult Finalize();
6971

7072
private:
7173
/// \brief Helper struct to encapsulate settings used for sampling
7274
struct AlpSamplingParameters {
73-
uint64_t numLookupValue;
74-
uint64_t numSampledIncrements;
75-
uint64_t numSampledValues;
75+
uint64_t num_lookup_value;
76+
uint64_t num_sampled_increments;
77+
uint64_t num_sampled_values;
7678
};
7779

7880
/// \brief Calculate sampling parameters for the current vector
7981
///
80-
/// \param[in] numCurrentVectorValues the number of values in the current vector
82+
/// \param[in] num_current_vector_values number of values in current vector
8183
/// \return the sampling parameters to use
82-
AlpSamplingParameters getAlpSamplingParameters(uint64_t numCurrentVectorValues);
84+
AlpSamplingParameters GetAlpSamplingParameters(uint64_t num_current_vector_values);
8385

8486
/// \brief Check if the current vector must be ignored for sampling
8587
///
86-
/// \param[in] vectorsCount the total number of vectors processed so far
87-
/// \param[in] vectorsSampledCount the number of vectors that have been sampled so far
88-
/// \param[in] numCurrentVectorValues the number of values in the current vector
88+
/// \param[in] vectors_count the total number of vectors processed so far
89+
/// \param[in] vectors_sampled_count the number of vectors sampled so far
90+
/// \param[in] num_current_vector_values number of values in current vector
8991
/// \return true if the current vector should be skipped, false otherwise
90-
bool mustSkipSamplingFromCurrentVector(uint64_t vectorsCount, uint64_t vectorsSampledCount,
91-
uint64_t numCurrentVectorValues);
92+
bool MustSkipSamplingFromCurrentVector(uint64_t vectors_count,
93+
uint64_t vectors_sampled_count,
94+
uint64_t num_current_vector_values);
9295

9396
/// Count of vectors that have been sampled
94-
uint64_t m_vectorsSampledCount = 0;
97+
uint64_t vectors_sampled_count_ = 0;
9598
/// Total count of values processed
96-
uint64_t m_totalValuesCount = 0;
99+
uint64_t total_values_count_ = 0;
97100
/// Total count of vectors processed
98-
uint64_t m_vectorsCount = 0;
101+
uint64_t vectors_count_ = 0;
99102
/// Number of samples stored
100-
uint64_t m_sampleStored = 0;
103+
uint64_t sample_stored_ = 0;
101104
/// Samples collected from current rowgroup
102-
std::vector<std::vector<T>> m_rowgroupSample;
105+
std::vector<std::vector<T>> rowgroup_sample_;
103106

104107
/// Complete vectors sampled
105-
std::vector<std::vector<T>> m_completeVectorsSampled;
108+
std::vector<std::vector<T>> complete_vectors_sampled_;
106109
/// Size of each sample vector
107-
const uint64_t m_sampleVectorSize;
110+
const uint64_t sample_vector_size_;
108111
/// Size of each rowgroup
109-
const uint64_t m_rowgroupSize;
112+
const uint64_t rowgroup_size_;
110113
/// Number of samples to take per vector
111-
const uint64_t m_samplesPerVector;
114+
const uint64_t samples_per_vector_;
112115
/// Number of vectors to sample per rowgroup
113-
const uint64_t m_sampleVectorsPerRowgroup;
116+
const uint64_t sample_vectors_per_rowgroup_;
114117
/// Jump interval for rowgroup sampling
115-
const uint64_t m_rowgroupSampleJump;
118+
const uint64_t rowgroup_sample_jump_;
116119
};
117120

118121
} // namespace alp

0 commit comments

Comments
 (0)