apache
diff --git a/‎cpp/src/arrow/util/alp/Alp.cc‎
Lines changed: 374 additions & 341 deletions b/‎cpp/src/arrow/util/alp/Alp.cc‎
Lines changed: 374 additions & 341 deletions
diff --git a/‎cpp/src/arrow/util/alp/Alp.h‎
Lines changed: 130 additions & 119 deletions b/‎cpp/src/arrow/util/alp/Alp.h‎
Lines changed: 130 additions & 119 deletions
diff --git a/‎cpp/src/arrow/util/alp/AlpConstants.h‎
Lines changed: 30 additions & 21 deletions b/‎cpp/src/arrow/util/alp/AlpConstants.h‎
Lines changed: 30 additions & 21 deletions
diff --git a/‎cpp/src/arrow/util/alp/AlpSampler.cc‎
Lines changed: 69 additions & 59 deletions b/‎cpp/src/arrow/util/alp/AlpSampler.cc‎
Lines changed: 69 additions & 59 deletions
diff --git a/‎cpp/src/arrow/util/alp/AlpSampler.h‎
Lines changed: 32 additions & 29 deletions b/‎cpp/src/arrow/util/alp/AlpSampler.h‎
Lines changed: 32 additions & 29 deletions
@@ -33,7 +33,7 @@ namespace alp {
 /// \brief Constants used throughout ALP compression
 class AlpConstants {
  public:
-  /// Number of elements compressed together as a unit. This value is fixed for compatibility.
+  /// Number of elements compressed together as a unit. Fixed for compatibility.
   static constexpr uint64_t kAlpVectorSize = 1024;
 
   /// Number of elements to use when determining sampling parameters.
@@ -61,15 +61,15 @@ class AlpConstants {
   static constexpr uint8_t kMaxCombinations = 5;
 
   /// Loop unroll factor for tight loops in ALP compression/decompression.
-  /// ALP has multiple tight loops that profit from unrolling. Setting this might affect
-  /// performance, so benchmarking is recommended. The gains from kLoopUnrolls = 4 are marginal.
+  /// ALP has multiple tight loops that profit from unrolling. Setting this
+  /// might affect performance, so benchmarking is recommended.
   static constexpr uint64_t kLoopUnrolls = 4;
 
   /// \brief Get power of ten as uint64_t
   ///
   /// \param[in] power the exponent (must be <= 19)
   /// \return 10^power as uint64_t
-  static uint64_t powerOfTenUB8(const uint8_t power) {
+  static uint64_t PowerOfTenUB8(const uint8_t power) {
     ARROW_DCHECK(power <= 19) << "power_out_of_range: " << static_cast<int>(power);
     static constexpr uint64_t kTable[20] = {1,
                                             10,
@@ -99,13 +99,15 @@ class AlpConstants {
   ///
   /// \param[in] power the exponent (must be in range [-10, 10])
   /// \return 10^power as float
-  static float powerOfTenFloat(int8_t power) {
-    ARROW_DCHECK(power >= -10 && power <= 10) << "power_out_of_range: " << static_cast<int>(power);
+  static float PowerOfTenFloat(int8_t power) {
+    ARROW_DCHECK(power >= -10 && power <= 10)
+        << "power_out_of_range: " << static_cast<int>(power);
     static constexpr float kTable[21] = {
-        0.0000000001F, 0.000000001F,  0.00000001F,   0.0000001F, 0.000001F,  0.00001F,
-        0.0001F,       0.001F,        0.01F,         0.1F,       1.0F,       10.0F,
-        100.0F,        1000.0F,       10000.0F,      100000.0F,  1000000.0F, 10000000.0F,
-        100000000.0F,  1000000000.0F, 10000000000.0F};
+        0.0000000001F, 0.000000001F,  0.00000001F,   0.0000001F, 0.000001F,
+        0.00001F,      0.0001F,       0.001F,        0.01F,      0.1F,
+        1.0F,          10.0F,         100.0F,        1000.0F,    10000.0F,
+        100000.0F,     1000000.0F,    10000000.0F,   100000000.0F,
+        1000000000.0F, 10000000000.0F};
 
     return kTable[power + 10];
   }
@@ -114,8 +116,9 @@ class AlpConstants {
   ///
   /// \param[in] power the exponent (must be in range [-20, 20])
   /// \return 10^power as double
-  static double powerOfTenDouble(const int8_t power) {
-    ARROW_DCHECK(power >= -20 && power <= 20) << "power_out_of_range: " << static_cast<int>(power);
+  static double PowerOfTenDouble(const int8_t power) {
+    ARROW_DCHECK(power >= -20 && power <= 20)
+        << "power_out_of_range: " << static_cast<int>(power);
     static constexpr double kTable[41] = {
         0.00000000000000000001,
         0.0000000000000000001,
@@ -166,7 +169,7 @@ class AlpConstants {
   ///
   /// \param[in] power the exponent
   /// \return 10^power as int64_t
-  static int64_t getFactor(const int8_t power) { return powerOfTenUB8(power); }
+  static int64_t GetFactor(const int8_t power) { return PowerOfTenUB8(power); }
 };
 
 // ----------------------------------------------------------------------
@@ -194,16 +197,19 @@ struct AlpTypedConstants<float> {
   ///
   /// \param[in] power the exponent
   /// \return 10^power as float
-  static float getExponent(const uint8_t power) { return AlpConstants::powerOfTenFloat(power); }
+  static float GetExponent(const uint8_t power) {
+    return AlpConstants::PowerOfTenFloat(power);
+  }
 
   /// \brief Get factor multiplier
   ///
   /// \param[in] power the factor
   /// \return 10^(-power) as float
-  static float getFactor(const uint8_t power) {
-    // This double cast is necessary since subtraction on int8_t does not necessarily yield an
-    // int8_t.
-    return AlpConstants::powerOfTenFloat(static_cast<int8_t>(-static_cast<int8_t>(power)));
+  static float GetFactor(const uint8_t power) {
+    // This double cast is necessary since subtraction on int8_t does not
+    // necessarily yield an int8_t.
+    return AlpConstants::PowerOfTenFloat(
+        static_cast<int8_t>(-static_cast<int8_t>(power)));
   }
 
   using FloatingToExact = uint32_t;
@@ -228,14 +234,17 @@ class AlpTypedConstants<double> {
   ///
   /// \param[in] power the exponent
   /// \return 10^power as double
-  static double getExponent(const uint8_t power) { return AlpConstants::powerOfTenDouble(power); }
+  static double GetExponent(const uint8_t power) {
+    return AlpConstants::PowerOfTenDouble(power);
+  }
 
   /// \brief Get factor multiplier
   ///
   /// \param[in] power the factor
   /// \return 10^(-power) as double
-  static double getFactor(const uint8_t power) {
-    return AlpConstants::powerOfTenDouble(static_cast<int8_t>(-static_cast<int8_t>(power)));
+  static double GetFactor(const uint8_t power) {
+    return AlpConstants::PowerOfTenDouble(
+        static_cast<int8_t>(-static_cast<int8_t>(power)));
   }
 
   using FloatingToExact = uint64_t;
 
@@ -33,97 +33,107 @@ namespace alp {
 
 template <typename T>
 AlpSampler<T>::AlpSampler()
-    : m_sampleVectorSize(AlpConstants::kSamplerVectorSize),
-      m_rowgroupSize(AlpConstants::kSamplerRowgroupSize),
-      m_samplesPerVector(AlpConstants::kSamplerSamplesPerVector),
-      m_sampleVectorsPerRowgroup(AlpConstants::kSamplerSampleVectorsPerRowgroup),
-      m_rowgroupSampleJump((m_rowgroupSize / m_sampleVectorsPerRowgroup) / m_sampleVectorSize) {}
+    : sample_vector_size_(AlpConstants::kSamplerVectorSize),
+      rowgroup_size_(AlpConstants::kSamplerRowgroupSize),
+      samples_per_vector_(AlpConstants::kSamplerSamplesPerVector),
+      sample_vectors_per_rowgroup_(AlpConstants::kSamplerSampleVectorsPerRowgroup),
+      rowgroup_sample_jump_((rowgroup_size_ / sample_vectors_per_rowgroup_) /
+                            sample_vector_size_) {}
 
 template <typename T>
-void AlpSampler<T>::addSample(arrow::util::span<const T> input) {
-  for (uint64_t i = 0; i < input.size(); i += m_sampleVectorSize) {
-    const uint64_t elements = std::min(input.size() - i, m_sampleVectorSize);
-    addSampleVector({input.data() + i, elements});
+void AlpSampler<T>::AddSample(arrow::util::span<const T> input) {
+  for (uint64_t i = 0; i < input.size(); i += sample_vector_size_) {
+    const uint64_t elements = std::min(input.size() - i, sample_vector_size_);
+    AddSampleVector({input.data() + i, elements});
   }
 }
 
 template <typename T>
-void AlpSampler<T>::addSampleVector(arrow::util::span<const T> input) {
-  const bool mustSkipCurrentVector =
-      mustSkipSamplingFromCurrentVector(m_vectorsCount, m_vectorsSampledCount, input.size());
-
-  m_vectorsCount += 1;
-  m_totalValuesCount += input.size();
-  if (mustSkipCurrentVector) {
+void AlpSampler<T>::AddSampleVector(arrow::util::span<const T> input) {
+  const bool must_skip_current_vector =
+      MustSkipSamplingFromCurrentVector(vectors_count_, vectors_sampled_count_,
+                                        input.size());
+
+  vectors_count_ += 1;
+  total_values_count_ += input.size();
+  if (must_skip_current_vector) {
     return;
   }
 
-  const AlpSamplingParameters samplingParams = getAlpSamplingParameters(input.size());
+  const AlpSamplingParameters sampling_params = GetAlpSamplingParameters(input.size());
 
-  // Slice: take first numLookupValue elements.
-  std::vector<T> currentVectorValues(
-      input.begin(), input.begin() + std::min<size_t>(samplingParams.numLookupValue, input.size()));
+  // Slice: take first num_lookup_value elements.
+  std::vector<T> current_vector_values(
+      input.begin(),
+      input.begin() + std::min<size_t>(sampling_params.num_lookup_value, input.size()));
 
-  // Stride: take every numSampledIncrements-th element.
-  std::vector<T> currentVectorSample;
-  for (size_t i = 0; i < currentVectorValues.size(); i += samplingParams.numSampledIncrements) {
-    currentVectorSample.push_back(currentVectorValues[i]);
+  // Stride: take every num_sampled_increments-th element.
+  std::vector<T> current_vector_sample;
+  for (size_t i = 0; i < current_vector_values.size();
+       i += sampling_params.num_sampled_increments) {
+    current_vector_sample.push_back(current_vector_values[i]);
   }
-  m_sampleStored += currentVectorSample.size();
+  sample_stored_ += current_vector_sample.size();
 
-  m_completeVectorsSampled.push_back(std::move(currentVectorValues));
-  m_rowgroupSample.push_back(std::move(currentVectorSample));
-  m_vectorsSampledCount++;
+  complete_vectors_sampled_.push_back(std::move(current_vector_values));
+  rowgroup_sample_.push_back(std::move(current_vector_sample));
+  vectors_sampled_count_++;
 }
 
 template <typename T>
-typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::finalize() {
-  ARROW_LOG(DEBUG) << "AlpSampler finalized: vectorsSampled=" << m_vectorsSampledCount << "/"
-                   << m_vectorsCount << " total"
-                   << ", valuesSampled=" << m_sampleStored << "/" << m_totalValuesCount << " total";
+typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::Finalize() {
+  ARROW_LOG(DEBUG) << "AlpSampler finalized: vectorsSampled=" << vectors_sampled_count_
+                   << "/" << vectors_count_ << " total"
+                   << ", valuesSampled=" << sample_stored_ << "/" << total_values_count_
+                   << " total";
 
   AlpSamplerResult result;
-  result.alpPreset = AlpCompression<T>::createEncodingPreset(m_rowgroupSample);
+  result.alp_preset = AlpCompression<T>::CreateEncodingPreset(rowgroup_sample_);
 
-  ARROW_LOG(DEBUG) << "AlpSampler preset: " << result.alpPreset.combinations.size()
+  ARROW_LOG(DEBUG) << "AlpSampler preset: " << result.alp_preset.combinations.size()
                    << " exponent/factor combinations"
-                   << ", estimatedSize=" << result.alpPreset.bestCompressedSize << " bytes";
+                   << ", estimatedSize=" << result.alp_preset.best_compressed_size
+                   << " bytes";
 
   return result;
 }
 
 template <typename T>
-typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::getAlpSamplingParameters(
-    uint64_t numCurrentVectorValues) {
-  const uint64_t numLookupValues =
-      std::min(numCurrentVectorValues, static_cast<uint64_t>(AlpConstants::kAlpVectorSize));
-  // We sample equidistant values within a vector; to do this we jump a fixed number of values.
-  const uint64_t numSampledIncrements = std::max(
-      uint64_t{1},
-      static_cast<uint64_t>(std::ceil(static_cast<double>(numLookupValues) / m_samplesPerVector)));
-  const uint64_t numSampledValues =
-      std::ceil(static_cast<double>(numLookupValues) / numSampledIncrements);
-
-  ARROW_CHECK(numSampledValues < AlpConstants::kAlpVectorSize) << "alp_sample_too_large";
-
-  return AlpSamplingParameters{numLookupValues, numSampledIncrements, numSampledValues};
+typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::GetAlpSamplingParameters(
+    uint64_t num_current_vector_values) {
+  const uint64_t num_lookup_values =
+      std::min(num_current_vector_values,
+               static_cast<uint64_t>(AlpConstants::kAlpVectorSize));
+  // Sample equidistant values within a vector; jump a fixed number of values.
+  const uint64_t num_sampled_increments =
+      std::max(uint64_t{1}, static_cast<uint64_t>(std::ceil(
+                                static_cast<double>(num_lookup_values) /
+                                samples_per_vector_)));
+  const uint64_t num_sampled_values =
+      std::ceil(static_cast<double>(num_lookup_values) / num_sampled_increments);
+
+  ARROW_CHECK(num_sampled_values < AlpConstants::kAlpVectorSize) << "alp_sample_too_large";
+
+  return AlpSamplingParameters{num_lookup_values, num_sampled_increments,
+                               num_sampled_values};
 }
 
 template <typename T>
-bool AlpSampler<T>::mustSkipSamplingFromCurrentVector(const uint64_t vectorsCount,
-                                                      const uint64_t vectorsSampledCount,
-                                                      const uint64_t currentVectorNValues) {
-  // We sample equidistant vectors; to do this we skip a fixed number of vectors.
-  const bool mustSelectRowgroupSamples = (vectorsCount % m_rowgroupSampleJump) == 0;
-
-  // If we are not in the correct jump, we do not take sample from this vector.
-  if (!mustSelectRowgroupSamples) {
+bool AlpSampler<T>::MustSkipSamplingFromCurrentVector(
+    const uint64_t vectors_count, const uint64_t vectors_sampled_count,
+    const uint64_t current_vector_n_values) {
+  // Sample equidistant vectors; skip a fixed number of vectors.
+  const bool must_select_rowgroup_samples = (vectors_count % rowgroup_sample_jump_) == 0;
+
+  // If we are not in the correct jump, do not take sample from this vector.
+  if (!must_select_rowgroup_samples) {
     return true;
   }
 
-  // We do not take samples of non-complete vectors (usually the last one),
+  // Do not take samples of non-complete vectors (usually the last one),
   // except in the case of too little data.
-  if (currentVectorNValues < AlpConstants::kSamplerSamplesPerVector && vectorsSampledCount != 0) {
+  if (current_vector_n_values < AlpConstants::kSamplerSamplesPerVector &&
+      vectors_sampled_count != 0) {
     return true;
   }
   return false;
 
@@ -35,8 +35,8 @@ namespace alp {
 /// \class AlpSampler
 /// \brief Collects samples from data to be compressed with ALP
 ///
-/// Usage: Call addSample() or addSampleVector() multiple times to collect samples,
-/// then call finalize() to retrieve the resulting preset.
+/// Usage: Call AddSample() or AddSampleVector() multiple times to collect
+/// samples, then call Finalize() to retrieve the resulting preset.
 ///
 /// \tparam T the floating point type (float or double) to sample
 template <typename T>
@@ -47,72 +47,75 @@ class AlpSampler {
 
   /// \brief Helper struct containing the preset for ALP compression
   struct AlpSamplerResult {
-    AlpEncodingPreset alpPreset;
+    AlpEncodingPreset alp_preset;
   };
 
   /// \brief Add a sample of arbitrary size
   ///
-  /// The sample is internally separated into vectors on which addSampleVector() is called.
+  /// The sample is internally separated into vectors on which AddSampleVector()
+  /// is called.
   ///
   /// \param[in] input the input data to sample from
-  void addSample(arrow::util::span<const T> input);
+  void AddSample(arrow::util::span<const T> input);
 
   /// \brief Add a single vector as a sample
   ///
-  /// \param[in] input the input vector to add. Size should be <= AlpConstants::kAlpVectorSize.
-  void addSampleVector(arrow::util::span<const T> input);
+  /// \param[in] input the input vector to add.
+  ///            Size should be <= AlpConstants::kAlpVectorSize.
+  void AddSampleVector(arrow::util::span<const T> input);
 
   /// \brief Finalize sampling and generate the encoding preset
   ///
   /// \return an AlpSamplerResult containing the generated encoding preset
-  AlpSamplerResult finalize();
+  AlpSamplerResult Finalize();
 
  private:
   /// \brief Helper struct to encapsulate settings used for sampling
   struct AlpSamplingParameters {
-    uint64_t numLookupValue;
-    uint64_t numSampledIncrements;
-    uint64_t numSampledValues;
+    uint64_t num_lookup_value;
+    uint64_t num_sampled_increments;
+    uint64_t num_sampled_values;
   };
 
   /// \brief Calculate sampling parameters for the current vector
   ///
-  /// \param[in] numCurrentVectorValues the number of values in the current vector
+  /// \param[in] num_current_vector_values number of values in current vector
   /// \return the sampling parameters to use
-  AlpSamplingParameters getAlpSamplingParameters(uint64_t numCurrentVectorValues);
+  AlpSamplingParameters GetAlpSamplingParameters(uint64_t num_current_vector_values);
 
   /// \brief Check if the current vector must be ignored for sampling
   ///
-  /// \param[in] vectorsCount the total number of vectors processed so far
-  /// \param[in] vectorsSampledCount the number of vectors that have been sampled so far
-  /// \param[in] numCurrentVectorValues the number of values in the current vector
+  /// \param[in] vectors_count the total number of vectors processed so far
+  /// \param[in] vectors_sampled_count the number of vectors sampled so far
+  /// \param[in] num_current_vector_values number of values in current vector
   /// \return true if the current vector should be skipped, false otherwise
-  bool mustSkipSamplingFromCurrentVector(uint64_t vectorsCount, uint64_t vectorsSampledCount,
-                                         uint64_t numCurrentVectorValues);
+  bool MustSkipSamplingFromCurrentVector(uint64_t vectors_count,
+                                         uint64_t vectors_sampled_count,
+                                         uint64_t num_current_vector_values);
 
   /// Count of vectors that have been sampled
-  uint64_t m_vectorsSampledCount = 0;
+  uint64_t vectors_sampled_count_ = 0;
   /// Total count of values processed
-  uint64_t m_totalValuesCount = 0;
+  uint64_t total_values_count_ = 0;
   /// Total count of vectors processed
-  uint64_t m_vectorsCount = 0;
+  uint64_t vectors_count_ = 0;
   /// Number of samples stored
-  uint64_t m_sampleStored = 0;
+  uint64_t sample_stored_ = 0;
   /// Samples collected from current rowgroup
-  std::vector<std::vector<T>> m_rowgroupSample;
+  std::vector<std::vector<T>> rowgroup_sample_;
 
   /// Complete vectors sampled
-  std::vector<std::vector<T>> m_completeVectorsSampled;
+  std::vector<std::vector<T>> complete_vectors_sampled_;
   /// Size of each sample vector
-  const uint64_t m_sampleVectorSize;
+  const uint64_t sample_vector_size_;
   /// Size of each rowgroup
-  const uint64_t m_rowgroupSize;
+  const uint64_t rowgroup_size_;
   /// Number of samples to take per vector
-  const uint64_t m_samplesPerVector;
+  const uint64_t samples_per_vector_;
   /// Number of vectors to sample per rowgroup
-  const uint64_t m_sampleVectorsPerRowgroup;
+  const uint64_t sample_vectors_per_rowgroup_;
   /// Jump interval for rowgroup sampling
-  const uint64_t m_rowgroupSampleJump;
+  const uint64_t rowgroup_sample_jump_;
 };
 
 }  // namespace alp