@@ -33,97 +33,107 @@ namespace alp {
3333
3434template <typename T>
3535AlpSampler<T>::AlpSampler()
36- : m_sampleVectorSize(AlpConstants::kSamplerVectorSize ),
37- m_rowgroupSize (AlpConstants::kSamplerRowgroupSize ),
38- m_samplesPerVector(AlpConstants::kSamplerSamplesPerVector ),
39- m_sampleVectorsPerRowgroup(AlpConstants::kSamplerSampleVectorsPerRowgroup ),
40- m_rowgroupSampleJump((m_rowgroupSize / m_sampleVectorsPerRowgroup) / m_sampleVectorSize) {}
36+ : sample_vector_size_(AlpConstants::kSamplerVectorSize ),
37+ rowgroup_size_ (AlpConstants::kSamplerRowgroupSize ),
38+ samples_per_vector_(AlpConstants::kSamplerSamplesPerVector ),
39+ sample_vectors_per_rowgroup_(AlpConstants::kSamplerSampleVectorsPerRowgroup ),
40+ rowgroup_sample_jump_((rowgroup_size_ / sample_vectors_per_rowgroup_) /
41+ sample_vector_size_) {}
4142
4243template <typename T>
43- void AlpSampler<T>::addSample (arrow::util::span<const T> input) {
44- for (uint64_t i = 0 ; i < input.size (); i += m_sampleVectorSize ) {
45- const uint64_t elements = std::min (input.size () - i, m_sampleVectorSize );
46- addSampleVector ({input.data () + i, elements});
44+ void AlpSampler<T>::AddSample (arrow::util::span<const T> input) {
45+ for (uint64_t i = 0 ; i < input.size (); i += sample_vector_size_ ) {
46+ const uint64_t elements = std::min (input.size () - i, sample_vector_size_ );
47+ AddSampleVector ({input.data () + i, elements});
4748 }
4849}
4950
5051template <typename T>
51- void AlpSampler<T>::addSampleVector(arrow::util::span<const T> input) {
52- const bool mustSkipCurrentVector =
53- mustSkipSamplingFromCurrentVector (m_vectorsCount, m_vectorsSampledCount, input.size ());
54-
55- m_vectorsCount += 1 ;
56- m_totalValuesCount += input.size ();
57- if (mustSkipCurrentVector) {
52+ void AlpSampler<T>::AddSampleVector(arrow::util::span<const T> input) {
53+ const bool must_skip_current_vector =
54+ MustSkipSamplingFromCurrentVector (vectors_count_, vectors_sampled_count_,
55+ input.size ());
56+
57+ vectors_count_ += 1 ;
58+ total_values_count_ += input.size ();
59+ if (must_skip_current_vector) {
5860 return ;
5961 }
6062
61- const AlpSamplingParameters samplingParams = getAlpSamplingParameters (input.size ());
63+ const AlpSamplingParameters sampling_params = GetAlpSamplingParameters (input.size ());
6264
63- // Slice: take first numLookupValue elements.
64- std::vector<T> currentVectorValues (
65- input.begin (), input.begin () + std::min<size_t >(samplingParams.numLookupValue , input.size ()));
65+ // Slice: take first num_lookup_value elements.
66+ std::vector<T> current_vector_values (
67+ input.begin (),
68+ input.begin () + std::min<size_t >(sampling_params.num_lookup_value , input.size ()));
6669
67- // Stride: take every numSampledIncrements-th element.
68- std::vector<T> currentVectorSample;
69- for (size_t i = 0 ; i < currentVectorValues.size (); i += samplingParams.numSampledIncrements ) {
70- currentVectorSample.push_back (currentVectorValues[i]);
70+ // Stride: take every num_sampled_increments-th element.
71+ std::vector<T> current_vector_sample;
72+ for (size_t i = 0 ; i < current_vector_values.size ();
73+ i += sampling_params.num_sampled_increments ) {
74+ current_vector_sample.push_back (current_vector_values[i]);
7175 }
72- m_sampleStored += currentVectorSample .size ();
76+ sample_stored_ += current_vector_sample .size ();
7377
74- m_completeVectorsSampled .push_back (std::move (currentVectorValues ));
75- m_rowgroupSample .push_back (std::move (currentVectorSample ));
76- m_vectorsSampledCount ++;
78+ complete_vectors_sampled_ .push_back (std::move (current_vector_values ));
79+ rowgroup_sample_ .push_back (std::move (current_vector_sample ));
80+ vectors_sampled_count_ ++;
7781}
7882
7983template <typename T>
80- typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::finalize() {
81- ARROW_LOG (DEBUG) << " AlpSampler finalized: vectorsSampled=" << m_vectorsSampledCount << " /"
82- << m_vectorsCount << " total"
83- << " , valuesSampled=" << m_sampleStored << " /" << m_totalValuesCount << " total" ;
84+ typename AlpSampler<T>::AlpSamplerResult AlpSampler<T>::Finalize() {
85+ ARROW_LOG (DEBUG) << " AlpSampler finalized: vectorsSampled=" << vectors_sampled_count_
86+ << " /" << vectors_count_ << " total"
87+ << " , valuesSampled=" << sample_stored_ << " /" << total_values_count_
88+ << " total" ;
8489
8590 AlpSamplerResult result;
86- result.alpPreset = AlpCompression<T>::createEncodingPreset (m_rowgroupSample );
91+ result.alp_preset = AlpCompression<T>::CreateEncodingPreset (rowgroup_sample_ );
8792
88- ARROW_LOG (DEBUG) << " AlpSampler preset: " << result.alpPreset .combinations .size ()
93+ ARROW_LOG (DEBUG) << " AlpSampler preset: " << result.alp_preset .combinations .size ()
8994 << " exponent/factor combinations"
90- << " , estimatedSize=" << result.alpPreset .bestCompressedSize << " bytes" ;
95+ << " , estimatedSize=" << result.alp_preset .best_compressed_size
96+ << " bytes" ;
9197
9298 return result;
9399}
94100
95101template <typename T>
96- typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::getAlpSamplingParameters(
97- uint64_t numCurrentVectorValues) {
98- const uint64_t numLookupValues =
99- std::min (numCurrentVectorValues, static_cast <uint64_t >(AlpConstants::kAlpVectorSize ));
100- // We sample equidistant values within a vector; to do this we jump a fixed number of values.
101- const uint64_t numSampledIncrements = std::max (
102- uint64_t {1 },
103- static_cast <uint64_t >(std::ceil (static_cast <double >(numLookupValues) / m_samplesPerVector)));
104- const uint64_t numSampledValues =
105- std::ceil (static_cast <double >(numLookupValues) / numSampledIncrements);
106-
107- ARROW_CHECK (numSampledValues < AlpConstants::kAlpVectorSize ) << " alp_sample_too_large" ;
108-
109- return AlpSamplingParameters{numLookupValues, numSampledIncrements, numSampledValues};
102+ typename AlpSampler<T>::AlpSamplingParameters AlpSampler<T>::GetAlpSamplingParameters(
103+ uint64_t num_current_vector_values) {
104+ const uint64_t num_lookup_values =
105+ std::min (num_current_vector_values,
106+ static_cast <uint64_t >(AlpConstants::kAlpVectorSize ));
107+ // Sample equidistant values within a vector; jump a fixed number of values.
108+ const uint64_t num_sampled_increments =
109+ std::max (uint64_t {1 }, static_cast <uint64_t >(std::ceil (
110+ static_cast <double >(num_lookup_values) /
111+ samples_per_vector_)));
112+ const uint64_t num_sampled_values =
113+ std::ceil (static_cast <double >(num_lookup_values) / num_sampled_increments);
114+
115+ ARROW_CHECK (num_sampled_values < AlpConstants::kAlpVectorSize ) << " alp_sample_too_large" ;
116+
117+ return AlpSamplingParameters{num_lookup_values, num_sampled_increments,
118+ num_sampled_values};
110119}
111120
112121template <typename T>
113- bool AlpSampler<T>::mustSkipSamplingFromCurrentVector( const uint64_t vectorsCount,
114- const uint64_t vectorsSampledCount ,
115- const uint64_t currentVectorNValues ) {
116- // We sample equidistant vectors; to do this we skip a fixed number of vectors.
117- const bool mustSelectRowgroupSamples = (vectorsCount % m_rowgroupSampleJump ) == 0 ;
118-
119- // If we are not in the correct jump, we do not take sample from this vector.
120- if (!mustSelectRowgroupSamples ) {
122+ bool AlpSampler<T>::MustSkipSamplingFromCurrentVector(
123+ const uint64_t vectors_count, const uint64_t vectors_sampled_count ,
124+ const uint64_t current_vector_n_values ) {
125+ // Sample equidistant vectors; skip a fixed number of vectors.
126+ const bool must_select_rowgroup_samples = (vectors_count % rowgroup_sample_jump_ ) == 0 ;
127+
128+ // If we are not in the correct jump, do not take sample from this vector.
129+ if (!must_select_rowgroup_samples ) {
121130 return true ;
122131 }
123132
124- // We do not take samples of non-complete vectors (usually the last one),
133+ // Do not take samples of non-complete vectors (usually the last one),
125134 // except in the case of too little data.
126- if (currentVectorNValues < AlpConstants::kSamplerSamplesPerVector && vectorsSampledCount != 0 ) {
135+ if (current_vector_n_values < AlpConstants::kSamplerSamplesPerVector &&
136+ vectors_sampled_count != 0 ) {
127137 return true ;
128138 }
129139 return false ;
0 commit comments