Skip to content

Commit 1976266

Browse files
committed
chore: Refactor stats_line into StatsBuilder
1 parent 076c408 commit 1976266

File tree

9 files changed

+230
-113
lines changed

9 files changed

+230
-113
lines changed

include/pisa/dec_time_prediction.hpp

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,17 @@
22

33
#include <algorithm>
44
#include <array>
5+
#include <cmath>
6+
#include <cstdint>
57
#include <sstream>
68
#include <string>
9+
#include <vector>
710

811
#include "boost/preprocessor/seq/enum.hpp"
912
#include "boost/preprocessor/seq/for_each.hpp"
1013
#include "boost/preprocessor/stringize.hpp"
11-
12-
#include "util/util.hpp"
14+
#include "util/broadword.hpp"
15+
#include "util/index_build_utils.hpp"
1316

1417
#define PISA_FEATURE_TYPES (n)(size)(sum_of_logs)(entropy)(nonzeros)(max_b)(pfor_b)(pfor_exceptions)
1518

@@ -53,12 +56,12 @@ namespace pisa { namespace time_prediction {
5356
float& operator[](feature_type f) { return m_features[(size_t)f]; }
5457
float const& operator[](feature_type f) const { return m_features[(size_t)f]; }
5558

56-
stats_line& dump(stats_line& sl) const {
59+
pisa::StatsBuilder& dump(pisa::StatsBuilder& builder) const {
5760
for (size_t i = 0; i < num_features; ++i) {
5861
auto ft = static_cast<feature_type>(i);
59-
sl(feature_name(ft), (*this)[ft]);
62+
builder.add(feature_name(ft), (*this)[ft]);
6063
}
61-
return sl;
64+
return builder;
6265
}
6366

6467
protected:
@@ -95,7 +98,7 @@ namespace pisa { namespace time_prediction {
9598
float m_bias{0.0};
9699
};
97100

98-
inline void values_statistics(std::vector<uint32_t> values, feature_vector& f) {
101+
inline void values_statistics(std::vector<std::uint32_t> values, feature_vector& f) {
99102
std::sort(values.begin(), values.end());
100103
f[feature_type::n] = values.size();
101104
if (values.empty()) {

include/pisa/util/index_build_utils.hpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "block_inverted_index.hpp"
66
#include "freq_index.hpp"
77
#include "mappable/mapper.hpp"
8+
#include "util/stats_builder.hpp"
89
#include "util/util.hpp"
910

1011
namespace pisa {
@@ -34,8 +35,14 @@ void dump_stats(Collection& coll, std::string const& type, uint64_t postings) {
3435
spdlog::info("Documents: {} bytes, {} bits per element", docs_size, bits_per_doc);
3536
spdlog::info("Frequencies: {} bytes, {} bits per element", freqs_size, bits_per_freq);
3637

37-
stats_line()("type", type)("size", docs_size + freqs_size)("docs_size", docs_size)(
38-
"freqs_size", freqs_size)("bits_per_doc", bits_per_doc)("bits_per_freq", bits_per_freq);
38+
std::cout << pisa::stats_builder()
39+
.add("type", type)
40+
.add("size", docs_size + freqs_size)
41+
.add("docs_size", docs_size)
42+
.add("freqs_size", freqs_size)
43+
.add("bits_per_doc", bits_per_doc)
44+
.add("bits_per_freq", bits_per_freq)
45+
.build();
3946
}
4047

4148
inline void dump_stats(SizeStats const& stats, std::size_t postings) {
@@ -44,8 +51,13 @@ inline void dump_stats(SizeStats const& stats, std::size_t postings) {
4451
double bits_per_freq = stats.freqs * 8.0 / postings;
4552
spdlog::info("Documents: {} bytes, {} bits per element", stats.docs, bits_per_doc);
4653
spdlog::info("Frequencies: {} bytes, {} bits per element", stats.freqs, bits_per_freq);
47-
stats_line()("size", stats.docs + stats.freqs)("docs_size", stats.docs)(
48-
"freqs_size", stats.freqs)("bits_per_doc", bits_per_doc)("bits_per_freq", bits_per_freq);
54+
std::cout << pisa::stats_builder()
55+
.add("size", stats.docs + stats.freqs)
56+
.add("docs_size", stats.docs)
57+
.add("freqs_size", stats.freqs)
58+
.add("bits_per_doc", bits_per_doc)
59+
.add("bits_per_freq", bits_per_freq)
60+
.build();
4961
}
5062

5163
} // namespace pisa
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// Copyright 2026 PISA Developers
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <concepts>
16+
#include <cstdint>
17+
#include <memory>
18+
#include <ostream>
19+
#include <sstream>
20+
21+
namespace pisa {
22+
23+
namespace detail {
24+
class StatsBuilderInterface {
25+
public:
26+
virtual void add(std::string const& key, bool value) = 0;
27+
virtual void add(std::string const& key, std::int64_t value) = 0;
28+
virtual void add(std::string const& key, std::uint64_t value) = 0;
29+
virtual void add(std::string const& key, double value) = 0;
30+
virtual void add(std::string const& key, const char* value) = 0;
31+
virtual void add(std::string const& key, std::string value) = 0;
32+
[[nodiscard]] virtual auto build() const -> std::string = 0;
33+
};
34+
} // namespace detail
35+
36+
template <typename T>
37+
concept Streamable = requires(std::ostream& os, T value) {
38+
{ os << value } -> std::convertible_to<std::ostream&>;
39+
};
40+
41+
/**
42+
* Builds a simple key-value JSON for printing statistics.
43+
*/
44+
class StatsBuilder {
45+
private:
46+
std::unique_ptr<::pisa::detail::StatsBuilderInterface> m_impl;
47+
48+
explicit StatsBuilder(std::unique_ptr<::pisa::detail::StatsBuilderInterface> impl);
49+
50+
public:
51+
StatsBuilder();
52+
auto add(std::string const& key, bool value) -> StatsBuilder&;
53+
auto add(std::string const& key, int value) -> StatsBuilder&;
54+
auto add(std::string const& key, unsigned int value) -> StatsBuilder&;
55+
auto add(std::string const& key, long value) -> StatsBuilder&;
56+
auto add(std::string const& key, unsigned long value) -> StatsBuilder&;
57+
auto add(std::string const& key, double value) -> StatsBuilder&;
58+
auto add(std::string const& key, const char* value) -> StatsBuilder&;
59+
auto add(std::string const& key, std::string value) -> StatsBuilder&;
60+
auto add(std::string const& key, std::string_view value) -> StatsBuilder&;
61+
62+
template <typename T>
63+
requires Streamable<T>
64+
auto add(std::string const& key, T const& value) -> StatsBuilder& {
65+
std::ostringstream out;
66+
out << value;
67+
add(key, out.str());
68+
return *this;
69+
}
70+
71+
/** Builds the JSON. */
72+
[[nodiscard]] auto build() const -> std::string;
73+
};
74+
75+
[[nodiscard]] auto stats_builder() -> StatsBuilder;
76+
77+
} // namespace pisa

include/pisa/util/util.hpp

Lines changed: 0 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,6 @@
22

33
#include <cassert>
44
#include <chrono>
5-
#include <cmath>
6-
#include <cstdlib>
7-
#include <cstring>
8-
#include <iostream>
9-
#include <map>
10-
#include <vector>
115

126
#include "util/broadword.hpp"
137

@@ -112,90 +106,4 @@ function_iterator<State, AdvanceFunctor, ValueFunctor> make_function_iterator(
112106
);
113107
}
114108

115-
struct stats_line {
116-
stats_line() { std::cout << "{"; }
117-
stats_line(stats_line const&) = default;
118-
stats_line(stats_line&&) noexcept = default;
119-
stats_line& operator=(stats_line const&) = default;
120-
stats_line& operator=(stats_line&&) noexcept = default;
121-
~stats_line() { std::cout << "}" << std::endl; }
122-
123-
template <typename K, typename T>
124-
stats_line& operator()(K const& key, T const& value) {
125-
if (!first) {
126-
std::cout << ", ";
127-
} else {
128-
first = false;
129-
}
130-
131-
emit(key);
132-
std::cout << ": ";
133-
emit(value);
134-
return *this;
135-
}
136-
137-
template <typename T>
138-
stats_line& operator()(T const& obj) {
139-
return obj.dump(*this);
140-
}
141-
142-
private:
143-
template <typename T>
144-
void emit(T const& v) const {
145-
std::cout << v;
146-
}
147-
148-
// XXX properly escape strings
149-
void emit(const char* s) const { std::cout << '"' << s << '"'; }
150-
151-
void emit(std::string const& s) const { emit(s.c_str()); }
152-
153-
template <typename T>
154-
void emit(std::vector<T> const& v) const {
155-
std::cout << "[";
156-
bool first = true;
157-
for (auto const& i: v) {
158-
if (first) {
159-
first = false;
160-
} else {
161-
std::cout << ", ";
162-
}
163-
emit(i);
164-
}
165-
std::cout << "]";
166-
}
167-
168-
template <typename K, typename V>
169-
void emit(std::map<K, V> const& m) const {
170-
std::vector<std::pair<K, V>> v(m.begin(), m.end());
171-
emit(v);
172-
}
173-
174-
template <typename Tuple, size_t Pos>
175-
typename std::enable_if<Pos != 0, void>::type emit_tuple_helper(Tuple const& t) const {
176-
emit_tuple_helper<Tuple, Pos - 1>(t);
177-
std::cout << ", ";
178-
emit(std::get<Pos>(t));
179-
}
180-
181-
template <typename Tuple, size_t Pos>
182-
typename std::enable_if<Pos == 0, void>::type emit_tuple_helper(Tuple const& t) const {
183-
emit(std::get<0>(t));
184-
}
185-
186-
template <typename... Tp>
187-
void emit(std::tuple<Tp...> const& t) const {
188-
std::cout << "[";
189-
emit_tuple_helper<std::tuple<Tp...>, sizeof...(Tp) - 1>(t);
190-
std::cout << "]";
191-
}
192-
193-
template <typename T1, typename T2>
194-
void emit(std::pair<T1, T2> const& p) const {
195-
emit(std::make_tuple(p.first, p.second));
196-
}
197-
198-
bool first{true};
199-
};
200-
201109
} // namespace pisa

src/block_inverted_index.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,11 @@ void BlockIndexBuilder::build(binary_freq_collection const& input, std::string c
183183
double elapsed_secs = (get_time_usecs() - tick) / 1000000;
184184
spdlog::info("Index compressed in {} seconds", elapsed_secs);
185185

186-
stats_line()("type", m_block_codec->get_name())("worker_threads", std::thread::hardware_concurrency())(
187-
"construction_time", elapsed_secs
188-
);
186+
std::cout << pisa::stats_builder()
187+
.add("type", m_block_codec->get_name())
188+
.add("worker_threads", std::thread::hardware_concurrency())
189+
.add("construction_time", elapsed_secs)
190+
.build();
189191

190192
if (m_check) {
191193
BlockInvertedIndex index(

src/compress.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@ template <typename Collection>
2323
void dump_index_specific_stats(Collection const&, std::string const&) {}
2424

2525
void dump_index_specific_stats(pisa::pefuniform_index const& coll, std::string const& type) {
26-
pisa::stats_line()("type", type)("log_partition_size", int(coll.params().log_partition_size));
26+
std::cout << pisa::stats_builder()
27+
.add("type", type)
28+
.add("log_partition_size", int(coll.params().log_partition_size))
29+
.build();
2730
}
2831

2932
void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const& type) {
@@ -41,9 +44,11 @@ void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const
4144
}
4245
}
4346

44-
pisa::stats_line()("type", type)("docs_avg_part", long_postings / docs_partitions)(
45-
"freqs_avg_part", long_postings / freqs_partitions
46-
);
47+
std::cout << pisa::stats_builder()
48+
.add("type", type)
49+
.add("docs_avg_part", long_postings / docs_partitions)
50+
.add("freqs_avg_part", long_postings / freqs_partitions)
51+
.build();
4752
}
4853

4954
template <typename CollectionType, typename Wand>
@@ -173,9 +178,11 @@ void compress_index(
173178
double elapsed_secs = (get_time_usecs() - tick) / 1000000;
174179
spdlog::info("{} collection built in {} seconds", seq_type, elapsed_secs);
175180

176-
stats_line()("type", seq_type)("worker_threads", std::thread::hardware_concurrency())(
177-
"construction_time", elapsed_secs
178-
);
181+
std::cout << pisa::stats_builder()
182+
.add("type", seq_type)
183+
.add("worker_threads", std::thread::hardware_concurrency())
184+
.add("construction_time", elapsed_secs)
185+
.build();
179186

180187
dump_stats(coll, seq_type, postings);
181188
dump_index_specific_stats(coll, seq_type);

src/sharding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ void rearrange_sequences(
148148
) {
149149
spdlog::info("Rearranging documents");
150150
if (not shard_count) {
151-
*shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1;
151+
shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1;
152152
}
153153
std::ifstream is(input_basename);
154154
std::ifstream dis(fmt::format("{}.documents", input_basename));

0 commit comments

Comments
 (0)