Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions include/pisa/dec_time_prediction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <sstream>
#include <string>
#include <vector>

#include "boost/preprocessor/seq/enum.hpp"
#include "boost/preprocessor/seq/for_each.hpp"
#include "boost/preprocessor/stringize.hpp"

#include "util/util.hpp"
#include "util/broadword.hpp"
#include "util/json_stats.hpp"

#define PISA_FEATURE_TYPES (n)(size)(sum_of_logs)(entropy)(nonzeros)(max_b)(pfor_b)(pfor_exceptions)

Expand Down Expand Up @@ -53,12 +56,12 @@ namespace pisa { namespace time_prediction {
float& operator[](feature_type f) { return m_features[(size_t)f]; }
float const& operator[](feature_type f) const { return m_features[(size_t)f]; }

stats_line& dump(stats_line& sl) const {
pisa::JsonStats& dump(pisa::JsonStats& stats) const {
for (size_t i = 0; i < num_features; ++i) {
auto ft = static_cast<feature_type>(i);
sl(feature_name(ft), (*this)[ft]);
stats.add(feature_name(ft), (*this)[ft]);
}
return sl;
return stats;
}

protected:
Expand Down Expand Up @@ -95,7 +98,7 @@ namespace pisa { namespace time_prediction {
float m_bias{0.0};
};

inline void values_statistics(std::vector<uint32_t> values, feature_vector& f) {
inline void values_statistics(std::vector<std::uint32_t> values, feature_vector& f) {
std::sort(values.begin(), values.end());
f[feature_type::n] = values.size();
if (values.empty()) {
Expand Down
21 changes: 16 additions & 5 deletions include/pisa/util/index_build_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "block_inverted_index.hpp"
#include "freq_index.hpp"
#include "mappable/mapper.hpp"
#include "util/util.hpp"
#include "util/json_stats.hpp"

namespace pisa {

Expand Down Expand Up @@ -34,8 +34,14 @@ void dump_stats(Collection& coll, std::string const& type, uint64_t postings) {
spdlog::info("Documents: {} bytes, {} bits per element", docs_size, bits_per_doc);
spdlog::info("Frequencies: {} bytes, {} bits per element", freqs_size, bits_per_freq);

stats_line()("type", type)("size", docs_size + freqs_size)("docs_size", docs_size)(
"freqs_size", freqs_size)("bits_per_doc", bits_per_doc)("bits_per_freq", bits_per_freq);
std::cout << pisa::json_stats()
.add("type", type)
.add("size", docs_size + freqs_size)
.add("docs_size", docs_size)
.add("freqs_size", freqs_size)
.add("bits_per_doc", bits_per_doc)
.add("bits_per_freq", bits_per_freq)
.str();
}

inline void dump_stats(SizeStats const& stats, std::size_t postings) {
Expand All @@ -44,8 +50,13 @@ inline void dump_stats(SizeStats const& stats, std::size_t postings) {
double bits_per_freq = stats.freqs * 8.0 / postings;
spdlog::info("Documents: {} bytes, {} bits per element", stats.docs, bits_per_doc);
spdlog::info("Frequencies: {} bytes, {} bits per element", stats.freqs, bits_per_freq);
stats_line()("size", stats.docs + stats.freqs)("docs_size", stats.docs)(
"freqs_size", stats.freqs)("bits_per_doc", bits_per_doc)("bits_per_freq", bits_per_freq);
std::cout << pisa::json_stats()
.add("size", stats.docs + stats.freqs)
.add("docs_size", stats.docs)
.add("freqs_size", stats.freqs)
.add("bits_per_doc", bits_per_doc)
.add("bits_per_freq", bits_per_freq)
.str();
}

} // namespace pisa
79 changes: 79 additions & 0 deletions include/pisa/util/json_stats.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Copyright 2026 PISA Developers
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <concepts>
#include <cstdint>
#include <memory>
#include <ostream>
#include <sstream>

namespace pisa {

namespace detail {
class StatsInterface {
public:
virtual void add(std::string const& key, bool value) = 0;
virtual void add(std::string const& key, std::int64_t value) = 0;
virtual void add(std::string const& key, std::uint64_t value) = 0;
virtual void add(std::string const& key, double value) = 0;
virtual void add(std::string const& key, const char* value) = 0;
virtual void add(std::string const& key, std::string value) = 0;
[[nodiscard]] virtual auto str() const -> std::string = 0;
};
} // namespace detail

template <typename T>
concept Streamable = requires(std::ostream& os, T value) {
{ os << value } -> std::convertible_to<std::ostream&>;
};

/**
* A simple key-value JSON for printing statistics.
*/
class JsonStats {
private:
std::unique_ptr<::pisa::detail::StatsInterface> m_impl;

explicit JsonStats(std::unique_ptr<::pisa::detail::StatsInterface> impl);

public:
JsonStats();
auto add(std::string const& key, bool value) -> JsonStats&;
auto add(std::string const& key, int value) -> JsonStats&;
auto add(std::string const& key, unsigned int value) -> JsonStats&;
auto add(std::string const& key, long value) -> JsonStats&;
auto add(std::string const& key, unsigned long value) -> JsonStats&;
auto add(std::string const& key, double value) -> JsonStats&;
auto add(std::string const& key, const char* value) -> JsonStats&;
auto add(std::string const& key, std::string value) -> JsonStats&;
auto add(std::string const& key, std::string_view value) -> JsonStats&;

template <typename T>
requires Streamable<T>
auto add(std::string const& key, T const& value) -> JsonStats& {
std::ostringstream out;
out << value;
add(key, out.str());
return *this;
}

/** Returns the JSON string. */
[[nodiscard]] auto str() const -> std::string;
};

[[nodiscard]] auto json_stats() -> JsonStats;

} // namespace pisa
92 changes: 0 additions & 92 deletions include/pisa/util/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,6 @@

#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <map>
#include <vector>

#include "util/broadword.hpp"

Expand Down Expand Up @@ -112,90 +106,4 @@ function_iterator<State, AdvanceFunctor, ValueFunctor> make_function_iterator(
);
}

struct stats_line {
stats_line() { std::cout << "{"; }
stats_line(stats_line const&) = default;
stats_line(stats_line&&) noexcept = default;
stats_line& operator=(stats_line const&) = default;
stats_line& operator=(stats_line&&) noexcept = default;
~stats_line() { std::cout << "}" << std::endl; }

template <typename K, typename T>
stats_line& operator()(K const& key, T const& value) {
if (!first) {
std::cout << ", ";
} else {
first = false;
}

emit(key);
std::cout << ": ";
emit(value);
return *this;
}

template <typename T>
stats_line& operator()(T const& obj) {
return obj.dump(*this);
}

private:
template <typename T>
void emit(T const& v) const {
std::cout << v;
}

// XXX properly escape strings
void emit(const char* s) const { std::cout << '"' << s << '"'; }

void emit(std::string const& s) const { emit(s.c_str()); }

template <typename T>
void emit(std::vector<T> const& v) const {
std::cout << "[";
bool first = true;
for (auto const& i: v) {
if (first) {
first = false;
} else {
std::cout << ", ";
}
emit(i);
}
std::cout << "]";
}

template <typename K, typename V>
void emit(std::map<K, V> const& m) const {
std::vector<std::pair<K, V>> v(m.begin(), m.end());
emit(v);
}

template <typename Tuple, size_t Pos>
typename std::enable_if<Pos != 0, void>::type emit_tuple_helper(Tuple const& t) const {
emit_tuple_helper<Tuple, Pos - 1>(t);
std::cout << ", ";
emit(std::get<Pos>(t));
}

template <typename Tuple, size_t Pos>
typename std::enable_if<Pos == 0, void>::type emit_tuple_helper(Tuple const& t) const {
emit(std::get<0>(t));
}

template <typename... Tp>
void emit(std::tuple<Tp...> const& t) const {
std::cout << "[";
emit_tuple_helper<std::tuple<Tp...>, sizeof...(Tp) - 1>(t);
std::cout << "]";
}

template <typename T1, typename T2>
void emit(std::pair<T1, T2> const& p) const {
emit(std::make_tuple(p.first, p.second));
}

bool first{true};
};

} // namespace pisa
8 changes: 5 additions & 3 deletions src/block_inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,11 @@ void BlockIndexBuilder::build(binary_freq_collection const& input, std::string c
double elapsed_secs = (get_time_usecs() - tick) / 1000000;
spdlog::info("Index compressed in {} seconds", elapsed_secs);

stats_line()("type", m_block_codec->get_name())("worker_threads", std::thread::hardware_concurrency())(
"construction_time", elapsed_secs
);
std::cout << pisa::json_stats()
.add("type", m_block_codec->get_name())
.add("worker_threads", std::thread::hardware_concurrency())
.add("construction_time", elapsed_secs)
.str();

if (m_check) {
BlockInvertedIndex index(
Expand Down
22 changes: 15 additions & 7 deletions src/compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "linear_quantizer.hpp"
#include "type_safe.hpp"
#include "util/index_build_utils.hpp"
#include "util/json_stats.hpp"
#include "util/progress.hpp"
#include "util/verify_collection.hpp"
#include "wand_data.hpp"
Expand All @@ -23,7 +24,10 @@ template <typename Collection>
void dump_index_specific_stats(Collection const&, std::string const&) {}

void dump_index_specific_stats(pisa::pefuniform_index const& coll, std::string const& type) {
pisa::stats_line()("type", type)("log_partition_size", int(coll.params().log_partition_size));
std::cout << pisa::json_stats()
.add("type", type)
.add("log_partition_size", int(coll.params().log_partition_size))
.str();
}

void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const& type) {
Expand All @@ -41,9 +45,11 @@ void dump_index_specific_stats(pisa::pefopt_index const& coll, std::string const
}
}

pisa::stats_line()("type", type)("docs_avg_part", long_postings / docs_partitions)(
"freqs_avg_part", long_postings / freqs_partitions
);
std::cout << pisa::json_stats()
.add("type", type)
.add("docs_avg_part", long_postings / docs_partitions)
.add("freqs_avg_part", long_postings / freqs_partitions)
.str();
}

template <typename CollectionType, typename Wand>
Expand Down Expand Up @@ -173,9 +179,11 @@ void compress_index(
double elapsed_secs = (get_time_usecs() - tick) / 1000000;
spdlog::info("{} collection built in {} seconds", seq_type, elapsed_secs);

stats_line()("type", seq_type)("worker_threads", std::thread::hardware_concurrency())(
"construction_time", elapsed_secs
);
std::cout << pisa::json_stats()
.add("type", seq_type)
.add("worker_threads", std::thread::hardware_concurrency())
.add("construction_time", elapsed_secs)
.str();

dump_stats(coll, seq_type, postings);
dump_index_specific_stats(coll, seq_type);
Expand Down
2 changes: 1 addition & 1 deletion src/sharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ void rearrange_sequences(
) {
spdlog::info("Rearranging documents");
if (not shard_count) {
*shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1;
shard_count = *std::max_element(mapping.begin(), mapping.end()) + 1;
}
std::ifstream is(input_basename);
std::ifstream dis(fmt::format("{}.documents", input_basename));
Expand Down
Loading