From cb6ff29fbdb8b0ea89a5d0eab4160b571317c508 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Mon, 17 Feb 2025 16:48:43 +0100 Subject: [PATCH 01/28] WIP --- .../keyvi/compression/compression_selector.h | 20 +++++++++++++++++ .../keyvi/compression/compression_strategy.h | 8 +------ .../compression/snappy_compression_strategy.h | 6 ----- keyvi/include/keyvi/dictionary/match.h | 22 +++++++++++++------ 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 5eea0f87e..4b0af45a9 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -84,6 +84,26 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { } } +typedef void (*compress_buffer_func_t)(buffer_t*, const char*, size_t); + +inline compress_buffer_func_t decompressor_by_code(CompressionAlgorithm algorithm) { + switch (algorithm) { + case NO_COMPRESSION: + TRACE("unpack uncompressed string"); + return RawCompressionStrategy::DoCompress; + //case ZLIB_COMPRESSION: + // TRACE("unpack zlib compressed string"); + // return ZlibCompressionStrategy::DoCompress; + case SNAPPY_COMPRESSION: + TRACE("unpack snappy compressed string"); + return SnappyCompressionStrategy::DoCompress; + default: + throw std::invalid_argument("Invalid compression algorith " + + boost::lexical_cast(static_cast(algorithm))); + } +} + + } /* namespace compression */ } /* namespace keyvi */ diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 721f96d80..1e25748a5 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -34,7 +34,7 @@ namespace keyvi { namespace compression { -enum CompressionCode { +enum CompressionAlgorithm { NO_COMPRESSION = 0, ZLIB_COMPRESSION = 1, SNAPPY_COMPRESSION = 2, @@ -90,12 +90,6 @@ struct RawCompressionStrategy final : public CompressionStrategy { std::memcpy(buffer->data() + 1, raw, raw_size); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); } diff --git a/keyvi/include/keyvi/compression/snappy_compression_strategy.h b/keyvi/include/keyvi/compression/snappy_compression_strategy.h index 39e393a8c..277903946 100644 --- a/keyvi/include/keyvi/compression/snappy_compression_strategy.h +++ b/keyvi/include/keyvi/compression/snappy_compression_strategy.h @@ -47,12 +47,6 @@ struct SnappyCompressionStrategy final : public CompressionStrategy { buffer->resize(output_length + 1); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static std::string DoDecompress(const std::string& compressed) { diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index db8b2a684..d1e9f3715 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -32,6 +32,7 @@ #include #include +#include "keyvi/compression/compression_strategy.h" #include "keyvi/dictionary/fsa/automata.h" #include "keyvi/util/json_value.h" @@ -196,12 +197,23 @@ struct Match { return fsa_->GetRawValueAsString(state_); } - std::string GetMsgPackedValueAsString() const { + std::string GetMsgPackedValueAsString( + const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const { const std::string raw_value = GetRawValueAsString(); if (raw_value.empty()) { return raw_value; } + + if (raw_value[0] == compression_algorithm) { + return raw_value.substr(1); + } else if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); + return decompressor(raw_value); + } + + // todo: recompress const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); + return decompressor(raw_value); } @@ -210,9 +222,7 @@ struct Match { * * @param value */ - void SetRawValue(const std::string& value) { - raw_value_ = value; - } + void SetRawValue(const std::string& value) { raw_value_ = value; } private: size_t start_ = 0; @@ -230,9 +240,7 @@ struct Match { template friend match_t index::internal::FirstFilteredMatch(const MatcherT&, const DeletedT&); - fsa::automata_t& GetFsa() { - return fsa_; - } + fsa::automata_t& GetFsa() { return fsa_; } }; } /* namespace dictionary */ From 8d708c0cae3e6548ea6ebbbc3c4d7bf0ae548a21 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 22 Feb 2025 09:22:24 +0100 Subject: [PATCH 02/28] switch to unique ptr --- .../keyvi/compression/compression_selector.h | 18 +++++++----------- .../keyvi/compression/compression_strategy.h | 3 +++ keyvi/include/keyvi/dictionary/match.h | 8 ++++---- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 4b0af45a9..6da7886cb 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -25,6 +25,7 @@ #ifndef KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ #define KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ +#include #include #include @@ -84,26 +85,21 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { } } -typedef void (*compress_buffer_func_t)(buffer_t*, const char*, size_t); - -inline compress_buffer_func_t decompressor_by_code(CompressionAlgorithm algorithm) { +/** Returns an instance of a compression strategy by name. */ +inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) { switch (algorithm) { case NO_COMPRESSION: - TRACE("unpack uncompressed string"); - return RawCompressionStrategy::DoCompress; - //case ZLIB_COMPRESSION: - // TRACE("unpack zlib compressed string"); - // return ZlibCompressionStrategy::DoCompress; + return std::make_unique(); + case ZLIB_COMPRESSION: + return std::make_unique(); case SNAPPY_COMPRESSION: - TRACE("unpack snappy compressed string"); - return SnappyCompressionStrategy::DoCompress; + return std::make_unique(); default: throw std::invalid_argument("Invalid compression algorith " + boost::lexical_cast(static_cast(algorithm))); } } - } /* namespace compression */ } /* namespace keyvi */ diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 1e25748a5..45ee2d531 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -26,6 +26,7 @@ #define KEYVI_COMPRESSION_COMPRESSION_STRATEGY_H_ #include +#include #include #include @@ -77,6 +78,8 @@ struct CompressionStrategy { virtual uint64_t GetFileVersionMin() const = 0; }; +using compression_strategy_t = std::unique_ptr; + /** * A compression strategy that does almost nothing; i.e. it only adds * the length field. diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index d1e9f3715..6a4969344 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -197,8 +197,8 @@ struct Match { return fsa_->GetRawValueAsString(state_); } - std::string GetMsgPackedValueAsString( - const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const { + std::string GetMsgPackedValueAsString(const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { const std::string raw_value = GetRawValueAsString(); if (raw_value.empty()) { return raw_value; @@ -211,9 +211,9 @@ struct Match { return decompressor(raw_value); } - // todo: recompress + // todo: recompress const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); - + return decompressor(raw_value); } From 28491ec2b828968a0a808fc2ba912f052a586cf2 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 26 Feb 2025 23:17:30 +0100 Subject: [PATCH 03/28] add compression support in GetMsgPackedValueAsString --- .../keyvi/compression/compression_selector.h | 14 +++++---- .../keyvi/compression/compression_strategy.h | 6 ++++ keyvi/include/keyvi/dictionary/fsa/automata.h | 6 ++++ .../dictionary/fsa/internal/ivalue_store.h | 29 +++++++++++++++++-- .../fsa/internal/json_value_store.h | 24 +++++++++++++++ keyvi/include/keyvi/dictionary/match.h | 17 +---------- keyvi/include/keyvi/util/float_vector_value.h | 2 +- keyvi/include/keyvi/util/json_value.h | 14 ++------- keyvi/include/keyvi/util/msgpack_util.h | 24 +++++++++++++++ 9 files changed, 100 insertions(+), 36 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 6da7886cb..be61717b9 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -65,8 +65,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") { typedef std::string (*decompress_func_t)(const std::string&); typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t); -inline decompress_func_t decompressor_by_code(const std::string& s) { - switch (s[0]) { +inline decompress_func_t decompressor_by_code(const char code) { + switch (code) { case NO_COMPRESSION: TRACE("unpack uncompressed string"); return RawCompressionStrategy::DoDecompress; @@ -81,12 +81,16 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { return ZstdCompressionStrategy::DoDecompress; default: throw std::invalid_argument("Invalid compression code " + - boost::lexical_cast(static_cast(s[0]))); + boost::lexical_cast(static_cast(code))); } } -/** Returns an instance of a compression strategy by name. */ -inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) { +inline decompress_func_t decompressor_from_string(const std::string& s) { + return decompressor_by_code(s[0]); +} + +/** Returns an instance of a compression strategy by enum. */ +inline compression_strategy_t compression_strategy_by_enum(const CompressionAlgorithm algorithm) { switch (algorithm) { case NO_COMPRESSION: return std::make_unique(); diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 45ee2d531..7cf5c225b 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -65,6 +65,12 @@ struct CompressionStrategy { return std::string(buf.data(), buf.size()); } + inline std::string CompressWithoutHeader(const std::string& raw) { + buffer_t buf; + Compress(&buf, raw.data(), raw.size()); + return std::string(buf.data() + 1, buf.size() - 1); + } + /** * By the time this function is called, the length field added in Compress() * will have been removed. diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 8b531e32d..6d7c98dfb 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,6 +394,12 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } + std::string GetMsgPackedValueAsString(uint64_t state_value, const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + assert(value_store_reader_); + return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); + } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index 1175cb024..3b8550e46 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -33,6 +33,7 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/dictionary_merger_fwd.h" #include "keyvi/dictionary/fsa/internal/value_store_properties.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -111,15 +112,39 @@ class IValueStoreReader { * Get Value as string in raw format * * Note: The raw format is an implementation detail of keyvi, not an official binary interface. - * Value store implementers can override this method for performance reasons. + * Value store implementers can override this method with an optimized version. * * @param fsa_value - * @return the value as string without any decompression + * @return the value as binary encoded string */ virtual std::string GetRawValueAsString(uint64_t fsa_value) const { return keyvi::util::EncodeJsonValue(GetValueAsString(fsa_value)); } + /** + * Get Value as msgpack string + * + * Value store implementers can override this method with an optimized version. + * + * @param fsa_value + * @return the value as msgpack string + */ + virtual std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + const std::string msgpacked_value = keyvi::util::JsonStringToMsgPack(GetValueAsString(fsa_value)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress the value + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_enum(compression_algorithm); + + return compressor->Compress(msgpacked_value); + } + /** * Get Value as string (for dumping or communication) * diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 78f0b149f..705f474f9 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -372,6 +372,30 @@ class JsonValueStoreReader final : public IValueStoreReader { return keyvi::util::decodeVarIntString(strings_ + fsa_value); } + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + size_t value_size; + const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + + if (value_ptr[0] == compression_algorithm) { + return std::string(value_ptr[1], value_size - 1); + } + + // decompress + const compression::decompress_func_t decompressor = compression::decompressor_by_code(value_ptr[0]); + std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_enum(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + std::string GetValueAsString(uint64_t fsa_value) const override { TRACE("JsonValueStoreReader GetValueAsString"); std::string packed_string = keyvi::util::decodeVarIntString(strings_ + fsa_value); diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index 6a4969344..15d12dccb 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -199,22 +199,7 @@ struct Match { std::string GetMsgPackedValueAsString(const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const { - const std::string raw_value = GetRawValueAsString(); - if (raw_value.empty()) { - return raw_value; - } - - if (raw_value[0] == compression_algorithm) { - return raw_value.substr(1); - } else if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { - const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); - return decompressor(raw_value); - } - - // todo: recompress - const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); - - return decompressor(raw_value); + return fsa_->GetMsgPackedValueAsString(state_, compression_algorithm); } /** diff --git a/keyvi/include/keyvi/util/float_vector_value.h b/keyvi/include/keyvi/util/float_vector_value.h index a36317331..8f9231a96 100644 --- a/keyvi/include/keyvi/util/float_vector_value.h +++ b/keyvi/include/keyvi/util/float_vector_value.h @@ -35,7 +35,7 @@ namespace keyvi { namespace util { inline std::vector DecodeFloatVector(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string unompressed_string_value = decompressor(encoded_value); const size_t vector_size = unompressed_string_value.size() / sizeof(uint32_t); diff --git a/keyvi/include/keyvi/util/json_value.h b/keyvi/include/keyvi/util/json_value.h index 2fb35f3cf..6463dc011 100644 --- a/keyvi/include/keyvi/util/json_value.h +++ b/keyvi/include/keyvi/util/json_value.h @@ -42,7 +42,7 @@ namespace util { /** Decompresses (if needed) and decodes a json value stored in a JsonValueStore. */ inline std::string DecodeJsonValue(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string packed_string = decompressor(encoded_value); TRACE("unpacking %s", packed_string.c_str()); @@ -64,17 +64,7 @@ inline void EncodeJsonValue(std::functionclear(); - rapidjson::Document json_document; - json_document.Parse(raw_value.c_str()); - - if (!json_document.HasParseError()) { - TRACE("Got json"); - msgpack::packer packer(msgpack_buffer); - JsonToMsgPack(json_document, &packer, single_precision_float); - } else { - TRACE("Got a normal string"); - msgpack::pack(msgpack_buffer, raw_value); - } + JsonStringToMsgPack(raw_value, msgpack_buffer, single_precision_float); // compression if (msgpack_buffer->size() > compression_threshold) { long_compress(buffer, msgpack_buffer->data(), msgpack_buffer->size()); diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 8ce3a6ce4..55ce46fab 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -147,6 +147,30 @@ inline void MsgPackDump(Writer* writer, const msgpack::object& o) { } } +inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuffer* msgpack_buffer, + bool single_precision_float) { + rapidjson::Document json_document; + json_document.Parse(raw_value.c_str()); + + if (!json_document.HasParseError()) { + TRACE("Got json"); + msgpack::packer packer(msgpack_buffer); + JsonToMsgPack(json_document, &packer, single_precision_float); + } else { + TRACE("Got a normal string"); + msgpack::pack(msgpack_buffer, raw_value); + } +} + +inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false +) { +msgpack::sbuffer msgpack_buffer; +compression::buffer_t buffer; + +JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); +return std::string(reinterpret_cast(buffer.data()), buffer.size()); +} + } /* namespace util */ } /* namespace keyvi */ From 3d2878272668a996c397bb82aee77f31d54c84eb Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 26 Feb 2025 23:19:22 +0100 Subject: [PATCH 04/28] format --- keyvi/include/keyvi/dictionary/fsa/automata.h | 17 ++++++----------- keyvi/include/keyvi/util/msgpack_util.h | 14 +++++++------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 6d7c98dfb..2a7eaaa1d 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,19 +394,16 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } - std::string GetMsgPackedValueAsString(uint64_t state_value, const compression::CompressionAlgorithm compression_algorithm = - compression::CompressionAlgorithm::NO_COMPRESSION) const { + std::string GetMsgPackedValueAsString(uint64_t state_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { assert(value_store_reader_); return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); } - std::string GetStatistics() const { - return dictionary_properties_->GetStatistics(); - } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } - const std::string& GetManifest() const { - return dictionary_properties_->GetManifest(); - } + const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); @@ -472,9 +469,7 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { - return dictionary_properties_; - } + const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } }; // shared pointer diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 55ce46fab..120f90469 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -25,6 +25,7 @@ #ifndef KEYVI_UTIL_MSGPACK_UTIL_H_ #define KEYVI_UTIL_MSGPACK_UTIL_H_ #include +#include #include "msgpack.hpp" #include "rapidjson/document.h" @@ -148,7 +149,7 @@ inline void MsgPackDump(Writer* writer, const msgpack::object& o) { } inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuffer* msgpack_buffer, - bool single_precision_float) { + bool single_precision_float) { rapidjson::Document json_document; json_document.Parse(raw_value.c_str()); @@ -162,13 +163,12 @@ inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuff } } -inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false -) { -msgpack::sbuffer msgpack_buffer; -compression::buffer_t buffer; +inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false) { + msgpack::sbuffer msgpack_buffer; + compression::buffer_t buffer; -JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); -return std::string(reinterpret_cast(buffer.data()), buffer.size()); + JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); + return std::string(reinterpret_cast(buffer.data()), buffer.size()); } } /* namespace util */ From 6c9d974987f2a38cd967c915f29d74087f53af30 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 26 Feb 2025 23:45:46 +0100 Subject: [PATCH 05/28] renamings --- keyvi/include/keyvi/compression/compression_selector.h | 10 +++++----- .../keyvi/dictionary/fsa/internal/ivalue_store.h | 4 ++-- .../keyvi/dictionary/fsa/internal/json_value_store.h | 5 +++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index be61717b9..dbd961778 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -65,8 +65,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") { typedef std::string (*decompress_func_t)(const std::string&); typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t); -inline decompress_func_t decompressor_by_code(const char code) { - switch (code) { +inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorithm) { + switch (algorithm) { case NO_COMPRESSION: TRACE("unpack uncompressed string"); return RawCompressionStrategy::DoDecompress; @@ -81,16 +81,16 @@ inline decompress_func_t decompressor_by_code(const char code) { return ZstdCompressionStrategy::DoDecompress; default: throw std::invalid_argument("Invalid compression code " + - boost::lexical_cast(static_cast(code))); + boost::lexical_cast(static_cast(algorithm))); } } inline decompress_func_t decompressor_from_string(const std::string& s) { - return decompressor_by_code(s[0]); + return decompressor_by_code(static_cast(s[0])); } /** Returns an instance of a compression strategy by enum. */ -inline compression_strategy_t compression_strategy_by_enum(const CompressionAlgorithm algorithm) { +inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) { switch (algorithm) { case NO_COMPRESSION: return std::make_unique(); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index 3b8550e46..baf7de73b 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -140,9 +140,9 @@ class IValueStoreReader { // compress the value const compression::compression_strategy_t compressor = - compression::compression_strategy_by_enum(compression_algorithm); + compression::compression_strategy_by_code(compression_algorithm); - return compressor->Compress(msgpacked_value); + return compressor->CompressWithoutHeader(msgpacked_value); } /** diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 705f474f9..7d57e1af3 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -383,7 +383,8 @@ class JsonValueStoreReader final : public IValueStoreReader { } // decompress - const compression::decompress_func_t decompressor = compression::decompressor_by_code(value_ptr[0]); + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(value_ptr[0])); std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { @@ -391,7 +392,7 @@ class JsonValueStoreReader final : public IValueStoreReader { } // compress const compression::compression_strategy_t compressor = - compression::compression_strategy_by_enum(compression_algorithm); + compression::compression_strategy_by_code(compression_algorithm); return compressor->CompressWithoutHeader(msgpacked_value); } From 8291e618bd11f8bca023a9d4dc015fdb01bde35c Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Mon, 10 Mar 2025 10:35:52 +0100 Subject: [PATCH 06/28] fix JsonStringToMsgPack --- keyvi/include/keyvi/compression/compression_selector.h | 4 ++-- .../include/keyvi/dictionary/fsa/internal/json_value_store.h | 4 ++++ keyvi/include/keyvi/util/msgpack_util.h | 3 +-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index dbd961778..6aa0c6269 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -80,7 +80,7 @@ inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorit TRACE("unpack zstd compressed string"); return ZstdCompressionStrategy::DoDecompress; default: - throw std::invalid_argument("Invalid compression code " + + throw std::invalid_argument("Invalid compression algorithm " + boost::lexical_cast(static_cast(algorithm))); } } @@ -99,7 +99,7 @@ inline compression_strategy_t compression_strategy_by_code(const CompressionAlgo case SNAPPY_COMPRESSION: return std::make_unique(); default: - throw std::invalid_argument("Invalid compression algorith " + + throw std::invalid_argument("Invalid compression algorithm " + boost::lexical_cast(static_cast(algorithm))); } } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 7d57e1af3..9e8d178f6 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -378,6 +378,10 @@ class JsonValueStoreReader final : public IValueStoreReader { size_t value_size; const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + if (value_size == 0) { + return std::string(); + } + if (value_ptr[0] == compression_algorithm) { return std::string(value_ptr[1], value_size - 1); } diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 120f90469..a353f915b 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -165,10 +165,9 @@ inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuff inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false) { msgpack::sbuffer msgpack_buffer; - compression::buffer_t buffer; JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); - return std::string(reinterpret_cast(buffer.data()), buffer.size()); + return std::string(reinterpret_cast(msgpack_buffer.data()), msgpack_buffer.size()); } } /* namespace util */ From 6494546132263e62cd1bd226435b40836817d327 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 14 Mar 2025 23:07:02 +0100 Subject: [PATCH 07/28] add msgpack support for loaded match objects --- keyvi/include/keyvi/dictionary/match.h | 24 ++++++++++++++++++++++++ python/src/addons/Match.pyx | 6 +++++- python/src/pxds/match.pxd | 4 ++-- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index 15d12dccb..14d2584fe 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -199,6 +199,30 @@ struct Match { std::string GetMsgPackedValueAsString(const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const { + if (!fsa_) { + if (raw_value_.empty()) { + return raw_value_; + } + + if (raw_value_[0] == compression_algorithm) { + return raw_value_.substr(1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(raw_value_[0])); + std::string msgpacked_value = decompressor(raw_value_); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + return fsa_->GetMsgPackedValueAsString(state_, compression_algorithm); } diff --git a/python/src/addons/Match.pyx b/python/src/addons/Match.pyx index 372714270..3c7163ecf 100644 --- a/python/src/addons/Match.pyx +++ b/python/src/addons/Match.pyx @@ -169,7 +169,11 @@ def GetRawValueAsString(self, *args): """deprecated, use get_raw_value_as_string""" - return call_deprecated_method("GetRawValueAsString", "raw_value_as_string", self.raw_value_as_string, *args) + return call_deprecated_method("GetRawValueAsString", "dumps", self.dumps, *args) + + def raw_value_as_string(self, *args): + """deprecated, use get_raw_value_as_string""" + return call_deprecated_method("raw_value_as_string", "dumps", self.dumps, *args) def __bool__(self): return not self.inst.get().IsEmpty() diff --git a/python/src/pxds/match.pxd b/python/src/pxds/match.pxd index 775c5d289..9a6d36ac4 100644 --- a/python/src/pxds/match.pxd +++ b/python/src/pxds/match.pxd @@ -19,8 +19,8 @@ cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": void SetMatchedString (libcpp_utf8_string matched_string) # wrap-ignore PyObject* GetAttributePy(libcpp_utf8_string) except + nogil # wrap-ignore libcpp_utf8_output_string GetValueAsString() except + # wrap-as:value_as_string - libcpp_string GetRawValueAsString() except + # wrap-as:raw_value_as_string - libcpp_string GetMsgPackedValueAsString() except + # wrap-ignore + libcpp_string GetRawValueAsString() except + # wrap-as:dumps + libcpp_string GetMsgPackedValueAsString() except + # wrap-as:msgpacked_value_as_string void SetRawValue(libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, float) except + # wrap-ignore From 5fd6d9de404bfec7220b323d61665bd35cdd013d Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 14 Mar 2025 23:17:17 +0100 Subject: [PATCH 08/28] fix style --- keyvi/include/keyvi/dictionary/fsa/automata.h | 12 +++++++++--- keyvi/include/keyvi/dictionary/match.h | 8 ++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 2a7eaaa1d..3041f0f3b 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -401,9 +401,13 @@ class Automata final { return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); } - std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } + std::string GetStatistics() const { + return dictionary_properties_->GetStatistics(); + } - const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } + const std::string& GetManifest() const { + return dictionary_properties_->GetManifest(); + } const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); @@ -469,7 +473,9 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } + const dictionary_properties_t& GetDictionaryProperties() const { + return dictionary_properties_; + } }; // shared pointer diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index 14d2584fe..ee80d990d 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -231,7 +231,9 @@ struct Match { * * @param value */ - void SetRawValue(const std::string& value) { raw_value_ = value; } + void SetRawValue(const std::string& value) { + raw_value_ = value; + } private: size_t start_ = 0; @@ -249,7 +251,9 @@ struct Match { template friend match_t index::internal::FirstFilteredMatch(const MatcherT&, const DeletedT&); - fsa::automata_t& GetFsa() { return fsa_; } + fsa::automata_t& GetFsa() { + return fsa_; + } }; } /* namespace dictionary */ From f3ae390310778ec033a4256a18405a55160126b7 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 14 Mar 2025 23:45:05 +0100 Subject: [PATCH 09/28] fix accidental regressions --- keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h | 2 +- python/src/pxds/match.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 9e8d178f6..dbbf2777c 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -383,7 +383,7 @@ class JsonValueStoreReader final : public IValueStoreReader { } if (value_ptr[0] == compression_algorithm) { - return std::string(value_ptr[1], value_size - 1); + return std::string(value_ptr + 1, value_size - 1); } // decompress diff --git a/python/src/pxds/match.pxd b/python/src/pxds/match.pxd index 9a6d36ac4..ab56edeb4 100644 --- a/python/src/pxds/match.pxd +++ b/python/src/pxds/match.pxd @@ -19,7 +19,7 @@ cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": void SetMatchedString (libcpp_utf8_string matched_string) # wrap-ignore PyObject* GetAttributePy(libcpp_utf8_string) except + nogil # wrap-ignore libcpp_utf8_output_string GetValueAsString() except + # wrap-as:value_as_string - libcpp_string GetRawValueAsString() except + # wrap-as:dumps + libcpp_string GetRawValueAsString() except + # wrap-as:raw_value_as_string libcpp_string GetMsgPackedValueAsString() except + # wrap-as:msgpacked_value_as_string void SetRawValue(libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, libcpp_utf8_string) except + # wrap-ignore From 2ccb4d47dce0b93ff7b0a327d0dff8d0ca730c78 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 14 Mar 2025 23:59:40 +0100 Subject: [PATCH 10/28] add test coverage for msgpacked_value_as_string() --- python/tests/match_object_test.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 9c907a2ac..49daa7b8b 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -2,6 +2,7 @@ # Usage: py.test tests import keyvi +import msgpack from test_tools import tmp_dictionary import warnings @@ -36,6 +37,7 @@ def test_raw_serialization(): d = m.dumps() m2 = keyvi.Match.loads(d) assert m2.value_as_string() == '{"a":2}' + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 2} with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") assert m.GetValueAsString() == '{"a":2}' @@ -130,6 +132,7 @@ def test_get_value(): assert m.value == {"a": 2} m = d["abd"] assert m.value == {"a": 3} + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 3} def test_get_value_int(): @@ -141,6 +144,7 @@ def test_get_value_int(): assert m.value == 42 m = d["abd"] assert m.value == 21 + assert msgpack.loads(m.msgpacked_value_as_string()) == 21 def test_get_value_key_only(): @@ -152,6 +156,7 @@ def test_get_value_key_only(): assert m.value == '' m = d["abd"] assert m.value == '' + assert msgpack.loads(m.msgpacked_value_as_string()) == '' def test_get_value_string(): @@ -163,6 +168,7 @@ def test_get_value_string(): assert m.value == "aaaaa" m = d["abd"] assert m.value == "bbbbb" + assert msgpack.loads(m.msgpacked_value_as_string()) == "bbbbb" def test_matched_string(): From e87efc0f2dc080f0fa8059eff849e7a09d57d3da Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 15 Mar 2025 19:12:48 +0100 Subject: [PATCH 11/28] expose compression algorithm in python extension and add tests --- python/src/pxds/match.pxd | 2 + python/src/py/keyvi/__init__.py | 2 +- python/tests/match_object_test.py | 61 ++++++++++++++++++++++++------- 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/python/src/pxds/match.pxd b/python/src/pxds/match.pxd index ab56edeb4..62f47eb8b 100644 --- a/python/src/pxds/match.pxd +++ b/python/src/pxds/match.pxd @@ -4,6 +4,7 @@ from libcpp.string cimport string as libcpp_utf8_string from libcpp.string cimport string as libcpp_utf8_output_string from libcpp cimport bool from cpython.ref cimport PyObject +from compression cimport CompressionAlgorithm cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": cdef cppclass Match: @@ -21,6 +22,7 @@ cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": libcpp_utf8_output_string GetValueAsString() except + # wrap-as:value_as_string libcpp_string GetRawValueAsString() except + # wrap-as:raw_value_as_string libcpp_string GetMsgPackedValueAsString() except + # wrap-as:msgpacked_value_as_string + libcpp_string GetMsgPackedValueAsString(CompressionAlgorithm) except + # wrap-as:msgpacked_value_as_string void SetRawValue(libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, float) except + # wrap-ignore diff --git a/python/src/py/keyvi/__init__.py b/python/src/py/keyvi/__init__.py index 647e52b97..352602f7e 100644 --- a/python/src/py/keyvi/__init__.py +++ b/python/src/py/keyvi/__init__.py @@ -20,4 +20,4 @@ from keyvi._version import __version__ # global keyvi concepts -from keyvi._core import MatchIterator, Match, loading_strategy_types +from keyvi._core import MatchIterator, Match, loading_strategy_types, CompressionAlgorithm diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 49daa7b8b..9ed012db5 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -5,7 +5,7 @@ import msgpack from test_tools import tmp_dictionary import warnings - +import zlib from keyvi.compiler import ( JsonDictionaryCompiler, @@ -31,7 +31,7 @@ def test_raw_serialization(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value_as_string() == '{"a":2}' d = m.dumps() @@ -65,8 +65,8 @@ def test_unicode_attributes(): def test_bytes_attributes(): m = keyvi.Match() - bytes_key = bytes(u"äöü".encode('utf-8')) - bytes_value = bytes(u"äöüöäü".encode('utf-8')) + bytes_key = bytes("äöü".encode("utf-8")) + bytes_value = bytes("äöüöäü".encode("utf-8")) m[bytes_key] = 22 assert m[bytes_key] == 22 m["k2"] = bytes_value @@ -75,14 +75,14 @@ def test_bytes_attributes(): def test_double_attributes(): m = keyvi.Match() - bytes_key = bytes("abc".encode('utf-8')) + bytes_key = bytes("abc".encode("utf-8")) m[bytes_key] = 42.0 assert m[bytes_key] == 42.0 def test_boolean_attributes(): m = keyvi.Match() - bytes_key = bytes("def".encode('utf-8')) + bytes_key = bytes("def".encode("utf-8")) m[bytes_key] = True assert m[bytes_key] == True @@ -127,48 +127,83 @@ def test_get_value(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value == {"a": 2} m = d["abd"] assert m.value == {"a": 3} assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 3} + assert msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZLIB_COMPRESSION) + ) + ) == {"a": 3} def test_get_value_int(): c = CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", 42) c.add("abd", 21) - with tmp_dictionary(c, 'match_object_int.kv') as d: + with tmp_dictionary(c, "match_object_int.kv") as d: m = d["abc"] assert m.value == 42 m = d["abd"] assert m.value == 21 assert msgpack.loads(m.msgpacked_value_as_string()) == 21 + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == 21 + ) def test_get_value_key_only(): c = KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc") c.add("abd") - with tmp_dictionary(c, 'match_object_key_only.kv') as d: + with tmp_dictionary(c, "match_object_key_only.kv") as d: m = d["abc"] - assert m.value == '' + assert m.value == "" m = d["abd"] - assert m.value == '' - assert msgpack.loads(m.msgpacked_value_as_string()) == '' + assert m.value == "" + assert msgpack.loads(m.msgpacked_value_as_string()) == "" + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == "" + ) def test_get_value_string(): c = StringDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", "aaaaa") c.add("abd", "bbbbb") - with tmp_dictionary(c, 'match_object_string.kv') as d: + with tmp_dictionary(c, "match_object_string.kv") as d: m = d["abc"] assert m.value == "aaaaa" m = d["abd"] assert m.value == "bbbbb" assert msgpack.loads(m.msgpacked_value_as_string()) == "bbbbb" + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == "bbbbb" + ) def test_matched_string(): From d014ff13e5df221c5ec02331d98556f9f9f13fee Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sat, 15 Mar 2025 19:16:43 +0100 Subject: [PATCH 12/28] add missing file --- python/src/pxds/compression.pxd | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 python/src/pxds/compression.pxd diff --git a/python/src/pxds/compression.pxd b/python/src/pxds/compression.pxd new file mode 100644 index 000000000..5935ef726 --- /dev/null +++ b/python/src/pxds/compression.pxd @@ -0,0 +1,5 @@ +cdef extern from "keyvi/compression/compression_strategy.h" namespace "keyvi::compression": + ctypedef enum CompressionAlgorithm: + NO_COMPRESSION, + ZLIB_COMPRESSION, + SNAPPY_COMPRESSION From 8fff4174dd5c951df3377104fe242b759e69420f Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sun, 16 Mar 2025 22:52:03 +0100 Subject: [PATCH 13/28] move CompressionAlgorithm into separate file --- .../keyvi/compression/compression_algorithm.h | 33 +++++++++++++++++++ .../keyvi/compression/compression_selector.h | 1 + .../keyvi/compression/compression_strategy.h | 7 +--- 3 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 keyvi/include/keyvi/compression/compression_algorithm.h diff --git a/keyvi/include/keyvi/compression/compression_algorithm.h b/keyvi/include/keyvi/compression/compression_algorithm.h new file mode 100644 index 000000000..4a347ed14 --- /dev/null +++ b/keyvi/include/keyvi/compression/compression_algorithm.h @@ -0,0 +1,33 @@ +/* * keyvi - A key value store. + * + * Copyright 2025 Hendrik Muhs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ +#define KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ + +namespace keyvi { +namespace compression { + +enum CompressionAlgorithm { + NO_COMPRESSION = 0, + ZLIB_COMPRESSION = 1, + SNAPPY_COMPRESSION = 2, +}; + +} /* namespace compression */ +} /* namespace keyvi */ + +#endif // KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 6aa0c6269..44bf09279 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -31,6 +31,7 @@ #include #include +#include "keyvi/compression/compression_algorithm.h" #include "keyvi/compression/compression_strategy.h" #include "keyvi/compression/snappy_compression_strategy.h" #include "keyvi/compression/zlib_compression_strategy.h" diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 7cf5c225b..2e9f2a8e0 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -30,17 +30,12 @@ #include #include +#include "keyvi/compression/compression_algorithm.h" #include "keyvi/dictionary/fsa/internal/constants.h" namespace keyvi { namespace compression { - -enum CompressionAlgorithm { - NO_COMPRESSION = 0, - ZLIB_COMPRESSION = 1, - SNAPPY_COMPRESSION = 2, ZSTD_COMPRESSION = 3, -}; // buffer type which is realloc-able typedef std::vector buffer_t; From fdff3102819e8e044c350ec049ae945cb1c67835 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 28 Mar 2025 09:43:23 +0100 Subject: [PATCH 14/28] add rust bindings --- keyvi/bin/keyvi_c/c_api.cpp | 23 +++++++++++++++++++++++ keyvi/include/keyvi/c_api/c_api.h | 5 +++++ rust/Cargo.toml | 1 + rust/build.rs | 3 +++ rust/src/keyvi_match.rs | 14 ++++++++++++++ rust/src/lib.rs | 2 ++ rust/tests/tests.rs | 16 ++++++++++++++++ 7 files changed, 64 insertions(+) diff --git a/keyvi/bin/keyvi_c/c_api.cpp b/keyvi/bin/keyvi_c/c_api.cpp index 87fb0ee8e..4eece67dd 100644 --- a/keyvi/bin/keyvi_c/c_api.cpp +++ b/keyvi/bin/keyvi_c/c_api.cpp @@ -182,6 +182,29 @@ keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match* match) { return keyvi_bytes{data_size, static_cast(data_ptr)}; } +keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match* match, + keyvi::compression::CompressionAlgorithm compression) { + const keyvi_bytes empty_keyvi_bytes{0, nullptr}; + + if (!match->obj_) { + return empty_keyvi_bytes; + } + + const std::string compressed_value = match->obj_->GetMsgPackedValueAsString(compression); + + const size_t data_size = compressed_value.size(); + if (0 == data_size) { + return empty_keyvi_bytes; + } + auto* data_ptr = malloc(data_size); + if (nullptr == data_ptr) { + return empty_keyvi_bytes; + } + memcpy(data_ptr, compressed_value.c_str(), data_size); + + return keyvi_bytes{data_size, static_cast(data_ptr)}; +} + char* keyvi_match_get_matched_string(const keyvi_match* match) { return std_2_c_string(match->obj_ ? match->obj_->GetMatchedString() : ""); } diff --git a/keyvi/include/keyvi/c_api/c_api.h b/keyvi/include/keyvi/c_api/c_api.h index 32cce1724..4200f32bb 100644 --- a/keyvi/include/keyvi/c_api/c_api.h +++ b/keyvi/include/keyvi/c_api/c_api.h @@ -32,6 +32,8 @@ extern "C" { #include #include +#include "keyvi/compression/compression_algorithm.h" + struct keyvi_dictionary; struct keyvi_match; struct keyvi_match_iterator; @@ -92,6 +94,9 @@ char* keyvi_match_get_value_as_string(const struct keyvi_match*); keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match*); +keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match*, + keyvi::compression::CompressionAlgorithm); + char* keyvi_match_get_matched_string(const struct keyvi_match*); ////////////////////// diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 125fb8cb3..295dfc8a5 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -19,3 +19,4 @@ serde_json = ">=1.0" [dev-dependencies] rayon = "0.9" rand = ">=0.4" +snap = "1.1" diff --git a/rust/build.rs b/rust/build.rs index 07139f9eb..90202dcf9 100644 --- a/rust/build.rs +++ b/rust/build.rs @@ -49,8 +49,10 @@ fn main() { .header("keyvi_core/keyvi/include/keyvi/c_api/c_api.h") .clang_arg("-x") .clang_arg("c++") + .clang_arg("-Ikeyvi_core/keyvi/include") .enable_cxx_namespaces() .layout_tests(true) + .rustified_enum("keyvi::compression::CompressionAlgorithm") .allowlist_function("keyvi_bytes_destroy") .allowlist_function("keyvi_string_destroy") .allowlist_function("keyvi_create_dictionary") @@ -65,6 +67,7 @@ fn main() { .allowlist_function("keyvi_match_destroy") .allowlist_function("keyvi_match_get_matched_string") .allowlist_function("keyvi_match_get_msgpacked_value") + .allowlist_function("keyvi_match_get_msgpacked_value_compressed") .allowlist_function("keyvi_match_get_score") .allowlist_function("keyvi_match_get_value_as_string") .allowlist_function("keyvi_match_is_empty") diff --git a/rust/src/keyvi_match.rs b/rust/src/keyvi_match.rs index dcf759644..802889f47 100644 --- a/rust/src/keyvi_match.rs +++ b/rust/src/keyvi_match.rs @@ -73,6 +73,20 @@ impl KeyviMatch { msgpacked_value } + pub fn get_msgpacked_value_compressed(&self, compression_algorithm: root::keyvi::compression::CompressionAlgorithm) -> Vec { + let kv_bytes = unsafe { root::keyvi_match_get_msgpacked_value_compressed(self.match_ptr_, compression_algorithm) }; + let msgpacked_value = if kv_bytes.data_size == 0 { + Vec::new() + } else { + unsafe { + slice::from_raw_parts(kv_bytes.data_ptr, kv_bytes.data_size as usize).to_vec() + } + }; + unsafe { root::keyvi_bytes_destroy(kv_bytes) }; + + msgpacked_value + } + pub fn is_empty(&self) -> bool { unsafe { root::keyvi_match_is_empty(self.match_ptr_) } } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8790adc31..3bc4b9c7c 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -36,3 +36,5 @@ pub mod dictionary; pub mod keyvi_match; pub mod keyvi_match_iterator; pub mod keyvi_string; + +pub type Compression = bindings::root::keyvi::compression::CompressionAlgorithm; \ No newline at end of file diff --git a/rust/tests/tests.rs b/rust/tests/tests.rs index 99af74578..582d8b538 100644 --- a/rust/tests/tests.rs +++ b/rust/tests/tests.rs @@ -1,6 +1,7 @@ extern crate rand; extern crate rayon; extern crate serde_json; +extern crate snap; extern crate keyvi; @@ -10,6 +11,7 @@ mod tests { use rand::Rng; use rayon::prelude::*; use serde_json::Value; + use snap::raw::Decoder; use keyvi::dictionary; @@ -85,6 +87,20 @@ mod tests { assert!(m.get_value_as_string().is_empty()); } + #[test] + fn match_msgpacked_value_compressed_array() { + let m = dictionary::Dictionary::new("test_data/test.kv") + .unwrap() + .get("a"); + + assert_eq!(m.get_msgpacked_value_compressed(keyvi::Compression::NO_COMPRESSION), vec![146, 12, 13]); + + let mut snap_decoder = Decoder::new(); + let value_compressed = m.get_msgpacked_value_compressed(keyvi::Compression::SNAPPY_COMPRESSION); + let value_uncompressed = snap_decoder.decompress_vec(&value_compressed); + assert_eq!(value_uncompressed.unwrap(), vec![146, 12, 13]); + } + #[test] fn match_value() { let d = dictionary::Dictionary::new("test_data/test.kv").unwrap(); From 2d2c4780cc02ac9db87ef262036ff7f7d7e1fe9e Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 28 Mar 2025 09:46:10 +0100 Subject: [PATCH 15/28] cargo fmt --- rust/src/keyvi_match.rs | 9 +++++++-- rust/src/lib.rs | 2 +- rust/tests/tests.rs | 8 ++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/rust/src/keyvi_match.rs b/rust/src/keyvi_match.rs index 802889f47..f090ab9a7 100644 --- a/rust/src/keyvi_match.rs +++ b/rust/src/keyvi_match.rs @@ -73,8 +73,13 @@ impl KeyviMatch { msgpacked_value } - pub fn get_msgpacked_value_compressed(&self, compression_algorithm: root::keyvi::compression::CompressionAlgorithm) -> Vec { - let kv_bytes = unsafe { root::keyvi_match_get_msgpacked_value_compressed(self.match_ptr_, compression_algorithm) }; + pub fn get_msgpacked_value_compressed( + &self, + compression_algorithm: root::keyvi::compression::CompressionAlgorithm, + ) -> Vec { + let kv_bytes = unsafe { + root::keyvi_match_get_msgpacked_value_compressed(self.match_ptr_, compression_algorithm) + }; let msgpacked_value = if kv_bytes.data_size == 0 { Vec::new() } else { diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 3bc4b9c7c..976c8b78f 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -37,4 +37,4 @@ pub mod keyvi_match; pub mod keyvi_match_iterator; pub mod keyvi_string; -pub type Compression = bindings::root::keyvi::compression::CompressionAlgorithm; \ No newline at end of file +pub type Compression = bindings::root::keyvi::compression::CompressionAlgorithm; diff --git a/rust/tests/tests.rs b/rust/tests/tests.rs index 582d8b538..b53d6777d 100644 --- a/rust/tests/tests.rs +++ b/rust/tests/tests.rs @@ -93,10 +93,14 @@ mod tests { .unwrap() .get("a"); - assert_eq!(m.get_msgpacked_value_compressed(keyvi::Compression::NO_COMPRESSION), vec![146, 12, 13]); + assert_eq!( + m.get_msgpacked_value_compressed(keyvi::Compression::NO_COMPRESSION), + vec![146, 12, 13] + ); let mut snap_decoder = Decoder::new(); - let value_compressed = m.get_msgpacked_value_compressed(keyvi::Compression::SNAPPY_COMPRESSION); + let value_compressed = + m.get_msgpacked_value_compressed(keyvi::Compression::SNAPPY_COMPRESSION); let value_uncompressed = snap_decoder.decompress_vec(&value_compressed); assert_eq!(value_uncompressed.unwrap(), vec![146, 12, 13]); } From f90462cd6e4901916b3284a09b893924e32b783d Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Sun, 30 Mar 2025 20:41:36 +0200 Subject: [PATCH 16/28] fix merge --- keyvi/include/keyvi/compression/compression_algorithm.h | 1 + keyvi/include/keyvi/compression/compression_selector.h | 2 ++ keyvi/include/keyvi/compression/compression_strategy.h | 3 +-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_algorithm.h b/keyvi/include/keyvi/compression/compression_algorithm.h index 4a347ed14..2a3186df9 100644 --- a/keyvi/include/keyvi/compression/compression_algorithm.h +++ b/keyvi/include/keyvi/compression/compression_algorithm.h @@ -25,6 +25,7 @@ enum CompressionAlgorithm { NO_COMPRESSION = 0, ZLIB_COMPRESSION = 1, SNAPPY_COMPRESSION = 2, + ZSTD_COMPRESSION = 3, }; } /* namespace compression */ diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 44bf09279..c1e0d50aa 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -99,6 +99,8 @@ inline compression_strategy_t compression_strategy_by_code(const CompressionAlgo return std::make_unique(); case SNAPPY_COMPRESSION: return std::make_unique(); + case ZSTD_COMPRESSION: + return std::make_unique(); default: throw std::invalid_argument("Invalid compression algorithm " + boost::lexical_cast(static_cast(algorithm))); diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 2e9f2a8e0..6205d0ddf 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -35,10 +35,9 @@ namespace keyvi { namespace compression { - ZSTD_COMPRESSION = 3, // buffer type which is realloc-able -typedef std::vector buffer_t; +using buffer_t = std::vector; /** * The base class of every compression strategy. From 534ef25a0a1732d0ac1e3e6f23bac195be83d2b0 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Mon, 31 Mar 2025 23:46:09 +0200 Subject: [PATCH 17/28] remove generic GetMsgPackedValueAsString --- .../fsa/internal/float_vector_value_store.h | 31 +++++++++++++++++++ .../internal/int_inner_weights_value_store.h | 19 ++++++++++++ .../dictionary/fsa/internal/int_value_store.h | 19 ++++++++++++ .../dictionary/fsa/internal/ivalue_store.h | 18 ++--------- .../fsa/internal/null_value_store.h | 15 +++++++++ .../fsa/internal/string_value_store.h | 22 +++++++++++++ keyvi/include/keyvi/util/msgpack_util.h | 11 +++++++ 7 files changed, 119 insertions(+), 16 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index f8d2b0d15..857b3764d 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -391,6 +391,37 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { return keyvi::util::FloatVectorAsString(keyvi::util::DecodeFloatVector(packed_string), ", "); } + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + + size_t value_size; + const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + + if (value_size == 0) { + return std::string(); + } + + if (value_ptr[0] == compression_algorithm) { + return std::string(value_ptr + 1, value_size - 1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(value_ptr[0])); + std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + void CheckCompatibility(const IValueStoreReader& other) override { if (other.GetValueStoreType() != GetValueStoreType()) { throw std::invalid_argument("Dictionaries must have the same value store type"); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h index 5497c5868..53578836e 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h @@ -28,9 +28,11 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" +#include "keyvi/util/msgpack_util.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -111,6 +113,23 @@ class IntInnerWeightsValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO: replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return keyvi::util::ValueToMsgPack(fsa_value); + } + + return compression::compression_strategy_by_code(compression_algorithm) + ->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value)); + } + uint32_t GetWeight(uint64_t fsa_value) const override { return static_cast(fsa_value); } }; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h index ea3d48fc2..1a36a891b 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h @@ -28,9 +28,11 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" +#include "keyvi/util/msgpack_util.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -108,6 +110,23 @@ class IntValueStoreReader final : public IValueStoreReader { } std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } + + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO: replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return keyvi::util::ValueToMsgPack(fsa_value); + } + + return compression::compression_strategy_by_code(compression_algorithm) + ->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value)); + } }; template <> diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index baf7de73b..912a8aef2 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -117,9 +117,7 @@ class IValueStoreReader { * @param fsa_value * @return the value as binary encoded string */ - virtual std::string GetRawValueAsString(uint64_t fsa_value) const { - return keyvi::util::EncodeJsonValue(GetValueAsString(fsa_value)); - } + virtual std::string GetRawValueAsString(uint64_t fsa_value) const = 0; /** * Get Value as msgpack string @@ -131,19 +129,7 @@ class IValueStoreReader { */ virtual std::string GetMsgPackedValueAsString(uint64_t fsa_value, const compression::CompressionAlgorithm compression_algorithm = - compression::CompressionAlgorithm::NO_COMPRESSION) const { - const std::string msgpacked_value = keyvi::util::JsonStringToMsgPack(GetValueAsString(fsa_value)); - - if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { - return msgpacked_value; - } - - // compress the value - const compression::compression_strategy_t compressor = - compression::compression_strategy_by_code(compression_algorithm); - - return compressor->CompressWithoutHeader(msgpacked_value); - } + compression::CompressionAlgorithm::NO_COMPRESSION) const = 0; /** * Get Value as string (for dumping or communication) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h index 7dc1d730e..322291130 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h @@ -100,6 +100,21 @@ class NullValueStoreReader final : public IValueStoreReader { attributes_t GetValueAsAttributeVector(uint64_t fsa_value) const override { return attributes_t(); } std::string GetValueAsString(uint64_t fsa_value) const override { return ""; } + + std::string GetRawValueAsString(uint64_t fsa_value) const override { + return "\x00\xc0"; + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return "\xc0"; + } + + return compression::compression_strategy_by_code(compression_algorithm) + ->CompressWithoutHeader("\xc0"); + } }; template <> diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h index 3b40dda08..82a30c4c8 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h @@ -284,6 +284,28 @@ class StringValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::string(strings_ + fsa_value); } + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO: replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(std::string(strings_ + fsa_value))); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + std::string msgpacked_value = keyvi::util::ValueToMsgPack(std::string(strings_ + fsa_value)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + private: boost::interprocess::mapped_region* strings_region_; const char* strings_; diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index a353f915b..2de6e7390 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -170,6 +170,17 @@ inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single return std::string(reinterpret_cast(msgpack_buffer.data()), msgpack_buffer.size()); } +template +inline std::string ValueToMsgPack(const T& value) { + msgpack::sbuffer msgpack_buffer; + compression::buffer_t buffer; + + msgpack::packer pk(&msgpack_buffer); + pk.pack(value); + + return std::string(msgpack_buffer.data(), msgpack_buffer.size()); +} + } /* namespace util */ } /* namespace keyvi */ From 80f5dc753e7508a0c6f5cd6316c4848a74ebae4a Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 3 Apr 2025 08:09:45 +0200 Subject: [PATCH 18/28] fix errors --- .../dictionary/fsa/internal/float_vector_value_store.h | 1 - .../fsa/internal/int_inner_weights_value_store.h | 2 +- .../keyvi/dictionary/fsa/internal/int_value_store.h | 2 +- .../keyvi/dictionary/fsa/internal/string_value_store.h | 2 +- keyvi/include/keyvi/util/msgpack_util.h | 4 +++- python/tests/dictionary/string_dictionary_merger_test.py | 2 +- python/tests/match_object_test.py | 8 ++++---- 7 files changed, 11 insertions(+), 10 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index 857b3764d..a54569e16 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -394,7 +394,6 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { std::string GetMsgPackedValueAsString(uint64_t fsa_value, const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const { - size_t value_size; const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h index 53578836e..ece5e2123 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h @@ -114,7 +114,7 @@ class IntInnerWeightsValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } std::string GetRawValueAsString(uint64_t fsa_value) const override { - // TODO: replace with std::format once we have C++20 + // TODO(hendrik): replace with std::format once we have C++20 return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h index 1a36a891b..94f5a9c4a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h @@ -112,7 +112,7 @@ class IntValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } std::string GetRawValueAsString(uint64_t fsa_value) const override { - // TODO: replace with std::format once we have C++20 + // TODO(hendrik): replace with std::format once we have C++20 return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h index 82a30c4c8..c6011a671 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h @@ -285,7 +285,7 @@ class StringValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::string(strings_ + fsa_value); } std::string GetRawValueAsString(uint64_t fsa_value) const override { - // TODO: replace with std::format once we have C++20 + // TODO(hendrik): replace with std::format once we have C++20 return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) ->Compress(keyvi::util::ValueToMsgPack(std::string(strings_ + fsa_value))); } diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 2de6e7390..101a85c02 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -31,6 +31,9 @@ #include "rapidjson/document.h" #include "rapidjson/writer.h" +// #define ENABLE_TRACING +#include "keyvi/dictionary/util/trace.h" + /** * Utility classes for msgpack. * @@ -173,7 +176,6 @@ inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single template inline std::string ValueToMsgPack(const T& value) { msgpack::sbuffer msgpack_buffer; - compression::buffer_t buffer; msgpack::packer pk(&msgpack_buffer); pk.pack(value); diff --git a/python/tests/dictionary/string_dictionary_merger_test.py b/python/tests/dictionary/string_dictionary_merger_test.py index 5c8da42c1..219da5207 100644 --- a/python/tests/dictionary/string_dictionary_merger_test.py +++ b/python/tests/dictionary/string_dictionary_merger_test.py @@ -46,7 +46,7 @@ def generate_keyvi(key_values, filename): dictionary_compiler = StringDictionaryCompiler({"memory_limit_mb": "10"}) for key, value in key_values.items(): - dictionary_compiler.add(key, json.dumps(value)) + dictionary_compiler.add(key, value) dictionary_compiler.compile() dictionary_compiler.write_to_file(filename) diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 9ed012db5..95e053972 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -168,10 +168,10 @@ def test_get_value_key_only(): c.add("abd") with tmp_dictionary(c, "match_object_key_only.kv") as d: m = d["abc"] - assert m.value == "" + assert m.value is None m = d["abd"] - assert m.value == "" - assert msgpack.loads(m.msgpacked_value_as_string()) == "" + assert m.value is None + assert msgpack.loads(m.msgpacked_value_as_string()) is None assert ( msgpack.loads( zlib.decompress( @@ -180,7 +180,7 @@ def test_get_value_key_only(): ) ) ) - == "" + is None ) From dcc4c6b446b7966f27a63c32a54901d04a07605c Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 3 Apr 2025 08:14:24 +0200 Subject: [PATCH 19/28] fix format --- .../dictionary/fsa/internal/float_vector_value_store.h | 4 ++-- keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h | 2 +- .../keyvi/dictionary/fsa/internal/null_value_store.h | 7 ++----- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index a54569e16..7e120db88 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -407,7 +407,7 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { // decompress const compression::decompress_func_t decompressor = - compression::decompressor_by_code(static_cast(value_ptr[0])); + compression::decompressor_by_code(static_cast(value_ptr[0])); std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { @@ -416,7 +416,7 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { // compress const compression::compression_strategy_t compressor = - compression::compression_strategy_by_code(compression_algorithm); + compression::compression_strategy_by_code(compression_algorithm); return compressor->CompressWithoutHeader(msgpacked_value); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index 912a8aef2..909cf91c5 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -117,7 +117,7 @@ class IValueStoreReader { * @param fsa_value * @return the value as binary encoded string */ - virtual std::string GetRawValueAsString(uint64_t fsa_value) const = 0; + virtual std::string GetRawValueAsString(uint64_t fsa_value) const = 0; /** * Get Value as msgpack string diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h index 322291130..1523f9eae 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h @@ -101,9 +101,7 @@ class NullValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return ""; } - std::string GetRawValueAsString(uint64_t fsa_value) const override { - return "\x00\xc0"; - } + std::string GetRawValueAsString(uint64_t fsa_value) const override { return "\x00\xc0"; } std::string GetMsgPackedValueAsString(uint64_t fsa_value, const compression::CompressionAlgorithm compression_algorithm = @@ -112,8 +110,7 @@ class NullValueStoreReader final : public IValueStoreReader { return "\xc0"; } - return compression::compression_strategy_by_code(compression_algorithm) - ->CompressWithoutHeader("\xc0"); + return compression::compression_strategy_by_code(compression_algorithm)->CompressWithoutHeader("\xc0"); } }; From b6f59b49201b4a778fe6f1b0238e242f5f6be134 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 3 Apr 2025 08:26:21 +0200 Subject: [PATCH 20/28] add missing override --- .../keyvi/dictionary/fsa/internal/float_vector_value_store.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index 7e120db88..39f50f61e 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -393,7 +393,7 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { std::string GetMsgPackedValueAsString(uint64_t fsa_value, const compression::CompressionAlgorithm compression_algorithm = - compression::CompressionAlgorithm::NO_COMPRESSION) const { + compression::CompressionAlgorithm::NO_COMPRESSION) const override { size_t value_size; const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); From a4135545e0fcc44dd4ce4c354a5bacf254b005e9 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 17 Apr 2025 17:58:30 +0200 Subject: [PATCH 21/28] address review comments, add python tests --- .../dictionary/fsa/internal/null_value_store.h | 2 ++ python/requirements.txt | 2 ++ python/src/pxds/compression.pxd | 5 +++-- python/tests/match_object_test.py | 17 +++++++++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h index 1523f9eae..17527eba5 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h @@ -101,11 +101,13 @@ class NullValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return ""; } + // shortcut: `\00` for no compression, `\xc0` for nil/null in msgpack std::string GetRawValueAsString(uint64_t fsa_value) const override { return "\x00\xc0"; } std::string GetMsgPackedValueAsString(uint64_t fsa_value, const compression::CompressionAlgorithm compression_algorithm = compression::CompressionAlgorithm::NO_COMPRESSION) const override { + // `\xc0` == msgpack nil if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { return "\xc0"; } diff --git a/python/requirements.txt b/python/requirements.txt index b2af40ec1..70a21c6d5 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,3 +2,5 @@ autowrap>=0.16.0 msgpack>=1.0.0 pytest>=7.1.1 cython>=3.0 +python-snappy>=0.7.3 +zstd>=1.5.6.7 \ No newline at end of file diff --git a/python/src/pxds/compression.pxd b/python/src/pxds/compression.pxd index 5935ef726..78555781a 100644 --- a/python/src/pxds/compression.pxd +++ b/python/src/pxds/compression.pxd @@ -1,5 +1,6 @@ -cdef extern from "keyvi/compression/compression_strategy.h" namespace "keyvi::compression": +cdef extern from "keyvi/compression/compression_algorithm.h" namespace "keyvi::compression": ctypedef enum CompressionAlgorithm: NO_COMPRESSION, ZLIB_COMPRESSION, - SNAPPY_COMPRESSION + SNAPPY_COMPRESSION, + ZSTD_COMPRESSION diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 95e053972..b3a604d56 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -6,6 +6,8 @@ from test_tools import tmp_dictionary import warnings import zlib +import snappy +import zstd from keyvi.compiler import ( JsonDictionaryCompiler, @@ -138,6 +140,21 @@ def test_get_value(): m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZLIB_COMPRESSION) ) ) == {"a": 3} + assert msgpack.loads( + snappy.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.SNAPPY_COMPRESSION + ) + ) + ) == {"a": 3} + assert msgpack.loads( + zstd.decompress( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZSTD_COMPRESSION) + ) + ) == {"a": 3} + assert msgpack.loads( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.NO_COMPRESSION) + ) == {"a": 3} def test_get_value_int(): From 9f23322c11e0f3306e64cdb796fe05fb55e7daab Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 18 Apr 2025 17:22:43 +0200 Subject: [PATCH 22/28] move requirements into venv --- .github/workflows/python-cibuildwheel.yml | 2 +- python/requirements.txt | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python-cibuildwheel.yml b/.github/workflows/python-cibuildwheel.yml index 68d1b9f6d..5b3b371d0 100644 --- a/.github/workflows/python-cibuildwheel.yml +++ b/.github/workflows/python-cibuildwheel.yml @@ -102,7 +102,7 @@ jobs: CIBW_BEFORE_BUILD: pip install -r python/requirements.txt # testing - CIBW_TEST_REQUIRES: pytest + CIBW_TEST_REQUIRES: pytest python-snappy zstd CIBW_TEST_COMMAND: > python -m pytest {package}/tests && python -m pytest {package}/integration-tests diff --git a/python/requirements.txt b/python/requirements.txt index 70a21c6d5..b2af40ec1 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,5 +2,3 @@ autowrap>=0.16.0 msgpack>=1.0.0 pytest>=7.1.1 cython>=3.0 -python-snappy>=0.7.3 -zstd>=1.5.6.7 \ No newline at end of file From 85b979814dc12a19425d300f39057773d84a9c23 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 18 Apr 2025 17:34:36 +0200 Subject: [PATCH 23/28] install test packages for pip --- .github/workflows/python-cibuildwheel.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-cibuildwheel.yml b/.github/workflows/python-cibuildwheel.yml index 5b3b371d0..416d47911 100644 --- a/.github/workflows/python-cibuildwheel.yml +++ b/.github/workflows/python-cibuildwheel.yml @@ -139,6 +139,7 @@ jobs: python setup.py sdist -d wheelhouse && \ python -m pip uninstall -y autowrap && \ python -m pip install wheelhouse/*.tar.gz -v && \ + python -m pip install python-snappy zstd && \ python -m pytest tests && \ python -m pip uninstall -y keyvi From 454f9813f0a4c7a3b410a6753c1a45633d73d517 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 18 Apr 2025 17:46:05 +0200 Subject: [PATCH 24/28] pin boost on mac --- .github/workflows/keyvi.yml | 2 +- .github/workflows/python-cibuildwheel.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/keyvi.yml b/.github/workflows/keyvi.yml index 945dc2016..276002cc6 100644 --- a/.github/workflows/keyvi.yml +++ b/.github/workflows/keyvi.yml @@ -33,7 +33,7 @@ jobs: brew update # workaround for https://github.com/actions/setup-python/issues/577 brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done - brew install zlib snappy boost + brew install zlib snappy boost@1.85 - name: checkout from git uses: actions/checkout@v4 diff --git a/.github/workflows/python-cibuildwheel.yml b/.github/workflows/python-cibuildwheel.yml index 416d47911..525972b0e 100644 --- a/.github/workflows/python-cibuildwheel.yml +++ b/.github/workflows/python-cibuildwheel.yml @@ -64,7 +64,7 @@ jobs: run: | brew update && \ brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done && \ - brew install ccache zlib snappy boost + brew install ccache zlib snappy boost@1.85 - name: set mac deployment target X64 if: runner.os == 'macOS' && runner.arch == 'X64' From e2f5529e39753a5530afef8db7ea24120540246b Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 23 Apr 2025 11:01:04 +0200 Subject: [PATCH 25/28] link boost on mac --- .github/workflows/keyvi.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/keyvi.yml b/.github/workflows/keyvi.yml index 276002cc6..5388a425a 100644 --- a/.github/workflows/keyvi.yml +++ b/.github/workflows/keyvi.yml @@ -34,6 +34,7 @@ jobs: # workaround for https://github.com/actions/setup-python/issues/577 brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done brew install zlib snappy boost@1.85 + brew link boost@1.85 - name: checkout from git uses: actions/checkout@v4 From 87f8a098a3cdb90f9b9256e7f73933322eb19826 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 23 Apr 2025 11:05:54 +0200 Subject: [PATCH 26/28] link boost also for python workflows --- .github/workflows/python-cibuildwheel.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-cibuildwheel.yml b/.github/workflows/python-cibuildwheel.yml index 525972b0e..1699e9641 100644 --- a/.github/workflows/python-cibuildwheel.yml +++ b/.github/workflows/python-cibuildwheel.yml @@ -65,6 +65,7 @@ jobs: brew update && \ brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done && \ brew install ccache zlib snappy boost@1.85 + brew link boost@1.85 - name: set mac deployment target X64 if: runner.os == 'macOS' && runner.arch == 'X64' From ace835dd2fc5d5269aed101ccfa3fd543767f014 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Wed, 23 Apr 2025 11:45:39 +0200 Subject: [PATCH 27/28] temporarily revert #318 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0226ea0b7..452d3f09d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ else () message(FATAL_ERROR "Can not find Boost") endif () if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread") + set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread-mt") else () set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread") endif () From fb5243df173766261f245f2f866c5182e62ea31c Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 25 Apr 2025 22:52:18 +0200 Subject: [PATCH 28/28] add rust tests --- rust/Cargo.toml | 2 ++ rust/tests/tests.rs | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 295dfc8a5..991c79d75 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -20,3 +20,5 @@ serde_json = ">=1.0" rayon = "0.9" rand = ">=0.4" snap = "1.1" +zstd = "0.13" +flate2 = "1.0" \ No newline at end of file diff --git a/rust/tests/tests.rs b/rust/tests/tests.rs index b53d6777d..2d0bbd33f 100644 --- a/rust/tests/tests.rs +++ b/rust/tests/tests.rs @@ -1,16 +1,21 @@ +extern crate flate2; extern crate rand; extern crate rayon; extern crate serde_json; extern crate snap; +extern crate zstd; extern crate keyvi; #[cfg(test)] mod tests { + use std::io::Read; + + use flate2::read::ZlibDecoder; use rand; use rand::Rng; use rayon::prelude::*; - use serde_json::Value; + use serde_json::{value, Value}; use snap::raw::Decoder; use keyvi::dictionary; @@ -99,10 +104,23 @@ mod tests { ); let mut snap_decoder = Decoder::new(); - let value_compressed = + let value_compressed_snap = m.get_msgpacked_value_compressed(keyvi::Compression::SNAPPY_COMPRESSION); - let value_uncompressed = snap_decoder.decompress_vec(&value_compressed); - assert_eq!(value_uncompressed.unwrap(), vec![146, 12, 13]); + let value_uncompressed_snap = snap_decoder.decompress_vec(&value_compressed_snap); + assert_eq!(value_uncompressed_snap.unwrap(), vec![146, 12, 13]); + + let value_compressed_zstd = + m.get_msgpacked_value_compressed(keyvi::Compression::ZSTD_COMPRESSION); + let value_uncompressed_zstd: Vec = + zstd::decode_all(value_compressed_zstd.as_slice()).unwrap(); + assert_eq!(value_uncompressed_zstd, vec![146, 12, 13]); + + let value_compressed_zlib = + m.get_msgpacked_value_compressed(keyvi::Compression::ZLIB_COMPRESSION); + let mut zlib_decoder = ZlibDecoder::new(value_compressed_zlib.as_slice()); + let mut value_uncompressed_zlib: Vec = Vec::new(); + let _ = zlib_decoder.read_to_end(&mut value_uncompressed_zlib); + assert_eq!(value_uncompressed_zlib, vec![146, 12, 13]); } #[test]