diff --git a/.github/workflows/keyvi.yml b/.github/workflows/keyvi.yml index 945dc2016..5388a425a 100644 --- a/.github/workflows/keyvi.yml +++ b/.github/workflows/keyvi.yml @@ -33,7 +33,8 @@ jobs: brew update # workaround for https://github.com/actions/setup-python/issues/577 brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done - brew install zlib snappy boost + brew install zlib snappy boost@1.85 + brew link boost@1.85 - name: checkout from git uses: actions/checkout@v4 diff --git a/.github/workflows/python-cibuildwheel.yml b/.github/workflows/python-cibuildwheel.yml index d27db3976..bf8fd288c 100644 --- a/.github/workflows/python-cibuildwheel.yml +++ b/.github/workflows/python-cibuildwheel.yml @@ -64,7 +64,8 @@ jobs: run: | brew update && \ brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done && \ - brew install ccache zlib snappy boost + brew install ccache zlib snappy boost@1.85 + brew link boost@1.85 - name: set mac deployment target X64 if: runner.os == 'macOS' && runner.arch == 'X64' @@ -102,7 +103,7 @@ jobs: CIBW_BEFORE_BUILD: pip install -r python/requirements.txt # testing - CIBW_TEST_REQUIRES: pytest + CIBW_TEST_REQUIRES: pytest python-snappy zstd CIBW_TEST_COMMAND: > python -m pytest {package}/tests && python -m pytest {package}/integration-tests @@ -139,6 +140,7 @@ jobs: python setup.py sdist -d wheelhouse && \ python -m pip uninstall -y autowrap && \ python -m pip install wheelhouse/*.tar.gz -v && \ + python -m pip install python-snappy zstd && \ python -m pytest tests && \ python -m pip uninstall -y keyvi diff --git a/CMakeLists.txt b/CMakeLists.txt index 0226ea0b7..452d3f09d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ else () message(FATAL_ERROR "Can not find Boost") endif () if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread") + set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread-mt") else () set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread") endif () diff --git a/keyvi/bin/keyvi_c/c_api.cpp b/keyvi/bin/keyvi_c/c_api.cpp index 87fb0ee8e..4eece67dd 100644 --- a/keyvi/bin/keyvi_c/c_api.cpp +++ b/keyvi/bin/keyvi_c/c_api.cpp @@ -182,6 +182,29 @@ keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match* match) { return keyvi_bytes{data_size, static_cast(data_ptr)}; } +keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match* match, + keyvi::compression::CompressionAlgorithm compression) { + const keyvi_bytes empty_keyvi_bytes{0, nullptr}; + + if (!match->obj_) { + return empty_keyvi_bytes; + } + + const std::string compressed_value = match->obj_->GetMsgPackedValueAsString(compression); + + const size_t data_size = compressed_value.size(); + if (0 == data_size) { + return empty_keyvi_bytes; + } + auto* data_ptr = malloc(data_size); + if (nullptr == data_ptr) { + return empty_keyvi_bytes; + } + memcpy(data_ptr, compressed_value.c_str(), data_size); + + return keyvi_bytes{data_size, static_cast(data_ptr)}; +} + char* keyvi_match_get_matched_string(const keyvi_match* match) { return std_2_c_string(match->obj_ ? match->obj_->GetMatchedString() : ""); } diff --git a/keyvi/include/keyvi/c_api/c_api.h b/keyvi/include/keyvi/c_api/c_api.h index 32cce1724..4200f32bb 100644 --- a/keyvi/include/keyvi/c_api/c_api.h +++ b/keyvi/include/keyvi/c_api/c_api.h @@ -32,6 +32,8 @@ extern "C" { #include #include +#include "keyvi/compression/compression_algorithm.h" + struct keyvi_dictionary; struct keyvi_match; struct keyvi_match_iterator; @@ -92,6 +94,9 @@ char* keyvi_match_get_value_as_string(const struct keyvi_match*); keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match*); +keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match*, + keyvi::compression::CompressionAlgorithm); + char* keyvi_match_get_matched_string(const struct keyvi_match*); ////////////////////// diff --git a/keyvi/include/keyvi/compression/compression_algorithm.h b/keyvi/include/keyvi/compression/compression_algorithm.h new file mode 100644 index 000000000..2a3186df9 --- /dev/null +++ b/keyvi/include/keyvi/compression/compression_algorithm.h @@ -0,0 +1,34 @@ +/* * keyvi - A key value store. + * + * Copyright 2025 Hendrik Muhs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ +#define KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ + +namespace keyvi { +namespace compression { + +enum CompressionAlgorithm { + NO_COMPRESSION = 0, + ZLIB_COMPRESSION = 1, + SNAPPY_COMPRESSION = 2, + ZSTD_COMPRESSION = 3, +}; + +} /* namespace compression */ +} /* namespace keyvi */ + +#endif // KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_ diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 5eea0f87e..c1e0d50aa 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -25,11 +25,13 @@ #ifndef KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ #define KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_ +#include #include #include #include +#include "keyvi/compression/compression_algorithm.h" #include "keyvi/compression/compression_strategy.h" #include "keyvi/compression/snappy_compression_strategy.h" #include "keyvi/compression/zlib_compression_strategy.h" @@ -64,8 +66,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") { typedef std::string (*decompress_func_t)(const std::string&); typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t); -inline decompress_func_t decompressor_by_code(const std::string& s) { - switch (s[0]) { +inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorithm) { + switch (algorithm) { case NO_COMPRESSION: TRACE("unpack uncompressed string"); return RawCompressionStrategy::DoDecompress; @@ -79,8 +81,29 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { TRACE("unpack zstd compressed string"); return ZstdCompressionStrategy::DoDecompress; default: - throw std::invalid_argument("Invalid compression code " + - boost::lexical_cast(static_cast(s[0]))); + throw std::invalid_argument("Invalid compression algorithm " + + boost::lexical_cast(static_cast(algorithm))); + } +} + +inline decompress_func_t decompressor_from_string(const std::string& s) { + return decompressor_by_code(static_cast(s[0])); +} + +/** Returns an instance of a compression strategy by enum. */ +inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) { + switch (algorithm) { + case NO_COMPRESSION: + return std::make_unique(); + case ZLIB_COMPRESSION: + return std::make_unique(); + case SNAPPY_COMPRESSION: + return std::make_unique(); + case ZSTD_COMPRESSION: + return std::make_unique(); + default: + throw std::invalid_argument("Invalid compression algorithm " + + boost::lexical_cast(static_cast(algorithm))); } } diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 721f96d80..6205d0ddf 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -26,23 +26,18 @@ #define KEYVI_COMPRESSION_COMPRESSION_STRATEGY_H_ #include +#include #include #include +#include "keyvi/compression/compression_algorithm.h" #include "keyvi/dictionary/fsa/internal/constants.h" namespace keyvi { namespace compression { -enum CompressionCode { - NO_COMPRESSION = 0, - ZLIB_COMPRESSION = 1, - SNAPPY_COMPRESSION = 2, - ZSTD_COMPRESSION = 3, -}; - // buffer type which is realloc-able -typedef std::vector buffer_t; +using buffer_t = std::vector; /** * The base class of every compression strategy. @@ -64,6 +59,12 @@ struct CompressionStrategy { return std::string(buf.data(), buf.size()); } + inline std::string CompressWithoutHeader(const std::string& raw) { + buffer_t buf; + Compress(&buf, raw.data(), raw.size()); + return std::string(buf.data() + 1, buf.size() - 1); + } + /** * By the time this function is called, the length field added in Compress() * will have been removed. @@ -77,6 +78,8 @@ struct CompressionStrategy { virtual uint64_t GetFileVersionMin() const = 0; }; +using compression_strategy_t = std::unique_ptr; + /** * A compression strategy that does almost nothing; i.e. it only adds * the length field. @@ -90,12 +93,6 @@ struct RawCompressionStrategy final : public CompressionStrategy { std::memcpy(buffer->data() + 1, raw, raw_size); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); } diff --git a/keyvi/include/keyvi/compression/snappy_compression_strategy.h b/keyvi/include/keyvi/compression/snappy_compression_strategy.h index 39e393a8c..277903946 100644 --- a/keyvi/include/keyvi/compression/snappy_compression_strategy.h +++ b/keyvi/include/keyvi/compression/snappy_compression_strategy.h @@ -47,12 +47,6 @@ struct SnappyCompressionStrategy final : public CompressionStrategy { buffer->resize(output_length + 1); } - static inline std::string DoCompress(const char* raw, size_t raw_size) { - buffer_t buf; - DoCompress(&buf, raw, raw_size); - return std::string(buf.data(), buf.size()); - } - inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } static std::string DoDecompress(const std::string& compressed) { diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 8b531e32d..3041f0f3b 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,6 +394,13 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } + std::string GetMsgPackedValueAsString(uint64_t state_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + assert(value_store_reader_); + return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm); + } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index f8d2b0d15..39f50f61e 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -391,6 +391,36 @@ class FloatVectorValueStoreReader final : public IValueStoreReader { return keyvi::util::FloatVectorAsString(keyvi::util::DecodeFloatVector(packed_string), ", "); } + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + size_t value_size; + const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + + if (value_size == 0) { + return std::string(); + } + + if (value_ptr[0] == compression_algorithm) { + return std::string(value_ptr + 1, value_size - 1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(value_ptr[0])); + std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + void CheckCompatibility(const IValueStoreReader& other) override { if (other.GetValueStoreType() != GetValueStoreType()) { throw std::invalid_argument("Dictionaries must have the same value store type"); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h index 5497c5868..ece5e2123 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h @@ -28,9 +28,11 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" +#include "keyvi/util/msgpack_util.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -111,6 +113,23 @@ class IntInnerWeightsValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO(hendrik): replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return keyvi::util::ValueToMsgPack(fsa_value); + } + + return compression::compression_strategy_by_code(compression_algorithm) + ->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value)); + } + uint32_t GetWeight(uint64_t fsa_value) const override { return static_cast(fsa_value); } }; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h index ea3d48fc2..94f5a9c4a 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h @@ -28,9 +28,11 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" +#include "keyvi/util/msgpack_util.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -108,6 +110,23 @@ class IntValueStoreReader final : public IValueStoreReader { } std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); } + + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO(hendrik): replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(fsa_value)); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return keyvi::util::ValueToMsgPack(fsa_value); + } + + return compression::compression_strategy_by_code(compression_algorithm) + ->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value)); + } }; template <> diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index 1175cb024..909cf91c5 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -33,6 +33,7 @@ #include #include +#include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/dictionary_merger_fwd.h" #include "keyvi/dictionary/fsa/internal/value_store_properties.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -111,14 +112,24 @@ class IValueStoreReader { * Get Value as string in raw format * * Note: The raw format is an implementation detail of keyvi, not an official binary interface. - * Value store implementers can override this method for performance reasons. + * Value store implementers can override this method with an optimized version. * * @param fsa_value - * @return the value as string without any decompression + * @return the value as binary encoded string */ - virtual std::string GetRawValueAsString(uint64_t fsa_value) const { - return keyvi::util::EncodeJsonValue(GetValueAsString(fsa_value)); - } + virtual std::string GetRawValueAsString(uint64_t fsa_value) const = 0; + + /** + * Get Value as msgpack string + * + * Value store implementers can override this method with an optimized version. + * + * @param fsa_value + * @return the value as msgpack string + */ + virtual std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const = 0; /** * Get Value as string (for dumping or communication) diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 78f0b149f..dbbf2777c 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -372,6 +372,35 @@ class JsonValueStoreReader final : public IValueStoreReader { return keyvi::util::decodeVarIntString(strings_ + fsa_value); } + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + size_t value_size; + const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size); + + if (value_size == 0) { + return std::string(); + } + + if (value_ptr[0] == compression_algorithm) { + return std::string(value_ptr + 1, value_size - 1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(value_ptr[0])); + std::string msgpacked_value = decompressor(std::string(value_ptr, value_size)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + std::string GetValueAsString(uint64_t fsa_value) const override { TRACE("JsonValueStoreReader GetValueAsString"); std::string packed_string = keyvi::util::decodeVarIntString(strings_ + fsa_value); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h index 7dc1d730e..17527eba5 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h @@ -100,6 +100,20 @@ class NullValueStoreReader final : public IValueStoreReader { attributes_t GetValueAsAttributeVector(uint64_t fsa_value) const override { return attributes_t(); } std::string GetValueAsString(uint64_t fsa_value) const override { return ""; } + + // shortcut: `\00` for no compression, `\xc0` for nil/null in msgpack + std::string GetRawValueAsString(uint64_t fsa_value) const override { return "\x00\xc0"; } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + // `\xc0` == msgpack nil + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return "\xc0"; + } + + return compression::compression_strategy_by_code(compression_algorithm)->CompressWithoutHeader("\xc0"); + } }; template <> diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h index 3b40dda08..c6011a671 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h @@ -284,6 +284,28 @@ class StringValueStoreReader final : public IValueStoreReader { std::string GetValueAsString(uint64_t fsa_value) const override { return std::string(strings_ + fsa_value); } + std::string GetRawValueAsString(uint64_t fsa_value) const override { + // TODO(hendrik): replace with std::format once we have C++20 + return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION) + ->Compress(keyvi::util::ValueToMsgPack(std::string(strings_ + fsa_value))); + } + + std::string GetMsgPackedValueAsString(uint64_t fsa_value, + const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const override { + std::string msgpacked_value = keyvi::util::ValueToMsgPack(std::string(strings_ + fsa_value)); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); + } + private: boost::interprocess::mapped_region* strings_region_; const char* strings_; diff --git a/keyvi/include/keyvi/dictionary/match.h b/keyvi/include/keyvi/dictionary/match.h index db8b2a684..ee80d990d 100644 --- a/keyvi/include/keyvi/dictionary/match.h +++ b/keyvi/include/keyvi/dictionary/match.h @@ -32,6 +32,7 @@ #include #include +#include "keyvi/compression/compression_strategy.h" #include "keyvi/dictionary/fsa/automata.h" #include "keyvi/util/json_value.h" @@ -196,13 +197,33 @@ struct Match { return fsa_->GetRawValueAsString(state_); } - std::string GetMsgPackedValueAsString() const { - const std::string raw_value = GetRawValueAsString(); - if (raw_value.empty()) { - return raw_value; + std::string GetMsgPackedValueAsString(const compression::CompressionAlgorithm compression_algorithm = + compression::CompressionAlgorithm::NO_COMPRESSION) const { + if (!fsa_) { + if (raw_value_.empty()) { + return raw_value_; + } + + if (raw_value_[0] == compression_algorithm) { + return raw_value_.substr(1); + } + + // decompress + const compression::decompress_func_t decompressor = + compression::decompressor_by_code(static_cast(raw_value_[0])); + std::string msgpacked_value = decompressor(raw_value_); + + if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) { + return msgpacked_value; + } + // compress + const compression::compression_strategy_t compressor = + compression::compression_strategy_by_code(compression_algorithm); + + return compressor->CompressWithoutHeader(msgpacked_value); } - const compression::decompress_func_t decompressor = compression::decompressor_by_code(raw_value); - return decompressor(raw_value); + + return fsa_->GetMsgPackedValueAsString(state_, compression_algorithm); } /** diff --git a/keyvi/include/keyvi/util/float_vector_value.h b/keyvi/include/keyvi/util/float_vector_value.h index a36317331..8f9231a96 100644 --- a/keyvi/include/keyvi/util/float_vector_value.h +++ b/keyvi/include/keyvi/util/float_vector_value.h @@ -35,7 +35,7 @@ namespace keyvi { namespace util { inline std::vector DecodeFloatVector(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string unompressed_string_value = decompressor(encoded_value); const size_t vector_size = unompressed_string_value.size() / sizeof(uint32_t); diff --git a/keyvi/include/keyvi/util/json_value.h b/keyvi/include/keyvi/util/json_value.h index 2fb35f3cf..6463dc011 100644 --- a/keyvi/include/keyvi/util/json_value.h +++ b/keyvi/include/keyvi/util/json_value.h @@ -42,7 +42,7 @@ namespace util { /** Decompresses (if needed) and decodes a json value stored in a JsonValueStore. */ inline std::string DecodeJsonValue(const std::string& encoded_value) { - compression::decompress_func_t decompressor = compression::decompressor_by_code(encoded_value); + compression::decompress_func_t decompressor = compression::decompressor_from_string(encoded_value); std::string packed_string = decompressor(encoded_value); TRACE("unpacking %s", packed_string.c_str()); @@ -64,17 +64,7 @@ inline void EncodeJsonValue(std::functionclear(); - rapidjson::Document json_document; - json_document.Parse(raw_value.c_str()); - - if (!json_document.HasParseError()) { - TRACE("Got json"); - msgpack::packer packer(msgpack_buffer); - JsonToMsgPack(json_document, &packer, single_precision_float); - } else { - TRACE("Got a normal string"); - msgpack::pack(msgpack_buffer, raw_value); - } + JsonStringToMsgPack(raw_value, msgpack_buffer, single_precision_float); // compression if (msgpack_buffer->size() > compression_threshold) { long_compress(buffer, msgpack_buffer->data(), msgpack_buffer->size()); diff --git a/keyvi/include/keyvi/util/msgpack_util.h b/keyvi/include/keyvi/util/msgpack_util.h index 8ce3a6ce4..101a85c02 100644 --- a/keyvi/include/keyvi/util/msgpack_util.h +++ b/keyvi/include/keyvi/util/msgpack_util.h @@ -25,11 +25,15 @@ #ifndef KEYVI_UTIL_MSGPACK_UTIL_H_ #define KEYVI_UTIL_MSGPACK_UTIL_H_ #include +#include #include "msgpack.hpp" #include "rapidjson/document.h" #include "rapidjson/writer.h" +// #define ENABLE_TRACING +#include "keyvi/dictionary/util/trace.h" + /** * Utility classes for msgpack. * @@ -147,6 +151,38 @@ inline void MsgPackDump(Writer* writer, const msgpack::object& o) { } } +inline void JsonStringToMsgPack(const std::string& raw_value, msgpack::v1::sbuffer* msgpack_buffer, + bool single_precision_float) { + rapidjson::Document json_document; + json_document.Parse(raw_value.c_str()); + + if (!json_document.HasParseError()) { + TRACE("Got json"); + msgpack::packer packer(msgpack_buffer); + JsonToMsgPack(json_document, &packer, single_precision_float); + } else { + TRACE("Got a normal string"); + msgpack::pack(msgpack_buffer, raw_value); + } +} + +inline std::string JsonStringToMsgPack(const std::string& raw_value, bool single_precision_float = false) { + msgpack::sbuffer msgpack_buffer; + + JsonStringToMsgPack(raw_value, &msgpack_buffer, single_precision_float); + return std::string(reinterpret_cast(msgpack_buffer.data()), msgpack_buffer.size()); +} + +template +inline std::string ValueToMsgPack(const T& value) { + msgpack::sbuffer msgpack_buffer; + + msgpack::packer pk(&msgpack_buffer); + pk.pack(value); + + return std::string(msgpack_buffer.data(), msgpack_buffer.size()); +} + } /* namespace util */ } /* namespace keyvi */ diff --git a/python/src/addons/Match.pyx b/python/src/addons/Match.pyx index 372714270..3c7163ecf 100644 --- a/python/src/addons/Match.pyx +++ b/python/src/addons/Match.pyx @@ -169,7 +169,11 @@ def GetRawValueAsString(self, *args): """deprecated, use get_raw_value_as_string""" - return call_deprecated_method("GetRawValueAsString", "raw_value_as_string", self.raw_value_as_string, *args) + return call_deprecated_method("GetRawValueAsString", "dumps", self.dumps, *args) + + def raw_value_as_string(self, *args): + """deprecated, use get_raw_value_as_string""" + return call_deprecated_method("raw_value_as_string", "dumps", self.dumps, *args) def __bool__(self): return not self.inst.get().IsEmpty() diff --git a/python/src/pxds/compression.pxd b/python/src/pxds/compression.pxd new file mode 100644 index 000000000..78555781a --- /dev/null +++ b/python/src/pxds/compression.pxd @@ -0,0 +1,6 @@ +cdef extern from "keyvi/compression/compression_algorithm.h" namespace "keyvi::compression": + ctypedef enum CompressionAlgorithm: + NO_COMPRESSION, + ZLIB_COMPRESSION, + SNAPPY_COMPRESSION, + ZSTD_COMPRESSION diff --git a/python/src/pxds/match.pxd b/python/src/pxds/match.pxd index 775c5d289..62f47eb8b 100644 --- a/python/src/pxds/match.pxd +++ b/python/src/pxds/match.pxd @@ -4,6 +4,7 @@ from libcpp.string cimport string as libcpp_utf8_string from libcpp.string cimport string as libcpp_utf8_output_string from libcpp cimport bool from cpython.ref cimport PyObject +from compression cimport CompressionAlgorithm cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": cdef cppclass Match: @@ -20,7 +21,8 @@ cdef extern from "keyvi/dictionary/match.h" namespace "keyvi::dictionary": PyObject* GetAttributePy(libcpp_utf8_string) except + nogil # wrap-ignore libcpp_utf8_output_string GetValueAsString() except + # wrap-as:value_as_string libcpp_string GetRawValueAsString() except + # wrap-as:raw_value_as_string - libcpp_string GetMsgPackedValueAsString() except + # wrap-ignore + libcpp_string GetMsgPackedValueAsString() except + # wrap-as:msgpacked_value_as_string + libcpp_string GetMsgPackedValueAsString(CompressionAlgorithm) except + # wrap-as:msgpacked_value_as_string void SetRawValue(libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, libcpp_utf8_string) except + # wrap-ignore void SetAttribute(libcpp_utf8_string, float) except + # wrap-ignore diff --git a/python/src/py/keyvi/__init__.py b/python/src/py/keyvi/__init__.py index 647e52b97..352602f7e 100644 --- a/python/src/py/keyvi/__init__.py +++ b/python/src/py/keyvi/__init__.py @@ -20,4 +20,4 @@ from keyvi._version import __version__ # global keyvi concepts -from keyvi._core import MatchIterator, Match, loading_strategy_types +from keyvi._core import MatchIterator, Match, loading_strategy_types, CompressionAlgorithm diff --git a/python/tests/dictionary/string_dictionary_merger_test.py b/python/tests/dictionary/string_dictionary_merger_test.py index 5c8da42c1..219da5207 100644 --- a/python/tests/dictionary/string_dictionary_merger_test.py +++ b/python/tests/dictionary/string_dictionary_merger_test.py @@ -46,7 +46,7 @@ def generate_keyvi(key_values, filename): dictionary_compiler = StringDictionaryCompiler({"memory_limit_mb": "10"}) for key, value in key_values.items(): - dictionary_compiler.add(key, json.dumps(value)) + dictionary_compiler.add(key, value) dictionary_compiler.compile() dictionary_compiler.write_to_file(filename) diff --git a/python/tests/match_object_test.py b/python/tests/match_object_test.py index 9c907a2ac..b3a604d56 100644 --- a/python/tests/match_object_test.py +++ b/python/tests/match_object_test.py @@ -2,9 +2,12 @@ # Usage: py.test tests import keyvi +import msgpack from test_tools import tmp_dictionary import warnings - +import zlib +import snappy +import zstd from keyvi.compiler import ( JsonDictionaryCompiler, @@ -30,12 +33,13 @@ def test_raw_serialization(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value_as_string() == '{"a":2}' d = m.dumps() m2 = keyvi.Match.loads(d) assert m2.value_as_string() == '{"a":2}' + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 2} with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") assert m.GetValueAsString() == '{"a":2}' @@ -63,8 +67,8 @@ def test_unicode_attributes(): def test_bytes_attributes(): m = keyvi.Match() - bytes_key = bytes(u"äöü".encode('utf-8')) - bytes_value = bytes(u"äöüöäü".encode('utf-8')) + bytes_key = bytes("äöü".encode("utf-8")) + bytes_value = bytes("äöüöäü".encode("utf-8")) m[bytes_key] = 22 assert m[bytes_key] == 22 m["k2"] = bytes_value @@ -73,14 +77,14 @@ def test_bytes_attributes(): def test_double_attributes(): m = keyvi.Match() - bytes_key = bytes("abc".encode('utf-8')) + bytes_key = bytes("abc".encode("utf-8")) m[bytes_key] = 42.0 assert m[bytes_key] == 42.0 def test_boolean_attributes(): m = keyvi.Match() - bytes_key = bytes("def".encode('utf-8')) + bytes_key = bytes("def".encode("utf-8")) m[bytes_key] = True assert m[bytes_key] == True @@ -125,44 +129,98 @@ def test_get_value(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", '{"a" : 2}') c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'match_object_json.kv') as d: + with tmp_dictionary(c, "match_object_json.kv") as d: m = d["abc"] assert m.value == {"a": 2} m = d["abd"] assert m.value == {"a": 3} + assert msgpack.loads(m.msgpacked_value_as_string()) == {"a": 3} + assert msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZLIB_COMPRESSION) + ) + ) == {"a": 3} + assert msgpack.loads( + snappy.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.SNAPPY_COMPRESSION + ) + ) + ) == {"a": 3} + assert msgpack.loads( + zstd.decompress( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.ZSTD_COMPRESSION) + ) + ) == {"a": 3} + assert msgpack.loads( + m.msgpacked_value_as_string(keyvi.CompressionAlgorithm.NO_COMPRESSION) + ) == {"a": 3} def test_get_value_int(): c = CompletionDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", 42) c.add("abd", 21) - with tmp_dictionary(c, 'match_object_int.kv') as d: + with tmp_dictionary(c, "match_object_int.kv") as d: m = d["abc"] assert m.value == 42 m = d["abd"] assert m.value == 21 + assert msgpack.loads(m.msgpacked_value_as_string()) == 21 + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == 21 + ) def test_get_value_key_only(): c = KeyOnlyDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc") c.add("abd") - with tmp_dictionary(c, 'match_object_key_only.kv') as d: + with tmp_dictionary(c, "match_object_key_only.kv") as d: m = d["abc"] - assert m.value == '' + assert m.value is None m = d["abd"] - assert m.value == '' + assert m.value is None + assert msgpack.loads(m.msgpacked_value_as_string()) is None + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + is None + ) def test_get_value_string(): c = StringDictionaryCompiler({"memory_limit_mb": "10"}) c.add("abc", "aaaaa") c.add("abd", "bbbbb") - with tmp_dictionary(c, 'match_object_string.kv') as d: + with tmp_dictionary(c, "match_object_string.kv") as d: m = d["abc"] assert m.value == "aaaaa" m = d["abd"] assert m.value == "bbbbb" + assert msgpack.loads(m.msgpacked_value_as_string()) == "bbbbb" + assert ( + msgpack.loads( + zlib.decompress( + m.msgpacked_value_as_string( + keyvi.CompressionAlgorithm.ZLIB_COMPRESSION + ) + ) + ) + == "bbbbb" + ) def test_matched_string(): diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 125fb8cb3..991c79d75 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -19,3 +19,6 @@ serde_json = ">=1.0" [dev-dependencies] rayon = "0.9" rand = ">=0.4" +snap = "1.1" +zstd = "0.13" +flate2 = "1.0" \ No newline at end of file diff --git a/rust/build.rs b/rust/build.rs index 07139f9eb..90202dcf9 100644 --- a/rust/build.rs +++ b/rust/build.rs @@ -49,8 +49,10 @@ fn main() { .header("keyvi_core/keyvi/include/keyvi/c_api/c_api.h") .clang_arg("-x") .clang_arg("c++") + .clang_arg("-Ikeyvi_core/keyvi/include") .enable_cxx_namespaces() .layout_tests(true) + .rustified_enum("keyvi::compression::CompressionAlgorithm") .allowlist_function("keyvi_bytes_destroy") .allowlist_function("keyvi_string_destroy") .allowlist_function("keyvi_create_dictionary") @@ -65,6 +67,7 @@ fn main() { .allowlist_function("keyvi_match_destroy") .allowlist_function("keyvi_match_get_matched_string") .allowlist_function("keyvi_match_get_msgpacked_value") + .allowlist_function("keyvi_match_get_msgpacked_value_compressed") .allowlist_function("keyvi_match_get_score") .allowlist_function("keyvi_match_get_value_as_string") .allowlist_function("keyvi_match_is_empty") diff --git a/rust/src/keyvi_match.rs b/rust/src/keyvi_match.rs index dcf759644..f090ab9a7 100644 --- a/rust/src/keyvi_match.rs +++ b/rust/src/keyvi_match.rs @@ -73,6 +73,25 @@ impl KeyviMatch { msgpacked_value } + pub fn get_msgpacked_value_compressed( + &self, + compression_algorithm: root::keyvi::compression::CompressionAlgorithm, + ) -> Vec { + let kv_bytes = unsafe { + root::keyvi_match_get_msgpacked_value_compressed(self.match_ptr_, compression_algorithm) + }; + let msgpacked_value = if kv_bytes.data_size == 0 { + Vec::new() + } else { + unsafe { + slice::from_raw_parts(kv_bytes.data_ptr, kv_bytes.data_size as usize).to_vec() + } + }; + unsafe { root::keyvi_bytes_destroy(kv_bytes) }; + + msgpacked_value + } + pub fn is_empty(&self) -> bool { unsafe { root::keyvi_match_is_empty(self.match_ptr_) } } diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8790adc31..976c8b78f 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -36,3 +36,5 @@ pub mod dictionary; pub mod keyvi_match; pub mod keyvi_match_iterator; pub mod keyvi_string; + +pub type Compression = bindings::root::keyvi::compression::CompressionAlgorithm; diff --git a/rust/tests/tests.rs b/rust/tests/tests.rs index 99af74578..2d0bbd33f 100644 --- a/rust/tests/tests.rs +++ b/rust/tests/tests.rs @@ -1,15 +1,22 @@ +extern crate flate2; extern crate rand; extern crate rayon; extern crate serde_json; +extern crate snap; +extern crate zstd; extern crate keyvi; #[cfg(test)] mod tests { + use std::io::Read; + + use flate2::read::ZlibDecoder; use rand; use rand::Rng; use rayon::prelude::*; - use serde_json::Value; + use serde_json::{value, Value}; + use snap::raw::Decoder; use keyvi::dictionary; @@ -85,6 +92,37 @@ mod tests { assert!(m.get_value_as_string().is_empty()); } + #[test] + fn match_msgpacked_value_compressed_array() { + let m = dictionary::Dictionary::new("test_data/test.kv") + .unwrap() + .get("a"); + + assert_eq!( + m.get_msgpacked_value_compressed(keyvi::Compression::NO_COMPRESSION), + vec![146, 12, 13] + ); + + let mut snap_decoder = Decoder::new(); + let value_compressed_snap = + m.get_msgpacked_value_compressed(keyvi::Compression::SNAPPY_COMPRESSION); + let value_uncompressed_snap = snap_decoder.decompress_vec(&value_compressed_snap); + assert_eq!(value_uncompressed_snap.unwrap(), vec![146, 12, 13]); + + let value_compressed_zstd = + m.get_msgpacked_value_compressed(keyvi::Compression::ZSTD_COMPRESSION); + let value_uncompressed_zstd: Vec = + zstd::decode_all(value_compressed_zstd.as_slice()).unwrap(); + assert_eq!(value_uncompressed_zstd, vec![146, 12, 13]); + + let value_compressed_zlib = + m.get_msgpacked_value_compressed(keyvi::Compression::ZLIB_COMPRESSION); + let mut zlib_decoder = ZlibDecoder::new(value_compressed_zlib.as_slice()); + let mut value_uncompressed_zlib: Vec = Vec::new(); + let _ = zlib_decoder.read_to_end(&mut value_uncompressed_zlib); + assert_eq!(value_uncompressed_zlib, vec![146, 12, 13]); + } + #[test] fn match_value() { let d = dictionary::Dictionary::new("test_data/test.kv").unwrap();