From 7b1be5ad685ec098fe7f7a5dd9138312f4832b1c Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Mon, 10 Feb 2025 20:50:38 +0100 Subject: [PATCH 01/10] add zstd compression support --- CMakeLists.txt | 25 ++++-- cmake_modules/FindZSTD.cmake | 19 +++++ .../keyvi/compression/compression_selector.h | 6 ++ .../keyvi/compression/compression_strategy.h | 1 + .../compression/zstd_compression_strategy.h | 83 +++++++++++++++++++ .../zstd_compression_strategy_test.cpp | 46 ++++++++++ 6 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 cmake_modules/FindZSTD.cmake create mode 100644 keyvi/include/keyvi/compression/zstd_compression_strategy.h create mode 100644 keyvi/tests/keyvi/compression/zstd_compression_strategy_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0608bbd33..0226ea0b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -118,6 +118,19 @@ else () set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} snappy") endif () +# Zstd +find_package(ZSTD REQUIRED) +if (ZSTD_FOUND) + list(APPEND KEYVI_INCLUDES "${ZSTD_INCLUDE_DIRS}") +else () + message(FATAL_ERROR "Can not find zstd") +endif (ZSTD_FOUND) +if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") + set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} zstd") +else () + set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} zstd") +endif () + # rapidjson list(APPEND KEYVI_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/keyvi/3rdparty/rapidjson/include") @@ -143,7 +156,7 @@ string(REPLACE " " ";" _KEYVI_COMPILE_DEFINITIONS_LIST "${_KEYVI_COMPILE_DEFINIT # keyvicompiler add_executable(keyvicompiler keyvi/bin/keyvicompiler/keyvicompiler.cpp) -target_link_libraries(keyvicompiler ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(keyvicompiler ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) target_compile_options(keyvicompiler PRIVATE ${_KEYVI_CXX_FLAGS_LIST}) target_compile_definitions(keyvicompiler PRIVATE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) target_include_directories(keyvicompiler PRIVATE "$") @@ -152,7 +165,7 @@ install (TARGETS keyvicompiler DESTINATION bin COMPONENT applications OPTIONAL) # keyviinspector add_executable(keyviinspector keyvi/bin/keyviinspector/keyviinspector.cpp) -target_link_libraries(keyviinspector ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(keyviinspector ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) target_compile_options(keyviinspector PRIVATE ${_KEYVI_CXX_FLAGS_LIST}) target_compile_definitions(keyviinspector PRIVATE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) target_include_directories(keyviinspector PRIVATE "$") @@ -161,7 +174,7 @@ install (TARGETS keyviinspector DESTINATION bin COMPONENT applications OPTIONAL) # keyvimerger add_executable(keyvimerger keyvi/bin/keyvimerger/keyvimerger.cpp) -target_link_libraries(keyvimerger ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(keyvimerger ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) target_compile_options(keyvimerger PRIVATE ${_KEYVI_CXX_FLAGS_LIST}) target_compile_definitions(keyvimerger PRIVATE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) target_include_directories(keyvimerger PRIVATE "$") @@ -170,7 +183,7 @@ install (TARGETS keyvimerger DESTINATION bin COMPONENT applications) # keyvi_c add_library(keyvi_c SHARED keyvi/bin/keyvi_c/c_api.cpp) -target_link_libraries(keyvi_c ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(keyvi_c ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) target_compile_options(keyvi_c PRIVATE ${_KEYVI_CXX_FLAGS_LIST}) target_compile_definitions(keyvi_c PRIVATE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) target_include_directories(keyvi_c PRIVATE "$") @@ -178,7 +191,7 @@ target_include_directories(keyvi_c PRIVATE "$ # unit tests FILE(GLOB_RECURSE UNIT_TEST_SOURCES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} keyvi/tests/keyvi/*.cpp) add_executable(unit_test_all ${UNIT_TEST_SOURCES}) -target_link_libraries(unit_test_all ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(unit_test_all ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) target_compile_options(unit_test_all PRIVATE ${_KEYVI_CXX_FLAGS_LIST}) target_compile_definitions(unit_test_all PRIVATE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) target_include_directories(unit_test_all PRIVATE "$") @@ -224,7 +237,7 @@ add_library(keyvi INTERFACE) target_include_directories(keyvi INTERFACE "$") target_compile_definitions(keyvi INTERFACE ${_KEYVI_COMPILE_DEFINITIONS_LIST}) -target_link_libraries(keyvi INTERFACE ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${_OS_LIBRARIES}) +target_link_libraries(keyvi INTERFACE ${Boost_LIBRARIES} ${ZLIB_LIBRARIES} ${Snappy_LIBRARY} ${ZSTD_LIBRARIES} ${_OS_LIBRARIES}) ### docs diff --git a/cmake_modules/FindZSTD.cmake b/cmake_modules/FindZSTD.cmake new file mode 100644 index 000000000..ebee04fcf --- /dev/null +++ b/cmake_modules/FindZSTD.cmake @@ -0,0 +1,19 @@ +# Find Zstd, a compression library +find_package(PkgConfig) + +pkg_check_modules(ZSTD_PKGCONF libzstd) + +find_path(ZSTD_INCLUDE_DIRS + NAMES zstd.h + PATHS ${ZSTD_PKGCONF_INCLUDE_DIRS} +) + +find_library(ZSTD_LIBRARIES + NAMES zstd + PATHS ${ZSTD_PKGCONF_LIBRARY_DIRS} +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ZSTD DEFAULT_MSG ZSTD_INCLUDE_DIRS ZSTD_LIBRARIES) + +mark_as_advanced(ZSTD_INCLUDE_DIRS ZSTD_LIBRARIES) diff --git a/keyvi/include/keyvi/compression/compression_selector.h b/keyvi/include/keyvi/compression/compression_selector.h index 42579627b..5eea0f87e 100644 --- a/keyvi/include/keyvi/compression/compression_selector.h +++ b/keyvi/include/keyvi/compression/compression_selector.h @@ -33,6 +33,7 @@ #include "keyvi/compression/compression_strategy.h" #include "keyvi/compression/snappy_compression_strategy.h" #include "keyvi/compression/zlib_compression_strategy.h" +#include "keyvi/compression/zstd_compression_strategy.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -51,6 +52,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") { return new ZlibCompressionStrategy(); // compression level? } else if (lower_name == "snappy") { return new SnappyCompressionStrategy(); + } else if (lower_name == "zstd") { + return new ZstdCompressionStrategy(); } else if (lower_name == "" || lower_name == "none" || lower_name == "raw") { return new RawCompressionStrategy(); } else { @@ -72,6 +75,9 @@ inline decompress_func_t decompressor_by_code(const std::string& s) { case SNAPPY_COMPRESSION: TRACE("unpack snappy compressed string"); return SnappyCompressionStrategy::DoDecompress; + case ZSTD_COMPRESSION: + TRACE("unpack zstd compressed string"); + return ZstdCompressionStrategy::DoDecompress; default: throw std::invalid_argument("Invalid compression code " + boost::lexical_cast(static_cast(s[0]))); diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 533cf5107..2a7756026 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -36,6 +36,7 @@ enum CompressionCode { NO_COMPRESSION = 0, ZLIB_COMPRESSION = 1, SNAPPY_COMPRESSION = 2, + ZSTD_COMPRESSION = 3, }; // buffer type which is realloc-able diff --git a/keyvi/include/keyvi/compression/zstd_compression_strategy.h b/keyvi/include/keyvi/compression/zstd_compression_strategy.h new file mode 100644 index 000000000..6f2945709 --- /dev/null +++ b/keyvi/include/keyvi/compression/zstd_compression_strategy.h @@ -0,0 +1,83 @@ +/* * keyvi - A key value store. + * + * Copyright 2015 Hendrik Muhs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * zstd_compression_strategy.h + * + * Created on: September 10, 2016 + * Author: Hendrik Muhs + */ + +#ifndef KEYVI_COMPRESSION_ZSTD_COMPRESSION_STRATEGY_H_ +#define KEYVI_COMPRESSION_ZSTD_COMPRESSION_STRATEGY_H_ + +#include + +#include + +#ifndef ZSTD_DEFAULT_CLEVEL + +/*-===== Pre-defined compression levels =====-*/ +#define ZSTD_DEFAULT_CLEVEL 3 +#define ZSTD_MAX_CLEVEL 22 +#endif + +#include "keyvi/compression/compression_strategy.h" + +// #define ENABLE_TRACING +#include "keyvi/dictionary/util/trace.h" + +namespace keyvi { +namespace compression { + +/** A compression strategy that wraps zlib. */ +struct ZstdCompressionStrategy final : public CompressionStrategy { + ZstdCompressionStrategy(int compression_level = ZSTD_DEFAULT_CLEVEL) : compression_level_(compression_level) {} + + inline void Compress(buffer_t* buffer, const char* raw, size_t raw_size) { DoCompress(buffer, raw, raw_size); } + + inline void DoCompress(buffer_t* buffer, const char* raw, size_t raw_size) { + size_t output_length = ZSTD_compressBound(raw_size); + buffer->resize(output_length + 1); + buffer->data()[0] = static_cast(ZSTD_COMPRESSION); + + output_length = ZSTD_compress(buffer->data() + 1, output_length, raw, raw_size, compression_level_); + buffer->resize(output_length + 1); + } + + inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); } + + static std::string DoDecompress(const std::string& compressed) { + std::string uncompressed; + + size_t dest_size = ZSTD_getFrameContentSize(&compressed.data()[1], compressed.size() - 1); + uncompressed.resize(dest_size); + ZSTD_decompress(&uncompressed[0], dest_size, &compressed.data()[1], compressed.size() - 1); + + return uncompressed; + } + + std::string name() const { return "zstd"; } + + private: + int compression_level_; +}; + +} /* namespace compression */ +} /* namespace keyvi */ + +#endif // KEYVI_COMPRESSION_ZSTD_COMPRESSION_STRATEGY_H_ diff --git a/keyvi/tests/keyvi/compression/zstd_compression_strategy_test.cpp b/keyvi/tests/keyvi/compression/zstd_compression_strategy_test.cpp new file mode 100644 index 000000000..26063f155 --- /dev/null +++ b/keyvi/tests/keyvi/compression/zstd_compression_strategy_test.cpp @@ -0,0 +1,46 @@ +/* keyvi - A key value store. + * + * Copyright 2025 Hendrik Muhs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include "keyvi/compression/compression_selector.h" + +namespace keyvi { +namespace compression { + +BOOST_AUTO_TEST_SUITE(ZstdCompressionStrategyTests) + +BOOST_AUTO_TEST_CASE(SimpleCompressAndUncompress) { + const char* input = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; + + std::unique_ptr cs; + cs.reset(compression_strategy("zstd")); + + auto compressed = cs->Compress(input); + BOOST_CHECK(compressed.size() < strlen(input)); + + auto uncompressed = cs->Decompress(compressed); + BOOST_CHECK_EQUAL(input, uncompressed); +} + +BOOST_AUTO_TEST_SUITE_END() + +} // namespace compression +} // namespace keyvi From 891ec6cba86c9a2f4946c6bb3328007064ddb30e Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Mon, 10 Feb 2025 21:36:51 +0100 Subject: [PATCH 02/10] add python test --- python/tests/json/json_dictionary_test.py | 56 +++++++++++------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/python/tests/json/json_dictionary_test.py b/python/tests/json/json_dictionary_test.py index fea32f261..e38c670af 100644 --- a/python/tests/json/json_dictionary_test.py +++ b/python/tests/json/json_dictionary_test.py @@ -3,6 +3,7 @@ from test_tools import tmp_dictionary import sys import os +import pytest from keyvi.compiler import JsonDictionaryCompiler @@ -16,36 +17,27 @@ def test_simple(): c.add("abd", '{"a" : 3}') # use python syntax ala __setitem__ c["abd"] = '{"a" : 3}' - with tmp_dictionary(c, 'simple_json.kv') as d: + with tmp_dictionary(c, "simple_json.kv") as d: assert len(d) == 2 assert d["abc"].value_as_string() == '{"a":2}' assert d["abd"].value_as_string() == '{"a":3}' -def test_simple_zlib(): +@pytest.mark.parametrize("compression", ["zlib", "snappy", "zstd"]) +def test_simple_compression(compression: str): c = JsonDictionaryCompiler( - {"memory_limit_mb": "10", 'compression': 'z', 'compression_threshold': '0'}) - c.add("abc", '{"a" : 2}') - c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'simple_json_z.kv') as d: + { + "memory_limit_mb": "10", + "compression": compression, + "compression_threshold": "0", + } + ) + with tmp_dictionary(c, f"simple_json_{compression}.kv") as d: assert len(d) == 2 assert d["abc"].value_as_string() == '{"a":2}' assert d["abd"].value_as_string() == '{"a":3}' - m = d.statistics()['Value Store'] - assert m['__compression'] == "zlib" - - -def test_simple_snappy(): - c = JsonDictionaryCompiler( - {"memory_limit_mb": "10", 'compression': 'snappy', 'compression_threshold': '0'}) - c.add("abc", '{"a" : 2}') - c.add("abd", '{"a" : 3}') - with tmp_dictionary(c, 'simple_json_snappy.kv') as d: - assert len(d) == 2 - assert d["abc"].value_as_string() == '{"a":2}' - assert d["abd"].value_as_string() == '{"a":3}' - m = d.statistics()['Value Store'] - assert m['__compression'] == "snappy" + m = d.statistics()["Value Store"] + assert m["__compression"] == compression def test_unicode_compile(): @@ -54,30 +46,38 @@ def test_unicode_compile(): c.add("üüüüüüabd", '{"a" : 3}') c.add(u"ääääädäd", '{"b" : 33}') - with tmp_dictionary(c, 'simple_json.kv') as d: + with tmp_dictionary(c, "simple_json.kv") as d: assert len(d) == 3 assert d["üöä"].value_as_string() == '{"y":2}' - assert d[u"üöä"].value_as_string() == '{"y":2}' + assert d["üöä"].value_as_string() == '{"y":2}' assert d["üüüüüüabd"].value_as_string() == '{"a":3}' assert d["ääääädäd"].value_as_string() == '{"b":33}' def test_float_compaction(): cs = JsonDictionaryCompiler( - {"memory_limit_mb": "10", 'floating_point_precision': 'single'}) + {"memory_limit_mb": "10", "floating_point_precision": "single"} + ) cd = JsonDictionaryCompiler({"memory_limit_mb": "10"}) # add a couple of floats to both cs.add('aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]') cd.add('aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]') + "[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]", + ) + cd.Add( + "aa", + "[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]", + ) - with tmp_dictionary(cs, 'json_single_precision_float.kv') as ds: - with tmp_dictionary(cd, 'json_double_precision_float.kv') as dd: + with tmp_dictionary(cs, "json_single_precision_float.kv") as ds: + with tmp_dictionary(cd, "json_double_precision_float.kv") as dd: # first some basic checks assert len(ds) == 1 assert len(dd) == 1 # simple test the length of the value store which shall be smaller for single floats stats_s = ds.statistics() stats_d = dd.statistics() - assert int(stats_s['Value Store']['size']) < int( - stats_d['Value Store']['size']) + assert int(stats_s["Value Store"]["size"]) < int( + stats_d["Value Store"]["size"] + ) From 5e0fe8c66623c9acf30f5471950dff4b0914ac20 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 09:14:38 +0100 Subject: [PATCH 03/10] change name for default version --- .../keyvi/compression/compression_strategy.h | 7 +++++++ .../compression/snappy_compression_strategy.h | 3 +++ .../compression/zlib_compression_strategy.h | 3 +++ .../compression/zstd_compression_strategy.h | 4 ++++ .../keyvi/dictionary/dictionary_merger.h | 3 ++- .../keyvi/dictionary/dictionary_properties.h | 2 ++ keyvi/include/keyvi/dictionary/fsa/generator.h | 5 ++++- .../keyvi/dictionary/fsa/internal/constants.h | 8 ++++---- .../fsa/internal/float_vector_value_store.h | 9 +++++++++ .../internal/int_inner_weights_value_store.h | 6 ++++++ .../dictionary/fsa/internal/int_value_store.h | 4 ++++ .../dictionary/fsa/internal/ivalue_store.h | 2 ++ .../dictionary/fsa/internal/json_value_store.h | 18 ++++++++++++++++++ .../dictionary/fsa/internal/null_value_store.h | 5 +++++ .../fsa/internal/string_value_store.h | 5 +++++ 15 files changed, 78 insertions(+), 6 deletions(-) diff --git a/keyvi/include/keyvi/compression/compression_strategy.h b/keyvi/include/keyvi/compression/compression_strategy.h index 2a7756026..721f96d80 100644 --- a/keyvi/include/keyvi/compression/compression_strategy.h +++ b/keyvi/include/keyvi/compression/compression_strategy.h @@ -29,6 +29,8 @@ #include #include +#include "keyvi/dictionary/fsa/internal/constants.h" + namespace keyvi { namespace compression { @@ -70,6 +72,9 @@ struct CompressionStrategy { /** The "name" of the compression strategy. */ virtual std::string name() const = 0; + + /** The minimum version this compressor requires */ + virtual uint64_t GetFileVersionMin() const = 0; }; /** @@ -96,6 +101,8 @@ struct RawCompressionStrategy final : public CompressionStrategy { static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); } std::string name() const { return "raw"; } + + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } }; } /* namespace compression */ diff --git a/keyvi/include/keyvi/compression/snappy_compression_strategy.h b/keyvi/include/keyvi/compression/snappy_compression_strategy.h index cf7868251..39e393a8c 100644 --- a/keyvi/include/keyvi/compression/snappy_compression_strategy.h +++ b/keyvi/include/keyvi/compression/snappy_compression_strategy.h @@ -30,6 +30,7 @@ #include #include "keyvi/compression/compression_strategy.h" +#include "keyvi/dictionary/fsa/internal/constants.h" namespace keyvi { namespace compression { @@ -61,6 +62,8 @@ struct SnappyCompressionStrategy final : public CompressionStrategy { } std::string name() const { return "snappy"; } + + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } }; } /* namespace compression */ diff --git a/keyvi/include/keyvi/compression/zlib_compression_strategy.h b/keyvi/include/keyvi/compression/zlib_compression_strategy.h index 5cd3a1b82..16a701b11 100644 --- a/keyvi/include/keyvi/compression/zlib_compression_strategy.h +++ b/keyvi/include/keyvi/compression/zlib_compression_strategy.h @@ -31,6 +31,7 @@ #include #include "keyvi/compression/compression_strategy.h" +#include "keyvi/dictionary/fsa/internal/constants.h" // #define ENABLE_TRACING #include "keyvi/dictionary/util/trace.h" @@ -131,6 +132,8 @@ struct ZlibCompressionStrategy final : public CompressionStrategy { std::string name() const { return "zlib"; } + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + private: z_stream zstream_compress_; }; diff --git a/keyvi/include/keyvi/compression/zstd_compression_strategy.h b/keyvi/include/keyvi/compression/zstd_compression_strategy.h index 6f2945709..57c744bf0 100644 --- a/keyvi/include/keyvi/compression/zstd_compression_strategy.h +++ b/keyvi/include/keyvi/compression/zstd_compression_strategy.h @@ -29,6 +29,8 @@ #include +#include "keyvi/dictionary/fsa/internal/constants.h" + #ifndef ZSTD_DEFAULT_CLEVEL /*-===== Pre-defined compression levels =====-*/ @@ -73,6 +75,8 @@ struct ZstdCompressionStrategy final : public CompressionStrategy { std::string name() const { return "zstd"; } + uint64_t GetFileVersionMin() const { return 3; } + private: int compression_level_; }; diff --git a/keyvi/include/keyvi/dictionary/dictionary_merger.h b/keyvi/include/keyvi/dictionary/dictionary_merger.h index bc62f4e26..5dd44b953 100644 --- a/keyvi/include/keyvi/dictionary/dictionary_merger.h +++ b/keyvi/include/keyvi/dictionary/dictionary_merger.h @@ -204,7 +204,8 @@ class DictionaryMerger final { } void CompleteMerge() { - ValueStoreMergeT* value_store = new ValueStoreMergeT(params_); + ValueStoreMergeT* value_store = new ValueStoreMergeT(inputFiles_, params_); + generator_ = GeneratorAdapter::template CreateGenerator>( GetTotalSparseArraySize(), params_, value_store); diff --git a/keyvi/include/keyvi/dictionary/dictionary_properties.h b/keyvi/include/keyvi/dictionary/dictionary_properties.h index 2c10aa1e3..7892298a3 100644 --- a/keyvi/include/keyvi/dictionary/dictionary_properties.h +++ b/keyvi/include/keyvi/dictionary/dictionary_properties.h @@ -147,6 +147,8 @@ class DictionaryProperties { const std::string& GetSpecializedDictionaryProperties() const { return specialized_dictionary_properties_; } + uint64_t GetVersion() const { return version_; } + std::string GetStatistics() const { rapidjson::StringBuffer string_buffer; rapidjson::Writer writer(string_buffer); diff --git a/keyvi/include/keyvi/dictionary/fsa/generator.h b/keyvi/include/keyvi/dictionary/fsa/generator.h index 03a171ff2..4c519a846 100644 --- a/keyvi/include/keyvi/dictionary/fsa/generator.h +++ b/keyvi/include/keyvi/dictionary/fsa/generator.h @@ -295,7 +295,10 @@ class Generator final { stream << KEYVI_FILE_MAGIC; - keyvi::dictionary::DictionaryProperties p(KEYVI_FILE_VERSION_CURRENT, start_state_, number_of_keys_added_, + // value stores can ask for a higher version + const uint64_t file_version = std::max(KEYVI_FILE_VERSION_DEFAULT, value_store_->GetFileVersionMin()); + + keyvi::dictionary::DictionaryProperties p(file_version, start_state_, number_of_keys_added_, number_of_states_, value_store_->GetValueStoreType(), persistence_->GetVersion(), persistence_->GetSize(), manifest_, specialized_dictionary_properties_); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/constants.h b/keyvi/include/keyvi/dictionary/fsa/internal/constants.h index 7100ae116..e2aae4389 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/constants.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/constants.h @@ -35,11 +35,11 @@ static const char KEYVI_FILE_MAGIC[] = "KEYVIFSA"; static const size_t KEYVI_FILE_MAGIC_LEN = 8; // min version of the file -static const int KEYVI_FILE_VERSION_MIN = 2; +static const uint64_t KEYVI_FILE_VERSION_MIN = 2; // max version of the file we support -static const int KEYVI_FILE_VERSION_MAX = 2; -// the current version of the file format -static const int KEYVI_FILE_VERSION_CURRENT = 2; +static const uint64_t KEYVI_FILE_VERSION_MAX = 3; +// the default version of the file format +static const uint64_t KEYVI_FILE_VERSION_DEFAULT = 2; // min version of the persistence part static const int KEYVI_FILE_PERSISTENCE_VERSION_MIN = 2; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h index 8b9b05cac..f8d2b0d15 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/float_vector_value_store.h @@ -25,6 +25,7 @@ #include "keyvi/compression/compression_selector.h" #include "keyvi/dictionary/dictionary_properties.h" +#include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/lru_generation_cache.h" #include "keyvi/dictionary/fsa/internal/memory_map_flags.h" @@ -63,6 +64,8 @@ class FloatVectorValueStoreBase { uint32_t GetMergeWeight(uint64_t fsa_value) { return 0; } + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + static value_store_t GetValueStoreType() { return value_store_t::FLOAT_VECTOR; } protected: @@ -218,6 +221,8 @@ class FloatVectorValueStoreMergeBase { uint32_t GetMergeWeight(uint64_t fsa_value) { return 0; } + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + static value_store_t GetValueStoreType() { return value_store_t::FLOAT_VECTOR; } protected: @@ -229,6 +234,10 @@ class FloatVectorValueStoreMergeBase { class FloatVectorValueStoreMerge final : public FloatVectorValueStoreMergeBase { public: explicit FloatVectorValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) + : FloatVectorValueStoreMerge({}, parameters) {} + + explicit FloatVectorValueStoreMerge(const std::vector& inputFiles, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) : hash_(keyvi::util::mapGetMemory(parameters, MEMORY_LIMIT_KEY, DEFAULT_MEMORY_LIMIT_VALUE_STORE)) { temporary_directory_ = keyvi::util::mapGetTemporaryPath(parameters); diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h index fff3ec46d..5497c5868 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h @@ -28,6 +28,7 @@ #include #include +#include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -59,8 +60,11 @@ class IntInnerWeightsValueStoreBase { uint32_t GetMergeWeight(uint64_t fsa_value) { return fsa_value; } void CloseFeeding() {} + void Write(std::ostream& stream) const {} + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + static value_store_t GetValueStoreType() { return value_store_t::INT_WITH_WEIGHTS; } }; @@ -75,6 +79,8 @@ class IntInnerWeightsValueStore final : public IntInnerWeightsValueStoreBase { class IntInnerWeightsValueStoreMerge final : public IntInnerWeightsValueStoreBase { public: explicit IntInnerWeightsValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} + explicit IntInnerWeightsValueStoreMerge(const std::vector&, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} uint64_t AddValueMerge(const char* p, uint64_t v, bool* no_minimization) { return v; } }; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h index beddd8e5a..ea3d48fc2 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h @@ -28,6 +28,7 @@ #include #include +#include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -60,6 +61,7 @@ class IntValueStoreBase { void CloseFeeding() {} void Write(std::ostream& stream) const {} + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } static value_store_t GetValueStoreType() { return value_store_t::INT; } }; @@ -76,6 +78,8 @@ class IntValueStore final : public IntValueStoreBase { class IntValueStoreMerge final : public IntValueStoreBase { public: explicit IntValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} + explicit IntValueStoreMerge(const std::vector&, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} uint64_t AddValueMerge(const char* p, uint64_t v, bool* no_minimization) { return v; } }; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h index cee376dc9..1175cb024 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/ivalue_store.h @@ -71,6 +71,8 @@ struct ValueStoreComponents {}; * void CloseFeeding() * * void Write(std::ostream& stream) + * + * uint64_t GetFileVersionMin() */ /** diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index 9d3e5d41b..c0c0c254d 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -194,6 +194,8 @@ class JsonValueStore final : public JsonValueStoreMinimizationBase { values_extern_->Write(stream, values_buffer_size_); } + uint64_t GetFileVersionMin() const { return compressor_->GetFileVersionMin(); } + private: /* * Compressors & the associated compression functions. Ugly, but @@ -228,6 +230,13 @@ class JsonValueStoreMerge final : public JsonValueStoreMinimizationBase { public: explicit JsonValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) : JsonValueStoreMinimizationBase(parameters) {} + explicit JsonValueStoreMerge(const std::vector& inputFiles, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) + : JsonValueStoreMinimizationBase(parameters) { + for (const auto& file_name : inputFiles) { + file_version_min_ = std::max(file_version_min_, DictionaryProperties::FromFile(file_name).GetVersion()); + } + } uint64_t AddValueMerge(const char* payload, uint64_t fsa_value, bool* no_minimization) { size_t buffer_size; @@ -268,6 +277,11 @@ class JsonValueStoreMerge final : public JsonValueStoreMinimizationBase { values_extern_->Write(stream, values_buffer_size_); } + + uint64_t GetFileVersionMin() const { return file_version_min_; } + + private: + uint64_t file_version_min_ = 0; }; class JsonValueStoreAppendMerge final : public JsonValueStoreBase { @@ -284,6 +298,7 @@ class JsonValueStoreAppendMerge final : public JsonValueStoreBase { number_of_values_ += properties_.back().GetValueStoreProperties().GetNumberOfValues(); number_of_unique_values_ += properties_.back().GetValueStoreProperties().GetNumberOfUniqueValues(); values_buffer_size_ += properties_.back().GetValueStoreProperties().GetSize(); + file_version_min_ = std::max(file_version_min_, properties_.back().GetVersion()); } } @@ -305,10 +320,13 @@ class JsonValueStoreAppendMerge final : public JsonValueStoreBase { } } + uint64_t GetFileVersionMin() const { return file_version_min_; } + private: std::vector input_files_; std::vector properties_; std::vector offsets_; + uint64_t file_version_min_ = 0; }; class JsonValueStoreReader final : public IValueStoreReader { diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h index 8b945d7d8..7dc1d730e 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/null_value_store.h @@ -28,6 +28,7 @@ #include #include +#include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/value_store_types.h" @@ -57,6 +58,8 @@ class NullValueStoreBase { void Write(std::ostream& stream) const {} + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + static value_store_t GetValueStoreType() { return value_store_t::KEY_ONLY; } }; @@ -72,6 +75,8 @@ class NullValueStore final : public NullValueStoreBase { class NullValueStoreMerge final : public NullValueStoreBase { public: explicit NullValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} + explicit NullValueStoreMerge(const std::vector&, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} uint64_t AddValueMerge(const char* p, uint64_t v, bool* no_minimization) { return 0; } }; diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h index 1f6f85be2..3b40dda08 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/string_value_store.h @@ -33,6 +33,7 @@ #include #include "keyvi/dictionary/dictionary_properties.h" +#include "keyvi/dictionary/fsa/internal/constants.h" #include "keyvi/dictionary/fsa/internal/ivalue_store.h" #include "keyvi/dictionary/fsa/internal/lru_generation_cache.h" #include "keyvi/dictionary/fsa/internal/memory_map_flags.h" @@ -70,6 +71,8 @@ class StringValueStoreBase { static value_store_t GetValueStoreType() { return value_store_t::STRING; } + uint64_t GetFileVersionMin() const { return KEYVI_FILE_VERSION_MIN; } + protected: size_t number_of_values_ = 0; size_t number_of_unique_values_ = 0; @@ -167,6 +170,8 @@ class StringValueStore final : public StringValueStoreMinimizationBase { class StringValueStoreMerge final : public StringValueStoreMinimizationBase { public: explicit StringValueStoreMerge(const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} + explicit StringValueStoreMerge(const std::vector& inputFiles, + const keyvi::util::parameters_t& parameters = keyvi::util::parameters_t()) {} uint64_t AddValueMerge(const char* payload, uint64_t fsa_value, bool* no_minimization) { const char* value = payload + fsa_value; From 0f972279d868405c598595a977e3895572545a51 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 11:26:03 +0100 Subject: [PATCH 04/10] checkstyle --- keyvi/include/keyvi/dictionary/fsa/generator.h | 7 +++---- .../keyvi/dictionary/fsa/internal/json_value_store.h | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/generator.h b/keyvi/include/keyvi/dictionary/fsa/generator.h index 4c519a846..aebdf19dd 100644 --- a/keyvi/include/keyvi/dictionary/fsa/generator.h +++ b/keyvi/include/keyvi/dictionary/fsa/generator.h @@ -298,10 +298,9 @@ class Generator final { // value stores can ask for a higher version const uint64_t file_version = std::max(KEYVI_FILE_VERSION_DEFAULT, value_store_->GetFileVersionMin()); - keyvi::dictionary::DictionaryProperties p(file_version, start_state_, number_of_keys_added_, - number_of_states_, value_store_->GetValueStoreType(), - persistence_->GetVersion(), persistence_->GetSize(), manifest_, - specialized_dictionary_properties_); + keyvi::dictionary::DictionaryProperties p(file_version, start_state_, number_of_keys_added_, number_of_states_, + value_store_->GetValueStoreType(), persistence_->GetVersion(), + persistence_->GetSize(), manifest_, specialized_dictionary_properties_); p.WriteAsJsonV2(stream); // write data from persistence diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h index c0c0c254d..78f0b149f 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/json_value_store.h @@ -25,6 +25,7 @@ #ifndef KEYVI_DICTIONARY_FSA_INTERNAL_JSON_VALUE_STORE_H_ #define KEYVI_DICTIONARY_FSA_INTERNAL_JSON_VALUE_STORE_H_ +#include #include #include #include From fbc86d109a7d475559514be7ee4a6d7796385f39 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 15:26:13 +0100 Subject: [PATCH 05/10] add python test --- python/tests/json/json_dictionary_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tests/json/json_dictionary_test.py b/python/tests/json/json_dictionary_test.py index e38c670af..6bcd4002f 100644 --- a/python/tests/json/json_dictionary_test.py +++ b/python/tests/json/json_dictionary_test.py @@ -44,7 +44,7 @@ def test_unicode_compile(): c = JsonDictionaryCompiler({"memory_limit_mb": "10"}) c.add("üöä", '{"y" : 2}') c.add("üüüüüüabd", '{"a" : 3}') - c.add(u"ääääädäd", '{"b" : 33}') + c.add("ääääädäd", '{"b" : 33}') with tmp_dictionary(c, "simple_json.kv") as d: assert len(d) == 3 @@ -61,11 +61,11 @@ def test_float_compaction(): cd = JsonDictionaryCompiler({"memory_limit_mb": "10"}) # add a couple of floats to both - cs.add('aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]') - cd.add('aa', '[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]') + cs.add( + "aa", "[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]", ) - cd.Add( + cd.add( "aa", "[1.7008715758978892, 1.8094465532317732, 1.6098250864350536, 1.6369107966501981, 1.7736887965234107, 1.606682751740542, 1.6186427703265525, 1.7939763843449683, 1.5973550162469434, 1.6799721708726192, 1.8199786239525833, 1.7956178070065245, 1.7269879953863045]", ) From a5fb3b2f167abad228adfac9c3faaad9e4560b02 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 16:52:30 +0100 Subject: [PATCH 06/10] -re-add accidentally removed lines after merge --- python/tests/json/json_dictionary_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tests/json/json_dictionary_test.py b/python/tests/json/json_dictionary_test.py index 6bcd4002f..ac4997e66 100644 --- a/python/tests/json/json_dictionary_test.py +++ b/python/tests/json/json_dictionary_test.py @@ -32,6 +32,8 @@ def test_simple_compression(compression: str): "compression_threshold": "0", } ) + c.add("abc", '{"a" : 2}') + c.add("abd", '{"a" : 3}') with tmp_dictionary(c, f"simple_json_{compression}.kv") as d: assert len(d) == 2 assert d["abc"].value_as_string() == '{"a":2}' From 7d2763bbaad4c2f32aa18ff41f2c119ceafacdfe Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 23:12:39 +0100 Subject: [PATCH 07/10] add a test for versions by compression --- keyvi/include/keyvi/dictionary/dictionary.h | 2 + keyvi/include/keyvi/dictionary/fsa/automata.h | 14 ++-- .../dictionary/dictionary_version_test.cpp | 79 +++++++++++++++++++ 3 files changed, 86 insertions(+), 9 deletions(-) create mode 100644 keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp diff --git a/keyvi/include/keyvi/dictionary/dictionary.h b/keyvi/include/keyvi/dictionary/dictionary.h index 5ef2f1f6a..fec3a3e1b 100644 --- a/keyvi/include/keyvi/dictionary/dictionary.h +++ b/keyvi/include/keyvi/dictionary/dictionary.h @@ -72,6 +72,8 @@ class Dictionary final { uint64_t GetSize() const { return fsa_->GetNumberOfKeys(); } + uint64_t GetVersion() const { return fsa_->GetVersion(); } + /** * A simple Contains method to check whether a key is in the dictionary. * diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index b25bc2b9c..64565dfb1 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,13 +394,11 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } - std::string GetStatistics() const { - return dictionary_properties_->GetStatistics(); - } + std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } - const std::string& GetManifest() const { - return dictionary_properties_->GetManifest(); - } + const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } + + const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } private: dictionary_properties_t dictionary_properties_; @@ -462,9 +460,7 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { - return dictionary_properties_; - } + const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } }; // shared pointer diff --git a/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp b/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp new file mode 100644 index 000000000..11a4bbec7 --- /dev/null +++ b/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp @@ -0,0 +1,79 @@ +// +// keyvi - A key value store. +// +// Copyright 2025 Hendrik Muhs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +/* + * dictionary_version_test.cpp + * + * Created on: Feb 20, 2025 + * Author: hendrik + */ + +#include +#include + +#include + +#include "keyvi/dictionary/dictionary.h" +#include "keyvi/dictionary/dictionary_compiler.h" +#include "keyvi/dictionary/dictionary_index_compiler.h" +#include "keyvi/dictionary/dictionary_types.h" +#include "keyvi/dictionary/fsa/automata.h" +#include "keyvi/dictionary/fsa/entry_iterator.h" +#include "keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h" +#include "keyvi/dictionary/fsa/internal/int_value_store.h" +#include "keyvi/dictionary/fsa/internal/json_value_store.h" +#include "keyvi/dictionary/fsa/internal/sparse_array_persistence.h" +#include "keyvi/util/configuration.h" +#include "keyvi/util/float_vector_value.h" + +namespace keyvi { +namespace dictionary { + +BOOST_AUTO_TEST_SUITE(DictionaryVersionTests) + +void check_dictionary_version_with_compression(const std::string& compression, uint64_t expected_version) { + keyvi::dictionary::DictionaryCompiler compiler( + {{"memory_limit_mb", "10"}, {"compression", compression}, {"compression_threshold", "0"}}); + + for (size_t i = 0; i < 10; ++i) { + compiler.Add("key-" + std::to_string(i), "{\"id\":" + std::to_string(i) + "}"); + } + compiler.Compile(); + + boost::filesystem::path temp_path = boost::filesystem::temp_directory_path(); + temp_path /= boost::filesystem::unique_path("dictionary-unit-test-dictionarycompiler-%%%%-%%%%-%%%%-%%%%"); + const std::string file_name = temp_path.string(); + + compiler.WriteToFile(file_name); + + const Dictionary d(file_name); + BOOST_CHECK_EQUAL(expected_version, d.GetVersion()); + BOOST_CHECK(std::remove(file_name.c_str()) == 0); +} + +BOOST_AUTO_TEST_CASE(CheckVersionsByCompression) { + check_dictionary_version_with_compression("raw", 2); + check_dictionary_version_with_compression("zlib", 2); + check_dictionary_version_with_compression("snappy", 2); + check_dictionary_version_with_compression("zstd", 3); +} + +BOOST_AUTO_TEST_SUITE_END() + +} /* namespace dictionary */ +} /* namespace keyvi */ From db4fe6ed3bf94253d95fecfea8ba4b7e0c426e46 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 23:18:58 +0100 Subject: [PATCH 08/10] remove unnecessary headers --- .../tests/keyvi/dictionary/dictionary_version_test.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp b/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp index 11a4bbec7..0f6dfdd74 100644 --- a/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp +++ b/keyvi/tests/keyvi/dictionary/dictionary_version_test.cpp @@ -24,22 +24,12 @@ */ #include -#include #include #include "keyvi/dictionary/dictionary.h" #include "keyvi/dictionary/dictionary_compiler.h" -#include "keyvi/dictionary/dictionary_index_compiler.h" #include "keyvi/dictionary/dictionary_types.h" -#include "keyvi/dictionary/fsa/automata.h" -#include "keyvi/dictionary/fsa/entry_iterator.h" -#include "keyvi/dictionary/fsa/internal/int_inner_weights_value_store.h" -#include "keyvi/dictionary/fsa/internal/int_value_store.h" -#include "keyvi/dictionary/fsa/internal/json_value_store.h" -#include "keyvi/dictionary/fsa/internal/sparse_array_persistence.h" -#include "keyvi/util/configuration.h" -#include "keyvi/util/float_vector_value.h" namespace keyvi { namespace dictionary { From 5325cfe41a2dff32c5111e25bc1c3b054135d487 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Thu, 20 Feb 2025 23:22:52 +0100 Subject: [PATCH 09/10] apply clang-format-14 --- keyvi/include/keyvi/dictionary/fsa/automata.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/automata.h b/keyvi/include/keyvi/dictionary/fsa/automata.h index 64565dfb1..8b531e32d 100644 --- a/keyvi/include/keyvi/dictionary/fsa/automata.h +++ b/keyvi/include/keyvi/dictionary/fsa/automata.h @@ -394,11 +394,17 @@ class Automata final { return value_store_reader_->GetRawValueAsString(state_value); } - std::string GetStatistics() const { return dictionary_properties_->GetStatistics(); } + std::string GetStatistics() const { + return dictionary_properties_->GetStatistics(); + } - const std::string& GetManifest() const { return dictionary_properties_->GetManifest(); } + const std::string& GetManifest() const { + return dictionary_properties_->GetManifest(); + } - const uint64_t GetVersion() const { return dictionary_properties_->GetVersion(); } + const uint64_t GetVersion() const { + return dictionary_properties_->GetVersion(); + } private: dictionary_properties_t dictionary_properties_; @@ -460,7 +466,9 @@ class Automata final { friend class keyvi::dictionary::SecondaryKeyDictionary; - const dictionary_properties_t& GetDictionaryProperties() const { return dictionary_properties_; } + const dictionary_properties_t& GetDictionaryProperties() const { + return dictionary_properties_; + } }; // shared pointer From a03f0823c1ce0c3ed9f363bbc5c0eae8d4718a83 Mon Sep 17 00:00:00 2001 From: Hendrik Muhs Date: Fri, 21 Feb 2025 21:13:51 +0100 Subject: [PATCH 10/10] simplify versioning by removing a variable --- keyvi/include/keyvi/dictionary/fsa/generator.h | 2 +- keyvi/include/keyvi/dictionary/fsa/internal/constants.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/keyvi/include/keyvi/dictionary/fsa/generator.h b/keyvi/include/keyvi/dictionary/fsa/generator.h index aebdf19dd..305d64a7e 100644 --- a/keyvi/include/keyvi/dictionary/fsa/generator.h +++ b/keyvi/include/keyvi/dictionary/fsa/generator.h @@ -296,7 +296,7 @@ class Generator final { stream << KEYVI_FILE_MAGIC; // value stores can ask for a higher version - const uint64_t file_version = std::max(KEYVI_FILE_VERSION_DEFAULT, value_store_->GetFileVersionMin()); + const uint64_t file_version = std::max(KEYVI_FILE_VERSION_MIN, value_store_->GetFileVersionMin()); keyvi::dictionary::DictionaryProperties p(file_version, start_state_, number_of_keys_added_, number_of_states_, value_store_->GetValueStoreType(), persistence_->GetVersion(), diff --git a/keyvi/include/keyvi/dictionary/fsa/internal/constants.h b/keyvi/include/keyvi/dictionary/fsa/internal/constants.h index e2aae4389..cfbd81a12 100644 --- a/keyvi/include/keyvi/dictionary/fsa/internal/constants.h +++ b/keyvi/include/keyvi/dictionary/fsa/internal/constants.h @@ -34,12 +34,10 @@ static const char KEYVI_FILE_MAGIC[] = "KEYVIFSA"; static const size_t KEYVI_FILE_MAGIC_LEN = 8; -// min version of the file +// min version of the file format static const uint64_t KEYVI_FILE_VERSION_MIN = 2; -// max version of the file we support +// max version of the file format supported static const uint64_t KEYVI_FILE_VERSION_MAX = 3; -// the default version of the file format -static const uint64_t KEYVI_FILE_VERSION_DEFAULT = 2; // min version of the persistence part static const int KEYVI_FILE_PERSISTENCE_VERSION_MIN = 2;