Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
cb6ff29
WIP
hendrikmuhs Feb 17, 2025
8d708c0
switch to unique ptr
hendrikmuhs Feb 22, 2025
28491ec
add compression support in GetMsgPackedValueAsString
hendrikmuhs Feb 26, 2025
3d28782
format
hendrikmuhs Feb 26, 2025
6c9d974
renamings
hendrikmuhs Feb 26, 2025
8291e61
fix JsonStringToMsgPack
hendrikmuhs Mar 10, 2025
6494546
add msgpack support for loaded match objects
hendrikmuhs Mar 14, 2025
5fd6d9d
fix style
hendrikmuhs Mar 14, 2025
f3ae390
fix accidental regressions
hendrikmuhs Mar 14, 2025
2ccb4d4
add test coverage for msgpacked_value_as_string()
hendrikmuhs Mar 14, 2025
e87efc0
expose compression algorithm in python extension and add tests
hendrikmuhs Mar 15, 2025
d014ff1
add missing file
hendrikmuhs Mar 15, 2025
8fff417
move CompressionAlgorithm into separate file
hendrikmuhs Mar 16, 2025
fdff310
add rust bindings
hendrikmuhs Mar 28, 2025
2d2c478
cargo fmt
hendrikmuhs Mar 28, 2025
f90462c
fix merge
hendrikmuhs Mar 30, 2025
534ef25
remove generic GetMsgPackedValueAsString
hendrikmuhs Mar 31, 2025
80f5dc7
fix errors
hendrikmuhs Apr 3, 2025
dcc4c6b
fix format
hendrikmuhs Apr 3, 2025
b6f59b4
add missing override
hendrikmuhs Apr 3, 2025
a413554
address review comments, add python tests
hendrikmuhs Apr 17, 2025
9f23322
move requirements into venv
hendrikmuhs Apr 18, 2025
85b9798
install test packages for pip
hendrikmuhs Apr 18, 2025
454f981
pin boost on mac
hendrikmuhs Apr 18, 2025
e2f5529
link boost on mac
hendrikmuhs Apr 23, 2025
87f8a09
link boost also for python workflows
hendrikmuhs Apr 23, 2025
ace835d
temporarily revert #318
hendrikmuhs Apr 23, 2025
8f86b06
Merge branch 'master' into getmsgpackedvalue-compressed
hendrikmuhs Apr 23, 2025
fb5243d
add rust tests
hendrikmuhs Apr 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/keyvi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ jobs:
brew update
# workaround for https://github.com/actions/setup-python/issues/577
brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done
brew install zlib snappy boost
brew install zlib snappy [email protected]
brew link [email protected]
- name: checkout from git
uses: actions/checkout@v4

Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/python-cibuildwheel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ jobs:
run: |
brew update && \
brew list -1 | grep python | while read formula; do brew unlink $formula; brew link --overwrite $formula; done && \
brew install ccache zlib snappy boost
brew install ccache zlib snappy [email protected]
brew link [email protected]

- name: set mac deployment target X64
if: runner.os == 'macOS' && runner.arch == 'X64'
Expand Down Expand Up @@ -102,7 +103,7 @@ jobs:
CIBW_BEFORE_BUILD: pip install -r python/requirements.txt

# testing
CIBW_TEST_REQUIRES: pytest
CIBW_TEST_REQUIRES: pytest python-snappy zstd
CIBW_TEST_COMMAND: >
python -m pytest {package}/tests &&
python -m pytest {package}/integration-tests
Expand Down Expand Up @@ -139,6 +140,7 @@ jobs:
python setup.py sdist -d wheelhouse && \
python -m pip uninstall -y autowrap && \
python -m pip install wheelhouse/*.tar.gz -v && \
python -m pip install python-snappy zstd && \
python -m pytest tests && \
python -m pip uninstall -y keyvi

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ else ()
message(FATAL_ERROR "Can not find Boost")
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread")
set(_KEYVI_LINK_LIBRARIES_STATIC "${_KEYVI_LINK_LIBRARIES_STATIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread-mt")
else ()
set(_KEYVI_LINK_LIBRARIES_DYNAMIC "${_KEYVI_LINK_LIBRARIES_DYNAMIC} boost_program_options boost_iostreams boost_filesystem boost_system boost_regex boost_thread")
endif ()
Expand Down
23 changes: 23 additions & 0 deletions keyvi/bin/keyvi_c/c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,29 @@ keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match* match) {
return keyvi_bytes{data_size, static_cast<const uint8_t*>(data_ptr)};
}

keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match* match,
keyvi::compression::CompressionAlgorithm compression) {
const keyvi_bytes empty_keyvi_bytes{0, nullptr};

if (!match->obj_) {
return empty_keyvi_bytes;
}

const std::string compressed_value = match->obj_->GetMsgPackedValueAsString(compression);

const size_t data_size = compressed_value.size();
if (0 == data_size) {
return empty_keyvi_bytes;
}
auto* data_ptr = malloc(data_size);
if (nullptr == data_ptr) {
return empty_keyvi_bytes;
}
memcpy(data_ptr, compressed_value.c_str(), data_size);

return keyvi_bytes{data_size, static_cast<const uint8_t*>(data_ptr)};
}

char* keyvi_match_get_matched_string(const keyvi_match* match) {
return std_2_c_string(match->obj_ ? match->obj_->GetMatchedString() : "");
}
Expand Down
5 changes: 5 additions & 0 deletions keyvi/include/keyvi/c_api/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ extern "C" {
#include <stddef.h>
#include <stdint.h>

#include "keyvi/compression/compression_algorithm.h"

struct keyvi_dictionary;
struct keyvi_match;
struct keyvi_match_iterator;
Expand Down Expand Up @@ -92,6 +94,9 @@ char* keyvi_match_get_value_as_string(const struct keyvi_match*);

keyvi_bytes keyvi_match_get_msgpacked_value(const struct keyvi_match*);

keyvi_bytes keyvi_match_get_msgpacked_value_compressed(const struct keyvi_match*,
keyvi::compression::CompressionAlgorithm);

char* keyvi_match_get_matched_string(const struct keyvi_match*);

//////////////////////
Expand Down
34 changes: 34 additions & 0 deletions keyvi/include/keyvi/compression/compression_algorithm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/* * keyvi - A key value store.
*
* Copyright 2025 Hendrik Muhs<[email protected]>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_
#define KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_

namespace keyvi {
namespace compression {

enum CompressionAlgorithm {
NO_COMPRESSION = 0,
ZLIB_COMPRESSION = 1,
SNAPPY_COMPRESSION = 2,
ZSTD_COMPRESSION = 3,
};

} /* namespace compression */
} /* namespace keyvi */

#endif // KEYVI_COMPRESSION_COMPRESSION_ALGORITHM_H_
31 changes: 27 additions & 4 deletions keyvi/include/keyvi/compression/compression_selector.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@
#ifndef KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_
#define KEYVI_COMPRESSION_COMPRESSION_SELECTOR_H_

#include <memory>
#include <string>

#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>

#include "keyvi/compression/compression_algorithm.h"
#include "keyvi/compression/compression_strategy.h"
#include "keyvi/compression/snappy_compression_strategy.h"
#include "keyvi/compression/zlib_compression_strategy.h"
Expand Down Expand Up @@ -64,8 +66,8 @@ inline CompressionStrategy* compression_strategy(const std::string& name = "") {
typedef std::string (*decompress_func_t)(const std::string&);
typedef void (CompressionStrategy::*compress_mem_fn_t)(buffer_t*, const char*, size_t);

inline decompress_func_t decompressor_by_code(const std::string& s) {
switch (s[0]) {
inline decompress_func_t decompressor_by_code(const CompressionAlgorithm algorithm) {
switch (algorithm) {
case NO_COMPRESSION:
TRACE("unpack uncompressed string");
return RawCompressionStrategy::DoDecompress;
Expand All @@ -79,8 +81,29 @@ inline decompress_func_t decompressor_by_code(const std::string& s) {
TRACE("unpack zstd compressed string");
return ZstdCompressionStrategy::DoDecompress;
default:
throw std::invalid_argument("Invalid compression code " +
boost::lexical_cast<std::string>(static_cast<int>(s[0])));
throw std::invalid_argument("Invalid compression algorithm " +
boost::lexical_cast<std::string>(static_cast<int>(algorithm)));
}
}

inline decompress_func_t decompressor_from_string(const std::string& s) {
return decompressor_by_code(static_cast<CompressionAlgorithm>(s[0]));
}

/** Returns an instance of a compression strategy by enum. */
inline compression_strategy_t compression_strategy_by_code(const CompressionAlgorithm algorithm) {
switch (algorithm) {
case NO_COMPRESSION:
return std::make_unique<RawCompressionStrategy>();
case ZLIB_COMPRESSION:
return std::make_unique<ZlibCompressionStrategy>();
case SNAPPY_COMPRESSION:
return std::make_unique<SnappyCompressionStrategy>();
case ZSTD_COMPRESSION:
return std::make_unique<ZstdCompressionStrategy>();
default:
throw std::invalid_argument("Invalid compression algorithm " +
boost::lexical_cast<std::string>(static_cast<int>(algorithm)));
}
}

Expand Down
25 changes: 11 additions & 14 deletions keyvi/include/keyvi/compression/compression_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,18 @@
#define KEYVI_COMPRESSION_COMPRESSION_STRATEGY_H_

#include <cstring>
#include <memory>
#include <string>
#include <vector>

#include "keyvi/compression/compression_algorithm.h"
#include "keyvi/dictionary/fsa/internal/constants.h"

namespace keyvi {
namespace compression {

enum CompressionCode {
NO_COMPRESSION = 0,
ZLIB_COMPRESSION = 1,
SNAPPY_COMPRESSION = 2,
ZSTD_COMPRESSION = 3,
};

// buffer type which is realloc-able
typedef std::vector<char> buffer_t;
using buffer_t = std::vector<char>;

/**
* The base class of every compression strategy.
Expand All @@ -64,6 +59,12 @@ struct CompressionStrategy {
return std::string(buf.data(), buf.size());
}

inline std::string CompressWithoutHeader(const std::string& raw) {
buffer_t buf;
Compress(&buf, raw.data(), raw.size());
return std::string(buf.data() + 1, buf.size() - 1);
}

/**
* By the time this function is called, the length field added in Compress()
* will have been removed.
Expand All @@ -77,6 +78,8 @@ struct CompressionStrategy {
virtual uint64_t GetFileVersionMin() const = 0;
};

using compression_strategy_t = std::unique_ptr<CompressionStrategy>;

/**
* A compression strategy that does almost nothing; i.e. it only adds
* the length field.
Expand All @@ -90,12 +93,6 @@ struct RawCompressionStrategy final : public CompressionStrategy {
std::memcpy(buffer->data() + 1, raw, raw_size);
}

static inline std::string DoCompress(const char* raw, size_t raw_size) {
buffer_t buf;
DoCompress(&buf, raw, raw_size);
return std::string(buf.data(), buf.size());
}

inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); }

static inline std::string DoDecompress(const std::string& compressed) { return compressed.substr(1); }
Expand Down
6 changes: 0 additions & 6 deletions keyvi/include/keyvi/compression/snappy_compression_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,6 @@ struct SnappyCompressionStrategy final : public CompressionStrategy {
buffer->resize(output_length + 1);
}

static inline std::string DoCompress(const char* raw, size_t raw_size) {
buffer_t buf;
DoCompress(&buf, raw, raw_size);
return std::string(buf.data(), buf.size());
}

inline std::string Decompress(const std::string& compressed) { return DoDecompress(compressed); }

static std::string DoDecompress(const std::string& compressed) {
Expand Down
7 changes: 7 additions & 0 deletions keyvi/include/keyvi/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,13 @@ class Automata final {
return value_store_reader_->GetRawValueAsString(state_value);
}

std::string GetMsgPackedValueAsString(uint64_t state_value,
const compression::CompressionAlgorithm compression_algorithm =
compression::CompressionAlgorithm::NO_COMPRESSION) const {
assert(value_store_reader_);
return value_store_reader_->GetMsgPackedValueAsString(state_value, compression_algorithm);
}

std::string GetStatistics() const {
return dictionary_properties_->GetStatistics();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,36 @@ class FloatVectorValueStoreReader final : public IValueStoreReader {
return keyvi::util::FloatVectorAsString(keyvi::util::DecodeFloatVector(packed_string), ", ");
}

std::string GetMsgPackedValueAsString(uint64_t fsa_value,
const compression::CompressionAlgorithm compression_algorithm =
compression::CompressionAlgorithm::NO_COMPRESSION) const override {
size_t value_size;
const char* value_ptr = keyvi::util::decodeVarIntString(strings_ + fsa_value, &value_size);

if (value_size == 0) {
return std::string();
}

if (value_ptr[0] == compression_algorithm) {
return std::string(value_ptr + 1, value_size - 1);
}

// decompress
const compression::decompress_func_t decompressor =
compression::decompressor_by_code(static_cast<compression::CompressionAlgorithm>(value_ptr[0]));
std::string msgpacked_value = decompressor(std::string(value_ptr, value_size));

if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) {
return msgpacked_value;
}

// compress
const compression::compression_strategy_t compressor =
compression::compression_strategy_by_code(compression_algorithm);

return compressor->CompressWithoutHeader(msgpacked_value);
}

void CheckCompatibility(const IValueStoreReader& other) override {
if (other.GetValueStoreType() != GetValueStoreType()) {
throw std::invalid_argument("Dictionaries must have the same value store type");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
#include <string>
#include <vector>

#include "keyvi/compression/compression_selector.h"
#include "keyvi/dictionary/fsa/internal/constants.h"
#include "keyvi/dictionary/fsa/internal/ivalue_store.h"
#include "keyvi/dictionary/fsa/internal/value_store_types.h"
#include "keyvi/util/msgpack_util.h"

// #define ENABLE_TRACING
#include "keyvi/dictionary/util/trace.h"
Expand Down Expand Up @@ -111,6 +113,23 @@ class IntInnerWeightsValueStoreReader final : public IValueStoreReader {

std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); }

std::string GetRawValueAsString(uint64_t fsa_value) const override {
// TODO(hendrik): replace with std::format once we have C++20
return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION)
->Compress(keyvi::util::ValueToMsgPack(fsa_value));
}

std::string GetMsgPackedValueAsString(uint64_t fsa_value,
const compression::CompressionAlgorithm compression_algorithm =
compression::CompressionAlgorithm::NO_COMPRESSION) const override {
if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) {
return keyvi::util::ValueToMsgPack(fsa_value);
}

return compression::compression_strategy_by_code(compression_algorithm)
->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value));
}

uint32_t GetWeight(uint64_t fsa_value) const override { return static_cast<uint32_t>(fsa_value); }
};

Expand Down
19 changes: 19 additions & 0 deletions keyvi/include/keyvi/dictionary/fsa/internal/int_value_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@
#include <string>
#include <vector>

#include "keyvi/compression/compression_selector.h"
#include "keyvi/dictionary/fsa/internal/constants.h"
#include "keyvi/dictionary/fsa/internal/ivalue_store.h"
#include "keyvi/dictionary/fsa/internal/value_store_types.h"
#include "keyvi/util/msgpack_util.h"

// #define ENABLE_TRACING
#include "keyvi/dictionary/util/trace.h"
Expand Down Expand Up @@ -108,6 +110,23 @@ class IntValueStoreReader final : public IValueStoreReader {
}

std::string GetValueAsString(uint64_t fsa_value) const override { return std::to_string(fsa_value); }

std::string GetRawValueAsString(uint64_t fsa_value) const override {
// TODO(hendrik): replace with std::format once we have C++20
return compression::compression_strategy_by_code(compression::CompressionAlgorithm::NO_COMPRESSION)
->Compress(keyvi::util::ValueToMsgPack(fsa_value));
}

std::string GetMsgPackedValueAsString(uint64_t fsa_value,
const compression::CompressionAlgorithm compression_algorithm =
compression::CompressionAlgorithm::NO_COMPRESSION) const override {
if (compression_algorithm == compression::CompressionAlgorithm::NO_COMPRESSION) {
return keyvi::util::ValueToMsgPack(fsa_value);
}

return compression::compression_strategy_by_code(compression_algorithm)
->CompressWithoutHeader(keyvi::util::ValueToMsgPack(fsa_value));
}
};

template <>
Expand Down
Loading
Loading