Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions components/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -402,12 +402,6 @@ set(SOURCE_FILES_clp_s_unitTest
src/clp_s/SchemaWriter.hpp
src/clp_s/search/AddTimestampConditions.cpp
src/clp_s/search/AddTimestampConditions.hpp
src/clp_s/search/clp_search/EncodedVariableInterpreter.cpp
src/clp_s/search/clp_search/EncodedVariableInterpreter.hpp
src/clp_s/search/clp_search/Grep.cpp
src/clp_s/search/clp_search/Grep.hpp
src/clp_s/search/clp_search/Query.cpp
src/clp_s/search/clp_search/Query.hpp
src/clp_s/search/EvaluateRangeIndexFilters.cpp
src/clp_s/search/EvaluateRangeIndexFilters.hpp
src/clp_s/search/EvaluateTimestampIndex.cpp
Expand All @@ -429,10 +423,6 @@ set(SOURCE_FILES_clp_s_unitTest
src/clp_s/TimestampEntry.hpp
src/clp_s/Utils.cpp
src/clp_s/Utils.hpp
src/clp_s/VariableDecoder.cpp
src/clp_s/VariableDecoder.hpp
src/clp_s/VariableEncoder.cpp
src/clp_s/VariableEncoder.hpp
src/clp_s/ZstdCompressor.cpp
src/clp_s/ZstdCompressor.hpp
src/clp_s/ZstdDecompressor.cpp
Expand Down
2 changes: 2 additions & 0 deletions components/core/cmake/Options/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ function(set_clp_s_clp_dependencies_dependencies)
CLP_NEED_BOOST
CLP_NEED_CURL
CLP_NEED_FMT
CLP_NEED_LOG_SURGEON
CLP_NEED_MSGPACKCXX
CLP_NEED_NLOHMANN_JSON
CLP_NEED_OPENSSL
Expand Down Expand Up @@ -311,6 +312,7 @@ endfunction()
function(set_clp_s_search_dependencies)
set_clp_need_flags(
CLP_NEED_ABSL
CLP_NEED_LOG_SURGEON
CLP_NEED_SIMDJSON
CLP_NEED_SPDLOG
)
Expand Down
2 changes: 0 additions & 2 deletions components/core/src/clp/StringReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
#include <cassert>
#include <cerrno>

#include <boost/filesystem.hpp>

using std::string;

namespace clp {
Expand Down
6 changes: 0 additions & 6 deletions components/core/src/clp/ir/parsing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@ bool is_delim(signed char c) {
|| ('A' <= c && c <= 'Z') || '\\' == c || '_' == c || ('a' <= c && c <= 'z'));
}

bool is_variable_placeholder(char c) {
return (enum_to_underlying_type(VariablePlaceholder::Integer) == c)
|| (enum_to_underlying_type(VariablePlaceholder::Dictionary) == c)
|| (enum_to_underlying_type(VariablePlaceholder::Float) == c);
}

bool is_var(std::string_view value) {
size_t begin_pos = 0;
size_t end_pos = 0;
Expand Down
11 changes: 10 additions & 1 deletion components/core/src/clp/ir/parsing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include <string>
#include <string_view>

#include "../type_utils.hpp"
#include "types.hpp"

namespace clp::ir {
/**
* Checks if the given character is a delimiter
Expand All @@ -23,10 +26,16 @@ namespace clp::ir {
bool is_delim(signed char c);

/**
* NOTE: This method is marked inline for a ~50% performance improvement to
* `append_constant_to_logtype`.
* @param c
* @return Whether the character is a variable placeholder
*/
bool is_variable_placeholder(char c);
inline bool is_variable_placeholder(char c) {
return (enum_to_underlying_type(VariablePlaceholder::Integer) == c)
|| (enum_to_underlying_type(VariablePlaceholder::Dictionary) == c)
|| (enum_to_underlying_type(VariablePlaceholder::Float) == c);
}

/**
* NOTE: This method is marked inline for a 1-2% performance improvement
Expand Down
28 changes: 23 additions & 5 deletions components/core/src/clp_s/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,22 @@ set(
../clp/cli_utils.cpp
../clp/cli_utils.hpp
../clp/Defs.h
../clp/EncodedVariableInterpreter.cpp
../clp/EncodedVariableInterpreter.hpp
../clp/ErrorCode.hpp
../clp/ffi/encoding_methods.cpp
../clp/ffi/encoding_methods.hpp
../clp/ffi/encoding_methods.inc
../clp/ffi/ir_stream/byteswap.hpp
../clp/ffi/ir_stream/decoding_methods.cpp
../clp/ffi/ir_stream/decoding_methods.hpp
../clp/ffi/ir_stream/decoding_methods.inc
../clp/ffi/ir_stream/Deserializer.hpp
../clp/ffi/ir_stream/encoding_methods.cpp
../clp/ffi/ir_stream/encoding_methods.hpp
../clp/ffi/ir_stream/ir_unit_deserialization_methods.cpp
../clp/ffi/ir_stream/ir_unit_deserialization_methods.hpp
../clp/ffi/ir_stream/protocol_constants.hpp
../clp/ffi/ir_stream/Serializer.cpp
../clp/ffi/ir_stream/Serializer.hpp
../clp/ffi/ir_stream/search/AstEvaluationResult.hpp
Expand All @@ -48,17 +56,28 @@ set(
../clp/FileDescriptor.hpp
../clp/FileReader.cpp
../clp/FileReader.hpp
../clp/GrepCore.cpp
../clp/GrepCore.hpp
../clp/hash_utils.cpp
../clp/hash_utils.hpp
../clp/ir/LogEvent.hpp
../clp/ir/types.hpp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's alphabetically reorder these two

../clp/ir/constants.hpp
../clp/ir/EncodedTextAst.cpp
../clp/ir/EncodedTextAst.hpp
../clp/ir/parsing.cpp
../clp/ir/parsing.hpp
../clp/ir/parsing.inc
../clp/LogSurgeonReader.cpp
../clp/LogSurgeonReader.hpp
../clp/NetworkReader.cpp
../clp/NetworkReader.hpp
../clp/networking/socket_utils.cpp
../clp/networking/socket_utils.hpp
../clp/Query.cpp
../clp/Query.hpp
../clp/QueryToken.cpp
../clp/QueryToken.hpp
../clp/ReaderInterface.cpp
../clp/ReaderInterface.hpp
../clp/ReadOnlyMemoryMappedFile.cpp
Expand All @@ -69,10 +88,12 @@ set(
../clp/streaming_archive/Constants.hpp
../clp/streaming_compression/zstd/Decompressor.cpp
../clp/streaming_compression/zstd/Decompressor.hpp
../clp/StringReader.cpp
../clp/StringReader.hpp
../clp/Thread.cpp
../clp/Thread.hpp
../clp/TraceableException.hpp
../clp/time_types.hpp
../clp/TraceableException.hpp
../clp/type_utils.hpp
../clp/utf8_utils.cpp
../clp/utf8_utils.hpp
Expand All @@ -92,6 +113,7 @@ if(CLP_BUILD_CLP_S_CLP_DEPENDENCIES)
clp_s_clp_dependencies
PUBLIC
clp::string_utils
log_surgeon::log_surgeon
ystdlib::containers
PRIVATE
Boost::regex
Expand Down Expand Up @@ -223,8 +245,6 @@ set(
TraceableException.hpp
Utils.cpp
Utils.hpp
VariableEncoder.cpp
VariableEncoder.hpp
)

if(CLP_BUILD_CLP_S_ARCHIVEWRITER)
Expand Down Expand Up @@ -286,8 +306,6 @@ set(
TraceableException.hpp
Utils.cpp
Utils.hpp
VariableDecoder.cpp
VariableDecoder.hpp
)

if(CLP_BUILD_CLP_S_ARCHIVEREADER)
Expand Down
13 changes: 9 additions & 4 deletions components/core/src/clp_s/ColumnReader.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include "ColumnReader.hpp"

#include "../clp/EncodedVariableInterpreter.hpp"
#include "BufferViewReader.hpp"
#include "ColumnWriter.hpp"
#include "Utils.hpp"
#include "VariableDecoder.hpp"

namespace clp_s {
void Int64ColumnReader::load(BufferViewReader& reader, uint64_t num_messages) {
Expand Down Expand Up @@ -113,9 +113,14 @@ ClpStringColumnReader::extract_string_value_into_buffer(uint64_t cur_message, st
}

int64_t encoded_vars_offset = ClpStringColumnWriter::get_encoded_offset(value);
auto encoded_vars = m_encoded_vars.sub_span(encoded_vars_offset, entry.get_num_vars());
auto encoded_vars = m_encoded_vars.sub_span(encoded_vars_offset, entry.get_num_variables());

VariableDecoder::decode_variables_into_message(entry, *m_var_dict, encoded_vars, buffer);
clp::EncodedVariableInterpreter::decode_variables_into_message(
entry,
*m_var_dict,
encoded_vars,
buffer
);
}

void ClpStringColumnReader::extract_escaped_string_value_into_buffer(
Expand Down Expand Up @@ -149,7 +154,7 @@ UnalignedMemSpan<int64_t> ClpStringColumnReader::get_encoded_vars(uint64_t cur_m

int64_t encoded_vars_offset = ClpStringColumnWriter::get_encoded_offset(value);

return m_encoded_vars.sub_span(encoded_vars_offset, entry.get_num_vars());
return m_encoded_vars.sub_span(encoded_vars_offset, entry.get_num_variables());
}

void VariableStringColumnReader::load(BufferViewReader& reader, uint64_t num_messages) {
Expand Down
30 changes: 19 additions & 11 deletions components/core/src/clp_s/ColumnWriter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
#include "ColumnWriter.hpp"

#include <cstdint>
#include <variant>

#include "../clp/Defs.h"
#include "../clp/EncodedVariableInterpreter.hpp"
#include "ParsedMessage.hpp"
#include "ZstdCompressor.hpp"

namespace clp_s {
size_t Int64ColumnWriter::add_value(ParsedMessage::variable_t& value) {
m_values.push_back(std::get<int64_t>(value));
Expand Down Expand Up @@ -49,15 +57,16 @@ void BooleanColumnWriter::store(ZstdCompressor& compressor) {
}

size_t ClpStringColumnWriter::add_value(ParsedMessage::variable_t& value) {
std::string string_var = std::get<std::string>(value);
uint64_t id;
uint64_t offset = m_encoded_vars.size();
VariableEncoder::encode_and_add_to_dictionary(
string_var,
uint64_t offset{m_encoded_vars.size()};
m_temp_var_dict_ids.clear();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that making this to be local is better than creating a class member specifically for a temporary usage purpose.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer class member over local in this instance since it helps avoid a large number of memory allocations in some frequently run code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fwiw, I agree with Haiqi. Unless we have metrics to back up the importance of this it feels very much like a premature optimization.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair -- I'm kicking off some compression benchmarks now that I'll check on later today. I'll make the change depending on the results.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it seems to help a bit with some datasets but probably not enough to justify it. I'll get rid of the class member.

clp::EncodedVariableInterpreter::encode_and_add_to_dictionary(
std::get<std::string>(value),
m_logtype_entry,
*m_var_dict,
m_encoded_vars
m_encoded_vars,
m_temp_var_dict_ids
);
clp::logtype_dictionary_id_t id{};
m_log_dict->add_entry(m_logtype_entry, id);
auto encoded_id = encode_log_dict_id(id, offset);
m_logtypes.push_back(encoded_id);
Expand All @@ -74,15 +83,14 @@ void ClpStringColumnWriter::store(ZstdCompressor& compressor) {
}

size_t VariableStringColumnWriter::add_value(ParsedMessage::variable_t& value) {
std::string string_var = std::get<std::string>(value);
uint64_t id;
m_var_dict->add_entry(string_var, id);
clp::variable_dictionary_id_t id{};
m_var_dict->add_entry(std::get<std::string>(value), id);
m_variables.push_back(id);
return sizeof(int64_t);
return sizeof(clp::variable_dictionary_id_t);
}

void VariableStringColumnWriter::store(ZstdCompressor& compressor) {
size_t size = m_variables.size() * sizeof(int64_t);
size_t size = m_variables.size() * sizeof(clp::variable_dictionary_id_t);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can use auto?

it's not related to your change, but since we already have some auto used in this file, replacing size_t with auto doesn't harm

compressor.write(reinterpret_cast<char const*>(m_variables.data()), size);
}

Expand Down
15 changes: 8 additions & 7 deletions components/core/src/clp_s/ColumnWriter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
#include <utility>
#include <variant>

#include "../clp/Defs.h"
#include "DictionaryWriter.hpp"
#include "FileWriter.hpp"
#include "ParsedMessage.hpp"
#include "TimestampDictionaryWriter.hpp"
#include "VariableEncoder.hpp"
#include "ZstdCompressor.hpp"

namespace clp_s {
Expand Down Expand Up @@ -141,15 +141,15 @@ class ClpStringColumnWriter : public BaseColumnWriter {
* @param encoded_id
* @return the encoded log dict id
*/
static int64_t get_encoded_log_dict_id(uint64_t encoded_id) {
return (int64_t)encoded_id & cLogDictIdMask;
static clp::logtype_dictionary_id_t get_encoded_log_dict_id(uint64_t encoded_id) {
return static_cast<clp::logtype_dictionary_id_t>(encoded_id & cLogDictIdMask);
}

/**
* @param encoded_id
* @return The encoded offset
*/
static int64_t get_encoded_offset(uint64_t encoded_id) {
static uint64_t get_encoded_offset(uint64_t encoded_id) {
return ((int64_t)encoded_id & cOffsetMask) >> cOffsetBitPosition;
}

Expand All @@ -160,8 +160,8 @@ class ClpStringColumnWriter : public BaseColumnWriter {
* @param offset
* @return The encoded log dict id
*/
static int64_t encode_log_dict_id(uint64_t id, uint64_t offset) {
return ((int64_t)id) | ((int64_t)offset) << cOffsetBitPosition;
static uint64_t encode_log_dict_id(clp::logtype_dictionary_id_t id, uint64_t offset) {
return static_cast<uint64_t>(id) | (offset << cOffsetBitPosition);
}

static constexpr int cOffsetBitPosition = 24;
Expand All @@ -174,6 +174,7 @@ class ClpStringColumnWriter : public BaseColumnWriter {

std::vector<int64_t> m_logtypes;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems to be a type mismatch?

at here, the return type of encode_log_dict_id is uint64_t.

You might also want to double check type for m_encoded_vars.

Not sure if we need to make a specific type for encode_log_dict_id like we do in clp.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch -- I'm adding a using encoded_log_dict_id_t = uint64_t that I'll use all over and changed the encoded variables to clp::encoded_variable_t respectively.``

std::vector<int64_t> m_encoded_vars;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

somewhere in encode_and_add_to_dictionary we are implicilty converting encoded_var_t to int64_t. when we push an encoded_var to this vector This doesn't cause any issue because encoded_var_t is currently the same as int64_t, but to prevent future issues, I think we should either do a explicit cast, or use clp::encoded_var_t as the vector datatype?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing this to clp::encoded_var_t.

std::vector<clp::variable_dictionary_id_t> m_temp_var_dict_ids;
};

class VariableStringColumnWriter : public BaseColumnWriter {
Expand All @@ -193,7 +194,7 @@ class VariableStringColumnWriter : public BaseColumnWriter {

private:
std::shared_ptr<VariableDictionaryWriter> m_var_dict;
std::vector<int64_t> m_variables;
std::vector<clp::variable_dictionary_id_t> m_variables;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shall we rename this to m_variable_ids (or m_variables_ids?)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'll go with m_var_dict_ids since I see that use xyz_var_dict_ids used in clp/Query.hpp at least.

};

class DateStringColumnWriter : public BaseColumnWriter {
Expand Down
Loading
Loading