diff --git a/CHANGELOG.md b/CHANGELOG.md index bbcc8d708d..2ad610c1a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,9 @@ Increment the: * [SDK] Implements options for the ParentBasedSampler with default values [#3553](https://github.com/open-telemetry/opentelemetry-cpp/pull/3553) +* [SDK] Add bundle version of utf8_range to validate attributes + [#3512](https://github.com/open-telemetry/opentelemetry-cpp/pull/3512) + * [SDK] View should not have a unit [#3552](https://github.com/open-telemetry/opentelemetry-cpp/pull/3552) diff --git a/exporters/otlp/src/otlp_populate_attribute_utils.cc b/exporters/otlp/src/otlp_populate_attribute_utils.cc index bc9a7d618a..0e78176e58 100644 --- a/exporters/otlp/src/otlp_populate_attribute_utils.cc +++ b/exporters/otlp/src/otlp_populate_attribute_utils.cc @@ -13,6 +13,7 @@ #include "opentelemetry/nostd/string_view.h" #include "opentelemetry/nostd/variant.h" #include "opentelemetry/sdk/common/attribute_utils.h" +#include "opentelemetry/sdk/common/attribute_validity.h" #include "opentelemetry/sdk/instrumentationscope/instrumentation_scope.h" #include "opentelemetry/sdk/resource/resource.h" #include "opentelemetry/version.h" @@ -85,8 +86,18 @@ void OtlpPopulateAttributeUtils::PopulateAnyValue( } else if (nostd::holds_alternative(value)) { - proto_value->set_string_value(nostd::get(value).data(), - nostd::get(value).size()); + if (allow_bytes && + !opentelemetry::sdk::common::AttributeIsValidString(nostd::get(value))) + { + proto_value->set_bytes_value( + reinterpret_cast(nostd::get(value).data()), + nostd::get(value).size()); + } + else + { + proto_value->set_string_value(nostd::get(value).data(), + nostd::get(value).size()); + } } else if (nostd::holds_alternative>(value)) { @@ -159,7 +170,15 @@ void OtlpPopulateAttributeUtils::PopulateAnyValue( auto array_value = proto_value->mutable_array_value(); for (const auto &val : nostd::get>(value)) { - array_value->add_values()->set_string_value(val.data(), val.size()); + if (allow_bytes && !opentelemetry::sdk::common::AttributeIsValidString(val)) + { + array_value->add_values()->set_bytes_value(reinterpret_cast(val.data()), + val.size()); + } + else + { + array_value->add_values()->set_string_value(val.data(), val.size()); + } } } } @@ -224,7 +243,17 @@ void OtlpPopulateAttributeUtils::PopulateAnyValue( } else if (nostd::holds_alternative(value)) { - proto_value->set_string_value(nostd::get(value)); + if (allow_bytes && + !opentelemetry::sdk::common::AttributeIsValidString(nostd::get(value))) + { + proto_value->set_bytes_value( + reinterpret_cast(nostd::get(value).data()), + nostd::get(value).size()); + } + else + { + proto_value->set_string_value(nostd::get(value)); + } } else if (nostd::holds_alternative>(value)) { @@ -281,7 +310,15 @@ void OtlpPopulateAttributeUtils::PopulateAnyValue( auto array_value = proto_value->mutable_array_value(); for (const auto &val : nostd::get>(value)) { - array_value->add_values()->set_string_value(val); + if (allow_bytes && !opentelemetry::sdk::common::AttributeIsValidString(val)) + { + array_value->add_values()->set_bytes_value(reinterpret_cast(val.data()), + val.size()); + } + else + { + array_value->add_values()->set_string_value(val); + } } } } diff --git a/sdk/include/opentelemetry/sdk/common/attribute_validity.h b/sdk/include/opentelemetry/sdk/common/attribute_validity.h new file mode 100644 index 0000000000..3e7bbfcaac --- /dev/null +++ b/sdk/include/opentelemetry/sdk/common/attribute_validity.h @@ -0,0 +1,129 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include "opentelemetry/common/attribute_value.h" +#include "opentelemetry/common/key_value_iterable.h" +#include "opentelemetry/nostd/function_ref.h" +#include "opentelemetry/nostd/span.h" +#include "opentelemetry/nostd/string_view.h" +#include "opentelemetry/sdk/common/attribute_utils.h" +#include "opentelemetry/version.h" + +OPENTELEMETRY_BEGIN_NAMESPACE +namespace sdk +{ +namespace common +{ + +OPENTELEMETRY_EXPORT bool AttributeIsValidString(nostd::string_view value) noexcept; + +/** + * Validate if an attribute value is valid. + */ +struct AttributeValidator +{ + bool operator()(bool /*v*/) noexcept { return true; } + bool operator()(int32_t /*v*/) noexcept { return true; } + bool operator()(uint32_t /*v*/) noexcept { return true; } + bool operator()(int64_t /*v*/) noexcept { return true; } + bool operator()(uint64_t /*v*/) noexcept { return true; } + bool operator()(double /*v*/) noexcept { return true; } + bool operator()(nostd::string_view v) noexcept { return AttributeIsValidString(v); } + bool operator()(std::string v) noexcept { return AttributeIsValidString(v); } + bool operator()(const char *v) noexcept { return AttributeIsValidString(v); } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span /*v*/) noexcept { return true; } + bool operator()(nostd::span v) noexcept + { + for (const auto &s : v) + { + if (!AttributeIsValidString(s)) + { + return false; + } + } + return true; + } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector &v) + { + for (const auto &s : v) + { + if (!AttributeIsValidString(s)) + { + return false; + } + } + return true; + } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + bool operator()(const std::vector & /*v*/) noexcept { return true; } + + OPENTELEMETRY_EXPORT static bool IsValid(const std::string &value) noexcept; + + OPENTELEMETRY_EXPORT static bool IsValid(nostd::string_view value) noexcept; + + OPENTELEMETRY_EXPORT static bool IsValid(const OwnedAttributeValue &value) noexcept; + + OPENTELEMETRY_EXPORT static bool IsValid( + const opentelemetry::common::AttributeValue &value) noexcept; + + OPENTELEMETRY_EXPORT static bool IsAllValid(const AttributeMap &attributes) noexcept; + + OPENTELEMETRY_EXPORT static bool IsAllValid(const OrderedAttributeMap &attributes) noexcept; + + OPENTELEMETRY_EXPORT static void Filter(AttributeMap &attributes, nostd::string_view log_hint); + + OPENTELEMETRY_EXPORT static void Filter(OrderedAttributeMap &attributes, + nostd::string_view log_hint); +}; + +/** + * Supports internal iteration over a collection of key-value pairs and filtering of invalid + * attributes. + */ +class OPENTELEMETRY_EXPORT KeyValueFilterIterable : public opentelemetry::common::KeyValueIterable +{ +public: + KeyValueFilterIterable(const opentelemetry::common::KeyValueIterable &origin, + opentelemetry::nostd::string_view log_hint) noexcept; + + ~KeyValueFilterIterable() override; + + bool ForEachKeyValue( + opentelemetry::nostd::function_ref callback) + const noexcept override; + + size_t size() const noexcept override; + +private: + // Pointer to the original KeyValueIterable + const opentelemetry::common::KeyValueIterable *origin_; + + // Size of valid attributes + mutable size_t size_; + + // Log hint for invalid attributes + opentelemetry::nostd::string_view log_hint_; +}; + +} // namespace common +} // namespace sdk +OPENTELEMETRY_END_NAMESPACE diff --git a/sdk/include/opentelemetry/sdk/instrumentationscope/instrumentation_scope.h b/sdk/include/opentelemetry/sdk/instrumentationscope/instrumentation_scope.h index 68e9d10d40..2b18e9dc35 100644 --- a/sdk/include/opentelemetry/sdk/instrumentationscope/instrumentation_scope.h +++ b/sdk/include/opentelemetry/sdk/instrumentationscope/instrumentation_scope.h @@ -12,6 +12,8 @@ #include "opentelemetry/nostd/unique_ptr.h" #include "opentelemetry/nostd/variant.h" #include "opentelemetry/sdk/common/attribute_utils.h" +#include "opentelemetry/sdk/common/attribute_validity.h" +#include "opentelemetry/sdk/common/global_log_handler.h" #include "opentelemetry/version.h" OPENTELEMETRY_BEGIN_NAMESPACE @@ -42,6 +44,7 @@ class InstrumentationScope nostd::string_view schema_url = "", InstrumentationScopeAttributes &&attributes = {}) { + common::AttributeValidator::Filter(attributes, "[InstrumentationScope]"); return nostd::unique_ptr( new InstrumentationScope{name, version, schema_url, std::move(attributes)}); } @@ -60,8 +63,19 @@ class InstrumentationScope nostd::string_view schema_url, const InstrumentationScopeAttributes &attributes) { - return nostd::unique_ptr(new InstrumentationScope{ - name, version, schema_url, InstrumentationScopeAttributes(attributes)}); + // Copy attributes only when we find some invalid attributes and try to remove them. + if (common::AttributeValidator::IsAllValid(attributes)) + { + return nostd::unique_ptr(new InstrumentationScope{ + name, version, schema_url, InstrumentationScopeAttributes(attributes)}); + } + else + { + InstrumentationScopeAttributes copy_attributes = attributes; + common::AttributeValidator::Filter(copy_attributes, "[InstrumentationScope]"); + return nostd::unique_ptr(new InstrumentationScope{ + name, version, schema_url, InstrumentationScopeAttributes(copy_attributes)}); + } } /** @@ -88,6 +102,19 @@ class InstrumentationScope result->attributes_.reserve(opentelemetry::nostd::size(arg)); for (auto &argv : arg) { + if (!common::AttributeValidator::IsValid(argv.first)) + { + OTEL_INTERNAL_LOG_WARN("[InstrumentationScope] Invalid attribute key " + << std::string{argv.first} << ". This attribute will be ignored."); + continue; + } + + if (!common::AttributeValidator::IsValid(argv.second)) + { + OTEL_INTERNAL_LOG_WARN("[InstrumentationScope] Invalid attribute value for " + << std::string{argv.first} << ". This attribute will be ignored."); + continue; + } result->SetAttribute(argv.first, argv.second); } @@ -148,6 +175,19 @@ class InstrumentationScope void SetAttribute(nostd::string_view key, const opentelemetry::common::AttributeValue &value) noexcept { + if (!common::AttributeValidator::IsValid(key)) + { + OTEL_INTERNAL_LOG_WARN("[InstrumentationScope] Invalid attribute key " + << std::string{key} << ". This attribute will be ignored."); + return; + } + + if (!common::AttributeValidator::IsValid(value)) + { + OTEL_INTERNAL_LOG_WARN("[InstrumentationScope] Invalid attribute value for " + << std::string{key} << ". This attribute will be ignored."); + return; + } attributes_[std::string(key)] = nostd::visit(opentelemetry::sdk::common::AttributeConverter(), value); } diff --git a/sdk/include/opentelemetry/sdk/metrics/view/attributes_processor.h b/sdk/include/opentelemetry/sdk/metrics/view/attributes_processor.h index 7ab8cafb13..ae2d140d46 100644 --- a/sdk/include/opentelemetry/sdk/metrics/view/attributes_processor.h +++ b/sdk/include/opentelemetry/sdk/metrics/view/attributes_processor.h @@ -10,6 +10,7 @@ #include "opentelemetry/common/attribute_value.h" #include "opentelemetry/common/key_value_iterable.h" #include "opentelemetry/nostd/string_view.h" +#include "opentelemetry/sdk/common/attribute_validity.h" #include "opentelemetry/sdk/metrics/state/filtered_ordered_attribute_map.h" #include "opentelemetry/version.h" @@ -50,7 +51,8 @@ class DefaultAttributesProcessor : public AttributesProcessor MetricAttributes process( const opentelemetry::common::KeyValueIterable &attributes) const noexcept override { - MetricAttributes result(attributes); + MetricAttributes result( + opentelemetry::sdk::common::KeyValueFilterIterable(attributes, "[Metrics] ")); return result; } @@ -78,7 +80,9 @@ class FilteringAttributesProcessor : public AttributesProcessor const opentelemetry::common::KeyValueIterable &attributes) const noexcept override { MetricAttributes result; - attributes.ForEachKeyValue( + opentelemetry::sdk::common::KeyValueFilterIterable validate_attributes{attributes, + "[Metrics] "}; + validate_attributes.ForEachKeyValue( [&](nostd::string_view key, opentelemetry::common::AttributeValue value) noexcept { if (allowed_attribute_keys_.find(key.data()) != allowed_attribute_keys_.end()) { diff --git a/sdk/src/common/BUILD b/sdk/src/common/BUILD index 19b47034f7..804cdd7992 100644 --- a/sdk/src/common/BUILD +++ b/sdk/src/common/BUILD @@ -3,6 +3,36 @@ package(default_visibility = ["//visibility:public"]) +cc_library( + name = "utf8_range", + srcs = [ + "internal/utf8_range/uft8_range.cc", + ], + hdrs = [ + "internal/utf8_range/utf8_range.h", + "internal/utf8_range/utf8_range_neon.inc", + "internal/utf8_range/utf8_range_sse.inc", + ], + include_prefix = "src/common", + deps = [ + "//api", + ], +) + +cc_library( + name = "attribute_validity", + srcs = [ + "attribute_validity.cc", + ], + include_prefix = "src/common", + deps = [ + "//api", + "//sdk:headers", + "//sdk/src/common:global_log_handler", + "//sdk/src/common:utf8_range", + ], +) + cc_library( name = "random", srcs = [ diff --git a/sdk/src/common/CMakeLists.txt b/sdk/src/common/CMakeLists.txt index 4a3b59aefa..0d08148c27 100644 --- a/sdk/src/common/CMakeLists.txt +++ b/sdk/src/common/CMakeLists.txt @@ -1,8 +1,14 @@ # Copyright The OpenTelemetry Authors # SPDX-License-Identifier: Apache-2.0 -set(COMMON_SRCS random.cc global_log_handler.cc env_variables.cc base64.cc - disabled.cc) +set(COMMON_SRCS + random.cc + global_log_handler.cc + env_variables.cc + base64.cc + disabled.cc + attribute_validity.cc + internal/utf8_range/uft8_range.cc) if(WIN32) list(APPEND COMMON_SRCS platform/fork_windows.cc) else() diff --git a/sdk/src/common/attribute_validity.cc b/sdk/src/common/attribute_validity.cc new file mode 100644 index 0000000000..6772b3d072 --- /dev/null +++ b/sdk/src/common/attribute_validity.cc @@ -0,0 +1,229 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#include "opentelemetry/sdk/common/attribute_validity.h" + +#include +#include +#include +#include + +#include "opentelemetry/nostd/variant.h" +#include "opentelemetry/sdk/common/global_log_handler.h" +#include "opentelemetry/version.h" + +#include "src/common/internal/utf8_range/utf8_range.h" + +OPENTELEMETRY_BEGIN_NAMESPACE +namespace sdk +{ +namespace common +{ + +namespace +{ +static AttributeValidator &GetSharedAttributeValidator() noexcept +{ + static AttributeValidator validator; + return validator; +} +} // namespace + +OPENTELEMETRY_EXPORT bool AttributeIsValidString(nostd::string_view value) noexcept +{ + return 0 != utf8_range::utf8_range_IsValid(value.data(), value.size()); +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsValid(const std::string &value) noexcept +{ + return AttributeIsValidString(value); +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsValid(nostd::string_view value) noexcept +{ + return AttributeIsValidString(value); +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsValid(const OwnedAttributeValue &value) noexcept +{ +#if OPENTELEMETRY_HAVE_EXCEPTIONS + try + { +#endif + + return nostd::visit(GetSharedAttributeValidator(), value); + +#if OPENTELEMETRY_HAVE_EXCEPTIONS + } + catch (...) + { + return false; + } +#endif +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsValid( + const opentelemetry::common::AttributeValue &value) noexcept +{ +#if OPENTELEMETRY_HAVE_EXCEPTIONS + try + { +#endif + + return nostd::visit(GetSharedAttributeValidator(), value); + +#if OPENTELEMETRY_HAVE_EXCEPTIONS + } + catch (...) + { + return false; + } +#endif +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsAllValid(const AttributeMap &attributes) noexcept +{ + for (const auto &kv : attributes) + { + if (!AttributeValidator::IsValid(kv.second)) + { + return false; + } + } + return true; +} + +OPENTELEMETRY_EXPORT bool AttributeValidator::IsAllValid( + const OrderedAttributeMap &attributes) noexcept +{ + for (const auto &kv : attributes) + { + if (!AttributeValidator::IsValid(kv.second)) + { + return false; + } + } + return true; +} + +OPENTELEMETRY_EXPORT void AttributeValidator::Filter(AttributeMap &attributes, + nostd::string_view log_hint) +{ + std::unordered_set invalid_keys; + for (auto &kv : attributes) + { + if (!common::AttributeValidator::IsValid(kv.first)) + { + OTEL_INTERNAL_LOG_WARN(log_hint << " Invalid attribute key " << kv.first + << ". This attribute will be ignored."); + + invalid_keys.insert(kv.first); + continue; + } + + if (!common::AttributeValidator::IsValid(kv.second)) + { + OTEL_INTERNAL_LOG_WARN(log_hint << " Invalid attribute value for " << kv.first + << ". This attribute will be ignored."); + + invalid_keys.insert(kv.first); + } + } + + for (auto &invalid_key : invalid_keys) + { + attributes.erase(invalid_key); + } +} + +OPENTELEMETRY_EXPORT void AttributeValidator::Filter(OrderedAttributeMap &attributes, + nostd::string_view log_hint) +{ + std::unordered_set invalid_keys; + for (auto &kv : attributes) + { + if (!common::AttributeValidator::IsValid(kv.first)) + { + OTEL_INTERNAL_LOG_WARN(log_hint << " Invalid attribute key " << kv.first + << ". This attribute will be ignored."); + + invalid_keys.insert(kv.first); + continue; + } + + if (!common::AttributeValidator::IsValid(kv.second)) + { + OTEL_INTERNAL_LOG_WARN(log_hint << " Invalid attribute value for " << kv.first + << ". This attribute will be ignored."); + + invalid_keys.insert(kv.first); + } + } + + for (auto &invalid_key : invalid_keys) + { + attributes.erase(invalid_key); + } +} + +KeyValueFilterIterable::KeyValueFilterIterable( + const opentelemetry::common::KeyValueIterable &origin, + opentelemetry::nostd::string_view log_hint) noexcept + : origin_(&origin), size_(static_cast(-1)), log_hint_(log_hint) +{} + +KeyValueFilterIterable::~KeyValueFilterIterable() {} + +bool KeyValueFilterIterable::ForEachKeyValue( + opentelemetry::nostd::function_ref callback) + const noexcept +{ + size_t size = 0; + bool ret = + origin_->ForEachKeyValue([&size, &callback, this](opentelemetry::nostd::string_view k, + opentelemetry::common::AttributeValue v) { + if (AttributeValidator::IsValid(k) && AttributeValidator::IsValid(v)) + { + ++size; + return callback(k, v); + } + + OTEL_INTERNAL_LOG_WARN(log_hint_ << " Invalid value for: " << k << ". It will be ignored."); + return true; + }); + + // If it return true, we already iterated over all key-values. The the size can be updated. + if (ret) + { + size_ = size; + } + + return ret; +} + +size_t KeyValueFilterIterable::size() const noexcept +{ + // Use cached size if it was already calculated. + if (size_ != static_cast(-1)) + { + return size_; + } + + size_t size = 0; + origin_->ForEachKeyValue( + [&size](opentelemetry::nostd::string_view k, opentelemetry::common::AttributeValue v) { + if (AttributeValidator::IsValid(k) && AttributeValidator::IsValid(v)) + { + ++size; + } + return true; + }); + + size_ = size; + return size_; +} + +} // namespace common +} // namespace sdk +OPENTELEMETRY_END_NAMESPACE diff --git a/sdk/src/common/internal/utf8_range/README.md b/sdk/src/common/internal/utf8_range/README.md new file mode 100644 index 0000000000..f469cbe027 --- /dev/null +++ b/sdk/src/common/internal/utf8_range/README.md @@ -0,0 +1,7 @@ +# Notes on utf8_range implementation + +This is a snapshot of utf8_range from Google protobuf `v31.1`. + +The origin source is here . + +We modify the namespace to keep ABI compatibility. diff --git a/sdk/src/common/internal/utf8_range/uft8_range.cc b/sdk/src/common/internal/utf8_range/uft8_range.cc new file mode 100644 index 0000000000..3ceb4c4505 --- /dev/null +++ b/sdk/src/common/internal/utf8_range/uft8_range.cc @@ -0,0 +1,243 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 +// +// Use of this source code is governed by an MIT-style +// license that can be found in the LICENSE file or at +// https://opensource.org/licenses/MIT. + +/* This is a wrapper for the Google range-sse.cc algorithm which checks whether + * a sequence of bytes is a valid UTF-8 sequence and finds the longest valid + * prefix of the UTF-8 sequence. + * + * The key difference is that it checks for as much ASCII symbols as possible + * and then falls back to the range-sse.cc algorithm. The changes to the + * algorithm are cosmetic, mostly to trick the clang compiler to produce optimal + * code. + * + * For API see the utf8_validity.h header. + */ +#include "src/common/internal/utf8_range/utf8_range.h" + +#include +#include + +#include "opentelemetry/version.h" + +#if defined(__GNUC__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +OPENTELEMETRY_BEGIN_NAMESPACE +namespace sdk +{ +namespace utf8_range +{ + +static FORCE_INLINE_ATTR uint64_t utf8_range_UnalignedLoad64(const void *p) +{ + uint64_t t; + memcpy(&t, p, sizeof t); + return t; +} + +static FORCE_INLINE_ATTR bool utf8_range_AsciiIsAscii(unsigned char c) +{ + return c < 128; +} + +static FORCE_INLINE_ATTR bool utf8_range_IsTrailByteOk(unsigned char c) +{ + return c <= static_cast(0xBF); +} + +/* If return_position is false then it returns 1 if |data| is a valid utf8 + * sequence, otherwise returns 0. + * If return_position is set to true, returns the length in bytes of the prefix + of |data| that is all structurally valid UTF-8. + */ +static size_t utf8_range_ValidateUTF8Naive(const char *data, const char *end, int return_position) +{ + /* We return err_pos in the loop which is always 0 if !return_position */ + size_t err_pos = 0; + size_t codepoint_bytes = 0; + /* The early check is done because of early continue's on codepoints of all + * sizes, i.e. we first check for ascii and if it is, we call continue, then + * for 2 byte codepoints, etc. This is done in order to reduce indentation and + * improve readability of the codepoint validity check. + */ + while (data + codepoint_bytes < end) + { + if (return_position) + { + err_pos += codepoint_bytes; + } + data += codepoint_bytes; + const size_t len = end - data; + const unsigned char byte1 = data[0]; + + /* We do not skip many ascii bytes at the same time as this function is + used for tail checking (< 16 bytes) and for non x86 platforms. We also + don't think that cases where non-ASCII codepoints are followed by ascii + happen often. For small strings it also introduces some penalty. For + purely ascii UTF8 strings (which is the overwhelming case) we call + SkipAscii function which is multiplatform and extremely fast. + */ + /* [00..7F] ASCII -> 1 byte */ + if (utf8_range_AsciiIsAscii(byte1)) + { + codepoint_bytes = 1; + continue; + } + /* [C2..DF], [80..BF] -> 2 bytes */ + if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && + utf8_range_IsTrailByteOk(static_cast(data[1]))) + { + codepoint_bytes = 2; + continue; + } + if (len >= 3) + { + const unsigned char byte2 = data[1]; + const unsigned char byte3 = data[2]; + + /* Is byte2, byte3 between [0x80, 0xBF] + * Check for 0x80 was done above. + */ + if (!utf8_range_IsTrailByteOk(byte2) || !utf8_range_IsTrailByteOk(byte3)) + { + return err_pos; + } + + if (/* E0, A0..BF, 80..BF */ + ((byte1 == 0xE0 && byte2 >= 0xA0) || + /* E1..EC, 80..BF, 80..BF */ + (byte1 >= 0xE1 && byte1 <= 0xEC) || + /* ED, 80..9F, 80..BF */ + (byte1 == 0xED && byte2 <= 0x9F) || + /* EE..EF, 80..BF, 80..BF */ + (byte1 >= 0xEE && byte1 <= 0xEF))) + { + codepoint_bytes = 3; + continue; + } + if (len >= 4) + { + const unsigned char byte4 = data[3]; + /* Is byte4 between 0x80 ~ 0xBF */ + if (!utf8_range_IsTrailByteOk(byte4)) + { + return err_pos; + } + + if (/* F0, 90..BF, 80..BF, 80..BF */ + ((byte1 == 0xF0 && byte2 >= 0x90) || + /* F1..F3, 80..BF, 80..BF, 80..BF */ + (byte1 >= 0xF1 && byte1 <= 0xF3) || + /* F4, 80..8F, 80..BF, 80..BF */ + (byte1 == 0xF4 && byte2 <= 0x8F))) + { + codepoint_bytes = 4; + continue; + } + } + } + return err_pos; + } + if (return_position) + { + err_pos += codepoint_bytes; + } + /* if return_position is false, this returns 1. + * if return_position is true, this returns err_pos. + */ + return err_pos + (1 - return_position); +} + +#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)) +/* Returns the number of bytes needed to skip backwards to get to the first + byte of codepoint. + */ +static inline int utf8_range_CodepointSkipBackwards(int32_t codepoint_word) +{ + const int8_t *const codepoint = (const int8_t *)(&codepoint_word); + if (!utf8_range_IsTrailByteOk(static_cast(codepoint[3]))) + { + return 1; + } + else if (!utf8_range_IsTrailByteOk(static_cast(codepoint[2]))) + { + return 2; + } + else if (!utf8_range_IsTrailByteOk(static_cast(codepoint[1]))) + { + return 3; + } + return 0; +} +#endif // __SSE4_1__ + +/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional + as most strings to check for validity consist only of 1 byte codepoints. + */ +static inline const char *utf8_range_SkipAscii(const char *data, const char *end) +{ + while (8 <= end - data && (utf8_range_UnalignedLoad64(data) & 0x8080808080808080) == 0) + { + data += 8; + } + while (data < end && utf8_range_AsciiIsAscii(*data)) + { + ++data; + } + return data; +} + +#if defined(__SSE4_1__) +# include "src/common/internal/utf8_range/utf8_range_sse.inc" +#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE) +# include "src/common/internal/utf8_range/utf8_range_neon.inc" +#endif + +static FORCE_INLINE_ATTR size_t utf8_range_Validate(const char *data, + size_t len, + int return_position) +{ + if (len == 0) + return 1 - return_position; + // Save buffer start address for later use + const char *const data_original = data; + const char *const end = data + len; + data = utf8_range_SkipAscii(data, end); + /* SIMD algorithm always outperforms the naive version for any data of + length >=16. + */ + if (end - data < 16) + { + return (return_position ? (data - data_original) : 0) + + utf8_range_ValidateUTF8Naive(data, end, return_position); + } +#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)) + return utf8_range_ValidateUTF8Simd(data_original, data, end, return_position); +#else + return (return_position ? (data - data_original) : 0) + + utf8_range_ValidateUTF8Naive(data, end, return_position); +#endif +} + +int utf8_range_IsValid(const char *data, size_t len) +{ + return utf8_range_Validate(data, len, /*return_position=*/0) != 0; +} + +size_t utf8_range_ValidPrefix(const char *data, size_t len) +{ + return utf8_range_Validate(data, len, /*return_position=*/1); +} + +} // namespace utf8_range +} // namespace sdk +OPENTELEMETRY_END_NAMESPACE diff --git a/sdk/src/common/internal/utf8_range/utf8_range.h b/sdk/src/common/internal/utf8_range/utf8_range.h new file mode 100644 index 0000000000..1b77e0b2ab --- /dev/null +++ b/sdk/src/common/internal/utf8_range/utf8_range.h @@ -0,0 +1,26 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include + +OPENTELEMETRY_BEGIN_NAMESPACE +namespace sdk +{ +namespace utf8_range +{ + +// Returns 1 if the sequence of characters is a valid UTF-8 sequence, otherwise +// 0. +int utf8_range_IsValid(const char *data, size_t len); + +// Returns the length in bytes of the prefix of str that is all +// structurally valid UTF-8. +size_t utf8_range_ValidPrefix(const char *data, size_t len); + +} // namespace utf8_range +} // namespace sdk +OPENTELEMETRY_END_NAMESPACE diff --git a/sdk/src/common/internal/utf8_range/utf8_range_neon.inc b/sdk/src/common/internal/utf8_range/utf8_range_neon.inc new file mode 100644 index 0000000000..b1a8729549 --- /dev/null +++ b/sdk/src/common/internal/utf8_range/utf8_range_neon.inc @@ -0,0 +1,129 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#include + +/* This code is almost the same as SSE implementation, please reference + * utf8-range-sse.inc for detailed explanation. + * The only difference is the range adjustment step. NEON code is more + * straightforward. + */ + +static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(const char *data_original, + const char *data, + const char *end, + int return_position) +{ + const uint8x16_t first_len_tbl = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, + }; + const uint8x16_t first_range_tbl = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, + }; + const uint8x16_t range_min_tbl = { + 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + }; + const uint8x16_t range_max_tbl = { + 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }; + /* Range adjustment in NEON uint8x16x2 table. Note that lanes are interleaved + * in register. The table below is plotted vertically to ease understanding. + * The 1st column is for E0~EF, 2nd column for F0~FF. + */ + // clang-format off + const uint8_t range_adjust_tbl_data[] = { + /* index -> 0~15 16~31 <- index */ + /* E0 -> */ 2, 3, /* <- F0 */ + 0, 0, + 0, 0, + 0, 0, + 0, 4, /* <- F4 */ + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + /* ED -> */ 3, 0, + 0, 0, + 0, 0, + }; + // clang-format on + const uint8x16x2_t range_adjust_tbl = vld2q_u8(range_adjust_tbl_data); + + const uint8x16_t const_1 = vdupq_n_u8(1); + const uint8x16_t const_2 = vdupq_n_u8(2); + const uint8x16_t const_e0 = vdupq_n_u8(0xE0); + + uint8x16_t prev_input = vdupq_n_u8(0); + uint8x16_t prev_first_len = vdupq_n_u8(0); + uint8x16_t error = vdupq_n_u8(0); + + while (end - data >= 16) + { + const uint8x16_t input = vld1q_u8((const uint8_t *)data); + + const uint8x16_t high_nibbles = vshrq_n_u8(input, 4); + + const uint8x16_t first_len = vqtbl1q_u8(first_len_tbl, high_nibbles); + + uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles); + + range = vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15)); + + uint8x16_t shift2 = vextq_u8(prev_first_len, first_len, 14); + shift2 = vqsubq_u8(shift2, const_1); + range = vorrq_u8(range, shift2); + + uint8x16_t shift3 = vextq_u8(prev_first_len, first_len, 13); + shift3 = vqsubq_u8(shift3, const_2); + range = vorrq_u8(range, shift3); + + uint8x16_t shift1 = vextq_u8(prev_input, input, 15); + shift1 = vsubq_u8(shift1, const_e0); + range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, shift1)); + + const uint8x16_t min_range = vqtbl1q_u8(range_min_tbl, range); + const uint8x16_t max_range = vqtbl1q_u8(range_max_tbl, range); + + if (return_position) + { + error = vcltq_u8(input, min_range); + error = vorrq_u8(error, vcgtq_u8(input, max_range)); + if (vmaxvq_u32(vreinterpretq_u32_u8(error))) + { + break; + } + } + else + { + error = vorrq_u8(error, vcltq_u8(input, min_range)); + error = vorrq_u8(error, vcgtq_u8(input, max_range)); + } + + prev_input = input; + prev_first_len = first_len; + + data += 16; + } + + if (return_position && data == data_original) + { + return utf8_range_ValidateUTF8Naive(data, end, return_position); + } + const int32_t prev = vgetq_lane_s32(vreinterpretq_s32_u8(prev_input), 3); + data -= utf8_range_CodepointSkipBackwards(prev); + if (return_position) + { + return (data - data_original) + utf8_range_ValidateUTF8Naive(data, end, return_position); + } + if (vmaxvq_u32(vreinterpretq_u32_u8(error))) + { + return 0; + } + return utf8_range_ValidateUTF8Naive(data, end, return_position); +} diff --git a/sdk/src/common/internal/utf8_range/utf8_range_sse.inc b/sdk/src/common/internal/utf8_range/utf8_range_sse.inc new file mode 100644 index 0000000000..90e5deb70e --- /dev/null +++ b/sdk/src/common/internal/utf8_range/utf8_range_sse.inc @@ -0,0 +1,277 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(const char *data_original, + const char *data, + const char *end, + int return_position) +{ + /* This code checks that utf-8 ranges are structurally valid 16 bytes at once + * using superscalar instructions. + * The mapping between ranges of codepoint and their corresponding utf-8 + * sequences is below. + */ + + /* + * U+0000...U+007F 00...7F + * U+0080...U+07FF C2...DF 80...BF + * U+0800...U+0FFF E0 A0...BF 80...BF + * U+1000...U+CFFF E1...EC 80...BF 80...BF + * U+D000...U+D7FF ED 80...9F 80...BF + * U+E000...U+FFFF EE...EF 80...BF 80...BF + * U+10000...U+3FFFF F0 90...BF 80...BF 80...BF + * U+40000...U+FFFFF F1...F3 80...BF 80...BF 80...BF + * U+100000...U+10FFFF F4 80...8F 80...BF 80...BF + */ + + /* First we compute the type for each byte, as given by the table below. + * This type will be used as an index later on. + */ + + /* + * Index Min Max Byte Type + * 0 00 7F Single byte sequence + * 1,2,3 80 BF Second, third and fourth byte for many of the sequences. + * 4 A0 BF Second byte after E0 + * 5 80 9F Second byte after ED + * 6 90 BF Second byte after F0 + * 7 80 8F Second byte after F4 + * 8 C2 F4 First non ASCII byte + * 9..15 7F 80 Invalid byte + */ + + /* After the first step we compute the index for all bytes, then we permute + the bytes according to their indices to check the ranges from the range + table. + * The range for a given type can be found in the range_min_table and + range_max_table, the range for type/index X is in range_min_table[X] ... + range_max_table[X]. + */ + + /* Algorithm: + * Put index zero to all bytes. + * Find all non ASCII characters, give them index 8. + * For each tail byte in a codepoint sequence, give it an index corresponding + to the 1 based index from the end. + * If the first byte of the codepoint is in the [C0...DF] range, we write + index 1 in the following byte. + * If the first byte of the codepoint is in the range [E0...EF], we write + indices 2 and 1 in the next two bytes. + * If the first byte of the codepoint is in the range [F0...FF] we write + indices 3,2,1 into the next three bytes. + * For finding the number of bytes we need to look at high nibbles (4 bits) + and do the lookup from the table, it can be done with shift by 4 + shuffle + instructions. We call it `first_len`. + * Then we shift first_len by 8 bits to get the indices of the 2nd bytes. + * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes. + * Again to get the indices of the 4th bytes. + * Take OR of all that 4 values and check within range. + */ + /* For example: + * input C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80 + * first_len 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 0 + * 1st byte 8 0 0 8 0 0 0 8 0 0 0 0 8 0 0 0 + * 2nd byte 0 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 // Shift + sub + * 3rd byte 0 0 0 0 0 1 0 0 0 2 0 0 0 0 2 0 // Shift + sub + * 4th byte 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 // Shift + sub + * Index 8 1 0 8 2 1 0 8 3 2 1 0 8 3 2 1 // OR of results + */ + + /* Checking for errors: + * Error checking is done by looking up the high nibble (4 bits) of each byte + against an error checking table. + * Because the lookup value for the second byte depends of the value of the + first byte in codepoint, we use saturated operations to adjust the index. + * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to + match the correct index. + * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1, + F4 -> 5 + * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to + match the adjustment + * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will + be more 128 and lookup in ef_fe_table will return 0 but for F0 + and F4 it will be 4 and 5 accordingly + */ + /* + * Then just check the appropriate ranges with greater/smaller equal + instructions. Check tail with a naive algorithm. + * To save from previous 16 byte checks we just align previous_first_len to + get correct continuations of the codepoints. + */ + + /* + * Map high nibble of "First Byte" to legal character length minus 1 + * 0x00 ~ 0xBF --> 0 + * 0xC0 ~ 0xDF --> 1 + * 0xE0 ~ 0xEF --> 2 + * 0xF0 ~ 0xFF --> 3 + */ + const __m128i first_len_table = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3); + + /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */ + const __m128i first_range_table = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8); + + /* + * Range table, map range index to min and max values + */ + const __m128i range_min_table = _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F); + + const __m128i range_max_table = _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + /* + * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after + * which the Second Byte are not 80~BF. It contains "range index adjustment". + * +------------+---------------+------------------+----------------+ + * | First Byte | original range| range adjustment | adjusted range | + * +------------+---------------+------------------+----------------+ + * | E0 | 2 | 2 | 4 | + * +------------+---------------+------------------+----------------+ + * | ED | 2 | 3 | 5 | + * +------------+---------------+------------------+----------------+ + * | F0 | 3 | 3 | 6 | + * +------------+---------------+------------------+----------------+ + * | F4 | 4 | 4 | 8 | + * +------------+---------------+------------------+----------------+ + */ + + /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */ + // The values represent the adjustment in the Range Index table for a correct + // index. + const __m128i df_ee_table = _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0); + + /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */ + // The values represent the adjustment in the Range Index table for a correct + // index. + const __m128i ef_fe_table = _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + __m128i prev_input = _mm_set1_epi8(0); + __m128i prev_first_len = _mm_set1_epi8(0); + __m128i error = _mm_set1_epi8(0); + + while (end - data >= 16) + { + const __m128i input = _mm_loadu_si128((const __m128i *)(data)); + + /* high_nibbles = input >> 4 */ + const __m128i high_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F)); + + /* first_len = legal character length minus 1 */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* first_len = first_len_table[high_nibbles] */ + __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles); + + /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */ + /* range = first_range_table[high_nibbles] */ + __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles); + + /* Second Byte: set range index to first_len */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* range |= (first_len, prev_first_len) << 1 byte */ + range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15)); + + /* Third Byte: set range index to saturate_sub(first_len, 1) */ + /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */ + __m128i tmp1; + __m128i tmp2; + /* tmp1 = saturate_sub(first_len, 1) */ + tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1)); + /* tmp2 = saturate_sub(prev_first_len, 1) */ + tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1)); + /* range |= (tmp1, tmp2) << 2 bytes */ + range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14)); + + /* Fourth Byte: set range index to saturate_sub(first_len, 2) */ + /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */ + /* tmp1 = saturate_sub(first_len, 2) */ + tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2)); + /* tmp2 = saturate_sub(prev_first_len, 2) */ + tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2)); + /* range |= (tmp1, tmp2) << 3 bytes */ + range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13)); + + /* + * Now we have below range indices calculated + * Correct cases: + * - 8 for C0~FF + * - 3 for 1st byte after F0~FF + * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF + * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or + * 3rd byte after F0~FF + * - 0 for others + * Error cases: + * >9 for non ascii First Byte overlapping + * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error + */ + + /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */ + /* Overlaps lead to index 9~15, which are illegal in range table */ + __m128i shift1; + __m128i pos; + __m128i range2; + /* shift1 = (input, prev_input) << 1 byte */ + shift1 = _mm_alignr_epi8(input, prev_input, 15); + pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF)); + /* + * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE | + * pos: | 0 1 15 | 16 17 239| 240 241 255| + * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 | + * pos+112: | 112 113 127| >= 128 | >= 128 | + */ + tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16)); + range2 = _mm_shuffle_epi8(df_ee_table, tmp1); + tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112)); + range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2)); + + range = _mm_add_epi8(range, range2); + + /* Load min and max values per calculated range index */ + __m128i min_range = _mm_shuffle_epi8(range_min_table, range); + __m128i max_range = _mm_shuffle_epi8(range_max_table, range); + + /* Check value range */ + if (return_position) + { + error = _mm_cmplt_epi8(input, min_range); + error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range)); + /* 5% performance drop from this conditional branch */ + if (!_mm_testz_si128(error, error)) + { + break; + } + } + else + { + error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range)); + error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range)); + } + + prev_input = input; + prev_first_len = first_len; + + data += 16; + } + /* If we got to the end, we don't need to skip any bytes backwards */ + if (return_position && data == data_original) + { + return utf8_range_ValidateUTF8Naive(data, end, return_position); + } + /* Find previous codepoint (not 80~BF) */ + data -= utf8_range_CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3)); + if (return_position) + { + return (data - data_original) + utf8_range_ValidateUTF8Naive(data, end, return_position); + } + /* Test if there was any error */ + if (!_mm_testz_si128(error, error)) + { + return 0; + } + /* Check the tail */ + return utf8_range_ValidateUTF8Naive(data, end, return_position); +} diff --git a/sdk/src/resource/BUILD b/sdk/src/resource/BUILD index 8845629990..1d3d840f47 100644 --- a/sdk/src/resource/BUILD +++ b/sdk/src/resource/BUILD @@ -10,6 +10,7 @@ cc_library( deps = [ "//api", "//sdk:headers", + "//sdk/src/common:attribute_validity", "//sdk/src/common:env_variables", ], ) diff --git a/sdk/src/resource/resource.cc b/sdk/src/resource/resource.cc index 95b41498f8..3fd1171c50 100644 --- a/sdk/src/resource/resource.cc +++ b/sdk/src/resource/resource.cc @@ -1,11 +1,14 @@ // Copyright The OpenTelemetry Authors // SPDX-License-Identifier: Apache-2.0 +#include #include #include #include #include "opentelemetry/nostd/variant.h" +#include "opentelemetry/sdk/common/attribute_validity.h" +#include "opentelemetry/sdk/common/global_log_handler.h" #include "opentelemetry/sdk/resource/resource.h" #include "opentelemetry/sdk/resource/resource_detector.h" #include "opentelemetry/sdk/version/version.h" @@ -27,8 +30,28 @@ Resource::Resource(const ResourceAttributes &attributes) noexcept {} Resource::Resource(const ResourceAttributes &attributes, const std::string &schema_url) noexcept - : attributes_(attributes), schema_url_(schema_url) -{} + : schema_url_(schema_url) +{ + attributes_.reserve(attributes.size()); + for (auto &kv : attributes) + { + if (!common::AttributeValidator::IsValid(kv.first)) + { + OTEL_INTERNAL_LOG_WARN("[Resource] Invalid attribute key " + << kv.first << ". This attribute will be ignored."); + continue; + } + + if (!common::AttributeValidator::IsValid(kv.second)) + { + OTEL_INTERNAL_LOG_WARN("[Resource] Invalid attribute value for " + << kv.first << ". This attribute will be ignored."); + continue; + } + + attributes_[kv.first] = kv.second; + } +} Resource Resource::Merge(const Resource &other) const noexcept { @@ -49,7 +72,8 @@ Resource Resource::Create(const ResourceAttributes &attributes, const std::strin std::string default_service_name = "unknown_service"; auto it_process_executable_name = resource.attributes_.find(semconv::process::kProcessExecutableName); - if (it_process_executable_name != resource.attributes_.end()) + if (it_process_executable_name != resource.attributes_.end() && + nostd::holds_alternative(it_process_executable_name->second)) { default_service_name += ":" + nostd::get(it_process_executable_name->second); } diff --git a/sdk/src/trace/BUILD b/sdk/src/trace/BUILD index 7262a1c0a0..67f5fbdede 100644 --- a/sdk/src/trace/BUILD +++ b/sdk/src/trace/BUILD @@ -11,6 +11,7 @@ cc_library( deps = [ "//api", "//sdk:headers", + "//sdk/src/common:attribute_validity", "//sdk/src/common:disabled", "//sdk/src/common:global_log_handler", "//sdk/src/common:random", diff --git a/sdk/src/trace/span.cc b/sdk/src/trace/span.cc index 3509b164da..daa25d5b4e 100644 --- a/sdk/src/trace/span.cc +++ b/sdk/src/trace/span.cc @@ -2,9 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include #include #include "opentelemetry/nostd/function_ref.h" +#include "opentelemetry/sdk/common/attribute_validity.h" +#include "opentelemetry/sdk/common/global_log_handler.h" #include "opentelemetry/sdk/trace/processor.h" #include "opentelemetry/sdk/trace/recordable.h" #include "opentelemetry/trace/span_id.h" @@ -75,14 +78,17 @@ Span::Span(std::shared_ptr &&tracer, recordable_->SetTraceFlags(span_context_->trace_flags()); - attributes.ForEachKeyValue([&](nostd::string_view key, common::AttributeValue value) noexcept { - recordable_->SetAttribute(key, value); - return true; - }); + opentelemetry::sdk::common::KeyValueFilterIterable attributes_filter(attributes, "[Trace Span] "); + attributes_filter.ForEachKeyValue( + [&](nostd::string_view key, common::AttributeValue value) noexcept { + recordable_->SetAttribute(key, value); + return true; + }); links.ForEachKeyValue([&](const opentelemetry::trace::SpanContext &span_context, const common::KeyValueIterable &attributes) { - recordable_->AddLink(span_context, attributes); + recordable_->AddLink(span_context, opentelemetry::sdk::common::KeyValueFilterIterable( + attributes, "[Trace Span Link] ")); return true; }); @@ -106,6 +112,20 @@ void Span::SetAttribute(nostd::string_view key, const common::AttributeValue &va return; } + if (!sdk::common::AttributeValidator::IsValid(key)) + { + OTEL_INTERNAL_LOG_WARN("[Trace Span] Invalid span attribute key " + << key << ". This attribute will be ignored."); + return; + } + + if (!sdk::common::AttributeValidator::IsValid(value)) + { + OTEL_INTERNAL_LOG_WARN("[Trace Span] Invalid span attribute value for " + << key << ". This attribute will be ignored."); + return; + } + recordable_->SetAttribute(key, value); } @@ -136,7 +156,8 @@ void Span::AddEvent(nostd::string_view name, const common::KeyValueIterable &att { return; } - recordable_->AddEvent(name, attributes); + recordable_->AddEvent( + name, opentelemetry::sdk::common::KeyValueFilterIterable(attributes, "[Trace Span Event] ")); } void Span::AddEvent(nostd::string_view name, @@ -148,7 +169,9 @@ void Span::AddEvent(nostd::string_view name, { return; } - recordable_->AddEvent(name, timestamp, attributes); + recordable_->AddEvent( + name, timestamp, + opentelemetry::sdk::common::KeyValueFilterIterable(attributes, "[Trace Span Event] ")); } #if OPENTELEMETRY_ABI_VERSION_NO >= 2 @@ -161,7 +184,8 @@ void Span::AddLink(const opentelemetry::trace::SpanContext &target, return; } - recordable_->AddLink(target, attrs); + recordable_->AddLink( + target, opentelemetry::sdk::common::KeyValueFilterIterable(attrs, "[Trace Span Link] ")); } void Span::AddLinks(const opentelemetry::trace::SpanContextKeyValueIterable &links) noexcept @@ -174,7 +198,8 @@ void Span::AddLinks(const opentelemetry::trace::SpanContextKeyValueIterable &lin links.ForEachKeyValue([&](const opentelemetry::trace::SpanContext &span_context, const common::KeyValueIterable &attributes) { - recordable_->AddLink(span_context, attributes); + recordable_->AddLink(span_context, opentelemetry::sdk::common::KeyValueFilterIterable( + attributes, "[Trace Span Link] ")); return true; }); } diff --git a/sdk/test/instrumentationscope/BUILD b/sdk/test/instrumentationscope/BUILD index 39a61564a7..4c053bf65b 100644 --- a/sdk/test/instrumentationscope/BUILD +++ b/sdk/test/instrumentationscope/BUILD @@ -10,6 +10,7 @@ cc_test( deps = [ "//api", "//sdk:headers", + "//sdk/src/common:attribute_validity", "@com_google_googletest//:gtest_main", ], ) diff --git a/sdk/test/instrumentationscope/CMakeLists.txt b/sdk/test/instrumentationscope/CMakeLists.txt index 659728300b..1743836ec4 100644 --- a/sdk/test/instrumentationscope/CMakeLists.txt +++ b/sdk/test/instrumentationscope/CMakeLists.txt @@ -5,8 +5,9 @@ include(GoogleTest) foreach(testname instrumentationscope_test) add_executable(${testname} "${testname}.cc") - target_link_libraries(${testname} ${GTEST_BOTH_LIBRARIES} - ${CMAKE_THREAD_LIBS_INIT} opentelemetry_sdk) + target_link_libraries( + ${testname} ${GTEST_BOTH_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT} + opentelemetry_common opentelemetry_sdk) gtest_add_tests( TARGET ${testname} TEST_PREFIX instrumentationscope. diff --git a/sdk/test/instrumentationscope/instrumentationscope_test.cc b/sdk/test/instrumentationscope/instrumentationscope_test.cc index 3d3bf057c5..c48520e623 100644 --- a/sdk/test/instrumentationscope/instrumentationscope_test.cc +++ b/sdk/test/instrumentationscope/instrumentationscope_test.cc @@ -61,6 +61,20 @@ TEST(InstrumentationScope, CreateInstrumentationScope) } } +TEST(InstrumentationScope, CreateInstrumentationScopeWithInvalidAttributes) +{ + std::string library_name = "opentelemetry-cpp"; + std::string library_version = "0.1.0"; + std::string schema_url = "https://opentelemetry.io/schemas/1.2.0"; + auto instrumentation_scope = + InstrumentationScope::Create(library_name, library_version, schema_url, + {{"attribute-key1", "attribute-value"}, + {"invalid-key\xff", "valid-value"}, + {"valid-key", "invalid-value\xff"}}); + + EXPECT_EQ(instrumentation_scope->GetAttributes().size(), 1); +} + TEST(InstrumentationScope, CreateInstrumentationScopeWithLoopForAttributes) { std::string library_name = "opentelemetry-cpp"; @@ -195,6 +209,25 @@ TEST(InstrumentationScope, SetAttribute) } } +TEST(InstrumentationScope, SetInvalidAttribute) +{ + std::string library_name = "opentelemetry-cpp"; + std::string library_version = "0.1.0"; + std::string schema_url = "https://opentelemetry.io/schemas/1.2.0"; + auto instrumentation_scope = + InstrumentationScope::Create(library_name, library_version, schema_url); + + EXPECT_EQ(instrumentation_scope->GetName(), library_name); + EXPECT_EQ(instrumentation_scope->GetVersion(), library_version); + EXPECT_EQ(instrumentation_scope->GetSchemaURL(), schema_url); + EXPECT_EQ(instrumentation_scope->GetAttributes().size(), 0); + + instrumentation_scope->SetAttribute("attribute-key1", "attribute-value"); + instrumentation_scope->SetAttribute("invalid-key\xff", "valid-value"); + instrumentation_scope->SetAttribute("valid-key", "invalid-value\xff"); + EXPECT_EQ(instrumentation_scope->GetAttributes().size(), 1); +} + TEST(InstrumentationScope, LegacyInstrumentationLibrary) { diff --git a/sdk/test/metrics/meter_test.cc b/sdk/test/metrics/meter_test.cc index 5fe99b41a8..07b293723b 100644 --- a/sdk/test/metrics/meter_test.cc +++ b/sdk/test/metrics/meter_test.cc @@ -656,6 +656,50 @@ TEST_F(MeterCreateInstrumentTest, ViewCorrectedDuplicateSyncInstrumentsByDescrip }); } +TEST_F(MeterCreateInstrumentTest, SyncInstrumentWithInvalidAttributes) +{ + InstrumentDescriptor descriptor{"my_counter", "desc", "unit", InstrumentType::kCounter, + InstrumentValueType::kDouble}; + AddDescriptionCorrectionView(descriptor.name_, descriptor.unit_, descriptor.type_, + descriptor.description_); + + auto counter1 = meter_->CreateDoubleCounter("my_counter", "desc", "unit"); + counter1->Add( + 1, + {{"key", "value1"}, {"invalid-key\xff", "valid-value"}, {"valid-key", "invalid-value\xff"}}); + + metric_reader_ptr_->Collect([](ResourceMetrics &metric_data) { + EXPECT_EQ(metric_data.scope_metric_data_.size(), 1); + // only one metric_data object expected after correction with the view + EXPECT_EQ(metric_data.scope_metric_data_[0].metric_data_.size(), 1); + EXPECT_EQ(metric_data.scope_metric_data_[0].metric_data_[0].point_data_attr_.size(), 1); + return true; + }); +} + +TEST_F(MeterCreateInstrumentTest, AsyncInstrumentWithInvalidAttributes) +{ + auto observable_counter1 = + meter_->CreateInt64ObservableCounter("observable_counter", "desc", "unit"); + auto callback1 = [](opentelemetry::metrics::ObserverResult observer, void * /* state */) { + auto observer_long = + nostd::get>>(observer); + observer_long->Observe(12, {{"key", "value1"}, + {"invalid-key\xff", "valid-value"}, + {"valid-key", "invalid-value\xff"}}); + }; + + observable_counter1->AddCallback(callback1, nullptr); + + metric_reader_ptr_->Collect([](ResourceMetrics &metric_data) { + EXPECT_EQ(metric_data.scope_metric_data_.size(), 1); + EXPECT_EQ(metric_data.scope_metric_data_[0].metric_data_.size(), 1); + auto &point_data_attr = metric_data.scope_metric_data_[0].metric_data_[0].point_data_attr_; + EXPECT_EQ(point_data_attr.size(), 1); + return true; + }); +} + TEST_F(MeterCreateInstrumentTest, IdenticalAsyncInstruments) { auto observable_counter1 = diff --git a/sdk/test/resource/resource_test.cc b/sdk/test/resource/resource_test.cc index 696509f892..07f04b0bd4 100644 --- a/sdk/test/resource/resource_test.cc +++ b/sdk/test/resource/resource_test.cc @@ -145,6 +145,36 @@ TEST(ResourceTest, create_with_schemaurl) EXPECT_EQ(received_schema_url, schema_url); } +TEST(ResourceTest, create_with_invalid_attributes) +{ + ResourceAttributes expected_attributes = { + {semconv::telemetry::kTelemetrySdkLanguage, "cpp"}, + {semconv::telemetry::kTelemetrySdkName, "opentelemetry"}, + {semconv::telemetry::kTelemetrySdkVersion, OPENTELEMETRY_SDK_VERSION}, + {semconv::service::kServiceName, "unknown_service"}, + }; + ResourceAttributes attributes = { + {semconv::telemetry::kTelemetrySdkLanguage, "cpp"}, + {semconv::telemetry::kTelemetrySdkName, "opentelemetry"}, + {semconv::telemetry::kTelemetrySdkVersion, OPENTELEMETRY_SDK_VERSION}, + {semconv::service::kServiceName, "unknown_service"}, + {"invalid_key\xff", "valid_value"}, + {"valid_key", "invalid_value\xff"}, + }; + auto resource = Resource::Create(attributes); + auto received_attributes = resource.GetAttributes(); + for (auto &e : received_attributes) + { + EXPECT_TRUE(expected_attributes.find(e.first) != expected_attributes.end()); + if (expected_attributes.find(e.first) != expected_attributes.end()) + { + EXPECT_EQ(opentelemetry::nostd::get(expected_attributes.find(e.first)->second), + opentelemetry::nostd::get(e.second)); + } + } + EXPECT_EQ(received_attributes.size(), expected_attributes.size()); +} + TEST(ResourceTest, Merge) { TestResource resource1(ResourceAttributes({{"service", "backend"}})); diff --git a/sdk/test/trace/tracer_test.cc b/sdk/test/trace/tracer_test.cc index 6e70c0130e..2c5314176f 100644 --- a/sdk/test/trace/tracer_test.cc +++ b/sdk/test/trace/tracer_test.cc @@ -398,6 +398,27 @@ TEST(Tracer, StartSpanWithAttributesCopy) ASSERT_EQ("c", strings[2]); } +TEST(Tracer, StartSpanWithInvalidAttributes) +{ + InMemorySpanExporter *exporter = new InMemorySpanExporter(); + std::shared_ptr span_data = exporter->GetData(); + auto tracer = initTracer(std::unique_ptr{exporter}); + + { + tracer + ->StartSpan("span 1", + { + {"attr1", "value1"}, + {"invalid_key\xff", "valid_value"}, + {"valid_key", "invalid_value\xff"}, + }) + ->End(); + } + + auto spans = span_data->GetSpans(); + ASSERT_EQ(1, spans.size()); +} + TEST(Tracer, GetSampler) { auto resource = Resource::Create({}); @@ -586,6 +607,30 @@ TEST(Tracer, StartSpanWithCustomConfig) #endif } +TEST(Tracer, SpanSetEventsWithInvalidAttributes) +{ + InMemorySpanExporter *exporter = new InMemorySpanExporter(); + std::shared_ptr span_data = exporter->GetData(); + auto tracer = initTracer(std::unique_ptr{exporter}); + + auto span = tracer->StartSpan("span 1"); + span->AddEvent("event 3", std::chrono::system_clock::now(), + { + {"attr1", 1}, + {"invalid_key\xff", "valid_value"}, + {"valid_key", "invalid_value\xff"}, + }); + span->End(); + + auto spans = span_data->GetSpans(); + ASSERT_EQ(1, spans.size()); + + auto &span_data_events = spans.at(0)->GetEvents(); + ASSERT_EQ(1, span_data_events.size()); + ASSERT_EQ("event 3", span_data_events[0].GetName()); + ASSERT_EQ(1, span_data_events[0].GetAttributes().size()); +} + TEST(Tracer, StartSpanWithCustomConfigDifferingConditionOrder) { std::shared_ptr noop_tracer = @@ -707,6 +752,27 @@ TEST(Tracer, SpanSetLinks) ASSERT_EQ(nostd::get(link2.GetAttributes().at("attr3")), "3"); ASSERT_EQ(nostd::get(link2.GetAttributes().at("attr4")), "4"); } + + { + + // Span link with invalid attributes + tracer + ->StartSpan("efg", {{"attr1", 1}}, + {{SpanContext(false, false), + { + {"attr2", 2}, + {"invalid_key\xff", "valid_value"}, + {"valid_key", "invalid_value\xff"}, + }}}) + ->End(); + auto spans = span_data->GetSpans(); + ASSERT_EQ(1, spans.size()); + + auto &span_data_links = spans.at(0)->GetLinks(); + ASSERT_EQ(1, span_data_links.size()); + auto link = span_data_links.at(0); + ASSERT_EQ(nostd::get(link.GetAttributes().at("attr2")), 2); + } } #if OPENTELEMETRY_ABI_VERSION_NO >= 2