diff --git a/README.md b/README.md index a61b52db44d8..6fd4a5f53b72 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ A/B testing, canary release, blue-green deployment, limit rate, defense against - **OPS friendly** - Zipkin tracing: [Zipkin](docs/en/latest/plugins/zipkin.md) + - OpenTelemetry tracing: [OpenTelemetry](docs/en/latest/plugins/opentelemetry.md) with plugin execution tracing - Open source APM: support [Apache SkyWalking](docs/en/latest/plugins/skywalking.md) - Works with external service discovery: In addition to the built-in etcd, it also supports [Consul](docs/en/latest/discovery/consul.md), [Consul_kv](docs/en/latest/discovery/consul_kv.md), [Nacos](docs/en/latest/discovery/nacos.md), [Eureka](docs/en/latest/discovery/eureka.md) and [Zookeeper (CP)](https://github.com/api7/apisix-seed/blob/main/docs/en/latest/zookeeper.md). - Monitoring And Metrics: [Prometheus](docs/en/latest/plugins/prometheus.md) diff --git a/apisix/plugin.lua b/apisix/plugin.lua index 789eb528d546..9a14fa66ddcf 100644 --- a/apisix/plugin.lua +++ b/apisix/plugin.lua @@ -38,6 +38,7 @@ local tostring = tostring local error = error local getmetatable = getmetatable local setmetatable = setmetatable +local string_format = string.format -- make linter happy to avoid error: getting the Lua global "load" -- luacheck: globals load, ignore lua_load local lua_load = load @@ -1169,6 +1170,9 @@ function _M.run_plugin(phase, plugins, api_ctx) return api_ctx end + -- Get OpenTelemetry plugin for tracing + local otel_plugin = _M.get("opentelemetry") + if phase ~= "log" and phase ~= "header_filter" and phase ~= "body_filter" @@ -1188,11 +1192,44 @@ function _M.run_plugin(phase, plugins, api_ctx) goto CONTINUE end + -- Start OpenTelemetry plugin span (with performance guard) + if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel_trace_plugins then + otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase) + end + run_meta_pre_function(conf, api_ctx, plugins[i]["name"]) plugin_run = true api_ctx._plugin_name = plugins[i]["name"] - local code, body = phase_func(conf, api_ctx) + local success = true + local error_msg = nil + local code, body + + -- Execute plugin with error handling + local ok, result = pcall(phase_func, conf, api_ctx) + if not ok then + -- Lua exception occurred + success = false + error_msg = string_format("plugin execution failed: %s", result) + code = 500 + body = nil + else + -- Plugin executed successfully, check return values + code, body = result, nil + if code and code >= 400 then + success = false + error_msg = string_format("plugin exited with status code %d", code) + end + end + api_ctx._plugin_name = nil + + -- Finish OpenTelemetry plugin span (with performance guard) + if otel_plugin and otel_plugin.finish_plugin_span and + api_ctx.otel_trace_plugins then + otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, success, + error_msg) + end + if code or body then if is_http then if code >= 400 then @@ -1226,11 +1263,30 @@ function _M.run_plugin(phase, plugins, api_ctx) local phase_func = plugins[i][phase] local conf = plugins[i + 1] if phase_func and meta_filter(api_ctx, plugins[i]["name"], conf) then + -- Start OpenTelemetry plugin span (with performance guard) + if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel_trace_plugins then + otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase) + end + plugin_run = true run_meta_pre_function(conf, api_ctx, plugins[i]["name"]) api_ctx._plugin_name = plugins[i]["name"] - phase_func(conf, api_ctx) + + local success = true + local error_msg = nil + local ok, err = pcall(phase_func, conf, api_ctx) + if not ok then + success = false + error_msg = err + end + api_ctx._plugin_name = nil + + -- Finish OpenTelemetry plugin span (with performance guard) + if otel_plugin and otel_plugin.finish_plugin_span and api_ctx.otel_trace_plugins then + otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, success, + error_msg) + end end end diff --git a/apisix/plugins/opentelemetry.lua b/apisix/plugins/opentelemetry.lua index d98ac44ae69d..0feca25ede8b 100644 --- a/apisix/plugins/opentelemetry.lua +++ b/apisix/plugins/opentelemetry.lua @@ -182,6 +182,20 @@ local schema = { type = "string", minLength = 1, } + }, + trace_plugins = { + type = "boolean", + description = "whether to trace individual plugin execution", + default = false + }, + plugin_span_kind = { + type = "string", + enum = {"internal", "server"}, + description = "span kind for plugin execution spans. " + .. "Some observability providers may exclude internal spans from metrics " + .. "and dashboards. Use 'server' if you need plugin spans included in " + .. "service-level metrics.", + default = "internal" } } } @@ -306,6 +320,119 @@ local function inject_attributes(attributes, wanted_attributes, source, with_pre end +-- Plugin tracing helper functions +local function create_plugin_span(tracer, plugin_name, phase, parent_context, api_ctx) + -- Use a more descriptive span name that includes the plugin and phase + local span_name = string_format("plugin.%s.%s", plugin_name, phase) + local attributes = { + attr.string("apisix.plugin.name", plugin_name), + attr.string("apisix.plugin.phase", phase), + attr.string("apisix.plugin.operation", "execute"), + -- Set resource name to distinguish plugin spans from main spans + attr.string("resource.name", span_name) + } + + -- Add request context attributes if available + if api_ctx and api_ctx.var then + local vars = api_ctx.var + table.insert(attributes, attr.string("http.method", vars.method or "unknown")) + table.insert(attributes, attr.string("http.target", vars.request_uri or "unknown")) + table.insert(attributes, attr.string("net.host.name", vars.host or "unknown")) + table.insert(attributes, attr.string("http.user_agent", vars.http_user_agent or "unknown")) + + -- Add route information if available + if api_ctx.route_id then + table.insert(attributes, attr.string("apisix.route_id", api_ctx.route_id)) + end + if api_ctx.route_name then + table.insert(attributes, attr.string("apisix.route_name", api_ctx.route_name)) + end + if api_ctx.curr_req_matched and api_ctx.curr_req_matched._path then + table.insert(attributes, attr.string("http.route", api_ctx.curr_req_matched._path)) + end + + -- Add service information if available + if api_ctx.service_id then + table.insert(attributes, attr.string("apisix.service_id", api_ctx.service_id)) + end + if api_ctx.service_name then + table.insert(attributes, attr.string("apisix.service_name", api_ctx.service_name)) + end + end + + -- Get the configured span kind or default to internal + local kind = span_kind.internal + if api_ctx.otel_plugin_span_kind then + kind = api_ctx.otel_plugin_span_kind + end + + -- Create child span with proper parent context + local ctx = tracer:start(parent_context, span_name, { + kind = kind, + attributes = attributes, + }) + + return ctx +end + + +local function finish_plugin_span(plugin_span_ctx, success, error_msg) + if not plugin_span_ctx then + return + end + + local span = plugin_span_ctx:span() + if not span then + return + end + + if not success then + span:set_status(span_status.ERROR, error_msg or "plugin execution failed") + end + + span:finish() +end + + +-- Store plugin spans in api_ctx for cleanup +local function store_plugin_span(api_ctx, plugin_name, phase, span_ctx) + if not api_ctx.otel_plugin_spans then + api_ctx.otel_plugin_spans = {} + end + + -- Use nested structure to avoid key collisions + if not api_ctx.otel_plugin_spans[plugin_name] then + api_ctx.otel_plugin_spans[plugin_name] = {} + end + api_ctx.otel_plugin_spans[plugin_name][phase] = { + span_ctx = span_ctx, + finished = false + } +end + + +local function cleanup_plugin_spans(api_ctx) + if not api_ctx.otel_plugin_spans then + return + end + + for plugin_name, phases in pairs(api_ctx.otel_plugin_spans) do + if phases then + for phase, span_data in pairs(phases) do + if span_data and not span_data.finished and span_data.span_ctx then + local span = span_data.span_ctx:span() + if span then + span:finish() + end + end + end + end + end + + api_ctx.otel_plugin_spans = nil +end + + function _M.rewrite(conf, api_ctx) local metadata = plugin.plugin_metadata(plugin_name) if metadata == nil then @@ -323,7 +450,7 @@ function _M.rewrite(conf, api_ctx) return end - local span_name = vars.method + local span_name = string_format("http.%s", vars.method) local attributes = { attr.string("net.host.name", vars.host), @@ -337,7 +464,7 @@ function _M.rewrite(conf, api_ctx) table.insert(attributes, attr.string("apisix.route_id", api_ctx.route_id)) table.insert(attributes, attr.string("apisix.route_name", api_ctx.route_name)) table.insert(attributes, attr.string("http.route", api_ctx.curr_req_matched._path)) - span_name = span_name .. " " .. api_ctx.curr_req_matched._path + span_name = string_format("http.%s %s", vars.method, api_ctx.curr_req_matched._path) end if api_ctx.service_id then @@ -378,6 +505,23 @@ function _M.rewrite(conf, api_ctx) api_ctx.otel_context_token = ctx:attach() + -- Store tracer and configuration for plugin tracing + if conf.trace_plugins then + api_ctx.otel_tracer = tracer + api_ctx.otel_main_context = ctx + api_ctx.otel_trace_plugins = true + + -- Map string span kind to span_kind constant + local kind_mapping = { + internal = span_kind.internal, + server = span_kind.server, + } + local span_kind_value = conf.plugin_span_kind or "internal" + api_ctx.otel_plugin_span_kind = kind_mapping[span_kind_value] or span_kind.internal + + -- Note: We don't create a span for the OpenTelemetry plugin itself to avoid recursion + end + -- inject trace context into the headers of upstream HTTP request trace_context_propagator:inject(ctx, ngx.req) end @@ -400,6 +544,14 @@ function _M.delayed_body_filter(conf, api_ctx) span:set_attributes(attr.int("http.status_code", upstream_status)) span:finish() + + -- Finish OpenTelemetry plugin span if it exists + if api_ctx.otel_trace_plugins then + _M.finish_plugin_span(api_ctx, "opentelemetry", "rewrite", true, nil) + end + + -- Cleanup plugin spans + cleanup_plugin_spans(api_ctx) end end @@ -419,7 +571,60 @@ function _M.log(conf, api_ctx) end span:finish() + + -- Cleanup plugin spans (guaranteed cleanup on request end) + cleanup_plugin_spans(api_ctx) + end +end + + +-- Public functions for plugin tracing integration +function _M.start_plugin_span(api_ctx, plugin_name, phase) + if not api_ctx.otel_trace_plugins or not api_ctx.otel_tracer then + return nil end + + -- Prevent recursion: don't trace the OpenTelemetry plugin itself + if plugin_name == "opentelemetry" then + return nil + end + + -- Always use the stored main context to ensure proper parent-child relationship + local parent_ctx = api_ctx.otel_main_context + if not parent_ctx then + return nil + end + + local plugin_span_ctx = create_plugin_span(api_ctx.otel_tracer, plugin_name, phase, parent_ctx, + api_ctx) + store_plugin_span(api_ctx, plugin_name, phase, plugin_span_ctx) + + return plugin_span_ctx +end + + +function _M.finish_plugin_span(api_ctx, plugin_name, phase, success, error_msg) + if not api_ctx.otel_trace_plugins or not api_ctx.otel_plugin_spans then + return + end + + -- Prevent recursion: don't trace the OpenTelemetry plugin itself + if plugin_name == "opentelemetry" then + return + end + + local plugin_data = api_ctx.otel_plugin_spans[plugin_name] + if not plugin_data then + return + end + + local span_data = plugin_data[phase] + if not span_data or span_data.finished then + return + end + + finish_plugin_span(span_data.span_ctx, success, error_msg) + span_data.finished = true end diff --git a/docs/en/latest/plugins/opentelemetry.md b/docs/en/latest/plugins/opentelemetry.md index 061c26212dd5..311958a0a913 100644 --- a/docs/en/latest/plugins/opentelemetry.md +++ b/docs/en/latest/plugins/opentelemetry.md @@ -222,3 +222,121 @@ You should see access log entries similar to the following when you generate req ```text {"time": "18/Feb/2024:15:09:00 +0000","opentelemetry_context_traceparent": "00-fbd0a38d4ea4a128ff1a688197bc58b0-8f4b9d9970a02629-01","opentelemetry_trace_id": "fbd0a38d4ea4a128ff1a688197bc58b0","opentelemetry_span_id": "af3dc7642104748a","remote_addr": "172.10.0.1"} ``` + +### Enable Plugin Execution Tracing + +The `trace_plugins` attribute allows you to trace individual plugin execution phases. When enabled (set to `true`), the OpenTelemetry plugin creates child spans for each plugin phase (rewrite, access, header_filter, body_filter, log) with comprehensive request context attributes. + +**Note**: Plugin tracing is **disabled by default** (`trace_plugins: false`). You must explicitly enable it to see plugin execution spans. + +#### Span Kind Configuration + +The `plugin_span_kind` attribute allows you to configure the span kind for plugin execution spans. Some observability providers may exclude `internal` spans from metrics and dashboards. + +- **Default**: `internal` - Standard internal operation span. +- **Alternative**: `server` - Treated as server-side operation, typically included in service-level metrics + +Create a Route with plugin tracing enabled: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -H "Content-Type: application/json" \ + -d '{ + "uri": "/hello", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": true + }, + "proxy-rewrite": { + "uri": "/get" + }, + "response-rewrite": { + "headers": { + "X-Response-Time": "$time_iso8601" + } + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }' +``` + +When you make requests to this route, you will see: + +1. **Main request span**: `http.GET /hello` with request context +2. **Plugin execution spans**: + - `plugin.opentelemetry.rewrite` + - `plugin.proxy-rewrite.rewrite` + - `plugin.response-rewrite.header_filter` + - `plugin.response-rewrite.body_filter` + - `plugin.opentelemetry.log` + +Each plugin span includes: +- Plugin name and phase information +- HTTP method, URI, hostname, and user agent +- Route ID, route name, and matched path +- Service ID and service name (if available) + +#### Example with Custom Span Kind + +For observability providers that exclude internal spans from metrics, configure plugin spans as `server` type: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -H "Content-Type: application/json" \ + -d '{ + "uri": "/hello", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": true, + "plugin_span_kind": "server" + }, + "proxy-rewrite": { + "uri": "/get" + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }' +``` + +Plugin tracing is disabled by default. If you don't need plugin tracing, you can omit the `trace_plugins` attribute: + +```shell +curl "http://127.0.0.1:9180/apisix/admin/routes/1" -X PUT \ + -H "X-API-KEY: ${admin_key}" \ + -H "Content-Type: application/json" \ + -d '{ + "uri": "/hello", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": false + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }' +``` diff --git a/t/plugin/opentelemetry-plugin-tracing.t b/t/plugin/opentelemetry-plugin-tracing.t new file mode 100644 index 000000000000..7dcfab03f71e --- /dev/null +++ b/t/plugin/opentelemetry-plugin-tracing.t @@ -0,0 +1,270 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +use t::APISIX 'no_plan'; +add_block_preprocessor(sub { + my ($block) = @_; + + if (!$block->extra_yaml_config) { + my $extra_yaml_config = <<_EOC_; +plugins: + - opentelemetry + - proxy-rewrite + - response-rewrite +_EOC_ + $block->set_value("extra_yaml_config", $extra_yaml_config); + } + + if (!$block->request) { + $block->set_value("request", "GET /t"); + } + + if (!defined $block->response_body) { + $block->set_value("response_body", "passed\n"); + } + $block; +}); +repeat_each(1); +no_long_string(); +no_root_location(); +log_level("debug"); + +run_tests; + +__DATA__ + +=== TEST 1: add plugin metadata with plugin tracing enabled +--- config + location /t { + content_by_lua_block { + local core = require("apisix.core") + local plugin = require("apisix.plugin") + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/plugin_metadata/opentelemetry', + ngx.HTTP_PUT, + [[{ + "collector": { + "address": "127.0.0.1:4318" + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 2: create route with opentelemetry plugin and trace_plugins enabled +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/1', + ngx.HTTP_PUT, + [[{ + "uri": "/hello", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": true + }, + "proxy-rewrite": { + "uri": "/get" + }, + "response-rewrite": { + "headers": { + "X-Response-Time": "$time_iso8601" + } + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 3: test route with plugin tracing +--- request +GET /hello +--- response_body +passed +--- error_log +plugin execution span created +--- wait: 0.1 + + + +=== TEST 4: create route with opentelemetry plugin and trace_plugins disabled +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/2', + ngx.HTTP_PUT, + [[{ + "uri": "/hello2", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": false + }, + "proxy-rewrite": { + "uri": "/get" + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 5: test route without plugin tracing +--- request +GET /hello2 +--- response_body +passed +--- error_log +plugin execution span created +--- wait: 0.1 + + + +=== TEST 6: test schema validation for trace_plugins +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/3', + ngx.HTTP_PUT, + [[{ + "uri": "/hello3", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + }, + "trace_plugins": "invalid_value" + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + ngx.say(body) + } + } +--- error_code: 400 +--- response_body +{"error_msg":"invalid configuration: property \"trace_plugins\" validation failed: wrong type: expected boolean, got string"} + + + +=== TEST 7: test default value for trace_plugins +--- config + location /t { + content_by_lua_block { + local t = require("lib.test_admin").test + local code, body = t('/apisix/admin/routes/4', + ngx.HTTP_PUT, + [[{ + "uri": "/hello4", + "plugins": { + "opentelemetry": { + "sampler": { + "name": "always_on" + } + } + }, + "upstream": { + "type": "roundrobin", + "nodes": { + "127.0.0.1:1980": 1 + } + } + }]] + ) + if code >= 300 then + ngx.status = code + ngx.say(body) + return + end + ngx.say(body) + } + } +--- response_body +passed + + + +=== TEST 8: test route with default trace_plugins (should be false) +--- request +GET /hello4 +--- response_body +passed +--- no_error_log +plugin execution span created +--- wait: 0.1