Propagate context internal tags through http headers (#3045)

brettlangdon · mergify[bot] · web-flow · commit dc806e21682c · 2021-12-10T10:47:58.000-05:00
* add tagset module

* Allow encoding a subclass of dict

* Propagate context meta through http headers

* Add HTTPPropagator.extract benchmark

* fix typing issues

* expand benchmark test cases

* fix flake8 issue

* fix type casting

* use ensure_str instead of ensure_text

* add tests for invalid tags

* add scaffolding for http inject benchmarks

* replace existing benchmarks with new inject/extract scnearios

* fix benchmark scenario

* disallow leading commas, remove runtime type check

* add warning logs

* add test case for handling of unicode keys and values

* remove unnused imports

* Update benchmarks/http_propagation_extract/config.yaml

* do not propagate on any error

* Update ddtrace/propagation/http.py

* fix tag assertion

* do not encode tags when we previously had an error

Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
diff --git a/benchmarks/http_propagation_extract/config.yaml b/benchmarks/http_propagation_extract/config.yaml
@@ -0,0 +1,109 @@
+# No headers provided
+empty_headers: &default_values
+  headers: "{}"
+  extra_headers: 0
+  wsgi_style: False
+
+# 20 headers, but none that we expect
+medium_header_no_matches: &medium_header_no_matches
+  headers: "{}"
+  extra_headers: 20
+  wsgi_style: False
+
+# 100 headers, but none that we expect
+large_header_no_matches: &large_header_no_matches
+  headers: "{}"
+  extra_headers: 100
+  wsgi_style: False
+
+# Only trace id/span id/priority
+valid_headers_basic: &valid_headers_basic
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "1234", "x-datadog-span-id": "5678", "x-datadog-sampling-priority": "1"}
+
+# All possible headers we expect
+valid_headers_all: &valid_headers_all
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "1234", "x-datadog-span-id": "5678", "x-datadog-sampling-priority": "1", "x-datadog-origin": "synthetics", "x-datadog-tags": "_dd.p.tag=value,_dd.p.some_long_key=some_long_value"}
+
+# All valid/possible headers but 20 additional unrelated headers
+medium_valid_headers_all: &medium_valid_headers_all
+  <<: *valid_headers_all
+  extra_headers: 20
+
+# All valid/possible headers but 100 additional unrelated headers
+large_valid_headers_all: &large_valid_headers_all
+  <<: *valid_headers_all
+  extra_headers: 100
+
+# x-datadog-trace-id is invalid
+invalid_trace_id_header: &invalid_trace_id_header
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "trace_id", "x-datadog-span-id": "5678", "x-datadog-sampling-priority": "1", "x-datadog-origin": "synthetics", "x-datadog-tags": "_dd.p.tag=value,_dd.p.some_long_key=some_long_value"}
+
+# x-datadog-span-id is invalid
+invalid_span_id_header: &invalid_span_id_header
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "1234", "x-datadog-span-id": "span_id", "x-datadog-sampling-priority": "1", "x-datadog-origin": "synthetics", "x-datadog-tags": "_dd.p.tag=value,_dd.p.some_long_key=some_long_value"}
+
+# x-datadog-sampling-priority is invalid
+invalid_priority_header: &invalid_priority_header
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "1234", "x-datadog-span-id": "5678", "x-datadog-sampling-priority": "priority", "x-datadog-origin": "synthetics", "x-datadog-tags": "_dd.p.tag=value,_dd.p.some_long_key=some_long_value"}
+
+# x-datadog-tags is invalid
+invalid_tags_header: &invalid_tags_header
+  <<: *default_values
+  headers: |
+    {"x-datadog-trace-id": "1234", "x-datadog-span-id": "5678", "x-datadog-sampling-priority": "1", "x-datadog-origin": "synthetics", "x-datadog-tags": "_dd.p.tag=value,_dd.p.some_long_key=some_long_value,key=,=value,"}
+
+
+# Same scenarios as above but with HTTP_WSGI_STYLE_HEADERS
+wsgi_empty_headers:
+  <<: *default_values
+  wsgi_style: True
+
+wsgi_medium_header_no_matches:
+  <<: *medium_header_no_matches
+  wsgi_style: True
+
+wsgi_large_header_no_matches:
+  <<: *large_header_no_matches
+  wsgi_style: True
+
+wsgi_valid_headers_basic:
+  <<: *valid_headers_basic
+  wsgi_style: True
+
+wsgi_valid_headers_all:
+  <<: *valid_headers_all
+  wsgi_style: True
+
+wsgi_medium_valid_headers_all:
+  <<: *medium_valid_headers_all
+  wsgi_style: True
+
+wsgi_large_valid_headers_all:
+  <<: *large_valid_headers_all
+  wsgi_style: True
+
+wsgi_invalid_trace_id_header:
+  <<: *invalid_trace_id_header
+  wsgi_style: True
+
+wsgi_invalid_span_id_header:
+  <<: *default_values
+  wsgi_style: True
+
+wsgi_invalid_priority_header:
+  <<: *invalid_priority_header
+  wsgi_style: True
+
+wsgi_invalid_tags_header:
+  <<: *invalid_tags_header
+  wsgi_style: True
diff --git a/benchmarks/http_propagation_extract/scenario.py b/benchmarks/http_propagation_extract/scenario.py
@@ -0,0 +1,34 @@
+import json
+
+import bm
+
+from ddtrace.propagation import http
+from ddtrace.propagation import utils
+
+
+class HTTPPropagationExtract(bm.Scenario):
+    headers = bm.var(type=str)
+    extra_headers = bm.var(type=int)
+    wsgi_style = bm.var(type=bool)
+
+    def generate_headers(self):
+        headers = json.loads(self.headers)
+        if self.wsgi_style:
+            headers = {utils.get_wsgi_header(header): value for header, value in headers.items()}
+
+        for i in range(self.extra_headers):
+            header = "x-test-header-{}".format(i)
+            if self.wsgi_style:
+                header = utils.get_wsgi_header(header)
+            headers[header] = str(i)
+
+        return headers
+
+    def run(self):
+        headers = self.generate_headers()
+
+        def _(loops):
+            for _ in range(loops):
+                http.HTTPPropagator.extract(headers)
+
+        yield _
diff --git a/benchmarks/http_propagation_inject/config.yaml b/benchmarks/http_propagation_inject/config.yaml
@@ -0,0 +1,40 @@
+ids_only: &defaults
+  sampling_priority: ""
+  dd_origin: ""
+  meta: ""
+
+with_sampling_priority:
+  <<: *defaults
+  sampling_priority: "1"
+
+with_dd_origin:
+  <<: *defaults
+  dd_origin: "synthetics"
+
+with_priority_and_origin:
+  <<: *defaults
+  sampling_priority: "1"
+  dd_origin: "synthetics"
+
+with_tags:
+  <<: *defaults
+  meta: |
+    {"_dd.p.test": "value", "_dd.p.sample": "value", "will.be": "skipped"}
+
+with_all:
+  <<: *defaults
+  sampling_priority: "1"
+  dd_origin: "synthetics"
+  meta: |
+    {"_dd.p.test": "value", "_dd.p.sample": "value", "will.be": "skipped"}
+
+with_tags_invalid:
+  <<: *defaults
+  meta: |
+    {"_dd.p.test": "value", "_dd.p.test=": "=value,"}
+
+with_tags_max_size:
+  <<: *defaults
+  # The limit is 512, so one of these will can be encoded, but not both
+  meta: |
+    {"_dd.p.test": "______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________", "_dd.p.sample": "______________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________"}
diff --git a/benchmarks/http_propagation_inject/scenario.py b/benchmarks/http_propagation_inject/scenario.py
@@ -0,0 +1,37 @@
+import json
+
+import bm
+
+from ddtrace.context import Context
+from ddtrace.propagation import http
+
+
+class HTTPPropagationInject(bm.Scenario):
+    sampling_priority = bm.var(type=str)
+    dd_origin = bm.var(type=str)
+    meta = bm.var(type=str)
+
+    def run(self):
+        sampling_priority = None
+        if self.sampling_priority != "":
+            sampling_priority = int(self.sampling_priority)
+        dd_origin = self.dd_origin or None
+
+        meta = None
+        if self.meta:
+            meta = json.loads(self.meta)
+
+        ctx = Context(
+            trace_id=8336172473188639332,
+            span_id=6804240797025004118,
+            sampling_priority=sampling_priority,
+            dd_origin=dd_origin,
+            meta=meta,
+        )
+
+        def _(loops):
+            for _ in range(loops):
+                # Just pass in a new/empty dict, we don't care about the result
+                http.HTTPPropagator.inject(ctx, {})
+
+        yield _
diff --git a/ddtrace/internal/_tagset.pyx b/ddtrace/internal/_tagset.pyx
@@ -135,6 +135,27 @@ cpdef dict decode_tagset_string(str tagset):
 
     return res
 
+cdef bint _key_is_valid(str key):
+    """Helper to ensure a key's characters are all valid"""
+    if not key:
+        return 0
+
+    for c in key:
+        if not is_valid_key_char(ord(c)):
+            return 0
+    return 1
+
+
+cdef bint _value_is_valid(str value):
+    """Helper to ensure a values's characters are all valid"""
+    if not value:
+        return 0
+
+    for c in value:
+        if not is_valid_key_char(ord(c)):
+            return 0
+    return 1
+
 
 cpdef str encode_tagset_values(object values, int max_size=512):
     # type: (Dict[str, str], int) -> str
@@ -164,20 +185,10 @@ cpdef str encode_tagset_values(object values, int max_size=512):
         key = key.strip(" ")
         value = value.strip(" ")
 
-        if not key:
-            raise TagsetEncodeError("Key cannot be empty")
-        if not value:
-            raise TagsetEncodeError("Value cannot be empty")
-
-        # Disallow " ", ",", and "=" in keys
-        for c in (" ", ",", "="):
-            if c in key:
-                raise TagsetEncodeError("Unexpected {!r} in key {!r}".format(c, key))
-
-        # Disallow "," and "=" in keys
-        for c in (",", "="):
-            if c in value:
-                raise TagsetEncodeError("Unexpected {!r} in value {!r}".format(c, value))
+        if not _key_is_valid(key):
+            raise TagsetEncodeError("Key is not valid: {!r}".format(key))
+        if not _value_is_valid(value):
+            raise TagsetEncodeError("Value is not valid: {!r}".format(value))
 
         encoded = "{}={}".format(key, value)
         # Prefix every item except the first with `,` for separator
diff --git a/ddtrace/internal/compat.py b/ddtrace/internal/compat.py
@@ -48,6 +48,7 @@
 reload_module = six.moves.reload_module
 
 ensure_text = six.ensure_text
+ensure_str = six.ensure_str
 stringify = six.text_type
 string_type = six.string_types[0]
 binary_type = six.binary_type
diff --git a/ddtrace/propagation/http.py b/ddtrace/propagation/http.py
@@ -1,8 +1,16 @@
 from typing import Dict
 from typing import FrozenSet
 from typing import Optional
+from typing import Union
+from typing import cast
 
 from ..context import Context
+from ..internal._tagset import TagsetDecodeError
+from ..internal._tagset import TagsetEncodeError
+from ..internal._tagset import TagsetMaxSizeError
+from ..internal._tagset import decode_tagset_string
+from ..internal._tagset import encode_tagset_values
+from ..internal.compat import ensure_str
 from ..internal.logger import get_logger
 from .utils import get_wsgi_header
 
@@ -15,6 +23,7 @@
 HTTP_HEADER_PARENT_ID = "x-datadog-parent-id"
 HTTP_HEADER_SAMPLING_PRIORITY = "x-datadog-sampling-priority"
 HTTP_HEADER_ORIGIN = "x-datadog-origin"
+HTTP_HEADER_TAGS = "x-datadog-tags"
 
 
 # Note that due to WSGI spec we have to also check for uppercased and prefixed
@@ -25,6 +34,7 @@
     [HTTP_HEADER_SAMPLING_PRIORITY, get_wsgi_header(HTTP_HEADER_SAMPLING_PRIORITY).lower()]
 )
 POSSIBLE_HTTP_HEADER_ORIGIN = frozenset([HTTP_HEADER_ORIGIN, get_wsgi_header(HTTP_HEADER_ORIGIN).lower()])
+POSSIBLE_HTTP_HEADER_TAGS = frozenset([HTTP_HEADER_TAGS, get_wsgi_header(HTTP_HEADER_TAGS).lower()])
 
 
 class HTTPPropagator(object):
@@ -60,6 +70,34 @@ def parent_call():
         if span_context.dd_origin is not None:
             headers[HTTP_HEADER_ORIGIN] = str(span_context.dd_origin)
 
+        # Do not try to encode tags if we have already tried and received an error
+        if "_dd.propagation_error" in span_context._meta:
+            return
+
+        # Only propagate tags that start with `_dd.p.`
+        tags_to_encode = {}  # type: Dict[str, str]
+        for key, value in span_context._meta.items():
+            # DEV: encoding will fail if the key or value are not `str`
+            key = ensure_str(key)
+            if key.startswith("_dd.p."):
+                tags_to_encode[key] = ensure_str(value)
+
+        if tags_to_encode:
+            encoded_tags = None
+
+            try:
+                encoded_tags = encode_tagset_values(tags_to_encode)
+            except TagsetMaxSizeError:
+                # We hit the max size allowed, add a tag to the context to indicate this happened
+                span_context._meta["_dd.propagation_error"] = "max_size"
+                log.warning("failed to encode x-datadog-tags", exc_info=True)
+            except TagsetEncodeError:
+                # We hit an encoding error, add a tag to the context to indicate this happened
+                span_context._meta["_dd.propagation_error"] = "encoding_error"
+                log.warning("failed to encode x-datadog-tags", exc_info=True)
+            if encoded_tags:
+                headers[HTTP_HEADER_TAGS] = encoded_tags
+
     @staticmethod
     def _extract_header_value(possible_header_names, headers, default=None):
         # type: (FrozenSet[str], Dict[str, str], Optional[str]) -> Optional[str]
@@ -117,6 +155,19 @@ def my_controller(url, headers):
                 POSSIBLE_HTTP_HEADER_ORIGIN,
                 normalized_headers,
             )
+            meta = None
+            tags_value = HTTPPropagator._extract_header_value(
+                POSSIBLE_HTTP_HEADER_TAGS,
+                normalized_headers,
+                default="",
+            )
+            if tags_value:
+                # Do not fail if the tags are malformed
+                try:
+                    # We get a Dict[str, str], but need it to be Dict[Union[str, bytes], str] (e.g. _MetaDictType)
+                    meta = cast(Dict[Union[str, bytes], str], decode_tagset_string(tags_value))
+                except TagsetDecodeError:
+                    log.debug("failed to decode x-datadog-tags: %r", tags_value, exc_info=True)
 
             # Try to parse values into their expected types
             try:
@@ -131,15 +182,20 @@ def my_controller(url, headers):
                     span_id=int(parent_span_id) or None,  # type: ignore[arg-type]
                     sampling_priority=sampling_priority,  # type: ignore[arg-type]
                     dd_origin=origin,
+                    meta=meta,
                 )
             # If headers are invalid and cannot be parsed, return a new context and log the issue.
             except (TypeError, ValueError):
                 log.debug(
-                    "received invalid x-datadog-* headers, trace-id: %r, parent-id: %r, priority: %r, origin: %r",
+                    (
+                        "received invalid x-datadog-* headers, "
+                        "trace-id: %r, parent-id: %r, priority: %r, origin: %r, tags: %r"
+                    ),
                     trace_id,
                     parent_span_id,
                     sampling_priority,
                     origin,
+                    tags_value,
                 )
                 return Context()
         except Exception:
diff --git a/tests/benchmarks/test_propagation.py b/tests/benchmarks/test_propagation.py
diff --git a/tests/tracer/test_propagation.py b/tests/tracer/test_propagation.py
diff --git a/tests/tracer/test_tagset.py b/tests/tracer/test_tagset.py