Add a processing pipeline to AsyncWorker

bmermet · bmermet · commit fdc727044ac3 · 2017-07-04T17:55:31.000+02:00
This makes it possible to do some processing/filtering of the traces
before sending them to the agent.

Add a FilterRequestsOnUrl processor to remove traces of incoming
requests that match a regexp.
diff --git a/ddtrace/processors.py b/ddtrace/processors.py
@@ -0,0 +1,50 @@
+import re
+
+from .ext import http
+
+class FilterRequestsOnUrl():
+    """Filter out traces from incoming http requests based on the request's url
+
+    This class takes as argument a list of regular expression patterns
+    representing the urls to be excluded from tracing. A trace will be excluded
+    if its root span contains a http.url tag and if this tag matches any of
+    the provided regular expression using the standard python regexp match
+    semantic (https://docs.python.org/2/library/re.html#re.match).
+
+    :param list regexps: the  list of regular expressions (as strings) defining the urls that should be filtered out. (a single string is also accepted)
+
+    Examples:
+
+    To filter out http calls to domain api.example.com::
+
+        FilterRequestsOnUrl(r'http://api\.example\.com')
+
+    To filter out http calls to all first level subdomains from example.com::
+
+        FilterRequestOnUrl(r'http://.*+\.example\.com')
+
+    To filter out calls to both http://test.example.com and http://example.com/healthcheck::
+
+        FilterRequestOnUrl([r'http://test\.example\.com', r'http://example\.com/healthcheck'])
+
+
+    """
+    def __init__(self, regexps):
+        if isinstance(regexps, str):
+            regexps = [regexps]
+        self._regexps = [re.compile(regexp) for regexp in regexps]
+
+    def process_trace(self, trace):
+        """
+        process_trace is called by the processing pipeline on each trace
+        before it is sent to the agent, the returned value will be fed to the
+        next step of the pipeline. If process_trace returns None, the whole
+        trace is discarded.
+        """
+        for span in trace:
+            if span.parent_id == None and span.get_tag(http.URL) is not None:
+                url = span.get_tag(http.URL)
+                for regexp in self._regexps:
+                    if regexp.match(url):
+                        return None
+        return trace
diff --git a/ddtrace/settings.py b/ddtrace/settings.py
@@ -0,0 +1,4 @@
+PROCESSING_PIPELINE_KEY = "PROCESSING_PIPELINE"
+
+#Shorter Alias
+PP_KEY = PROCESSING_PIPELINE_KEY
diff --git a/ddtrace/tracer.py b/ddtrace/tracer.py
@@ -7,6 +7,7 @@
 from .sampler import AllSampler
 from .writer import AgentWriter
 from .span import Span
+from .settings import PP_KEY
 from . import compat
 from os import getpid
 
@@ -70,7 +71,7 @@ async def web_handler(request):
         return self._context_provider(*args, **kwargs)
 
     def configure(self, enabled=None, hostname=None, port=None, sampler=None,
-                context_provider=None, wrap_executor=None):
+                context_provider=None, wrap_executor=None, settings=None):
         """
         Configure an existing Tracer the easy way.
         Allow to configure or reconfigure a Tracer instance.
@@ -90,8 +91,12 @@ def configure(self, enabled=None, hostname=None, port=None, sampler=None,
         if enabled is not None:
             self.enabled = enabled
 
-        if hostname is not None or port is not None:
-            self.writer = AgentWriter(hostname or self.DEFAULT_HOSTNAME, port or self.DEFAULT_PORT)
+        processing_pipeline = None
+        if settings is not None and PP_KEY in settings:
+                processing_pipeline = settings[PP_KEY]
+
+        if hostname is not None or port is not None or processing_pipeline is not None:
+            self.writer = AgentWriter(hostname or self.DEFAULT_HOSTNAME, port or self.DEFAULT_PORT, processing_pipeline=processing_pipeline)
 
         if sampler is not None:
             self.sampler = sampler
diff --git a/ddtrace/writer.py b/ddtrace/writer.py
@@ -22,11 +22,12 @@
 
 class AgentWriter(object):
 
-    def __init__(self, hostname='localhost', port=8126):
+    def __init__(self, hostname='localhost', port=8126, processing_pipeline=None):
         self._pid = None
         self._traces = None
         self._services = None
         self._worker = None
+        self._processing_pipeline = processing_pipeline
         self.api = api.API(hostname, port)
 
     def write(self, spans=None, services=None):
@@ -52,17 +53,18 @@ def _reset_worker(self):
 
         # ensure we have an active thread working on this queue
         if not self._worker or not self._worker.is_alive():
-            self._worker = AsyncWorker(self.api, self._traces, self._services)
+            self._worker = AsyncWorker(self.api, self._traces, self._services, processing_pipeline=self._processing_pipeline)
 
 
 class AsyncWorker(object):
 
-    def __init__(self, api, trace_queue, service_queue, shutdown_timeout=DEFAULT_TIMEOUT):
+    def __init__(self, api, trace_queue, service_queue, shutdown_timeout=DEFAULT_TIMEOUT, processing_pipeline=None):
         self._trace_queue = trace_queue
         self._service_queue = service_queue
         self._lock = threading.Lock()
         self._thread = None
         self._shutdown_timeout = shutdown_timeout
+        self._processing_pipeline = processing_pipeline
         self._last_error_ts = 0
         self.api = api
         self.start()
@@ -119,6 +121,13 @@ def _target(self):
 
         while True:
             traces = self._trace_queue.pop()
+            if traces:
+                # Before sending the traces, make them go through the
+                # processing pipeline
+                try:
+                    traces = self._apply_processing_pipeline(traces)
+                except Exception as err:
+                    log.error("error while processing traces:{0}".format(err))
             if traces:
                 # If we have data, let's try to send it.
                 try:
@@ -155,6 +164,19 @@ def _log_error_status(self, result, result_name):
                       getattr(result, "status", None), getattr(result, "reason", None),
                       getattr(result, "msg", None))
 
+    def _apply_processing_pipeline(self, traces):
+        if self._processing_pipeline is not None:
+            processed_traces = []
+            for trace in traces:
+                for processor in self._processing_pipeline:
+                    trace = processor.process_trace(trace)
+                    if trace is None:
+                        break
+                if trace is not None:
+                    processed_traces.append(trace)
+            return processed_traces
+        return traces
+
 
 class Q(object):
     """
diff --git a/docs/index.rst b/docs/index.rst
@@ -309,6 +309,28 @@ Users can pass along the parent_trace_id and parent_span_id via whatever method
 Advanced Usage
 --------------
 
+Trace Filtering
+~~~~~~~~~~~~~~~
+
+It is possible to filter or modify traces before they are sent to the agent by configuring the tracer with a processing pipeline. For instance to filter out all traces of incoming requests to a specific url::
+
+    processing_pipeline = [FilterRequestsOnUrl(r'http://test\.example\.com')]
+    Tracer.configure(settings={'PROCESSING_PIPELINE': processing_pipeline})
+
+All the processors in the processing pipeline will be evaluated sequentially for each trace and the resulting trace will either be sent to the agent or discarded depending on the output of the pipeline.
+
+**Use the standard processors**
+
+The library comes with a FilterRequestsOnUrl processor that can be used to filter out incoming requests to specific urls:
+
+.. autoclass:: ddtrace.processors.FilterRequestsOnUrl
+    :members:
+
+**Write a custom processor**
+
+Creating your own processors is as simple as implementing a class with a process_trace method and adding it to the processing pipeline parameter of Tracer.configure. process_trace should either return a trace to be fed to the next step of the pipeline or None if the trace should be discarded. (see processors.py for example implementations)
+
+
 API
 ~~~
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -9,6 +9,9 @@
 from nose.tools import eq_, ok_
 
 from ddtrace.api import API
+from ddtrace.ext import http
+from ddtrace.processors import FilterRequestsOnUrl
+from ddtrace.settings import PP_KEY
 from ddtrace.span import Span
 from ddtrace.tracer import Tracer
 from ddtrace.encoding import JSONEncoder, MsgpackEncoder, get_encoder
@@ -200,6 +203,27 @@ def test_worker_http_error_logging(self):
         ok_('failed_to_send traces to Agent: HTTP error status 400, reason Bad Request, message Content-Type:'
             in logged_errors[0])
 
+    def test_worker_filter_request(self):
+        self.tracer.configure(settings={PP_KEY: [FilterRequestsOnUrl(r'http://example\.com/health')]})
+        # spy the send() method
+        self.api = self.tracer.writer.api
+        self.api._put = mock.Mock(self.api._put, wraps=self.api._put)
+
+        span = self.tracer.trace('testing.filteredurl')
+        span.set_tag(http.URL, 'http://example.com/health')
+        span.finish()
+        span = self.tracer.trace('testing.nonfilteredurl')
+        span.set_tag(http.URL, 'http://example.com/api/resource')
+        span.finish()
+        self._wait_thread_flush()
+
+        # Only the second trace should have been sent
+        eq_(self.api._put.call_count, 1)
+        # check and retrieve the right call
+        endpoint, payload = self._get_endpoint_payload(self.api._put.call_args_list, '/v0.3/traces')
+        eq_(endpoint, '/v0.3/traces')
+        eq_(len(payload), 1)
+        eq_(payload[0][0]['name'], 'testing.nonfilteredurl')
 
 @skipUnless(
     os.environ.get('TEST_DATADOG_INTEGRATION', False),
diff --git a/tests/test_processors.py b/tests/test_processors.py
@@ -0,0 +1,34 @@
+from unittest import TestCase
+
+from ddtrace.processors import FilterRequestsOnUrl
+from ddtrace.span import Span
+from ddtrace.ext.http import URL
+
+class FilterRequestOnUrlTests(TestCase):
+    def test_is_match(self):
+        span = Span(name='Name', tracer=None)
+        span.set_tag(URL, r'http://example.com')
+        processor = FilterRequestsOnUrl('http://examp.*.com')
+        trace = processor.process_trace([span])
+        self.assertIsNone(trace)
+
+    def test_is_not_match(self):
+        span = Span(name='Name', tracer=None)
+        span.set_tag(URL, r'http://anotherexample.com')
+        processor = FilterRequestsOnUrl('http://examp.*.com')
+        trace = processor.process_trace([span])
+        self.assertIsNotNone(trace)
+
+    def test_list_match(self):
+        span = Span(name='Name', tracer=None)
+        span.set_tag(URL, r'http://anotherdomain.example.com')
+        processor = FilterRequestsOnUrl(['http://domain\.example\.com', 'http://anotherdomain\.example\.com'])
+        trace = processor.process_trace([span])
+        self.assertIsNone(trace)
+
+    def test_list_no_match(self):
+        span = Span(name='Name', tracer=None)
+        span.set_tag(URL, r'http://cooldomain.example.com')
+        processor = FilterRequestsOnUrl(['http://domain\.example\.com', 'http://anotherdomain\.example\.com'])
+        trace = processor.process_trace([span])
+        self.assertIsNotNone(trace)
diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -0,0 +1,62 @@
+from unittest import TestCase
+
+from ddtrace.span import Span
+from ddtrace.writer import AsyncWorker, Q
+
+class RemoveAllProcessor():
+    def process_trace(self, trace):
+        return None
+
+class KeepAllProcessor():
+    def process_trace(self, trace):
+        return trace
+
+class AddTagProcessor():
+    def __init__(self, tag_name):
+        self.tag_name = tag_name
+    def process_trace(self, trace):
+        for span in trace:
+            span.set_tag(self.tag_name, "A value")
+        return trace
+
+class DummmyAPI():
+    def __init__(self):
+        self.traces = []
+    def send_traces(self, traces):
+        for trace in traces:
+            self.traces.append(trace)
+
+N_TRACES = 11
+
+class AsyncWorkerTests(TestCase):
+    def setUp(self):
+        self.api = DummmyAPI()
+        self.traces = Q()
+        self.services = Q()
+        for i in range(N_TRACES):
+            self.traces.add([Span(tracer=None, name="name", trace_id=i, span_id=j, parent_id=j-1 or None) for j in range(7)])
+
+    def test_processing_pipeline_keep_all(self):
+        processing_pipeline = [KeepAllProcessor()]
+        worker = AsyncWorker(self.api, self.traces, self.services, processing_pipeline=processing_pipeline)
+        worker.stop()
+        worker.join()
+        self.assertEqual(len(self.api.traces), N_TRACES)
+
+    def test_processing_pipeline_remove_all(self):
+        processing_pipeline = [RemoveAllProcessor()]
+        worker = AsyncWorker(self.api, self.traces, self.services, processing_pipeline=processing_pipeline)
+        worker.stop()
+        worker.join()
+        self.assertEqual(len(self.api.traces), 0)
+
+    def test_processing_pipeline_add_tag(self):
+        tag_name = "Tag"
+        processing_pipeline = [AddTagProcessor(tag_name)]
+        worker = AsyncWorker(self.api, self.traces, self.services, processing_pipeline=processing_pipeline)
+        worker.stop()
+        worker.join()
+        self.assertEqual(len(self.api.traces), N_TRACES)
+        for trace in self.api.traces:
+            for span in trace:
+                self.assertIsNotNone(span.get_tag(tag_name))