Skip to content

Commit 830c7b3

Browse files
[AMLII-2019] Max samples per context for Histogram, Distribution and Timing metrics (Experimental Feature) (#863)
This experimental feature allows the user to limit the number of samples per context for histogram, distribution, and timing metrics. This can be enabled with the statsd_max_samples_per_context flag. When enabled up to n samples will be kept in per context for Histogram, Distribution and Timing metrics when Aggregation is enabled. The default value is 0 which means no limit.
1 parent 5482cf4 commit 830c7b3

File tree

8 files changed

+337
-38
lines changed

8 files changed

+337
-38
lines changed

datadog/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def initialize(
4343
statsd_use_default_route=False, # type: bool
4444
statsd_socket_path=None, # type: Optional[str]
4545
statsd_namespace=None, # type: Optional[str]
46+
statsd_max_samples_per_context=0, # type: Optional[int]
4647
statsd_constant_tags=None, # type: Optional[List[str]]
4748
return_raw_response=False, # type: bool
4849
hostname_from_config=True, # type: bool
@@ -82,8 +83,12 @@ def initialize(
8283
(default: True).
8384
:type statsd_disable_aggregation: boolean
8485
86+
:param statsd_max_samples_per_context: Set the max samples per context for Histogram,
87+
Distribution and Timing metrics. Use with the statsd_disable_aggregation set to False.
88+
:type statsd_max_samples_per_context: int
89+
8590
:param statsd_aggregation_flush_interval: If aggregation is enabled, set the flush interval for
86-
aggregation/buffering
91+
aggregation/buffering (This feature is experimental)
8792
(default: 0.3 seconds)
8893
:type statsd_aggregation_flush_interval: float
8994
@@ -142,7 +147,7 @@ def initialize(
142147
if statsd_disable_aggregation:
143148
statsd.disable_aggregation()
144149
else:
145-
statsd.enable_aggregation(statsd_aggregation_flush_interval)
150+
statsd.enable_aggregation(statsd_aggregation_flush_interval, statsd_max_samples_per_context)
146151
statsd.disable_buffering = statsd_disable_buffering
147152
api._return_raw_response = return_raw_response
148153

datadog/dogstatsd/aggregator.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,28 @@
44
GaugeMetric,
55
SetMetric,
66
)
7+
from datadog.dogstatsd.max_sample_metric import (
8+
HistogramMetric,
9+
DistributionMetric,
10+
TimingMetric
11+
)
712
from datadog.dogstatsd.metric_types import MetricType
13+
from datadog.dogstatsd.max_sample_metric_context import MaxSampleMetricContexts
814

915

1016
class Aggregator(object):
11-
def __init__(self):
17+
def __init__(self, max_samples_per_context=0):
18+
self.max_samples_per_context = max_samples_per_context
1219
self.metrics_map = {
1320
MetricType.COUNT: {},
1421
MetricType.GAUGE: {},
1522
MetricType.SET: {},
1623
}
24+
self.max_sample_metric_map = {
25+
MetricType.HISTOGRAM: MaxSampleMetricContexts(HistogramMetric),
26+
MetricType.DISTRIBUTION: MaxSampleMetricContexts(DistributionMetric),
27+
MetricType.TIMING: MaxSampleMetricContexts(TimingMetric)
28+
}
1729
self._locks = {
1830
MetricType.COUNT: threading.RLock(),
1931
MetricType.GAUGE: threading.RLock(),
@@ -28,6 +40,18 @@ def flush_aggregated_metrics(self):
2840
self.metrics_map[metric_type] = {}
2941
for metric in current_metrics.values():
3042
metrics.extend(metric.get_data() if isinstance(metric, SetMetric) else [metric])
43+
44+
return metrics
45+
46+
def set_max_samples_per_context(self, max_samples_per_context=0):
47+
self.max_samples_per_context = max_samples_per_context
48+
49+
def flush_aggregated_sampled_metrics(self):
50+
metrics = []
51+
for metric_type in self.max_sample_metric_map.keys():
52+
metric_context = self.max_sample_metric_map[metric_type]
53+
for metricList in metric_context.flush():
54+
metrics.extend(metricList)
3155
return metrics
3256

3357
def get_context(self, name, tags):
@@ -60,3 +84,27 @@ def add_metric(
6084
self.metrics_map[metric_type][context] = metric_class(
6185
name, value, tags, rate, timestamp
6286
)
87+
88+
def histogram(self, name, value, tags, rate):
89+
return self.add_max_sample_metric(
90+
MetricType.HISTOGRAM, name, value, tags, rate
91+
)
92+
93+
def distribution(self, name, value, tags, rate):
94+
return self.add_max_sample_metric(
95+
MetricType.DISTRIBUTION, name, value, tags, rate
96+
)
97+
98+
def timing(self, name, value, tags, rate):
99+
return self.add_max_sample_metric(
100+
MetricType.TIMING, name, value, tags, rate
101+
)
102+
103+
def add_max_sample_metric(
104+
self, metric_type, name, value, tags, rate
105+
):
106+
if rate is None:
107+
rate = 1
108+
context_key = self.get_context(name, tags)
109+
metric_context = self.max_sample_metric_map[metric_type]
110+
return metric_context.sample(name, value, tags, rate, context_key, self.max_samples_per_context)

datadog/dogstatsd/base.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def __init__(
160160
telemetry_port=None, # type: Union[str, int]
161161
telemetry_socket_path=None, # type: Text
162162
max_buffer_len=0, # type: int
163+
max_metric_samples_per_context=0, # type: int
163164
container_id=None, # type: Optional[Text]
164165
origin_detection_enabled=True, # type: bool
165166
socket_timeout=0, # type: Optional[float]
@@ -236,9 +237,14 @@ def __init__(
236237
it overrides the default value.
237238
:type flush_interval: float
238239
239-
:disable_aggregation: If true, metrics (Count, Gauge, Set) are no longered aggregated by the client
240+
:disable_aggregation: If true, metrics (Count, Gauge, Set) are no longer aggregated by the client
240241
:type disable_aggregation: bool
241242
243+
:max_metric_samples_per_context: Sets the maximum amount of samples for Histogram, Distribution
244+
and Timings metrics (default 0). This feature should be used alongside aggregation. This feature
245+
is experimental.
246+
:type max_metric_samples_per_context: int
247+
242248
:disable_buffering: If set, metrics are no longered buffered by the client and
243249
all data is sent synchronously to the server
244250
:type disable_buffering: bool
@@ -450,7 +456,7 @@ def __init__(
450456
self._flush_interval = flush_interval
451457
self._flush_thread = None
452458
self._flush_thread_stop = threading.Event()
453-
self.aggregator = Aggregator()
459+
self.aggregator = Aggregator(max_metric_samples_per_context)
454460
# Indicates if the process is about to fork, so we shouldn't start any new threads yet.
455461
self._forking = False
456462

@@ -643,10 +649,11 @@ def disable_aggregation(self):
643649
self._stop_flush_thread()
644650
log.debug("Statsd aggregation is disabled")
645651

646-
def enable_aggregation(self, flush_interval=DEFAULT_BUFFERING_FLUSH_INTERVAL):
652+
def enable_aggregation(self, flush_interval=DEFAULT_BUFFERING_FLUSH_INTERVAL, max_samples_per_context=0):
647653
with self._config_lock:
648654
if not self._disable_aggregation:
649655
return
656+
self.aggregator.set_max_samples_per_context(max_samples_per_context)
650657
self._disable_aggregation = False
651658
self._flush_interval = flush_interval
652659
if self._disable_buffering:
@@ -826,6 +833,10 @@ def flush_aggregated_metrics(self):
826833
for m in metrics:
827834
self._report(m.name, m.metric_type, m.value, m.tags, m.rate, m.timestamp)
828835

836+
sampled_metrics = self.aggregator.flush_aggregated_sampled_metrics()
837+
for m in sampled_metrics:
838+
self._report(m.name, m.metric_type, m.value, m.tags, m.rate, m.timestamp, False)
839+
829840
def gauge(
830841
self,
831842
metric, # type: Text
@@ -960,7 +971,10 @@ def histogram(
960971
>>> statsd.histogram("uploaded.file.size", 1445)
961972
>>> statsd.histogram("album.photo.count", 26, tags=["gender:female"])
962973
"""
963-
self._report(metric, "h", value, tags, sample_rate)
974+
if not self._disable_aggregation and self.aggregator.max_samples_per_context != 0:
975+
self.aggregator.histogram(metric, value, tags, sample_rate)
976+
else:
977+
self._report(metric, "h", value, tags, sample_rate)
964978

965979
def distribution(
966980
self,
@@ -975,7 +989,10 @@ def distribution(
975989
>>> statsd.distribution("uploaded.file.size", 1445)
976990
>>> statsd.distribution("album.photo.count", 26, tags=["gender:female"])
977991
"""
978-
self._report(metric, "d", value, tags, sample_rate)
992+
if not self._disable_aggregation and self.aggregator.max_samples_per_context != 0:
993+
self.aggregator.distribution(metric, value, tags, sample_rate)
994+
else:
995+
self._report(metric, "d", value, tags, sample_rate)
979996

980997
def timing(
981998
self,
@@ -989,7 +1006,10 @@ def timing(
9891006
9901007
>>> statsd.timing("query.response.time", 1234)
9911008
"""
992-
self._report(metric, "ms", value, tags, sample_rate)
1009+
if not self._disable_aggregation and self.aggregator.max_samples_per_context != 0:
1010+
self.aggregator.timing(metric, value, tags, sample_rate)
1011+
else:
1012+
self._report(metric, "ms", value, tags, sample_rate)
9931013

9941014
def timed(self, metric=None, tags=None, sample_rate=None, use_ms=None):
9951015
"""
@@ -1093,7 +1113,7 @@ def _serialize_metric(
10931113
("|T" + text(timestamp)) if timestamp > 0 else "",
10941114
)
10951115

1096-
def _report(self, metric, metric_type, value, tags, sample_rate, timestamp=0):
1116+
def _report(self, metric, metric_type, value, tags, sample_rate, timestamp=0, sampling=True):
10971117
"""
10981118
Create a metric packet and send it.
10991119
@@ -1109,11 +1129,12 @@ def _report(self, metric, metric_type, value, tags, sample_rate, timestamp=0):
11091129
if self._telemetry:
11101130
self.metrics_count += 1
11111131

1112-
if sample_rate is None:
1113-
sample_rate = self.default_sample_rate
1132+
if sampling:
1133+
if sample_rate is None:
1134+
sample_rate = self.default_sample_rate
11141135

1115-
if sample_rate != 1 and random() > sample_rate:
1116-
return
1136+
if sample_rate != 1 and random() > sample_rate:
1137+
return
11171138
# timestamps (protocol v1.3) only allowed on gauges and counts
11181139
allows_timestamp = metric_type == MetricType.GAUGE or metric_type == MetricType.COUNT
11191140

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import random
2+
from datadog.dogstatsd.metric_types import MetricType
3+
from datadog.dogstatsd.metrics import MetricAggregator
4+
from threading import Lock
5+
6+
7+
class MaxSampleMetric(object):
8+
def __init__(self, name, tags, metric_type, specified_rate=1.0, max_metric_samples=0):
9+
self.name = name
10+
self.tags = tags
11+
self.lock = Lock()
12+
self.metric_type = metric_type
13+
self.max_metric_samples = max_metric_samples
14+
self.specified_rate = specified_rate
15+
self.data = [None] * max_metric_samples if max_metric_samples > 0 else []
16+
self.stored_metric_samples = 0
17+
self.total_metric_samples = 0
18+
19+
def sample(self, value):
20+
if self.max_metric_samples == 0:
21+
self.data.append(value)
22+
else:
23+
self.data[self.stored_metric_samples] = value
24+
self.stored_metric_samples += 1
25+
self.total_metric_samples += 1
26+
27+
def maybe_keep_sample_work_unsafe(self, value):
28+
if self.max_metric_samples > 0:
29+
self.total_metric_samples += 1
30+
if self.stored_metric_samples < self.max_metric_samples:
31+
self.data[self.stored_metric_samples] = value
32+
self.stored_metric_samples += 1
33+
else:
34+
i = random.randint(0, self.total_metric_samples - 1)
35+
if i < self.max_metric_samples:
36+
self.data[i] = value
37+
else:
38+
self.sample(value)
39+
40+
def skip_sample(self):
41+
self.total_metric_samples += 1
42+
43+
def flush(self):
44+
rate = self.stored_metric_samples / self.total_metric_samples
45+
with self.lock:
46+
return [
47+
MetricAggregator(self.name, self.tags, rate, self.metric_type, self.data[i])
48+
for i in range(self.stored_metric_samples)
49+
]
50+
51+
52+
class HistogramMetric(MaxSampleMetric):
53+
def __init__(self, name, tags, rate=1.0, max_metric_samples=0):
54+
super(HistogramMetric, self).__init__(name, tags, MetricType.HISTOGRAM, rate, max_metric_samples)
55+
56+
57+
class DistributionMetric(MaxSampleMetric):
58+
def __init__(self, name, tags, rate=1.0, max_metric_samples=0):
59+
super(DistributionMetric, self).__init__(name, tags, MetricType.DISTRIBUTION, rate, max_metric_samples)
60+
61+
62+
class TimingMetric(MaxSampleMetric):
63+
def __init__(self, name, tags, rate=1.0, max_metric_samples=0):
64+
super(TimingMetric, self).__init__(name, tags, MetricType.TIMING, rate, max_metric_samples)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from threading import Lock
2+
import random
3+
4+
5+
class MaxSampleMetricContexts:
6+
def __init__(self, max_sample_metric_type):
7+
self.lock = Lock()
8+
self.values = {}
9+
self.max_sample_metric_type = max_sample_metric_type
10+
11+
def flush(self):
12+
metrics = []
13+
"""Flush the metrics and reset the stored values."""
14+
with self.lock:
15+
temp = self.values
16+
self.values = {}
17+
for _, metric in temp.items():
18+
metrics.append(metric.flush())
19+
return metrics
20+
21+
def sample(self, name, value, tags, rate, context_key, max_samples_per_context):
22+
"""Sample a metric and store it if it meets the criteria."""
23+
keeping_sample = self.should_sample(rate)
24+
with self.lock:
25+
if context_key not in self.values:
26+
# Create a new metric if it doesn't exist
27+
self.values[context_key] = self.max_sample_metric_type(name, tags, max_samples_per_context)
28+
metric = self.values[context_key]
29+
metric.lock.acquire()
30+
if keeping_sample:
31+
metric.maybe_keep_sample_work_unsafe(value)
32+
else:
33+
metric.skip_sample()
34+
metric.lock.release()
35+
36+
def should_sample(self, rate):
37+
"""Determine if a sample should be kept based on the specified rate."""
38+
if rate >= 1:
39+
return True
40+
return random.random() < rate

datadog/dogstatsd/metric_types.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@ class MetricType:
22
COUNT = "c"
33
GAUGE = "g"
44
SET = "s"
5+
HISTOGRAM = "h"
6+
DISTRIBUTION = "d"
7+
TIMING = "ms"

0 commit comments

Comments
 (0)