Skip to content

Commit f278483

Browse files
authored
perf(profiling): implement and use cheaper groupby (#4936)
## Description The use of the `itertools.groupby` function requires the collection that is passed to it to be sorted for the intended use. We make our own implementation of the `groupby` operation to avoid incurring in the extra cost of sorting, thus keeping the time complexity linear with the collection size. ## Checklist - [ ] Followed the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines) when writing a release note. - [ ] Add additional sections for `feat` and `fix` pull requests. - [ ] [Library documentation](https://github.com/DataDog/dd-trace-py/tree/1.x/docs) and/or [Datadog's documentation site](https://github.com/DataDog/documentation/) is updated. Link to doc PR in description. ## Reviewer Checklist - [ ] Title is accurate. - [ ] Description motivates each change. - [ ] No unnecessary changes were introduced in this PR. - [ ] Avoid breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [ ] Tests provided or description of manual testing performed is included in the code or PR. - [ ] Release note has been added and follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines), or else `changelog/no-changelog` label added. - [ ] All relevant GitHub issues are correctly linked. - [ ] Change contains telemetry where appropriate (logs, metrics, etc.). - [ ] Telemetry is meaningful, actionable and does not have the potential to leak sensitive data.
1 parent ea51750 commit f278483

File tree

6 files changed

+49
-2795
lines changed

6 files changed

+49
-2795
lines changed

ddtrace/profiling/exporter/pprof.pyx

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import collections
2-
import itertools
32
import operator
43
import platform
54
import sysconfig
@@ -143,10 +142,8 @@ class _StringTable(object):
143142
return len(self._strings)
144143

145144

146-
def _none_to_str(value: typing.Any) -> str:
147-
if value is None:
148-
return ""
149-
return str(value)
145+
cdef str _none_to_str(object value):
146+
return "" if value is None else str(value)
150147

151148

152149
def _get_thread_name(thread_id: typing.Optional[int], thread_name: typing.Optional[str]) -> str:
@@ -155,6 +152,15 @@ def _get_thread_name(thread_id: typing.Optional[int], thread_name: typing.Option
155152
return thread_name
156153

157154

155+
cdef groupby(object collection, object key):
156+
cdef dict groups = {}
157+
158+
for item in collection:
159+
groups.setdefault(key(item), []).append(item)
160+
161+
return groups.items()
162+
163+
158164
class pprof_LocationType(object):
159165
# pprof_pb2.Location
160166
id: int
@@ -460,20 +466,15 @@ class _PprofConverter(object):
460466
"paths": sorted(lib_and_filename[1] for lib_and_filename in libs_and_filenames),
461467
}
462468
)
463-
for lib, libs_and_filenames in itertools.groupby(
464-
sorted(
465-
set(
466-
filter(
467-
lambda p: p[0] is not None,
468-
(
469-
(_packages.filename_to_package(filename), filename)
470-
for filename, lineno, funcname in self._locations
471-
),
472-
)
473-
),
474-
key=_ITEMGETTER_ZERO,
475-
),
476-
key=_ITEMGETTER_ZERO,
469+
for lib, libs_and_filenames in groupby(
470+
{
471+
_
472+
for _ in (
473+
(_packages.filename_to_package(filename), filename)
474+
for filename, lineno, funcname in self._locations
475+
)
476+
if _[0] is not None
477+
}, _ITEMGETTER_ZERO
477478
)
478479
] + STDLIB
479480

@@ -601,10 +602,7 @@ class PprofExporter(exporter.Exporter):
601602
def _group_stack_events(
602603
self, events: typing.Iterable[event.StackBasedEvent]
603604
) -> typing.Iterator[typing.Tuple[StackEventGroupKey, typing.Iterator[event.StackBasedEvent]]]:
604-
return itertools.groupby(
605-
sorted(events, key=self._stack_event_group_key),
606-
key=self._stack_event_group_key,
607-
)
605+
return groupby(events, self._stack_event_group_key)
608606

609607
def _lock_event_group_key(
610608
self,
@@ -627,10 +625,7 @@ class PprofExporter(exporter.Exporter):
627625
def _group_lock_events(
628626
self, events: typing.Iterable[_lock.LockEventBase]
629627
) -> typing.Iterator[typing.Tuple[LockEventGroupKey, typing.Iterator[_lock.LockEventBase]]]:
630-
return itertools.groupby(
631-
sorted(events, key=self._lock_event_group_key),
632-
key=self._lock_event_group_key,
633-
)
628+
return groupby(events, self._lock_event_group_key)
634629

635630
def _stack_exception_group_key(self, event: stack_event.StackExceptionSampleEvent) -> StackExceptionEventGroupKey:
636631
exc_type = event.exc_type
@@ -654,10 +649,7 @@ class PprofExporter(exporter.Exporter):
654649
) -> typing.Iterator[
655650
typing.Tuple[StackExceptionEventGroupKey, typing.Iterator[stack_event.StackExceptionSampleEvent]]
656651
]:
657-
return itertools.groupby(
658-
sorted(events, key=self._stack_exception_group_key),
659-
key=self._stack_exception_group_key,
660-
)
652+
return groupby(events, self._stack_exception_group_key)
661653

662654
def _get_event_trace_resource(self, event: event.StackBasedEvent) -> str:
663655
trace_resource = ""

mypy.ini

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
[mypy]
22
files = ddtrace,
33
ddtrace/profiling/_build.pyx,
4-
ddtrace/profiling/exporter/pprof.pyx,
54
docs
65
# mypy thinks .pyx files are scripts and errors out if it finds multiple scripts
76
scripts_are_modules = true

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ exclude = '''
3232
| ddtrace/profiling/_threading.pyx$
3333
| ddtrace/profiling/collector/stack.pyx$
3434
| ddtrace/profiling/exporter/pprof_.*pb2.py$
35+
| ddtrace/profiling/exporter/pprof.pyx$
3536
| ddtrace/vendor/
3637
| \.eggs
3738
| \.git

0 commit comments

Comments
 (0)