add latency, throughput, and size metrics for get/put/delete storage operations (#97277)

matt-codecov · Swatinem · web-flow · commit 7cfd09ef1026 · 2025-08-11T11:53:15.000Z
logs: - request latency - compressed and uncompressed sizes - compressed and uncompressed throughput (bytes per second) - compressed and uncompressed _inverse throughput_ (seconds per byte) - compression ratio throughputs and compression ratio require one or both of the size metrics. if they're not known, the metrics that depend on them won't be logged. nuances: - profiling reads are in vroom so i can't instrument them in this PR. didn't see explicit deletes. also i added a missing PUT case - there isn't a good way to get uncompressed size for replays. our dash currently relies on their own metrics from ingestion for that - the nodestore metrics are the layer above where (de)compression is applied, so: - the latencies include (de)compression whereas the other instances of these metrics do not. - while compressed PUT size is still logged with [the one-off log arpad added](https://github.com/getsentry/sentry/blob/06e943abf26a0566339b00a25324b8e0d212cab6/src/sentry/utils/kvstore/bigtable.py#L244-L249), the compressed throughputs and compression ratio are not because the metric emitter can't get to it - it's not currently clear to me the right place to instrument artifact-bundles and debug-files. the PUT size metrics arpad added are still in tact, but i'll have to double back to add latency, throughput, and get/delete metrics ### Legal Boilerplate Look, I get it. The entity doing business as "Sentry" was incorporated in the State of Delaware in 2015 as Functional Software, Inc. and is gonna need some rights from me in order to utilize my contributions in this here PR. So here's the deal: I retain all rights, title and interest in and to my contributions, and by keeping this boilerplate intact I confirm that Sentry can use, modify, copy, and redistribute my contributions, under Sentry's choice of terms. --------- Co-authored-by: Arpad Borsos <arpad.borsos@sentry.io>
diff --git a/src/sentry/filestore/gcs.py b/src/sentry/filestore/gcs.py
@@ -175,6 +175,8 @@ def __init__(self, name, mode, storage):
 
     @property
     def size(self):
+        if self.blob.size is None:
+            self.blob.reload()
         return self.blob.size
 
     @property
diff --git a/src/sentry/models/eventattachment.py b/src/sentry/models/eventattachment.py
@@ -18,9 +18,8 @@
 from sentry.db.models.manager.base_query_set import BaseQuerySet
 from sentry.models.files.utils import get_size_and_checksum, get_storage
 from sentry.objectstore import attachments
-from sentry.objectstore.metrics import measure_storage_put
+from sentry.objectstore.metrics import measure_storage_operation
 from sentry.options.rollout import in_random_rollout
-from sentry.utils import metrics
 
 # Attachment file types that are considered a crash report (PII relevant)
 CRASH_REPORT_TYPES = ("event.minidump", "event.applecrashreport")
@@ -121,7 +120,8 @@ def delete(self, *args: Any, **kwargs: Any) -> tuple[int, dict[str, int]]:
 
             elif self.blob_path.startswith(V1_PREFIX):
                 storage = get_storage()
-                storage.delete(self.blob_path)
+                with measure_storage_operation("delete", "attachments"):
+                    storage.delete(self.blob_path)
 
             elif self.blob_path.startswith(V2_PREFIX):
                 organization_id = _get_organization(self.project_id)
@@ -143,7 +143,13 @@ def getfile(self) -> IO[bytes]:
 
         elif self.blob_path.startswith(V1_PREFIX):
             storage = get_storage()
-            compressed_blob = storage.open(self.blob_path)
+            with measure_storage_operation("get", "attachments", self.size) as metric_emitter:
+                compressed_blob = storage.open(self.blob_path)
+                # We want to log the compressed size here but we want to stream the payload.
+                # Accessing `.size` does additional metadata requests, for which we
+                # just swallow the costs.
+                metric_emitter.record_compressed_size(compressed_blob.size, "zstd")
+
             dctx = zstandard.ZstdDecompressor()
             return dctx.stream_reader(compressed_blob, read_across_frames=True)
 
@@ -168,17 +174,6 @@ def putfile(cls, project_id: int, attachment: CachedAttachment) -> PutfileResult
         blob = BytesIO(data)
         size, checksum = get_size_and_checksum(blob)
 
-        # TODO: we measure the uncompressed size for inline stored attachments as well,
-        # however moving to V2 storage would mean we would eather double count
-        # when leaving this metric here in place, or miss inline-stored attachments
-        # when removing this metric and only rely on the one in the V2 Client API.
-        metrics.distribution(
-            "storage.put.size",
-            size,
-            tags={"usecase": "attachments", "compression": "none"},
-            unit="byte",
-        )
-
         if can_store_inline(data):
             blob_path = ":" + data.decode()
 
@@ -188,7 +183,9 @@ def putfile(cls, project_id: int, attachment: CachedAttachment) -> PutfileResult
             storage = get_storage()
             compressed_blob = zstandard.compress(data)
 
-            with measure_storage_put(len(compressed_blob), "attachments", "zstd"):
+            with measure_storage_operation(
+                "put", "attachments", size, len(compressed_blob), "zstd"
+            ):
                 storage.save(blob_path, BytesIO(compressed_blob))
 
         else:
diff --git a/src/sentry/nodestore/base.py b/src/sentry/nodestore/base.py
@@ -10,7 +10,6 @@
 from django.utils.functional import cached_property
 
 from sentry import options
-from sentry.objectstore.metrics import measure_storage_put
 from sentry.utils import json, metrics
 from sentry.utils.services import Service
 
@@ -232,8 +231,7 @@ def set_bytes(self, item_id: str, data: bytes, ttl: timedelta | None = None) ->
         >>> nodestore.set_bytes('key1', b"{'foo': 'bar'}")
         """
         metrics.distribution("nodestore.set_bytes", len(data))
-        with measure_storage_put(len(data), "nodestore"):
-            return self._set_bytes(item_id, data, ttl)
+        return self._set_bytes(item_id, data, ttl)
 
     def _set_bytes(self, item_id: str, data: bytes, ttl: timedelta | None = None) -> None:
         raise NotImplementedError
diff --git a/src/sentry/nodestore/bigtable/backend.py b/src/sentry/nodestore/bigtable/backend.py
@@ -7,6 +7,7 @@
 import sentry_sdk
 
 from sentry.nodestore.base import NodeStorage
+from sentry.objectstore.metrics import measure_storage_operation
 from sentry.utils.kvstore.bigtable import BigtableKVStorage
 
 
@@ -65,24 +66,37 @@ def __init__(
 
     @sentry_sdk.tracing.trace
     def _get_bytes(self, id: str) -> bytes | None:
-        return self.store.get(id)
+        # Note: This metric encapsulates any decompression performed by `self.store.get()`. Other
+        # instances of this metric stop measuring before decompression happens.
+        with measure_storage_operation("get", "nodestore") as metric_emitter:
+            result = self.store.get(id)
+            if result:
+                metric_emitter.record_uncompressed_size(len(result))
+            return result
 
     @sentry_sdk.tracing.trace
     def _get_bytes_multi(self, id_list: list[str]) -> dict[str, bytes | None]:
         rv: dict[str, bytes | None] = {id: None for id in id_list}
-        rv.update(self.store.get_many(id_list))
+        # Note: This metric encapsulates any decompression performed by `self.store.get_many()`. Other
+        # instances of this metric stop measuring before decompression happens.
+        with measure_storage_operation("get-multi", "nodestore"):
+            rv.update(self.store.get_many(id_list))
         return rv
 
     def _set_bytes(self, id: str, data: Any, ttl: timedelta | None = None) -> None:
-        self.store.set(id, data, ttl)
+        # Note: This metric encapsulates any compression performed by `self.store.put()`. Other
+        # instances of this metric start measuring after compression happens.
+        with measure_storage_operation("put", "nodestore", len(data)):
+            self.store.set(id, data, ttl)
 
     def delete(self, id: str) -> None:
         if self.skip_deletes:
             return
 
         with sentry_sdk.start_span(op="nodestore.bigtable.delete"):
             try:
-                self.store.delete(id)
+                with measure_storage_operation("delete", "nodestore"):
+                    self.store.delete(id)
             finally:
                 self._delete_cache_item(id)
 
@@ -98,7 +112,8 @@ def delete_multi(self, id_list: list[str]) -> None:
                 return
 
             try:
-                self.store.delete_many(id_list)
+                with measure_storage_operation("delete-multi", "nodestore"):
+                    self.store.delete_many(id_list)
             finally:
                 self._delete_cache_items(id_list)
 
diff --git a/src/sentry/objectstore/metrics.py b/src/sentry/objectstore/metrics.py
@@ -1,37 +1,111 @@
 import time
 from collections.abc import Generator
 from contextlib import contextmanager
-from dataclasses import dataclass
 
 from sentry.utils import metrics
 
 
-@dataclass
-class UploadMeasurement:
-    upload_size: int | None
-    compression: str | None
+class StorageMetricEmitter:
+    def __init__(self, operation: str, usecase: str):
+        self.operation = operation
+        self.usecase = usecase
+
+        # These may be set during or after the enclosed operation
+        self.start: int | None = None
+        self.elapsed: float | None = None
+        self.uncompressed_size: int | None = None
+        self.compressed_size: int | None = None
+        self.compression: str = "unknown"
+
+    def record_latency(self, elapsed: float):
+        tags = {"usecase": self.usecase}
+        metrics.timing(f"storage.{self.operation}.latency", elapsed, tags=tags, precise=True)
+        self.elapsed = elapsed
+
+    def record_uncompressed_size(self, value: int):
+        tags = {"usecase": self.usecase, "compression": "none"}
+        metrics.distribution(
+            f"storage.{self.operation}.size", value, tags=tags, unit="byte", precise=True
+        )
+        self.uncompressed_size = value
+
+    def record_compressed_size(self, value: int, compression: str = "unknown"):
+        tags = {"usecase": self.usecase, "compression": compression}
+        metrics.distribution(
+            f"storage.{self.operation}.size", value, tags=tags, unit="byte", precise=True
+        )
+        self.compressed_size = value
+        self.compression = compression
+
+    def maybe_record_compression_ratio(self):
+        if not self.uncompressed_size or not self.compressed_size:
+            return
+
+        tags = {"usecase": self.usecase, "compression": self.compression}
+        metrics.distribution(
+            f"storage.{self.operation}.compression_ratio",
+            self.compressed_size / self.uncompressed_size,
+            tags=tags,
+            precise=True,
+        )
+
+    def maybe_record_throughputs(self):
+        if not self.elapsed or self.elapsed <= 0:
+            return
+
+        sizes = []
+        if self.uncompressed_size:
+            sizes.append((self.uncompressed_size, "none"))
+        if self.compressed_size:
+            sizes.append((self.compressed_size, self.compression))
+
+        for size, compression in sizes:
+            tags = {"usecase": self.usecase, "compression": compression}
+            metrics.distribution(
+                f"storage.{self.operation}.throughput", size / self.elapsed, tags=tags, precise=True
+            )
+            metrics.distribution(
+                f"storage.{self.operation}.inverse_throughput",
+                self.elapsed / size,
+                tags=tags,
+                precise=True,
+            )
 
 
 @contextmanager
-def measure_storage_put(
-    upload_size: int | None, usecase: str, compression: str | None = None
-) -> Generator[UploadMeasurement]:
-    measurement = UploadMeasurement(upload_size, compression)
+def measure_storage_operation(
+    operation: str,
+    usecase: str,
+    uncompressed_size: int | None = None,
+    compressed_size: int | None = None,
+    compression: str = "unknown",
+) -> Generator[StorageMetricEmitter]:
+    """
+    Context manager which records the latency of the enclosed storage operation.
+    Can also record the compressed or uncompressed size of an object, the
+    compression ratio, the throughput, and the inverse throughput.
+
+    Yields a `StorageMetricEmitter` because for some operations (GET) the size
+    is not known until the inside of the enclosed block.
+    """
+    emitter = StorageMetricEmitter(operation, usecase)
+
+    if uncompressed_size:
+        emitter.record_uncompressed_size(uncompressed_size)
+    if compressed_size:
+        emitter.record_compressed_size(compressed_size, compression)
+
     start = time.monotonic()
+
+    # Yield an emitter in case the size becomes known inside the enclosed block
     try:
-        yield measurement
+        yield emitter
+
     finally:
         elapsed = time.monotonic() - start
-        metrics.timing("storage.put.latency", elapsed, tags={"usecase": usecase})
+        emitter.record_latency(elapsed)
 
-        if upload_size := measurement.upload_size:
-            metrics.distribution(
-                "storage.put.size",
-                upload_size,
-                tags={"usecase": usecase, "compression": measurement.compression or "none"},
-                unit="byte",
-            )
-            if elapsed > 0:
-                metrics.distribution(
-                    "storage.put.throughput", upload_size / elapsed, tags={"usecase": usecase}
-                )
+        # If `uncompressed_size` and/or `compressed_size` have been set, we have
+        # extra metrics we can send.
+        emitter.maybe_record_compression_ratio()
+        emitter.maybe_record_throughputs()
diff --git a/src/sentry/objectstore/service.py b/src/sentry/objectstore/service.py
@@ -16,8 +16,8 @@
     Metadata,
     format_expiration,
 )
-from sentry.objectstore.metrics import measure_storage_put
-from sentry.utils import jwt, metrics
+from sentry.objectstore.metrics import measure_storage_operation
+from sentry.utils import jwt
 
 Permission = Literal["read", "write"]
 
@@ -130,7 +130,7 @@ def put(
             for k, v in metadata.items():
                 headers[f"{HEADER_META_PREFIX}{k}"] = v
 
-        with measure_storage_put(None, self._usecase, compression) as measurement:
+        with measure_storage_operation("put", self._usecase) as metric_emitter:
             response = self._pool.request(
                 "PUT",
                 f"/{id}" if id else "/",
@@ -142,15 +142,11 @@ def put(
             raise_for_status(response)
             res = response.json()
 
-            measurement.upload_size = body.tell()
-            if compression != "none":
-                metrics.distribution(
-                    "storage.put.size",
-                    original_body.tell(),
-                    tags={"usecase": self._usecase, "compression": "none"},
-                    unit="byte",
-                )
-
+            # Must do this after streaming `body` as that's what is responsible
+            # for advancing the seek position in both streams
+            metric_emitter.record_uncompressed_size(original_body.tell())
+            if compression and compression != "none":
+                metric_emitter.record_compressed_size(body.tell(), compression)
             return res["key"]
 
     def get(self, id: str, decompress: bool = True) -> GetResult:
@@ -163,14 +159,15 @@ def get(self, id: str, decompress: bool = True) -> GetResult:
         """
         headers = self._make_headers("read")
 
-        response = self._pool.request(
-            "GET",
-            f"/{id}",
-            headers=headers,
-            preload_content=False,
-            decode_content=False,
-        )
-        raise_for_status(response)
+        with measure_storage_operation("get", self._usecase):
+            response = self._pool.request(
+                "GET",
+                f"/{id}",
+                headers=headers,
+                preload_content=False,
+                decode_content=False,
+            )
+            raise_for_status(response)
         # OR: should I use `response.stream()`?
         stream = cast(IO[bytes], response)
         metadata = Metadata.from_headers(response.headers)
@@ -193,12 +190,13 @@ def delete(self, id: str):
         """
         headers = self._make_headers("write")
 
-        response = self._pool.request(
-            "DELETE",
-            f"/{id}",
-            headers=headers,
-        )
-        raise_for_status(response)
+        with measure_storage_operation("delete", self._usecase):
+            response = self._pool.request(
+                "DELETE",
+                f"/{id}",
+                headers=headers,
+            )
+            raise_for_status(response)
 
 
 class ClientError(Exception):
diff --git a/src/sentry/profiles/task.py b/src/sentry/profiles/task.py
diff --git a/src/sentry/replays/lib/storage.py b/src/sentry/replays/lib/storage.py