LLM-Atlas/03_Engineering/03_Context_Engineering/02_Practical/shared/observability.py at main · HuangZurong/LLM-Atlas · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
"""
Context Observability
======================
Instruments context assembly with token usage tracking, cost attribution,
context diff logging, and OpenTelemetry-compatible span recording.

Use this alongside the other Practical files to gain visibility into what
your application is putting into the context window and what it costs.

Prerequisites:
  - 02_Practical/01_Context_Composition_Pipeline.py
  - 02_Practical/02_Token_Budget_Controller.py
"""

from __future__ import annotations

import json
import logging
import time
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any

import tiktoken

logger = logging.getLogger(__name__)

_ENCODER = tiktoken.get_encoding("cl100k_base")


def count_tokens(text: str) -> int:
    return len(_ENCODER.encode(text))


# ---------------------------------------------------------------------------
# Pricing table (USD per 1M tokens, as of early 2025)
# Update from provider docs as prices change.
# ---------------------------------------------------------------------------

PRICING: dict[str, dict[str, float]] = {
    "claude-sonnet-4-5": {"input": 3.00, "output": 15.00, "cached_input": 0.30},
    "claude-haiku-3-5":  {"input": 0.80, "output": 4.00,  "cached_input": 0.08},
    "gpt-4o":            {"input": 2.50, "output": 10.00, "cached_input": 1.25},
    "gpt-4o-mini":       {"input": 0.15, "output": 0.60,  "cached_input": 0.075},
}


def token_cost(
    tokens: int,
    model: str,
    token_type: str = "input",  # "input" | "output" | "cached_input"
) -> float:
    """Return cost in USD for a given token count."""
    price_per_million = PRICING.get(model, {}).get(token_type, 0.0)
    return tokens * price_per_million / 1_000_000


# ---------------------------------------------------------------------------
# Context snapshot — one per LLM call
# ---------------------------------------------------------------------------

@dataclass
class ContextSnapshot:
    """Records the token composition of a single LLM call."""
    request_id: str
    session_id: str
    turn: int
    model: str
    timestamp: float = field(default_factory=time.time)

    # Layer token counts
    layer_tokens: dict[str, int] = field(default_factory=dict)

    # Cache info
    cached_tokens: int = 0
    cache_hit: bool = False

    # Output
    output_tokens: int = 0

    # Assembly metadata
    compression_applied: bool = False
    layers_trimmed: list[str] = field(default_factory=list)
    assembly_latency_ms: float = 0.0

    @property
    def total_input_tokens(self) -> int:
        return sum(self.layer_tokens.values())

    @property
    def uncached_input_tokens(self) -> int:
        return max(0, self.total_input_tokens - self.cached_tokens)

    @property
    def total_cost_usd(self) -> float:
        return (
            token_cost(self.uncached_input_tokens, self.model, "input")
            + token_cost(self.cached_tokens, self.model, "cached_input")
            + token_cost(self.output_tokens, self.model, "output")
        )

    def to_dict(self) -> dict:
        return {
            "request_id": self.request_id,
            "session_id": self.session_id,
            "turn": self.turn,
            "model": self.model,
            "timestamp": self.timestamp,
            "layer_tokens": self.layer_tokens,
            "total_input_tokens": self.total_input_tokens,
            "cached_tokens": self.cached_tokens,
            "cache_hit": self.cache_hit,
            "output_tokens": self.output_tokens,
            "total_cost_usd": round(self.total_cost_usd, 8),
            "compression_applied": self.compression_applied,
            "layers_trimmed": self.layers_trimmed,
            "assembly_latency_ms": round(self.assembly_latency_ms, 2),
        }


# ---------------------------------------------------------------------------
# Context diff — what changed between consecutive turns
# ---------------------------------------------------------------------------

@dataclass
class ContextDiff:
    """Records what changed in the context between two consecutive turns."""
    session_id: str
    from_turn: int
    to_turn: int
    token_delta: int
    layer_deltas: dict[str, int]  # layer_name → token change
    compression_applied: bool
    layers_trimmed: list[str]

    def to_dict(self) -> dict:
        return {
            "session_id": self.session_id,
            "from_turn": self.from_turn,
            "to_turn": self.to_turn,
            "token_delta": self.token_delta,
            "layer_deltas": self.layer_deltas,
            "compression_applied": self.compression_applied,
            "layers_trimmed": self.layers_trimmed,
        }


# ---------------------------------------------------------------------------
# Context observer — collects and analyzes snapshots
# ---------------------------------------------------------------------------

class ContextObserver:
    """
    Collects context snapshots, computes diffs, and generates reports.

    Usage:
        observer = ContextObserver(model="claude-sonnet-4-5", context_window=128_000)

        # Before each LLM call:
        snapshot = observer.record(
            request_id="req_001",
            session_id="sess_abc",
            turn=1,
            layer_tokens={"system": 500, "rag": 3200, "history": 2800, "query": 150},
            cached_tokens=500,
            output_tokens=320,
            assembly_latency_ms=45.2,
        )

        # Get reports:
        observer.print_report()
    """

    def __init__(
        self,
        model: str = "claude-sonnet-4-5",
        context_window: int = 128_000,
        output_reserve_ratio: float = 0.25,
        alert_thresholds: dict[str, float] | None = None,
    ) -> None:
        self.model = model
        self.context_window = context_window
        self.input_budget = int(context_window * (1 - output_reserve_ratio))
        self.alert_thresholds = alert_thresholds or {
            "utilization": 0.80,          # Alert if input > 80% of budget
            "cache_hit_rate": 0.30,        # Alert if cache hit rate < 30%
            "avg_relevance": 0.60,         # Alert if avg relevance score < 0.6
            "output_truncation_rate": 0.05, # Alert if >5% of responses truncated
        }

        self._snapshots: list[ContextSnapshot] = []
        self._session_last: dict[str, ContextSnapshot] = {}  # session_id → last snapshot

    # ------------------------------------------------------------------
    # Recording
    # ------------------------------------------------------------------

    def record(
        self,
        request_id: str,
        session_id: str,
        turn: int,
        layer_tokens: dict[str, int],
        cached_tokens: int = 0,
        output_tokens: int = 0,
        compression_applied: bool = False,
        layers_trimmed: list[str] | None = None,
        assembly_latency_ms: float = 0.0,
    ) -> ContextSnapshot:
        snapshot = ContextSnapshot(
            request_id=request_id,
            session_id=session_id,
            turn=turn,
            model=self.model,
            layer_tokens=layer_tokens,
            cached_tokens=cached_tokens,
            cache_hit=cached_tokens > 0,
            output_tokens=output_tokens,
            compression_applied=compression_applied,
            layers_trimmed=layers_trimmed or [],
            assembly_latency_ms=assembly_latency_ms,
        )
        self._snapshots.append(snapshot)

        # Compute and log diff if we have a previous snapshot for this session
        if session_id in self._session_last:
            diff = self._compute_diff(self._session_last[session_id], snapshot)
            if abs(diff.token_delta) > 500:
                logger.info(
                    "Session %s turn %d→%d: %+d tokens %s",
                    session_id, diff.from_turn, diff.to_turn, diff.token_delta,
                    "(compressed)" if diff.compression_applied else "",
                )

        self._session_last[session_id] = snapshot
        self._check_alerts(snapshot)
        return snapshot

    # ------------------------------------------------------------------
    # Diff computation
    # ------------------------------------------------------------------

    def _compute_diff(self, prev: ContextSnapshot, curr: ContextSnapshot) -> ContextDiff:
        all_layers = set(prev.layer_tokens) | set(curr.layer_tokens)
        layer_deltas = {
            layer: curr.layer_tokens.get(layer, 0) - prev.layer_tokens.get(layer, 0)
            for layer in all_layers
            if curr.layer_tokens.get(layer, 0) != prev.layer_tokens.get(layer, 0)
        }
        return ContextDiff(
            session_id=curr.session_id,
            from_turn=prev.turn,
            to_turn=curr.turn,
            token_delta=curr.total_input_tokens - prev.total_input_tokens,
            layer_deltas=layer_deltas,
            compression_applied=curr.compression_applied,
            layers_trimmed=curr.layers_trimmed,
        )

    # ------------------------------------------------------------------
    # Alerts
    # ------------------------------------------------------------------

    def _check_alerts(self, snapshot: ContextSnapshot) -> None:
        utilization = snapshot.total_input_tokens / self.input_budget
        if utilization > self.alert_thresholds["utilization"]:
            logger.warning(
                "HIGH UTILIZATION: request %s at %.1f%% of input budget (%d/%d tokens)",
                snapshot.request_id, utilization * 100,
                snapshot.total_input_tokens, self.input_budget,
            )

    # ------------------------------------------------------------------
    # Reporting
    # ------------------------------------------------------------------

    def summary(self) -> dict:
        if not self._snapshots:
            return {}

        total_requests = len(self._snapshots)
        total_input = sum(s.total_input_tokens for s in self._snapshots)
        total_output = sum(s.output_tokens for s in self._snapshots)
        total_cached = sum(s.cached_tokens for s in self._snapshots)
        total_cost = sum(s.total_cost_usd for s in self._snapshots)
        cache_hits = sum(1 for s in self._snapshots if s.cache_hit)
        compressed = sum(1 for s in self._snapshots if s.compression_applied)

        # Per-layer totals
        layer_totals: dict[str, int] = defaultdict(int)
        for s in self._snapshots:
            for layer, tokens in s.layer_tokens.items():
                layer_totals[layer] += tokens

        avg_input = total_input / total_requests
        avg_utilization = avg_input / self.input_budget

        return {
            "total_requests": total_requests,
            "avg_input_tokens": round(avg_input),
            "avg_utilization": f"{avg_utilization:.1%}",
            "cache_hit_rate": f"{cache_hits / total_requests:.1%}",
            "compression_rate": f"{compressed / total_requests:.1%}",
            "total_cost_usd": round(total_cost, 4),
            "avg_cost_per_request_usd": round(total_cost / total_requests, 6),
            "layer_token_totals": dict(sorted(layer_totals.items(), key=lambda x: -x[1])),
            "alerts": self._generate_alerts(avg_utilization, cache_hits / total_requests),
        }

    def _generate_alerts(self, avg_utilization: float, cache_hit_rate: float) -> list[str]:
        alerts = []
        if avg_utilization > self.alert_thresholds["utilization"]:
            alerts.append(f"Avg utilization {avg_utilization:.1%} exceeds threshold — review injection logic")
        if cache_hit_rate < self.alert_thresholds["cache_hit_rate"]:
            alerts.append(f"Cache hit rate {cache_hit_rate:.1%} below threshold — check static-first ordering")
        return alerts

    def cost_attribution(self) -> dict[str, dict]:
        """Break down total cost by context layer."""
        layer_costs: dict[str, float] = defaultdict(float)
        layer_tokens: dict[str, int] = defaultdict(int)

        for snapshot in self._snapshots:
            for layer, tokens in snapshot.layer_tokens.items():
                layer_tokens[layer] += tokens
                layer_costs[layer] += token_cost(tokens, self.model, "input")

        total_cost = sum(layer_costs.values())
        return {
            layer: {
                "total_tokens": layer_tokens[layer],
                "total_cost_usd": round(layer_costs[layer], 6),
                "cost_fraction": f"{layer_costs[layer] / total_cost:.1%}" if total_cost > 0 else "0%",
            }
            for layer in sorted(layer_costs, key=lambda x: -layer_costs[x])
        }

    def print_report(self) -> None:
        s = self.summary()
        print("=" * 60)
        print("Context Observability Report")
        print("=" * 60)
        print(json.dumps(s, indent=2))
        print("\nCost Attribution:")
        print(json.dumps(self.cost_attribution(), indent=2))


# ---------------------------------------------------------------------------
# OpenTelemetry-compatible span context manager
# ---------------------------------------------------------------------------

class ContextAssemblySpan:
    """
    Lightweight span for timing context assembly stages.
    Compatible with OpenTelemetry span interface — replace with
    opentelemetry.trace.get_tracer(...).start_as_current_span() in production.

    Usage:
        with ContextAssemblySpan("retrieve_rag") as span:
            results = await retrieve_rag(query)
            span.set_attribute("chunks_retrieved", len(results))
    """

    def __init__(self, name: str) -> None:
        self.name = name
        self._start: float = 0.0
        self._attributes: dict[str, Any] = {}

    def __enter__(self) -> "ContextAssemblySpan":
        self._start = time.perf_counter()
        return self

    def __exit__(self, *_: Any) -> None:
        duration_ms = (time.perf_counter() - self._start) * 1000
        self._attributes["duration_ms"] = round(duration_ms, 2)
        logger.debug("Span [%s]: %s", self.name, self._attributes)

    def set_attribute(self, key: str, value: Any) -> None:
        self._attributes[key] = value

    @property
    def duration_ms(self) -> float:
        return self._attributes.get("duration_ms", 0.0)


# ---------------------------------------------------------------------------
# Example usage
# ---------------------------------------------------------------------------

def demo() -> None:
    import uuid

    observer = ContextObserver(model="claude-sonnet-4-5", context_window=16_000)
    session_id = "sess_demo_001"

    # Simulate 5 turns of a conversation
    turns = [
        {"system": 500, "rag": 3200, "history": 0,    "query": 120},
        {"system": 500, "rag": 3200, "history": 800,  "query": 95},
        {"system": 500, "rag": 3200, "history": 1600, "query": 140},
        {"system": 500, "rag": 2800, "history": 2400, "query": 110},  # RAG reduced
        {"system": 500, "rag": 2800, "history": 1200, "query": 88},   # History compressed
    ]

    for i, layer_tokens in enumerate(turns, start=1):
        with ContextAssemblySpan("context_assembly") as span:
            span.set_attribute("turn", i)
            span.set_attribute("total_tokens", sum(layer_tokens.values()))

        observer.record(
            request_id=str(uuid.uuid4())[:8],
            session_id=session_id,
            turn=i,
            layer_tokens=layer_tokens,
            cached_tokens=500,  # System prompt always cached
            output_tokens=350,
            compression_applied=(i == 5),
            layers_trimmed=["old_history"] if i == 5 else [],
            assembly_latency_ms=span.duration_ms + 45,  # Add mock retrieval time
        )

    observer.print_report()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    demo()