Fix bug in how tokens are counted when streaming generateContent method is called. (#4152)

DylanRussell · web-flow · commit 1f64f15d07d0 · 2026-02-02T15:35:55.000-05:00
* Fix token count bug

* Add changelog

* Fix changelog
diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md b/instrumentation-genai/opentelemetry-instrumentation-google-genai/CHANGELOG.md
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## Unreleased
+- Fix bug in how tokens are counted when using the streaming `generateContent` method.  ([#4152](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/4152)).
 
 ## Version 0.6b0 (2026-01-27)
 
diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/src/opentelemetry/instrumentation/google_genai/generate_content.py
@@ -430,9 +430,9 @@ def _maybe_update_token_counts(self, response: GenerateContentResponse):
             response, "usage_metadata.candidates_token_count"
         )
         if input_tokens and isinstance(input_tokens, int):
-            self._input_tokens += input_tokens
+            self._input_tokens = input_tokens
         if output_tokens and isinstance(output_tokens, int):
-            self._output_tokens += output_tokens
+            self._output_tokens = output_tokens
 
     def _maybe_update_error_type(self, response: GenerateContentResponse):
         if response.candidates:
diff --git a/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py b/instrumentation-genai/opentelemetry-instrumentation-google-genai/tests/generate_content/streaming_base.py
@@ -89,22 +89,18 @@ def test_handles_multiple_ressponses(self):
         choice_events = self.otel.get_events_named("gen_ai.choice")
         self.assertEqual(len(choice_events), 2)
 
-    def test_includes_token_counts_in_span_aggregated_from_responses(self):
-        # Configure multiple responses whose input/output tokens should be
-        # accumulated together when summarizing the end-to-end request.
-        #
-        #   Input: 1 + 3 + 5 => 4 + 5 => 9
-        #   Output: 2 + 4 + 6 => 6 + 6 => 12
-        self.configure_valid_response(input_tokens=1, output_tokens=2)
-        self.configure_valid_response(input_tokens=3, output_tokens=4)
-        self.configure_valid_response(input_tokens=5, output_tokens=6)
+    def test_includes_token_counts_in_span_not_aggregated_from_responses(self):
+        # Tokens should not be aggregated in streaming. Cumulative counts are returned on each response.
+        self.configure_valid_response(input_tokens=3, output_tokens=5)
+        self.configure_valid_response(input_tokens=3, output_tokens=5)
+        self.configure_valid_response(input_tokens=3, output_tokens=5)
 
         self.generate_content(model="gemini-2.0-flash", contents="Some input")
 
         self.otel.assert_has_span_named("generate_content gemini-2.0-flash")
         span = self.otel.get_span_named("generate_content gemini-2.0-flash")
-        self.assertEqual(span.attributes["gen_ai.usage.input_tokens"], 9)
-        self.assertEqual(span.attributes["gen_ai.usage.output_tokens"], 12)
+        self.assertEqual(span.attributes["gen_ai.usage.input_tokens"], 3)
+        self.assertEqual(span.attributes["gen_ai.usage.output_tokens"], 5)
 
     def test_new_semconv_log_has_extra_genai_attributes(self):
         patched_environ = patch.dict(