Skip to content

Commit fc27ff4

Browse files
committed
fix(llma): cache cost calculation in the LangChain callback
1 parent 02e82a6 commit fc27ff4

File tree

4 files changed

+220
-11
lines changed

4 files changed

+220
-11
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 6.7.10 - 2025-10-24
2+
3+
- fix(llma): cache cost calculation in the LangChain callback
4+
15
# 6.7.9 - 2025-10-22
26

37
- fix(flags): multi-condition flags with static cohorts returning wrong variants

posthog/ai/langchain/callbacks.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -750,12 +750,13 @@ def _parse_usage_model(
750750
"cache_read": "cache_read_tokens",
751751
"reasoning": "reasoning_tokens",
752752
}
753-
return ModelUsage(
754-
**{
755-
dataclass_key: parsed_usage.get(mapped_key) or 0
756-
for mapped_key, dataclass_key in field_mapping.items()
757-
},
753+
normalized_usage = ModelUsage(
754+
**{dataclass_key: parsed_usage.get(mapped_key) or 0 for mapped_key, dataclass_key in field_mapping.items()},
758755
)
756+
# input_tokens is the sum of input and cache read tokens.
757+
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
758+
normalized_usage.input_tokens = max(normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0)
759+
return normalized_usage
759760

760761

761762
def _parse_usage(response: LLMResult) -> ModelUsage:

posthog/test/ai/langchain/test_callbacks.py

Lines changed: 209 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1564,9 +1564,9 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
15641564
AIMessage(
15651565
content="Using cached analysis to provide quick response.",
15661566
usage_metadata={
1567-
"input_tokens": 200,
1567+
"input_tokens": 1200,
15681568
"output_tokens": 30,
1569-
"total_tokens": 1030,
1569+
"total_tokens": 1230,
15701570
"cache_read_input_tokens": 800, # Anthropic cache read
15711571
},
15721572
)
@@ -1583,7 +1583,7 @@ def test_anthropic_cache_write_and_read_tokens(mock_client):
15831583
generation_props = generation_args["properties"]
15841584

15851585
assert generation_args["event"] == "$ai_generation"
1586-
assert generation_props["$ai_input_tokens"] == 200
1586+
assert generation_props["$ai_input_tokens"] == 400
15871587
assert generation_props["$ai_output_tokens"] == 30
15881588
assert generation_props["$ai_cache_creation_input_tokens"] == 0
15891589
assert generation_props["$ai_cache_read_input_tokens"] == 800
@@ -1625,7 +1625,7 @@ def test_openai_cache_read_tokens(mock_client):
16251625
generation_props = generation_args["properties"]
16261626

16271627
assert generation_args["event"] == "$ai_generation"
1628-
assert generation_props["$ai_input_tokens"] == 150
1628+
assert generation_props["$ai_input_tokens"] == 50
16291629
assert generation_props["$ai_output_tokens"] == 40
16301630
assert generation_props["$ai_cache_read_input_tokens"] == 100
16311631
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1707,7 +1707,7 @@ def test_combined_reasoning_and_cache_tokens(mock_client):
17071707
generation_props = generation_args["properties"]
17081708

17091709
assert generation_args["event"] == "$ai_generation"
1710-
assert generation_props["$ai_input_tokens"] == 500
1710+
assert generation_props["$ai_input_tokens"] == 200
17111711
assert generation_props["$ai_output_tokens"] == 100
17121712
assert generation_props["$ai_cache_read_input_tokens"] == 300
17131713
assert generation_props["$ai_cache_creation_input_tokens"] == 0
@@ -1876,3 +1876,207 @@ def test_tool_definition(mock_client):
18761876
assert props["$ai_latency"] == 1.0
18771877
# Verify that tools are captured in the $ai_tools property
18781878
assert props["$ai_tools"] == tools
1879+
1880+
1881+
def test_cache_read_tokens_subtraction_from_input_tokens(mock_client):
1882+
"""Test that cache_read_tokens are properly subtracted from input_tokens.
1883+
1884+
This tests the logic in callbacks.py lines 757-758:
1885+
if normalized_usage.input_tokens and normalized_usage.cache_read_tokens:
1886+
normalized_usage.input_tokens = max(normalized_usage.input_tokens - normalized_usage.cache_read_tokens, 0)
1887+
"""
1888+
prompt = ChatPromptTemplate.from_messages(
1889+
[("user", "Use the cached prompt for this request")]
1890+
)
1891+
1892+
# Scenario 1: input_tokens includes cache_read_tokens (typical case)
1893+
# input_tokens=150 includes 100 cache_read tokens, so actual input is 50
1894+
model = FakeMessagesListChatModel(
1895+
responses=[
1896+
AIMessage(
1897+
content="Response using cached prompt context.",
1898+
usage_metadata={
1899+
"input_tokens": 150, # Total includes cache reads
1900+
"output_tokens": 40,
1901+
"total_tokens": 190,
1902+
"cache_read_input_tokens": 100, # 100 tokens read from cache
1903+
},
1904+
)
1905+
]
1906+
)
1907+
1908+
callbacks = [CallbackHandler(mock_client)]
1909+
chain = prompt | model
1910+
result = chain.invoke({}, config={"callbacks": callbacks})
1911+
1912+
assert result.content == "Response using cached prompt context."
1913+
assert mock_client.capture.call_count == 3
1914+
1915+
generation_args = mock_client.capture.call_args_list[1][1]
1916+
generation_props = generation_args["properties"]
1917+
1918+
assert generation_args["event"] == "$ai_generation"
1919+
# Input tokens should be reduced: 150 - 100 = 50
1920+
assert generation_props["$ai_input_tokens"] == 50
1921+
assert generation_props["$ai_output_tokens"] == 40
1922+
assert generation_props["$ai_cache_read_input_tokens"] == 100
1923+
1924+
1925+
def test_cache_read_tokens_subtraction_prevents_negative(mock_client):
1926+
"""Test that cache_read_tokens subtraction doesn't result in negative input_tokens.
1927+
1928+
This tests the max(..., 0) part of the logic in callbacks.py lines 757-758.
1929+
"""
1930+
prompt = ChatPromptTemplate.from_messages(
1931+
[("user", "Edge case with large cache read")]
1932+
)
1933+
1934+
# Edge case: cache_read_tokens >= input_tokens
1935+
# This could happen in some API responses where accounting differs
1936+
model = FakeMessagesListChatModel(
1937+
responses=[
1938+
AIMessage(
1939+
content="Response with edge case token counts.",
1940+
usage_metadata={
1941+
"input_tokens": 80,
1942+
"output_tokens": 20,
1943+
"total_tokens": 100,
1944+
"cache_read_input_tokens": 100, # More than input_tokens
1945+
},
1946+
)
1947+
]
1948+
)
1949+
1950+
callbacks = [CallbackHandler(mock_client)]
1951+
chain = prompt | model
1952+
result = chain.invoke({}, config={"callbacks": callbacks})
1953+
1954+
assert result.content == "Response with edge case token counts."
1955+
assert mock_client.capture.call_count == 3
1956+
1957+
generation_args = mock_client.capture.call_args_list[1][1]
1958+
generation_props = generation_args["properties"]
1959+
1960+
assert generation_args["event"] == "$ai_generation"
1961+
# Input tokens should be 0, not negative: max(80 - 100, 0) = 0
1962+
assert generation_props["$ai_input_tokens"] == 0
1963+
assert generation_props["$ai_output_tokens"] == 20
1964+
assert generation_props["$ai_cache_read_input_tokens"] == 100
1965+
1966+
1967+
def test_no_cache_read_tokens_no_subtraction(mock_client):
1968+
"""Test that when there are no cache_read_tokens, input_tokens remain unchanged.
1969+
1970+
This tests the conditional check before the subtraction in callbacks.py line 757.
1971+
"""
1972+
prompt = ChatPromptTemplate.from_messages([("user", "Normal request without cache")])
1973+
1974+
# No cache usage - input_tokens should remain as-is
1975+
model = FakeMessagesListChatModel(
1976+
responses=[
1977+
AIMessage(
1978+
content="Response without cache.",
1979+
usage_metadata={
1980+
"input_tokens": 100,
1981+
"output_tokens": 30,
1982+
"total_tokens": 130,
1983+
# No cache_read_input_tokens
1984+
},
1985+
)
1986+
]
1987+
)
1988+
1989+
callbacks = [CallbackHandler(mock_client)]
1990+
chain = prompt | model
1991+
result = chain.invoke({}, config={"callbacks": callbacks})
1992+
1993+
assert result.content == "Response without cache."
1994+
assert mock_client.capture.call_count == 3
1995+
1996+
generation_args = mock_client.capture.call_args_list[1][1]
1997+
generation_props = generation_args["properties"]
1998+
1999+
assert generation_args["event"] == "$ai_generation"
2000+
# Input tokens should remain unchanged at 100
2001+
assert generation_props["$ai_input_tokens"] == 100
2002+
assert generation_props["$ai_output_tokens"] == 30
2003+
assert generation_props["$ai_cache_read_input_tokens"] == 0
2004+
2005+
2006+
def test_zero_input_tokens_with_cache_read(mock_client):
2007+
"""Test edge case where input_tokens is 0 but cache_read_tokens exist.
2008+
2009+
This tests the falsy check in the conditional (line 757).
2010+
"""
2011+
prompt = ChatPromptTemplate.from_messages([("user", "Edge case query")])
2012+
2013+
# Edge case: input_tokens is 0 (falsy), should skip subtraction
2014+
model = FakeMessagesListChatModel(
2015+
responses=[
2016+
AIMessage(
2017+
content="Response.",
2018+
usage_metadata={
2019+
"input_tokens": 0,
2020+
"output_tokens": 10,
2021+
"total_tokens": 10,
2022+
"cache_read_input_tokens": 50,
2023+
},
2024+
)
2025+
]
2026+
)
2027+
2028+
callbacks = [CallbackHandler(mock_client)]
2029+
chain = prompt | model
2030+
result = chain.invoke({}, config={"callbacks": callbacks})
2031+
2032+
assert result.content == "Response."
2033+
assert mock_client.capture.call_count == 3
2034+
2035+
generation_args = mock_client.capture.call_args_list[1][1]
2036+
generation_props = generation_args["properties"]
2037+
2038+
assert generation_args["event"] == "$ai_generation"
2039+
# Input tokens should remain 0 (no subtraction because input_tokens is falsy)
2040+
assert generation_props["$ai_input_tokens"] == 0
2041+
assert generation_props["$ai_output_tokens"] == 10
2042+
assert generation_props["$ai_cache_read_input_tokens"] == 50
2043+
2044+
2045+
def test_cache_write_tokens_not_subtracted_from_input(mock_client):
2046+
"""Test that cache_creation_input_tokens (cache write) do NOT affect input_tokens.
2047+
2048+
Only cache_read_tokens should be subtracted from input_tokens, not cache_write_tokens.
2049+
"""
2050+
prompt = ChatPromptTemplate.from_messages([("user", "Create cache")])
2051+
2052+
# Cache creation without cache read
2053+
model = FakeMessagesListChatModel(
2054+
responses=[
2055+
AIMessage(
2056+
content="Creating cache.",
2057+
usage_metadata={
2058+
"input_tokens": 1000,
2059+
"output_tokens": 20,
2060+
"total_tokens": 1020,
2061+
"cache_creation_input_tokens": 800, # Cache write, not read
2062+
},
2063+
)
2064+
]
2065+
)
2066+
2067+
callbacks = [CallbackHandler(mock_client)]
2068+
chain = prompt | model
2069+
result = chain.invoke({}, config={"callbacks": callbacks})
2070+
2071+
assert result.content == "Creating cache."
2072+
assert mock_client.capture.call_count == 3
2073+
2074+
generation_args = mock_client.capture.call_args_list[1][1]
2075+
generation_props = generation_args["properties"]
2076+
2077+
assert generation_args["event"] == "$ai_generation"
2078+
# Input tokens should NOT be reduced by cache_creation_input_tokens
2079+
assert generation_props["$ai_input_tokens"] == 1000
2080+
assert generation_props["$ai_output_tokens"] == 20
2081+
assert generation_props["$ai_cache_creation_input_tokens"] == 800
2082+
assert generation_props["$ai_cache_read_input_tokens"] == 0

posthog/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = "6.7.9"
1+
VERSION = "6.7.10"
22

33
if __name__ == "__main__":
44
print(VERSION, end="") # noqa: T201

0 commit comments

Comments
 (0)