Skip to content
Merged
87 changes: 67 additions & 20 deletions newrelic/hooks/mlmodel_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,11 @@ def create_chat_completion_message_event(
span_id,
trace_id,
response_model,
request_model,
response_id,
request_id,
llm_metadata,
output_message_list,
all_token_counts,
):
settings = transaction.settings if transaction.settings is not None else global_settings()

Expand All @@ -153,11 +153,6 @@ def create_chat_completion_message_event(
"request_id": request_id,
"span_id": span_id,
"trace_id": trace_id,
"token_count": (
settings.ai_monitoring.llm_token_count_callback(request_model, message_content)
if settings.ai_monitoring.llm_token_count_callback
else None
),
"role": message.get("role"),
"completion_id": chat_completion_id,
"sequence": index,
Expand All @@ -166,6 +161,9 @@ def create_chat_completion_message_event(
"ingest_source": "Python",
}

if all_token_counts:
chat_completion_input_message_dict["token_count"] = 0

if settings.ai_monitoring.record_content.enabled:
chat_completion_input_message_dict["content"] = message_content

Expand Down Expand Up @@ -193,11 +191,6 @@ def create_chat_completion_message_event(
"request_id": request_id,
"span_id": span_id,
"trace_id": trace_id,
"token_count": (
settings.ai_monitoring.llm_token_count_callback(response_model, message_content)
if settings.ai_monitoring.llm_token_count_callback
else None
),
"role": message.get("role"),
"completion_id": chat_completion_id,
"sequence": index,
Expand All @@ -207,6 +200,9 @@ def create_chat_completion_message_event(
"is_response": True,
}

if all_token_counts:
chat_completion_output_message_dict["token_count"] = 0

if settings.ai_monitoring.record_content.enabled:
chat_completion_output_message_dict["content"] = message_content

Expand Down Expand Up @@ -280,15 +276,18 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg
else getattr(attribute_response, "organization", None)
)

response_total_tokens = attribute_response.get("usage", {}).get("total_tokens") if response else None

total_tokens = (
settings.ai_monitoring.llm_token_count_callback(response_model, input_)
if settings.ai_monitoring.llm_token_count_callback and input_
else response_total_tokens
)

full_embedding_response_dict = {
"id": embedding_id,
"span_id": span_id,
"trace_id": trace_id,
"token_count": (
settings.ai_monitoring.llm_token_count_callback(response_model, input_)
if settings.ai_monitoring.llm_token_count_callback
else None
),
"request.model": kwargs.get("model") or kwargs.get("engine"),
"request_id": request_id,
"duration": ft.duration * 1000,
Expand All @@ -313,6 +312,7 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg
"response.headers.ratelimitRemainingRequests": check_rate_limit_header(
response_headers, "x-ratelimit-remaining-requests", True
),
"response.usage.total_tokens": total_tokens,
"vendor": "openai",
"ingest_source": "Python",
}
Expand Down Expand Up @@ -475,12 +475,15 @@ def _handle_completion_success(transaction, linking_metadata, completion_id, kwa


def _record_completion_success(transaction, linking_metadata, completion_id, kwargs, ft, response_headers, response):
settings = transaction.settings if transaction.settings is not None else global_settings()
span_id = linking_metadata.get("span.id")
trace_id = linking_metadata.get("trace.id")

try:
if response:
response_model = response.get("model")
response_id = response.get("id")
token_usage = response.get("usage") or {}
output_message_list = []
finish_reason = None
choices = response.get("choices") or []
Expand All @@ -494,6 +497,7 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
else:
response_model = kwargs.get("response.model")
response_id = kwargs.get("id")
token_usage = {}
output_message_list = []
finish_reason = kwargs.get("finish_reason")
if "content" in kwargs:
Expand All @@ -505,10 +509,44 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
output_message_list = []
request_model = kwargs.get("model") or kwargs.get("engine")

request_id = response_headers.get("x-request-id")
organization = response_headers.get("openai-organization") or getattr(response, "organization", None)
messages = kwargs.get("messages") or [{"content": kwargs.get("prompt"), "role": "user"}]
input_message_list = list(messages)

# Extract token counts from response object
if token_usage:
response_prompt_tokens = token_usage.get("prompt_tokens")
response_completion_tokens = token_usage.get("completion_tokens")
response_total_tokens = token_usage.get("total_tokens")

else:
response_prompt_tokens = None
response_completion_tokens = None
response_total_tokens = None

# Calculate token counts by checking if a callback is registered and if we have the necessary content to pass
# to it. If not, then we use the token counts provided in the response object
input_message_content = " ".join([msg.get("content", "") for msg in input_message_list if msg.get("content")])
prompt_tokens = (
settings.ai_monitoring.llm_token_count_callback(request_model, input_message_content)
if settings.ai_monitoring.llm_token_count_callback and input_message_content
else response_prompt_tokens
)
output_message_content = " ".join([msg.get("content", "") for msg in output_message_list if msg.get("content")])
completion_tokens = (
settings.ai_monitoring.llm_token_count_callback(response_model, output_message_content)
if settings.ai_monitoring.llm_token_count_callback and output_message_content
else response_completion_tokens
)

total_tokens = (
prompt_tokens + completion_tokens if all([prompt_tokens, completion_tokens]) else response_total_tokens
)

all_token_counts = bool(prompt_tokens and completion_tokens and total_tokens)

request_id = response_headers.get("x-request-id")
organization = response_headers.get("openai-organization") or getattr(response, "organization", None)

full_chat_completion_summary_dict = {
"id": completion_id,
"span_id": span_id,
Expand Down Expand Up @@ -553,6 +591,12 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
),
"response.number_of_messages": len(input_message_list) + len(output_message_list),
}

if all_token_counts:
full_chat_completion_summary_dict["response.usage.prompt_tokens"] = prompt_tokens
full_chat_completion_summary_dict["response.usage.completion_tokens"] = completion_tokens
full_chat_completion_summary_dict["response.usage.total_tokens"] = total_tokens

llm_metadata = _get_llm_attributes(transaction)
full_chat_completion_summary_dict.update(llm_metadata)
transaction.record_custom_event("LlmChatCompletionSummary", full_chat_completion_summary_dict)
Expand All @@ -564,11 +608,11 @@ def _record_completion_success(transaction, linking_metadata, completion_id, kwa
span_id,
trace_id,
response_model,
request_model,
response_id,
request_id,
llm_metadata,
output_message_list,
all_token_counts,
)
except Exception:
_logger.warning(RECORD_EVENTS_FAILURE_LOG_MESSAGE, traceback.format_exception(*sys.exc_info()))
Expand All @@ -579,6 +623,7 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg
trace_id = linking_metadata.get("trace.id")
request_message_list = kwargs.get("messages", None) or []
notice_error_attributes = {}

try:
if OPENAI_V1:
response = getattr(exc, "response", None)
Expand Down Expand Up @@ -643,18 +688,20 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg
output_message_list = []
if "content" in kwargs:
output_message_list = [{"content": kwargs.get("content"), "role": kwargs.get("role")}]

create_chat_completion_message_event(
transaction,
request_message_list,
completion_id,
span_id,
trace_id,
kwargs.get("response.model"),
request_model,
response_id,
request_id,
llm_metadata,
output_message_list,
# We do not record token counts in error cases, so set all_token_counts to True so the pipeline tokenizer does not run
all_token_counts=True,
)
except Exception:
_logger.warning(RECORD_EVENTS_FAILURE_LOG_MESSAGE, traceback.format_exception(*sys.exc_info()))
Expand Down
8 changes: 8 additions & 0 deletions tests/mlmodel_langchain/test_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@
"response.headers.ratelimitResetRequests": "20ms",
"response.headers.ratelimitRemainingTokens": 999992,
"response.headers.ratelimitRemainingRequests": 2999,
"response.usage.total_tokens": 8,
"vendor": "openai",
"ingest_source": "Python",
"input": "[[3923, 374, 220, 17, 489, 220, 19, 30]]",
Expand All @@ -382,6 +383,7 @@
"response.headers.ratelimitResetRequests": "20ms",
"response.headers.ratelimitRemainingTokens": 999998,
"response.headers.ratelimitRemainingRequests": 2999,
"response.usage.total_tokens": 1,
"vendor": "openai",
"ingest_source": "Python",
"input": "[[10590]]",
Expand Down Expand Up @@ -452,6 +454,9 @@
"response.headers.ratelimitResetRequests": "8.64s",
"response.headers.ratelimitRemainingTokens": 199912,
"response.headers.ratelimitRemainingRequests": 9999,
"response.usage.prompt_tokens": 73,
"response.usage.completion_tokens": 375,
"response.usage.total_tokens": 448,
"response.number_of_messages": 3,
},
],
Expand All @@ -467,6 +472,7 @@
"sequence": 0,
"response.model": "gpt-3.5-turbo-0125",
"vendor": "openai",
"token_count": 0,
"ingest_source": "Python",
"content": "You are a generator of quiz questions for a seminar. Use the following pieces of retrieved context to generate 5 multiple choice questions (A,B,C,D) on the subject matter. Use a three sentence maximum and keep the answer concise. Render the output as HTML\n\nWhat is 2 + 4?",
},
Expand All @@ -483,6 +489,7 @@
"sequence": 1,
"response.model": "gpt-3.5-turbo-0125",
"vendor": "openai",
"token_count": 0,
"ingest_source": "Python",
"content": "math",
},
Expand All @@ -499,6 +506,7 @@
"sequence": 2,
"response.model": "gpt-3.5-turbo-0125",
"vendor": "openai",
"token_count": 0,
"ingest_source": "Python",
"is_response": True,
"content": "```html\n<!DOCTYPE html>\n<html>\n<head>\n <title>Math Quiz</title>\n</head>\n<body>\n <h2>Math Quiz Questions</h2>\n <ol>\n <li>What is the result of 5 + 3?</li>\n <ul>\n <li>A) 7</li>\n <li>B) 8</li>\n <li>C) 9</li>\n <li>D) 10</li>\n </ul>\n <li>What is the product of 6 x 7?</li>\n <ul>\n <li>A) 36</li>\n <li>B) 42</li>\n <li>C) 48</li>\n <li>D) 56</li>\n </ul>\n <li>What is the square root of 64?</li>\n <ul>\n <li>A) 6</li>\n <li>B) 7</li>\n <li>C) 8</li>\n <li>D) 9</li>\n </ul>\n <li>What is the result of 12 / 4?</li>\n <ul>\n <li>A) 2</li>\n <li>B) 3</li>\n <li>C) 4</li>\n <li>D) 5</li>\n </ul>\n <li>What is the sum of 15 + 9?</li>\n <ul>\n <li>A) 22</li>\n <li>B) 23</li>\n <li>C) 24</li>\n <li>D) 25</li>\n </ul>\n </ol>\n</body>\n</html>\n```",
Expand Down
12 changes: 9 additions & 3 deletions tests/mlmodel_openai/test_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import openai
from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes
from testing_support.ml_testing_utils import (
add_token_count_to_events,
add_token_counts_to_chat_events,
disabled_ai_monitoring_record_content_settings,
disabled_ai_monitoring_settings,
disabled_ai_monitoring_streaming_settings,
Expand Down Expand Up @@ -55,6 +55,9 @@
"response.organization": "new-relic-nkmd8b",
"request.temperature": 0.7,
"request.max_tokens": 100,
"response.usage.completion_tokens": 11,
"response.usage.total_tokens": 64,
"response.usage.prompt_tokens": 53,
"response.choices.finish_reason": "stop",
"response.headers.llmVersion": "2020-10-01",
"response.headers.ratelimitLimitRequests": 200,
Expand All @@ -81,6 +84,7 @@
"role": "system",
"completion_id": None,
"sequence": 0,
"token_count": 0,
"response.model": "gpt-3.5-turbo-0613",
"vendor": "openai",
"ingest_source": "Python",
Expand All @@ -99,6 +103,7 @@
"role": "user",
"completion_id": None,
"sequence": 1,
"token_count": 0,
"response.model": "gpt-3.5-turbo-0613",
"vendor": "openai",
"ingest_source": "Python",
Expand All @@ -117,6 +122,7 @@
"role": "assistant",
"completion_id": None,
"sequence": 2,
"token_count": 0,
"response.model": "gpt-3.5-turbo-0613",
"vendor": "openai",
"is_response": True,
Expand Down Expand Up @@ -172,7 +178,7 @@ def test_openai_chat_completion_sync_no_content(set_trace_info):

@reset_core_stats_engine()
@override_llm_token_callback_settings(llm_token_count_callback)
@validate_custom_events(add_token_count_to_events(chat_completion_recorded_events))
@validate_custom_events(add_token_counts_to_chat_events(chat_completion_recorded_events))
# One summary event, one system message, one user message, and one response message from the assistant
@validate_custom_event_count(count=4)
@validate_transaction_metrics(
Expand Down Expand Up @@ -343,7 +349,7 @@ def test_openai_chat_completion_async_no_content(loop, set_trace_info):

@reset_core_stats_engine()
@override_llm_token_callback_settings(llm_token_count_callback)
@validate_custom_events(add_token_count_to_events(chat_completion_recorded_events))
@validate_custom_events(add_token_counts_to_chat_events(chat_completion_recorded_events))
# One summary event, one system message, one user message, and one response message from the assistant
@validate_custom_event_count(count=4)
@validate_transaction_metrics(
Expand Down
Loading