fix(langchain): handle secret str api keys [backport 2.0] (#7479)

github-actions[bot] · albertjan · Yun-Kim · web-flow · commit c17e063ec1a3 · 2023-11-06T14:07:57.000-05:00
Backport 6dc61f5 from #7430 to 2.0. Currently the anthropic chain implementation in langchain uses a pydantic SecretStr as an api key this is causing errors in our pipeline when ddtrace tries to format the api key. With this PR: langchain-ai/langchain#12542 the OpenAI implementation will also start using a SecretStr. I'm sure at that point there will be a few more people asking why things are broken. I'm struggling setting up and running the tests, riot doesn't print anything. And I have no experience with the cassettes testing methods. Can someone help with this? I think if we add a test that uses the Anthropic LLM we will see the failure before. And this will fix it. I've updated the type comment to the function, but the env doesn't know about Pydantic so I don't know if this is a valid thing to do. ## Checklist - [X] Change(s) are motivated and described in the PR description. - [x] Testing strategy is described if automated tests are not included in the PR. - [X] Risk is outlined (performance impact, potential for breakage, maintainability, etc). - [X] Change is maintainable (easy to change, telemetry, documentation). - [X] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed. If no release note is required, add label `changelog/no-changelog`. - [X] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)). - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Title is accurate. - [x] No unnecessary changes are introduced. - [x] Description motivates each change. - [x] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [x] Testing strategy adequately addresses listed risk(s). - [x] Change is maintainable (easy to change, telemetry, documentation). - [x] Release note makes sense to a user of the library. - [x] Reviewer has explicitly acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment. - [x] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) - [x] If this PR touches code that signs or publishes builds or packages, or handles credentials of any kind, I've requested a review from `@DataDog/security-design-and-guidance`. - [x] This PR doesn't touch any of that. --------- Co-authored-by: Albert-Jan Nijburg <me@ajp.codes> Co-authored-by: Yun Kim <yun.kim@datadoghq.com> Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
diff --git a/ddtrace/contrib/langchain/patch.py b/ddtrace/contrib/langchain/patch.py
@@ -7,6 +7,7 @@
 
 import langchain
 from langchain.callbacks.openai_info import get_openai_token_cost_for_model
+from pydantic import SecretStr
 
 from ddtrace import config
 from ddtrace.constants import ERROR_TYPE
@@ -140,8 +141,11 @@ def _extract_model_name(instance):
 
 
 def _format_api_key(api_key):
-    # type: (str) -> str
+    # type: (str | SecretStr) -> str
     """Obfuscate a given LLM provider API key by returning the last four characters."""
+    if hasattr(api_key, "get_secret_value"):
+        api_key = api_key.get_secret_value()
+
     if not api_key or len(api_key) < 4:
         return ""
     return "...%s" % api_key[-4:]
@@ -695,7 +699,7 @@ def traced_similarity_search(langchain, pin, func, instance, args, kwargs):
                 instance._index.configuration.server_variables.get("project_name", ""),
             )
             api_key = instance._index.configuration.api_key.get("ApiKeyAuth", "")
-            span.set_tag_str(API_KEY, "...%s" % api_key[-4:])  # override api_key for Pinecone
+            span.set_tag_str(API_KEY, _format_api_key(api_key))  # override api_key for Pinecone
         documents = func(*args, **kwargs)
         span.set_metric("langchain.response.document_count", len(documents))
         for idx, document in enumerate(documents):
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
@@ -175,6 +175,7 @@ proxying
 psutil
 psycopg
 py
+pydantic
 pyenv
 PyFrameObject
 pylibmc
diff --git a/releasenotes/notes/langchain-api-key-secret-str-b51ef4f3be0b7315.yaml b/releasenotes/notes/langchain-api-key-secret-str-b51ef4f3be0b7315.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    langchain: This fix resolves an issue with tagging pydantic `SecretStr` type api keys.
diff --git a/tests/contrib/langchain/test_langchain.py b/tests/contrib/langchain/test_langchain.py
@@ -53,6 +53,9 @@ def langchain(ddtrace_config_langchain, mock_logs, mock_metrics):
     with override_config("langchain", ddtrace_config_langchain):
         # ensure that mock OpenAI API key is passed in
         os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
+        os.environ["COHERE_API_KEY"] = os.getenv("COHERE_API_KEY", "<not-a-real-key>")
+        os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN", "<not-a-real-key>")
+        os.environ["AI21_API_KEY"] = os.getenv("AI21_API_KEY", "<not-a-real-key>")
         patch()
         import langchain
 
@@ -1078,9 +1081,7 @@ def test_pinecone_vectorstore_similarity_search(langchain, request_vcr):
             api_key=os.getenv("PINECONE_API_KEY", "<not-a-real-key>"),
             environment=os.getenv("PINECONE_ENV", "<not-a-real-env>"),
         )
-        embed = langchain.embeddings.OpenAIEmbeddings(
-            model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
-        )
+        embed = langchain.embeddings.OpenAIEmbeddings(model="text-embedding-ada-002")
         index = pinecone.Index(index_name="langchain-retrieval")
         vectorstore = langchain.vectorstores.Pinecone(index, embed.embed_query, "text")
         vectorstore.similarity_search("Who was Alan Turing?", 1)
@@ -1100,9 +1101,7 @@ def test_pinecone_vectorstore_retrieval_chain(langchain, request_vcr):
             api_key=os.getenv("PINECONE_API_KEY", "<not-a-real-key>"),
             environment=os.getenv("PINECONE_ENV", "<not-a-real-env>"),
         )
-        embed = langchain.embeddings.OpenAIEmbeddings(
-            model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
-        )
+        embed = langchain.embeddings.OpenAIEmbeddings(model="text-embedding-ada-002")
         index = pinecone.Index(index_name="langchain-retrieval")
         vectorstore = langchain.vectorstores.Pinecone(index, embed.embed_query, "text")
 
@@ -1127,9 +1126,7 @@ def test_pinecone_vectorstore_retrieval_chain_39(langchain, request_vcr):
             api_key=os.getenv("PINECONE_API_KEY", "<not-a-real-key>"),
             environment=os.getenv("PINECONE_ENV", "<not-a-real-env>"),
         )
-        embed = langchain.embeddings.OpenAIEmbeddings(
-            model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
-        )
+        embed = langchain.embeddings.OpenAIEmbeddings(model="text-embedding-ada-002")
         index = pinecone.Index(index_name="langchain-retrieval")
         vectorstore = langchain.vectorstores.Pinecone(index, embed.embed_query, "text")
 
@@ -1152,9 +1149,7 @@ def test_vectorstore_similarity_search_metrics(langchain, request_vcr, mock_metr
             api_key=os.getenv("PINECONE_API_KEY", "<not-a-real-key>"),
             environment=os.getenv("PINECONE_ENV", "<not-a-real-env>"),
         )
-        embed = langchain.embeddings.OpenAIEmbeddings(
-            model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
-        )
+        embed = langchain.embeddings.OpenAIEmbeddings(model="text-embedding-ada-002")
         index = pinecone.Index(index_name="langchain-retrieval")
         vectorstore = langchain.vectorstores.Pinecone(index, embed.embed_query, "text")
         vectorstore.similarity_search("Who was Alan Turing?", 1)
@@ -1205,9 +1200,7 @@ def test_vectorstore_logs(langchain, ddtrace_config_langchain, request_vcr, mock
             api_key=os.getenv("PINECONE_API_KEY", "<not-a-real-key>"),
             environment=os.getenv("PINECONE_ENV", "<not-a-real-env>"),
         )
-        embed = langchain.embeddings.OpenAIEmbeddings(
-            model="text-embedding-ada-002", openai_api_key=os.getenv("OPENAI_API_KEY", "<not-a-real-key>")
-        )
+        embed = langchain.embeddings.OpenAIEmbeddings(model="text-embedding-ada-002")
         index = pinecone.Index(index_name="langchain-retrieval")
         vectorstore = langchain.vectorstores.Pinecone(index, embed.embed_query, "text")
         vectorstore.similarity_search("Who was Alan Turing?", 1)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    langchain: This fix resolves an issue with tagging pydantic `SecretStr` type api keys.