fix: added support for azure LLM (#133)

jjmachan · web-flow · commit 4f260c887d47 · 2023-09-14T17:44:24.000+05:30
fixes #114 #126 Shoutout to @gabriead for helping test this out :)
diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb
@@ -60,6 +60,8 @@
     "\n",
     "Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.\n",
     "\n",
+    "**Note:** *While originially ragas was aimed at `ground_truth` free evalutions there is some aspects of the RAG pipeline that need `ground_truth` in order to measure. We're in the process of building a testset generation features that will make it easier. Checkout [issue#136](https://github.com/explodinggradients/ragas/issues/136) for more details.*\n",
+    "\n",
     "Hence to work with ragas all you need are the following data\n",
     "- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. \n",
     "- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.\n",
@@ -73,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 1,
    "id": "b658e02f",
    "metadata": {},
    "outputs": [
@@ -87,7 +89,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "e481f1b6ae824149aaf5afe96330fda3",
+       "model_id": "a2dfebb012dd4b79b3a6ed951ce0d406",
        "version_major": 2,
        "version_minor": 0
       },
@@ -109,7 +111,7 @@
        "})"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -141,7 +143,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
    "id": "f17bcf9d",
    "metadata": {},
    "outputs": [],
@@ -185,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "id": "22eb6f97",
    "metadata": {},
    "outputs": [
@@ -200,7 +202,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|█████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.57s/it]\n"
+      "100%|████████████████████████████████████████████████████████████| 2/2 [04:08<00:00, 124.31s/it]\n"
      ]
     },
     {
@@ -214,7 +216,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|█████████████████████████████████████████████████████████████| 1/1 [00:28<00:00, 28.82s/it]\n"
+      "100%|████████████████████████████████████████████████████████████| 2/2 [06:29<00:00, 194.60s/it]\n"
      ]
     },
     {
@@ -228,7 +230,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|█████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.53s/it]\n"
+      "100%|█████████████████████████████████████████████████████████████| 2/2 [01:16<00:00, 38.12s/it]\n"
      ]
     },
     {
@@ -242,7 +244,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|█████████████████████████████████████████████████████████████| 1/1 [00:24<00:00, 24.13s/it]\n"
+      "100%|████████████████████████████████████████████████████████████| 2/2 [07:53<00:00, 236.95s/it]\n"
      ]
     },
     {
@@ -256,25 +258,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|█████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.31s/it]\n"
+      " 50%|██████████████████████████████▌                              | 1/2 [00:46<00:46, 46.32s/it]"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'ragas_score': 0.3482, 'context_relevancy': 0.1296, 'faithfulness': 0.8889, 'answer_relevancy': 0.9285, 'context_recall': 0.6370, 'harmfulness': 0.0000}"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
     "from ragas import evaluate\n",
     "\n",
     "result = evaluate(\n",
-    "    fiqa_eval[\"baseline\"].select(range(3)),\n",
+    "    fiqa_eval[\"baseline\"],\n",
     "    metrics=[\n",
     "        context_relevancy,\n",
     "        faithfulness,\n",
@@ -454,8 +446,6 @@
    "source": [
     "And thats it!\n",
     "\n",
-    "You can check out the [ragas in action] notebook to get a feel of what is like to use it while trying to improve your pipelines.\n",
-    "\n",
     "if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁"
    ]
   }
diff --git a/src/ragas/metrics/llms.py b/src/ragas/metrics/llms.py
@@ -2,9 +2,9 @@
 
 import typing as t
 
-from langchain.chat_models import ChatOpenAI
+from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
 from langchain.chat_models.base import BaseChatModel
-from langchain.llms import OpenAI
+from langchain.llms import AzureOpenAI, OpenAI
 from langchain.llms.base import BaseLLM
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema import LLMResult
@@ -17,18 +17,33 @@ def isOpenAI(llm: BaseLLM | BaseChatModel) -> bool:
     return isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI)
 
 
+# have to specify it twice for runtime and static checks
+MULTIPLE_COMPLETION_SUPPORTED = [OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI]
+MultipleCompletionSupportedLLM = t.Union[
+    OpenAI, ChatOpenAI, AzureOpenAI, AzureChatOpenAI
+]
+
+
+def multiple_completion_supported(llm: BaseLLM | BaseChatModel) -> bool:
+    for model in MULTIPLE_COMPLETION_SUPPORTED:
+        if isinstance(llm, model):
+            return True
+    return False
+
+
 def generate(
     prompts: list[ChatPromptTemplate],
     llm: BaseLLM | BaseChatModel,
-    n: t.Optional[int] = None,
+    n: int = 1,
     temperature: float = 0,
     callbacks: t.Optional[Callbacks] = None,
 ) -> LLMResult:
-    old_n = None
+    old_n: int = 1
     n_swapped = False
     llm.temperature = temperature
     if n is not None:
-        if isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI):
+        if multiple_completion_supported(llm):
+            llm = t.cast(MultipleCompletionSupportedLLM, llm)
             old_n = llm.n
             llm.n = n
             n_swapped = True
@@ -44,7 +59,8 @@ def generate(
         ps = [p.format_messages() for p in prompts]
         result = llm.generate(ps, callbacks=callbacks)
 
-    if (isinstance(llm, OpenAI) or isinstance(llm, ChatOpenAI)) and n_swapped:
-        llm.n = old_n  # type: ignore
+    if multiple_completion_supported(llm) and n_swapped:
+        llm = t.cast(MultipleCompletionSupportedLLM, llm)
+        llm.n = old_n
 
     return result