Minor changes (#38)

shahules786 · web-flow · commit 48ae5995ae30 · 2023-06-09T08:21:43.000+05:30
* added readme

* added how to use

* added metrics to docs

* update readme

* set model_max_length

* fix import paths
diff --git a/experiments/assesments/metrics_assesments.ipynb b/experiments/assesments/metrics_assesments.ipynb
@@ -32,10 +32,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 1,
    "id": "7bfb2480",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/alerts/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
    "source": [
     "import json\n",
     "from datasets import load_dataset\n",
@@ -55,7 +64,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "os.chdir(\"/Users/shahules/belar/\")"
+    "os.chdir('/Users/shahules/belar/src/')"
    ]
   },
   {
@@ -135,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 7,
    "id": "f9f4280e",
    "metadata": {},
    "outputs": [
@@ -144,7 +153,7 @@
      "output_type": "stream",
      "text": [
       "Found cached dataset parquet (/Users/shahules/.cache/huggingface/datasets/explodinggradients___parquet/explodinggradients--ragas-wikiqa-5b5116e5cb909aca/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n",
-      "100%|█| 1/1 [00:00<00:00, 58.\n"
+      "100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 242.78it/s]\n"
      ]
     }
    ],
@@ -162,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 8,
    "id": "eca20daf",
    "metadata": {},
    "outputs": [],
@@ -184,7 +193,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "f3e35532",
    "metadata": {},
    "outputs": [],
@@ -216,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "335081e3",
    "metadata": {},
    "outputs": [],
@@ -252,7 +261,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 11,
    "id": "b2642e5b",
    "metadata": {},
    "outputs": [],
@@ -267,7 +276,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 13,
    "id": "26ca4af4",
    "metadata": {},
    "outputs": [
@@ -284,7 +293,7 @@
        "0"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -305,7 +314,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "id": "ca1c56d6",
    "metadata": {},
    "outputs": [],
@@ -327,7 +336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "cd7fed9c",
    "metadata": {},
    "outputs": [],
@@ -343,7 +352,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "id": "35113558",
    "metadata": {},
    "outputs": [],
@@ -354,7 +363,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "id": "4e82d0df",
    "metadata": {},
    "outputs": [
@@ -368,10 +377,10 @@
     {
      "data": {
       "text/plain": [
-       "3.514920235612768"
+       "3.5533440372846865"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -399,40 +408,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 13,
    "id": "cc263805",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from experimental.relevance import QGen"
+    "from ragas.metrics.answer_relevance import QGen"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 14,
    "id": "38deaf06",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/anaconda3/envs/alerts/lib/python3.8/site-packages/transformers/models/t5/tokenization_t5_fast.py:155: FutureWarning: This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.\n",
-      "For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.\n",
-      "- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.\n",
-      "- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.\n",
-      "- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "t5_qgen = QGen(\"t5-base\", \"cpu\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 15,
    "id": "45942810",
    "metadata": {},
    "outputs": [],
@@ -457,7 +453,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 16,
    "id": "ab00e4fe",
    "metadata": {},
    "outputs": [],
@@ -522,12 +518,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 17,
    "id": "b6d76ae2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "## import cross encoder"
+    "from ragas.metrics.context_relevance import context_relavancy"
    ]
   },
   {
diff --git a/src/ragas/metrics/answer_relevance.py b/src/ragas/metrics/answer_relevance.py
@@ -24,7 +24,7 @@
 class QGen:
     def __init__(self, model_name: str, device: str) -> None:
         config = AutoConfig.from_pretrained(model_name)
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = "[PAD]"
         architecture = np.intersect1d(