FasterDecoding
diff --git a/‎notebook/example.ipynb‎ ‎notebooks/example.ipynb‎notebook/example.ipynb renamed to notebooks/example.ipynb b/‎notebook/example.ipynb‎ ‎notebooks/example.ipynb‎notebook/example.ipynb renamed to notebooks/example.ipynb
diff --git a/‎notebook/snapkv.txt‎ ‎notebooks/snapkv.txt‎notebook/snapkv.txt renamed to notebooks/snapkv.txt b/‎notebook/snapkv.txt‎ ‎notebooks/snapkv.txt‎notebook/snapkv.txt renamed to notebooks/snapkv.txt
diff --git a/‎notebooks/test_snapkv.ipynb‎
Lines changed: 259 additions & 0 deletions b/‎notebooks/test_snapkv.ipynb‎
Lines changed: 259 additions & 0 deletions
@@ -0,0 +1,259 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/tianle/miniconda3/envs/code_attn/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "# CUDAVISIBLE DEVICES\n",
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"5,6\"\n",
+    "\n",
+    "os.environ['HF_DATASETS_CACHE'] = \"/work/tianle/huggingface/datasets\"\n",
+    "os.environ['HF_HOME'] = \"/work/tianle/huggingface\"\n",
+    "\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig\n",
+    "import transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from snapkv.monkeypatch.monkeypatch import replace_llama, replace_mistral, replace_mixtral"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/work/data/tianle/share/SnapKV/snapkv/monkeypatch/monkeypatch.py:50: UserWarning: Transformers version 4.36.2 might not be compatible with SnapKV. SnapKV is tested with Transformers version ['4.37'].\n",
+      "  warnings.warn(f\"Transformers version {transformers_version} might not be compatible with SnapKV. SnapKV is tested with Transformers version {version_list}.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "replace_mixtral()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastchat.model import load_model, get_conversation_template\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Transformers version: 4.36.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "from importlib.metadata import version\n",
+    "try:\n",
+    "    transformers_version = version(\"transformers\")\n",
+    "    print(f\"Transformers version: {transformers_version}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error: {e}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation=\"flash_attention_2\"` instead.\n",
+      "Loading checkpoint shards: 100%|██████████| 19/19 [01:10<00:00,  3.71s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    \"mistralai/Mixtral-8x7B-Instruct-v0.1\",\n",
+    "    torch_dtype=torch.bfloat16,\n",
+    "    low_cpu_mem_usage=True,\n",
+    "    device_map=\"auto\",\n",
+    "    use_flash_attention_2=True\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(\"mistralai/Mixtral-8x7B-Instruct-v0.1\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load './snapkv.txt'\n",
+    "with open('snapkv.txt', 'r') as f:\n",
+    "    content = f.read().strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "question = \"\\n What is the repository of SnapKV?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conv = get_conversation_template(\"longchat\")\n",
+    "conv.messages = []\n",
+    "conv.append_message(conv.roles[0],content + question)\n",
+    "# conv.append_message(conv.roles[0],\"Who is Kobe Bryant?\")\n",
+    "conv.append_message(conv.roles[1], None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = conv.get_prompt()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = tokenizer.encode(prompt, return_tensors='pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids_len = input_ids.size(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
+     ]
+    }
+   ],
+   "source": [
+    "outputs = model.generate(input_ids.cuda(), max_new_tokens=200, do_sample=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The repository of SnapKV is available at <https://github.com/FasterDecoding/SnapKV>.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tokenizer.decode(outputs[0][input_ids_len:], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "code_attn",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}