|
19 | 19 | }, |
20 | 20 | { |
21 | 21 | "cell_type": "code", |
22 | | - "execution_count": 2, |
| 22 | + "execution_count": null, |
23 | 23 | "metadata": {}, |
24 | 24 | "outputs": [], |
25 | 25 | "source": [ |
|
35 | 35 | "import numpy as np\n", |
36 | 36 | "import torch\n", |
37 | 37 | "from transformers import AutoModelForCausalLM, pipeline\n", |
38 | | - "from transformers import QuantizedCacheConfig, QuantoQuantizedCache, DynamicCache, QuantizedCache\n", |
| 38 | + "from transformers import DynamicCache, QuantizedCache\n", |
39 | 39 | "from transformers.utils.logging import disable_progress_bar\n", |
40 | 40 | "import transformers\n", |
41 | 41 | "\n", |
|
65 | 65 | }, |
66 | 66 | { |
67 | 67 | "cell_type": "code", |
68 | | - "execution_count": 5, |
| 68 | + "execution_count": null, |
69 | 69 | "metadata": {}, |
70 | 70 | "outputs": [], |
71 | 71 | "source": [ |
72 | 72 | "def get_size_of_cache(cache):\n", |
73 | | - " if isinstance(cache, QuantoQuantizedCache):\n", |
| 73 | + " if isinstance(cache, QuantizedCache):\n", |
74 | 74 | " # We cannot use x.element_size() * x.nelement() as below to calculate the size of the cache, \n", |
75 | 75 | " # as cache._quantized_value_cache[0].element_size() triggers a call of __torch_dispatch__,\n", |
76 | 76 | " # which, in turn, unpacks the internally packed tensor; and thus does not report the correct internal storage size.\n", |
77 | 77 | " # See also https://github.com/huggingface/optimum-quanto/blob/main/optimum/quanto/tensor/packed.py#L144\n", |
78 | 78 | "\n", |
79 | | - " # As QuantoQuantizedCache stores values, as well as shift and scale, \n", |
80 | | - " # we temporarily save the cache to disc and getthe size of the saved object\n", |
| 79 | + " # As QuantizedCache stores values, as well as shift and scale, \n", |
| 80 | + " # we temporarily save the cache to disc and get the size of the saved object\n", |
81 | 81 | " temp_file = \"tmp.pickle\"\n", |
82 | 82 | " with open(temp_file, \"wb\") as f:\n", |
83 | 83 | " pickle.dump(cache, f)\n", |
|
100 | 100 | }, |
101 | 101 | { |
102 | 102 | "cell_type": "code", |
103 | | - "execution_count": 6, |
| 103 | + "execution_count": null, |
104 | 104 | "metadata": {}, |
105 | 105 | "outputs": [], |
106 | 106 | "source": [ |
|
125 | 125 | " if cache_implementation == \"dynamic\":\n", |
126 | 126 | " cache = DynamicCache()\n", |
127 | 127 | " elif cache_implementation == \"quantized\":\n", |
128 | | - " cache = QuantoQuantizedCache(config=model.config, nbits=4)\n", |
| 128 | + " cache = QuantizedCache(backend=\"quanto\", config=model.config, nbits=4)\n", |
129 | 129 | " else:\n", |
130 | 130 | " raise NotImplementedError(f\"Cache {cache_implementation} not yet implemented\")\n", |
131 | 131 | "\n", |
|
0 commit comments