|
11 | 11 | }, |
12 | 12 | { |
13 | 13 | "cell_type": "code", |
14 | | - "execution_count": null, |
| 14 | + "execution_count": 1, |
15 | 15 | "metadata": {}, |
16 | 16 | "outputs": [], |
17 | 17 | "source": [ |
|
42 | 42 | }, |
43 | 43 | { |
44 | 44 | "cell_type": "code", |
45 | | - "execution_count": null, |
| 45 | + "execution_count": 2, |
46 | 46 | "metadata": {}, |
47 | 47 | "outputs": [], |
48 | 48 | "source": [ |
|
151 | 151 | }, |
152 | 152 | { |
153 | 153 | "cell_type": "code", |
154 | | - "execution_count": null, |
| 154 | + "execution_count": 3, |
155 | 155 | "metadata": {}, |
156 | | - "outputs": [], |
| 156 | + "outputs": [ |
| 157 | + { |
| 158 | + "name": "stdout", |
| 159 | + "output_type": "stream", |
| 160 | + "text": [ |
| 161 | + "Overriding medusa_num_heads as: 4\n" |
| 162 | + ] |
| 163 | + }, |
| 164 | + { |
| 165 | + "data": { |
| 166 | + "application/vnd.jupyter.widget-view+json": { |
| 167 | + "model_id": "ef69040c760f4e4b949e27b2c09526d2", |
| 168 | + "version_major": 2, |
| 169 | + "version_minor": 0 |
| 170 | + }, |
| 171 | + "text/plain": [ |
| 172 | + "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]" |
| 173 | + ] |
| 174 | + }, |
| 175 | + "metadata": {}, |
| 176 | + "output_type": "display_data" |
| 177 | + }, |
| 178 | + { |
| 179 | + "name": "stderr", |
| 180 | + "output_type": "stream", |
| 181 | + "text": [ |
| 182 | + "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n" |
| 183 | + ] |
| 184 | + } |
| 185 | + ], |
157 | 186 | "source": [ |
158 | 187 | "model_name = 'FasterDecoding/medusa-vicuna-7b-v1.3'\n", |
159 | 188 | "model = MedusaModel.from_pretrained(\n", |
|
180 | 209 | }, |
181 | 210 | { |
182 | 211 | "cell_type": "code", |
183 | | - "execution_count": null, |
| 212 | + "execution_count": 4, |
184 | 213 | "metadata": {}, |
185 | 214 | "outputs": [], |
186 | 215 | "source": [ |
|
200 | 229 | }, |
201 | 230 | { |
202 | 231 | "cell_type": "code", |
203 | | - "execution_count": null, |
| 232 | + "execution_count": 5, |
204 | 233 | "metadata": {}, |
205 | 234 | "outputs": [], |
206 | 235 | "source": [ |
|
218 | 247 | }, |
219 | 248 | { |
220 | 249 | "cell_type": "code", |
221 | | - "execution_count": null, |
| 250 | + "execution_count": 9, |
222 | 251 | "metadata": {}, |
223 | | - "outputs": [], |
| 252 | + "outputs": [ |
| 253 | + { |
| 254 | + "name": "stdout", |
| 255 | + "output_type": "stream", |
| 256 | + "text": [ |
| 257 | + "Output length: 403\n", |
| 258 | + "Compression ratio: tensor(2.4724, device='cuda:0')\n" |
| 259 | + ] |
| 260 | + } |
| 261 | + ], |
224 | 262 | "source": [ |
225 | 263 | "with torch.inference_mode():\n", |
226 | 264 | " input_ids = tokenizer([prompt]).input_ids\n", |
|
249 | 287 | }, |
250 | 288 | { |
251 | 289 | "cell_type": "code", |
252 | | - "execution_count": null, |
| 290 | + "execution_count": 10, |
253 | 291 | "metadata": {}, |
254 | | - "outputs": [], |
| 292 | + "outputs": [ |
| 293 | + { |
| 294 | + "name": "stdout", |
| 295 | + "output_type": "stream", |
| 296 | + "text": [ |
| 297 | + "Once upon a time, in a small village nestled in the Andes mountains, there lived a charming llama named Luna. Luna was known for her kind heart and her love of coffee. She would often spend her afternoons sipping on a steaming cup of joe at the local café, chatting with the villagers and enjoying the warmth of the sun on her back.\n", |
| 298 | + "\n", |
| 299 | + "One day, as Luna was grazing on some fresh grass, she noticed that her hair was starting to grow longer and thicker. At first, she didn't think much of it, but as the days went on, her hair continued to grow and change. It became thick and wiry, with sharp spikes protruding from it.\n", |
| 300 | + "\n", |
| 301 | + "Luna was confused and a little scared by her new appearance. She had always been a gentle creature, and now she looked like a monster. She knew that she couldn't stay in the village anymore, so she set off on a journey to find a new home.\n", |
| 302 | + "\n", |
| 303 | + "As she wandered through the mountains, Luna stumbled upon a beautiful clearing. In the center of the clearing stood a small cottage, with a sign hanging outside that read \"Café Llama.\" Luna knew that this was where she belonged.\n", |
| 304 | + "\n", |
| 305 | + "She transformed the cottage into a cozy coffee shop, serving the best coffee in the mountains. The villagers were amazed by Luna's transformation, and they flocked to her café to taste her delicious brews.\n", |
| 306 | + "\n", |
| 307 | + "Luna's Medusa-like hair became her signature style, and she quickly became known as the most charming llama in the land. She spent her days sipping coffee, chatting with customers, and enjoying the warmth of the sun on her back. And she knew that she had finally found her true home.</s>\n" |
| 308 | + ] |
| 309 | + } |
| 310 | + ], |
255 | 311 | "source": [ |
256 | 312 | "output = tokenizer.decode(\n", |
257 | 313 | " output_ids,\n", |
|
275 | 331 | }, |
276 | 332 | { |
277 | 333 | "cell_type": "code", |
278 | | - "execution_count": null, |
| 334 | + "execution_count": 11, |
279 | 335 | "metadata": {}, |
280 | | - "outputs": [], |
| 336 | + "outputs": [ |
| 337 | + { |
| 338 | + "name": "stdout", |
| 339 | + "output_type": "stream", |
| 340 | + "text": [ |
| 341 | + "==================================================\n", |
| 342 | + "Wall time init: 0.026\n", |
| 343 | + "Wall time medusa: 0.031\n", |
| 344 | + "Wall time Tree: 3.732\n", |
| 345 | + "Wall time Posterior: 0.025\n", |
| 346 | + "Wall time Update: 0.051\n", |
| 347 | + "--------------------------------------------------\n", |
| 348 | + "Wall time portion medusa: 0.008\n", |
| 349 | + "Wall time portion Tree: 0.965\n", |
| 350 | + "Wall time portion Posterior: 0.007\n", |
| 351 | + "Wall time portion Update: 0.013\n", |
| 352 | + "--------------------------------------------------\n", |
| 353 | + "Tokens/second: 104.247\n", |
| 354 | + "==================================================\n" |
| 355 | + ] |
| 356 | + } |
| 357 | + ], |
281 | 358 | "source": [ |
282 | 359 | "max_length = 50\n", |
283 | 360 | "\n", |
|
307 | 384 | "print(format_string(\"Tokens/second: \", new_token / time_total, max_length))\n", |
308 | 385 | "print('='*max_length)" |
309 | 386 | ] |
310 | | - }, |
311 | | - { |
312 | | - "cell_type": "code", |
313 | | - "execution_count": null, |
314 | | - "metadata": {}, |
315 | | - "outputs": [], |
316 | | - "source": [] |
317 | 387 | } |
318 | 388 | ], |
319 | 389 | "metadata": { |
|
0 commit comments