Skip to content

Commit 18e2df6

Browse files
committed
generate
1 parent dc74137 commit 18e2df6

File tree

2 files changed

+139
-0
lines changed

2 files changed

+139
-0
lines changed

_doc/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ def linkcode_resolve(domain, info):
277277
epkg_dictionary.update(
278278
{
279279
"arnir0/Tiny-LLM": "https://huggingface.co/arnir0/Tiny-LLM",
280+
"microsoft/Phi-1.5": "https://huggingface.co/microsoft/phi-1_5",
280281
"microsoft/phi-2": "https://huggingface.co/microsoft/phi-2",
281282
"microsoft/Phi-3.5-mini-instruct": "https://huggingface.co/microsoft/Phi-3.5-mini-instruct",
282283
"microsoft/Phi-3.5-vision-instruct": "https://huggingface.co/microsoft/Phi-3.5-vision-instruct",

_doc/technical/plot_generate.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""
2+
.. _l-plot-generate:
3+
4+
=================================
5+
From a LLM to processing a prompt
6+
=================================
7+
8+
Method ``generate`` generates the model answer fro a given prompt.
9+
Let's implement our own to understand better how it works.
10+
11+
Example with Phi 1.5
12+
====================
13+
14+
epkg:`microsoft/Phi-1.5` is a small LLM. The example given
15+
"""
16+
17+
import time
18+
import pandas
19+
from tqdm import tqdm
20+
from onnx_diagnostic.ext_test_case import unit_test_going
21+
from onnx_diagnostic.helpers import string_type
22+
from onnx_diagnostic.torch_models.hghub import get_untrained_model_with_inputs
23+
import torch
24+
from transformers import AutoModelForCausalLM, AutoTokenizer
25+
26+
device = "cuda" if torch.cuda.is_available else "cpu"
27+
data = []
28+
29+
print("-- load the model...")
30+
# unit_test_going() returns True if UNITTEST_GOING is 1
31+
if unit_test_going():
32+
model_id = "arnir0/Tiny-LLM"
33+
model = get_untrained_model_with_inputs(model_id)["model"]
34+
tokenizer = AutoTokenizer.from_pretrained(model_id)
35+
else:
36+
model_id = "microsoft/phi-1_5"
37+
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
38+
tokenizer = AutoTokenizer.from_pretrained(model_id)
39+
model = model.to(device)
40+
print("-- done.")
41+
42+
print("-- tokenize the prompt...")
43+
inputs = tokenizer(
44+
'''def print_prime(n):
45+
"""
46+
Print all primes between 1 and n
47+
"""''',
48+
return_tensors="pt",
49+
return_attention_mask=False,
50+
).to(device)
51+
print("-- done.")
52+
53+
print("-- compute the answer...")
54+
begin = time.perf_counter()
55+
outputs = model.generate(**inputs, max_length=100)
56+
duration = time.perf_counter() - begin
57+
print(f"-- done in {duration}")
58+
data.append(dict(name="generate", duration=duration))
59+
print("output shape:", string_type(outputs, with_shape=True))
60+
print("-- decode the answer...")
61+
text = tokenizer.batch_decode(outputs)[0]
62+
print("-- done.")
63+
print(text)
64+
65+
66+
# %%
67+
# eos_token_id?
68+
# =============
69+
#
70+
# This token means the end of the answer.
71+
72+
print("eos_token_id=", tokenizer.eos_token_id)
73+
74+
# %%
75+
# Custom method generate
76+
# ======================
77+
78+
79+
def simple_generate_with_cache(
80+
model, input_ids: torch.Tensor, eos_token_id: int, max_new_tokens: int = 100
81+
):
82+
answer = []
83+
# First call.
84+
outputs = model(input_ids, use_cache=True)
85+
next_token_logits = outputs.logits[:, -1, :]
86+
past_key_values = outputs.past_key_values
87+
88+
# Next calls.
89+
for _ in tqdm(list(range(max_new_tokens))):
90+
# The most probable next token is chosen.
91+
next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)
92+
# But we could select it using a multinomial law
93+
# <<< probs = torch.softmax(next_token_logits / temperature, dim=-1)
94+
# <<< top_probs, top_indices = torch.topk(probs, top_k)
95+
# <<< next_token_id = top_indices[torch.multinomial(top_probs, 1)]
96+
97+
# Let's add the predicted token to the answer.
98+
answer.append(next_token_id)
99+
100+
# Feed only the new token, but with the cache
101+
outputs = model(next_token_id, use_cache=True, past_key_values=past_key_values)
102+
next_token_logits = outputs.logits[:, -1, :]
103+
past_key_values = outputs.past_key_values
104+
105+
input_ids = torch.cat([input_ids, next_token_id], dim=-1)
106+
107+
if next_token_id.item() == eos_token_id:
108+
break
109+
110+
return torch.cat(answer, dim=1)
111+
112+
113+
print("-- compute the answer with custom generate...")
114+
begin = time.perf_counter()
115+
outputs = simple_generate_with_cache(
116+
model, inputs.input_ids, eos_token_id=tokenizer.eos_token_id, max_new_tokens=100
117+
)
118+
duration = time.perf_counter() - begin
119+
print(f"-- done in {duration}")
120+
data.append(dict(name="custom", duration=duration))
121+
122+
print("-- done.")
123+
print("output shape:", string_type(outputs, with_shape=True))
124+
print("-- decode the answer...")
125+
text = tokenizer.batch_decode(outputs)[0]
126+
print("-- done.")
127+
print(text)
128+
129+
# %%
130+
# Plots
131+
# =====
132+
df = pandas.DataFrame(data).set_index("name")
133+
print(df)
134+
135+
# %%
136+
ax = df.plot(kind="bar", title="Time (s) comparison to generate a prompt.", rot=45)
137+
ax.figure.tight_layout()
138+
ax.figure.savefig("plot_generate.png")

0 commit comments

Comments
 (0)