-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsft_generate.py
More file actions
86 lines (72 loc) · 3.65 KB
/
sft_generate.py
File metadata and controls
86 lines (72 loc) · 3.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import config
import pickle
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import time
s = time.time()
model = AutoModelForCausalLM.from_pretrained(f"{config.USER}/{config.SFT_MERGED_MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True if "llama" in config.MODEL_NAME.lower() else False
generation_config = GenerationConfig(
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
min_new_tokens=1, # NEW tokens generated (doesn't include prompt)
max_new_tokens=125, # NEW tokens generated (doesn't include prompt)
do_sample=True, # sample across tokens; if false, model uses greedy decoding
temperature=1.0, # how randomly model samples from list of available tokens
# top_p=0.30, # list of tokens (top X %) which model can sample from
# num_beams=1, # beam search; number of beams (X more runs through model)
penalty_alpha=0.6, # contrastive search param alpha https://huggingface.co/blog/introducing-csearch#5-contrastive-search
top_k=4, # contrastive search param `k` https://huggingface.co/blog/introducing-csearch#5-contrastive-search
# repetition_penalty=1.0, # 1=no penalty; penalty at >1
# length_penalty=1.0, # > 0.0 promotes longer sequences; < 0.0 encourages shorter sequences.
# exponential_decay_length_penalty=(40, 1.1), # increase likelihood of eos_token probability; increase starts at X generated tokens (X,_) and the penalty increases at Y (_,Y)
num_return_sequences=2, # how many times to perform independent generate from the prompt (default is 1)
# eta_cutoff=8e-4,
renormalize_logits=True, # recommended by HF: https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig.renormalize_logits
output_attentions=False, # return attention tensors after generation
output_hidden_states=False, # return all hidden states of all layers
output_scores=False, # return the prediction scores
return_dict_in_generate=False, # return output as ModelOuput object instead of tuple
use_cache=False, # whether to re-use last passed key/value attentions
)
user_inputs = [
"So, how was the date?",
"Where are you going?",
"Do you like her?",
"How did the audition go?",
"So I bumped into her again yesterday, what should I do?",
"Hey Joey!",
"When are we playing foosball next?",
"Should I take the job?",
"Where you going?",
"What's up?",
"Do you think she likes me or you more?",
"Come on, tell me the truth.",
"Where's your favorite pizza place?",
"Come on, give me a slice of pizza!",
"Joey, are you okay?",
"What's the problem?",
"What does your ideal date look like?",
"Do you know her?",
"Tell me when your next audition is!",
"Whom do you live with?",
"Where are you from?",
]
prompts = [config.TEMPLATE.format(prompt=i, response="") for i in user_inputs]
results = []
for p in prompts:
encoding = tokenizer(p, return_tensors="pt", padding=True).to("cuda")
generated_ids = model.generate(**encoding, generation_config=generation_config)
out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print("\n\n")
print(out)
results.append(out)
runtime = time.time() - s
with open("results", "wb") as f: # "wb" because we want to write in binary mode
pickle.dump(results, f)
df = pd.DataFrame(data={"results": results})
savepath = f"{config.SINGLE_MODEL_DIR}/generated_responses__{runtime}.csv"
df.to_csv(savepath, index=False, sep="Θ")
print(f"Generated results saved to: {savepath}")