JoeyGPT/sft_generate.py at main · braunagn/JoeyGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import config
import pickle
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import time

s = time.time()

model = AutoModelForCausalLM.from_pretrained(f"{config.USER}/{config.SFT_MERGED_MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True if "llama" in config.MODEL_NAME.lower() else False


generation_config = GenerationConfig(
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    min_new_tokens=1,  # NEW tokens generated (doesn't include prompt)
    max_new_tokens=125,  # NEW tokens generated (doesn't include prompt)
    do_sample=True,  # sample across tokens; if false, model uses greedy decoding
    temperature=1.0,  # how randomly model samples from list of available tokens
    # top_p=0.30,  # list of tokens (top X %) which model can sample from
    # num_beams=1,  # beam search; number of beams (X more runs through model)
    penalty_alpha=0.6,  # contrastive search param alpha https://huggingface.co/blog/introducing-csearch#5-contrastive-search
    top_k=4,  # contrastive search param `k` https://huggingface.co/blog/introducing-csearch#5-contrastive-search
    # repetition_penalty=1.0,  # 1=no penalty; penalty at >1
    # length_penalty=1.0, # > 0.0 promotes longer sequences; < 0.0 encourages shorter sequences.
    # exponential_decay_length_penalty=(40, 1.1),  # increase likelihood of eos_token probability; increase starts at X generated tokens (X,_) and the penalty increases at Y (_,Y)
    num_return_sequences=2,  # how many times to perform independent generate from the prompt (default is 1)

    # eta_cutoff=8e-4,
    renormalize_logits=True,  # recommended by HF: https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig.renormalize_logits
    output_attentions=False,  # return attention tensors after generation
    output_hidden_states=False,  # return all hidden states of all layers
    output_scores=False,  # return the prediction scores
    return_dict_in_generate=False,  # return output as ModelOuput object instead of tuple
    use_cache=False,  # whether to re-use last passed key/value attentions
)

user_inputs = [
    "So, how was the date?",
    "Where are you going?",
    "Do you like her?",
    "How did the audition go?",
    "So I bumped into her again yesterday, what should I do?",
    "Hey Joey!",
    "When are we playing foosball next?",
    "Should I take the job?",
    "Where you going?",
    "What's up?",
    "Do you think she likes me or you more?",
    "Come on, tell me the truth.",
    "Where's your favorite pizza place?",
    "Come on, give me a slice of pizza!",
    "Joey, are you okay?",
    "What's the problem?",
    "What does your ideal date look like?",
    "Do you know her?",
    "Tell me when your next audition is!",
    "Whom do you live with?",
    "Where are you from?",
]

prompts = [config.TEMPLATE.format(prompt=i, response="") for i in user_inputs]

results = []

for p in prompts:
    encoding = tokenizer(p, return_tensors="pt", padding=True).to("cuda")
    generated_ids = model.generate(**encoding, generation_config=generation_config)
    out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    print("\n\n")
    print(out)
    results.append(out)

runtime = time.time() - s

with open("results", "wb") as f: # "wb" because we want to write in binary mode
    pickle.dump(results, f)

df = pd.DataFrame(data={"results": results})
savepath = f"{config.SINGLE_MODEL_DIR}/generated_responses__{runtime}.csv"
df.to_csv(savepath, index=False, sep="Θ")

print(f"Generated results saved to: {savepath}")