Skip to content

Commit 1ef29f4

Browse files
Merge pull request #250 from david-thrower/249-improve-text-generation-samples-in-cicd-scale-test
249 improve text generation samples in cicd scale test (Merge into an experimental branch).
2 parents c5d8644 + 462368e commit 1ef29f4

File tree

2 files changed

+120
-47
lines changed

2 files changed

+120
-47
lines changed

.github/workflows/automerge.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ name: Python application
55

66
on:
77
push:
8-
branches: [ "main", "240-branch-to-diverge-cicd-scale-nlp-hpo-from-at-scale-study" ]
8+
branches: [ "main", "249-improve-text-generation-samples-in-cicd-scale-test" ]
99

1010
permissions:
1111
contents: read

generative-proof-of-concept-CPU-preprocessing-in-memory.py

Lines changed: 119 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,16 @@ def objective(trial: optuna.Trial) -> float:
7979
moities_to_try = 3 # ++ Accuracy, linear increase in computation time (Raise this before resorting to raising the next one)
8080
tries_per_moity = 1 # ++ Modest ++ Accuracy, quadratic increase in computation time
8181

82+
## Generation time configurables: ##########
83+
84+
GENERATION_PROMPT_LEN = 25
85+
MAX_NEW_TOKENS = 14
86+
RESULT_CUTOFF = 11 # Only print out verbose text samples when perplexity is < RESULT_CUTOFF
87+
88+
if GENERATION_PROMPT_LEN + MAX_NEW_TOKENS > MAX_SEQ_LENGTH:
89+
raise ValueError("Sequence length overflow: Generated text length (GENERATION_PROMPT_LEN + MAX_NEW_TOKENS) "
90+
"should be less than or equal to MAX_SEQ_LENGTH.")
91+
8292
##### HP Tuning Parameters: ######### (Parameters to be optimized by TPE or SOBOL)
8393

8494

@@ -1020,8 +1030,9 @@ def call(self, inputs):
10201030

10211031
# mlflow.keras.log_model(generator, artifact_path="generator")
10221032
print("########### BEFORE SEARIALIZING THE GENERATIVE MODEL")
1023-
1024-
def complete_text(text):
1033+
1034+
# Utility function to generate text from greedy sampling:
1035+
def complete_text_greedy(text: str, max_new_tokens:int=10) -> str:
10251036
input_ids = tokenizer(
10261037
text,
10271038
add_special_tokens=False
@@ -1030,62 +1041,124 @@ def complete_text(text):
10301041
generated_tokens = generator.generate(
10311042
token_ids=input_ids, # Just the actual tokens, no padding
10321043
do_sample=False,
1033-
max_new_tokens=10
1044+
max_new_tokens=max_new_tokens
10341045
)
10351046
generated_text =\
10361047
tokenizer.decode(generated_tokens).replace(text, "")
10371048
return generated_text
1038-
1039-
test_text = "I saw the sun and it was as"
1040-
response = complete_text(test_text)
1041-
1042-
print(f"I ask the generator: {test_text}... It responds:")
1043-
print(response)
1044-
1045-
counter = 0
1046-
for sample in non_instruct_samples:
1047-
1048-
1049-
# Tokenize the text without padding first to get actual tokens
1050-
sample_tokenized = tokenizer(
1051-
sample,
1049+
1050+
# Utility function to generate text from beam sampling:
1051+
def complete_text_beam(text: str,
1052+
max_new_tokens: int=10,
1053+
temperature: float=0.75,
1054+
top_k: int=75,
1055+
top_p: float=0.98,
1056+
repetition_penalty: float=None,
1057+
presence_penalty: float=1.3,
1058+
frequency_penalty: float=1.4) -> str:
1059+
1060+
input_ids = tokenizer(
1061+
text,
10521062
add_special_tokens=False
10531063
)['input_ids']
1054-
start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
1055-
half_sample_tokenized = sample_tokenized[:start_generate_index]
1056-
1057-
# Convert to Python list of integers
1058-
if hasattr(half_sample_tokenized, 'numpy'):
1059-
token_ids = half_sample_tokenized.numpy().tolist()
1060-
else:
1061-
token_ids = [int(token_id) for token_id in half_sample_tokenized]
1062-
1063-
print(f"Actual token count: {len(token_ids)}")
1064-
print(f"First 10 tokens: {token_ids[:10]}")
1065-
1066-
# Now pass the list of integers to your generate method
1064+
10671065
generated_tokens = generator.generate(
1068-
token_ids=token_ids, # Just the actual tokens, no padding
1066+
token_ids=input_ids, # Just the actual tokens, no padding
10691067
do_sample=True,
1070-
max_new_tokens=20,
1071-
temperature=0.73,
1072-
# One set of recommendations
1073-
top_k=75,
1074-
top_p=0.97,
1075-
# Previous semi-working values
1076-
# top_k=40,
1077-
# top_p=0.985,
1068+
max_new_tokens=max_new_tokens,
1069+
temperature=temperature,
1070+
top_k=top_k,
1071+
top_p=top_p,
10781072
# repetition_penalty=1.2,
1079-
presence_penalty=1.2,
1080-
frequency_penalty=1.4
1073+
presence_penalty= presence_penalty,
1074+
frequency_penalty=frequency_penalty
10811075
)
1076+
generated_text =\
1077+
tokenizer.decode(generated_tokens).replace(text, "")
1078+
return generated_text
1079+
1080+
test_text = "I saw the sun and it was as shining on the"
1081+
response = complete_text_greedy(test_text)
1082+
print(f"I ask the generator (greedy): {test_text}... It responds: '{response}'.")
1083+
response = complete_text_beam(test_text)
1084+
print(f"I ask the generator (Beam defaults - max_new_tokens: 10, temperature: 0.75, top_k: 75, top_p: 0.98, repetition_penalty: None, presence_penalty: 1.3, frequency_penalty: 1.4): {test_text}... It responds: '{response}'.")
1085+
1086+
def test_text(test_prompt: str, max_new_tokens: int, sample_number: int, result: float, result_cutoff) -> None:
1087+
"""
1088+
If the result < result_cutoff, this will run a matrix of different sampling values and print out the resulting text for human subjective evaluation.
1089+
1090+
Parameters:
1091+
- test_prompt: a string to prompt generation
1092+
- max_new_tokens: int, number of tokens to generate unless we generate a stop token.
1093+
- sample_number: Metadata for sample...
1094+
- result: Perplexity score from this run
1095+
- result_cutoff: Perplexity score that would be expected to indicate a trial worth running this pn
1096+
1097+
"""
1098+
if result < result_cutoff:
1099+
response1 = response = complete_text_greedy(text=test_prompt, max_new_tokens=max_new_tokens)
1100+
print(f"Sample {sample_number}: I ask the generator (greedy): {test_prompt}... It responds: '{response1}'.")
1101+
response_2 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens)
1102+
print(f"Sample {sample_number}: I ask the generator (Beam defaults - max_new_tokens: 10, temperature: 0.75, top_k: 75, top_p: 0.98, repetition_penalty: None, presence_penalty: 1.3, frequency_penalty: 1.4): {test_prompt}... It responds: '{response_2}'.")
1103+
response_3 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.6, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
1104+
print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.6, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_3}'.")
1105+
response_4 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
1106+
print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.98, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_4}'.")
1107+
response_5 = complete_text_beam(text=test_prompt, max_new_tokens=max_new_tokens, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4)
1108+
print(f"Sample {sample_number}: I ask the generator (Beam: - max_new_tokens: 10, temperature=0.7, top_k=75, top_p=0.97, repetition_penalty=None, presence_penalty = 1.3, frequency_penalty = 1.4): {test_prompt}... It responds: '{response_5}'.")
1109+
1110+
1111+
prompt_samples = [
1112+
"In the beginning God created the ",
1113+
"And the earth was without form, and",
1114+
"And God said, Let there be light: and there ",
1115+
"And God said, Let the waters under the heaven be gathered"]
1116+
1117+
1118+
counter = 0
1119+
for sample in prompt_samples:
1120+
test_text(test_prompt=sample, max_new_tokens=MAX_NEW_TOKENS, sample_number= counter, result=result, result_cutoff = RESULT_CUTOFF)
1121+
1122+
# # Tokenize the text without padding first to get actual tokens
1123+
# sample_tokenized = tokenizer(
1124+
# sample,
1125+
# add_special_tokens=False
1126+
# )['input_ids']
1127+
# start_generate_index = int(np.ceil(len(sample_tokenized) * 0.5))
1128+
# half_sample_tokenized = sample_tokenized[:start_generate_index]
1129+
1130+
# # Convert to Python list of integers
1131+
# if hasattr(half_sample_tokenized, 'numpy'):
1132+
# token_ids = half_sample_tokenized.numpy().tolist()
1133+
# else:
1134+
# token_ids = [int(token_id) for token_id in half_sample_tokenized]
1135+
1136+
# print(f"Actual token count: {len(token_ids)}")
1137+
# print(f"First 10 tokens: {token_ids[:10]}")
1138+
1139+
# # Now pass the list of integers to your generate method
1140+
# generated_tokens = generator.generate(
1141+
# token_ids=token_ids, # Just the actual tokens, no padding
1142+
# do_sample=True,
1143+
# max_new_tokens=20,
1144+
# temperature=0.73,
1145+
# # One set of recommendations
1146+
# top_k=75,
1147+
# top_p=0.97,
1148+
# # Previous semi-working values
1149+
# # top_k=40,
1150+
# # top_p=0.985,
1151+
# # repetition_penalty=1.2,
1152+
# presence_penalty=1.2,
1153+
# frequency_penalty=1.4
1154+
# )
10821155

1083-
# Decode the result
1084-
half_sample = tokenizer.decode(half_sample_tokenized)
1085-
full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)\
1086-
.replace(half_sample, "")
1156+
# # Decode the result
1157+
# half_sample = tokenizer.decode(half_sample_tokenized)
1158+
# full_generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)\
1159+
# .replace(half_sample, "")
10871160

1088-
print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
1161+
# print(f"PROMPT number {counter}: {half_sample}; RESPONSE: {full_generated_text}")
10891162
counter += 1
10901163
mlflow.log_metric("perplexity", result, step=trial.number)
10911164
del(best_model_found)

0 commit comments

Comments
 (0)