Skip to content

Commit c106c67

Browse files
committed
Option to omit stop sequence from text output
Introduces an optional bool include_stop_sequence field in the stopping criteria section of the generate request API. There is a new global configuration option to control what the default behaviour should be for requests that don't specify this. For now the default is to include the stop sequence text, for backwards compatibility (though this is applied in the launcher, the default at the router level is to omit them). There's also a new stop_sequence field in the generate API response message, which will contain the matched stop sequence iff stop_reason == STOP_SEQUENCE. In addition, when a matched stop sequence is included in the output, the text will now be trimmed to the end of the stop sequence rather than potentially including additional characters from the last generated token.
1 parent 79d38a1 commit c106c67

File tree

11 files changed

+473
-100
lines changed

11 files changed

+473
-100
lines changed

integration_tests/test_cases_bloom560m.yaml

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -844,8 +844,9 @@
844844
- generatedTokenCount: 17
845845
inputTokenCount: 45
846846
stopReason: STOP_SEQUENCE
847+
stopSequence: "🤖 \nTran"
847848
text: "Convert movie titles into emoji.\n\nBack to the Future:
848-
👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars: 🤗🤖 \nBatman: 🤗🤖 \nTransform"
849+
👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars: 🤗🤖 \nBatman: 🤗🤖 \nTran"
849850

850851

851852
# Single token input
@@ -1087,6 +1088,7 @@
10871088
- generatedTokenCount: 14
10881089
inputTokenCount: 6
10891090
stopReason: STOP_SEQUENCE
1091+
stopSequence: confused
10901092
text: The first time I saw the movie, I was a little bit confused
10911093

10921094
# Stop sequence
@@ -1104,9 +1106,90 @@
11041106
- generatedTokenCount: 10
11051107
inputTokenCount: 6
11061108
stopReason: STOP_SEQUENCE
1109+
stopSequence: I was
11071110
text: The first time I saw the movie, I was
11081111

11091112

1113+
# Stop sequence, omitted
1114+
- name: Stop sequence omitted
1115+
request:
1116+
params:
1117+
stopping:
1118+
maxNewTokens: 20
1119+
includeStopSequence: false
1120+
stopSequences:
1121+
- " movie"
1122+
requests:
1123+
- {"text": "A very long story:\n"}
1124+
response:
1125+
responses:
1126+
- generatedTokenCount: 7
1127+
inputTokenCount: 6
1128+
stopReason: STOP_SEQUENCE
1129+
stopSequence: " movie"
1130+
text: 'The first time I saw the'
1131+
1132+
# Stop sequence partial token
1133+
- name: Stop sequence partial token
1134+
request:
1135+
params:
1136+
stopping:
1137+
maxNewTokens: 20
1138+
includeStopSequence: true
1139+
stopSequences:
1140+
- "confu"
1141+
requests:
1142+
- {"text": "A very long story:\n"}
1143+
response:
1144+
responses:
1145+
- generatedTokenCount: 14
1146+
inputTokenCount: 6
1147+
stopReason: STOP_SEQUENCE
1148+
stopSequence: "confu"
1149+
text: "The first time I saw the movie, I was a little bit confu"
1150+
1151+
1152+
# Long stop sequence, omitted
1153+
- name: Long stop sequence omitted
1154+
request:
1155+
params:
1156+
stopping:
1157+
maxNewTokens: 80
1158+
includeStopSequence: false
1159+
stopSequences:
1160+
- "w the movie, I was a little bit confused. I "
1161+
requests:
1162+
- {"text": "A very long story:\n"}
1163+
response:
1164+
responses:
1165+
- generatedTokenCount: 17
1166+
inputTokenCount: 6
1167+
stopReason: STOP_SEQUENCE
1168+
stopSequence: 'w the movie, I was a little bit confused. I '
1169+
text: The first time I sa
1170+
1171+
1172+
# Long stop sequence partial token
1173+
- name: Long stop sequence partial token
1174+
request:
1175+
params:
1176+
stopping:
1177+
maxNewTokens: 80
1178+
includeStopSequence: true
1179+
stopSequences:
1180+
- "to exp"
1181+
requests:
1182+
- {"text": "A very long story:\n"}
1183+
response:
1184+
responses:
1185+
- generatedTokenCount: 22
1186+
inputTokenCount: 6
1187+
stopReason: STOP_SEQUENCE
1188+
stopSequence: to exp
1189+
text: "The first time I saw the movie, I was a little bit confused. I wasn\u2019\
1190+
t sure what to exp"
1191+
1192+
11101193
# Repetition penalty
11111194
- name: Repetition penalty
11121195
request:

integration_tests/test_cases_mt0small.yaml

Lines changed: 105 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,7 @@
681681
- generatedTokenCount: 6
682682
inputTokenCount: 33
683683
stopReason: STOP_SEQUENCE
684+
stopSequence: "🎭"
684685
text: "Convert movie titles into emoji.\n\nBack to the Future:
685686
👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars:\n\nTransformers: 🎬🎭"
686687

@@ -897,7 +898,8 @@
897898
- generatedTokenCount: 8
898899
inputTokenCount: 8
899900
stopReason: STOP_SEQUENCE
900-
text: 'The very long story is written'
901+
stopSequence: written
902+
text: The very long story is written
901903

902904
- name: Stop sequence 2
903905
request:
@@ -914,8 +916,88 @@
914916
- generatedTokenCount: 16
915917
inputTokenCount: 8
916918
stopReason: STOP_SEQUENCE
919+
stopSequence: '.'
917920
text: 'The very long story is written by a very long story.'
918921

922+
# Stop sequence, omitted
923+
- name: Stop sequence omitted
924+
request:
925+
params:
926+
stopping:
927+
maxNewTokens: 20
928+
includeStopSequence: false
929+
stopSequences:
930+
- "written"
931+
requests:
932+
- {"text": "A very long story:\n"}
933+
response:
934+
responses:
935+
- generatedTokenCount: 8
936+
inputTokenCount: 8
937+
stopReason: STOP_SEQUENCE
938+
stopSequence: written
939+
text: 'The very long story is '
940+
941+
# Stop sequence partial token
942+
- name: Stop sequence partial token
943+
request:
944+
params:
945+
stopping:
946+
maxNewTokens: 20
947+
includeStopSequence: true
948+
stopSequences:
949+
- "tte"
950+
requests:
951+
- {"text": "A very long story:\n"}
952+
response:
953+
responses:
954+
- generatedTokenCount: 8
955+
inputTokenCount: 8
956+
stopReason: STOP_SEQUENCE
957+
stopSequence: tte
958+
text: 'The very long story is writte'
959+
960+
961+
# Long stop sequence, omitted
962+
- name: Long stop sequence omitted
963+
request:
964+
params:
965+
stopping:
966+
maxNewTokens: 80
967+
includeStopSequence: false
968+
stopSequences:
969+
- "story. The story is written by a very lon"
970+
requests:
971+
- {"text": "A very long story:\n"}
972+
response:
973+
responses:
974+
- generatedTokenCount: 27
975+
inputTokenCount: 8
976+
stopReason: STOP_SEQUENCE
977+
stopSequence: "story. The story is written by a very lon"
978+
text: 'The very long story is written by a very long '
979+
980+
# Long stop sequence partial token
981+
- name: Long stop sequence partial token
982+
request:
983+
params:
984+
stopping:
985+
maxNewTokens: 80
986+
includeStopSequence: true
987+
stopSequences:
988+
- "story. The story is written by a very lon"
989+
requests:
990+
- {"text": "A very long story:\n"}
991+
response:
992+
responses:
993+
- generatedTokenCount: 27
994+
inputTokenCount: 8
995+
stopReason: STOP_SEQUENCE
996+
stopSequence: "story. The story is written by a very lon"
997+
text: The very long story is written by a very long story. The story is written
998+
by a very lon
999+
1000+
9191001
# Repetition penalty
9201002
- name: Repetition penalty
9211003
request:
@@ -1004,26 +1086,28 @@
10041086
- {"text": "Somewhere,\nover the rainbow,\nthere is"}
10051087
response:
10061088
responses:
1007-
- generatedTokenCount: 7
1008-
inputTokenCount: 8
1009-
seed: '22'
1010-
stopReason: STOP_SEQUENCE
1011-
text: 'The very long story is '
1012-
- generatedTokenCount: 3
1013-
inputTokenCount: 2
1014-
seed: '22'
1015-
stopReason: STOP_SEQUENCE
1016-
text: The Giant
1017-
- generatedTokenCount: 10
1018-
inputTokenCount: 5
1019-
seed: '22'
1020-
stopReason: MAX_TOKENS
1021-
text: One of the largest cities in the world
1022-
- generatedTokenCount: 4
1023-
inputTokenCount: 11
1024-
seed: '22'
1025-
stopReason: EOS_TOKEN
1026-
text: The rainbow
1089+
- generatedTokenCount: 7
1090+
inputTokenCount: 8
1091+
seed: '22'
1092+
stopReason: STOP_SEQUENCE
1093+
stopSequence: 'is '
1094+
text: 'The very long story is '
1095+
- generatedTokenCount: 3
1096+
inputTokenCount: 2
1097+
seed: '22'
1098+
stopReason: STOP_SEQUENCE
1099+
stopSequence: ant
1100+
text: The Giant
1101+
- generatedTokenCount: 10
1102+
inputTokenCount: 5
1103+
seed: '22'
1104+
stopReason: MAX_TOKENS
1105+
text: One of the largest cities in the world
1106+
- generatedTokenCount: 4
1107+
inputTokenCount: 11
1108+
seed: '22'
1109+
stopReason: EOS_TOKEN
1110+
text: The rainbow
10271111

10281112
## TODO move to separate test
10291113

launcher/src/main.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ struct Args {
6767
output_special_tokens: bool,
6868
#[clap(default_value = "1.0", long, short, env)]
6969
cuda_process_memory_fraction: f32,
70+
// Default for default_include_stop_seqs is true for now, for backwards compatibility
71+
#[clap(default_value = "true", long, env, action = clap::ArgAction::Set)]
72+
default_include_stop_seqs: bool,
7073
}
7174

7275
fn main() -> ExitCode {
@@ -233,6 +236,10 @@ fn main() -> ExitCode {
233236
argv.push("--output-special-tokens".into());
234237
}
235238

239+
if args.default_include_stop_seqs {
240+
argv.push("--default-include-stop-seqs".into());
241+
}
242+
236243
let mut webserver = match Popen::create(
237244
&argv,
238245
PopenConfig {

proto/generation.proto

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ message GenerationResponse {
5454
uint32 generated_token_count = 2;
5555
string text = 4;
5656
StopReason stop_reason = 7;
57+
// The stop sequence encountered, iff stop_reason == STOP_SEQUENCE
58+
string stop_sequence = 11;
5759
// Random seed used, not applicable for greedy requests
5860
uint64 seed = 10;
5961

@@ -121,6 +123,8 @@ message StoppingCriteria {
121123
// Default (0) means no time limit
122124
uint32 time_limit_millis = 3;
123125
repeated string stop_sequences = 4;
126+
// If not specified, default behavior depends on server setting
127+
optional bool include_stop_sequence = 5;
124128

125129
//more to come
126130
}

router/client/src/client.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,9 @@ impl Client {
8383
.instrument(info_span!("model_info"))
8484
.await?
8585
.into_inner();
86-
ModelType::from_i32(response.model_type)
86+
ModelType::try_from(response.model_type)
8787
.map(|mt| (mt, response.eos_token, response.batch_padding))
88-
.ok_or(ClientError::Generation("Unrecognized model type".to_string()))
88+
.map_err(|_| ClientError::Generation("Unrecognized model type".to_string()))
8989
}
9090

9191
/// Get model health
@@ -99,9 +99,9 @@ impl Client {
9999
/// Get shard model info
100100
#[instrument(skip(self))]
101101
pub async fn prefix_lookup(&mut self, prefix_id: String) -> Result<u32> {
102-
let mut request = tonic::Request::new(PrefixLookupRequest {
103-
prefix_id
104-
});
102+
let mut request = tonic::Request::new(
103+
PrefixLookupRequest { prefix_id }
104+
);
105105
request.set_timeout(PREFIX_LOOKUP_TIMEOUT);
106106
let response = self.stub
107107
.prefix_lookup(request)

0 commit comments

Comments
 (0)