red-hat-data-services
diff --git a/‎integration_tests/test_cases_bloom560m.yaml
Lines changed: 84 additions & 1 deletion b/‎integration_tests/test_cases_bloom560m.yaml
Lines changed: 84 additions & 1 deletion
diff --git a/‎integration_tests/test_cases_mt0small.yaml
Lines changed: 105 additions & 21 deletions b/‎integration_tests/test_cases_mt0small.yaml
Lines changed: 105 additions & 21 deletions
diff --git a/‎launcher/src/main.rs
Lines changed: 7 additions & 0 deletions b/‎launcher/src/main.rs
Lines changed: 7 additions & 0 deletions
diff --git a/‎proto/generation.proto
Lines changed: 4 additions & 0 deletions b/‎proto/generation.proto
Lines changed: 4 additions & 0 deletions
diff --git a/‎router/client/src/client.rs
Lines changed: 5 additions & 5 deletions b/‎router/client/src/client.rs
Lines changed: 5 additions & 5 deletions
@@ -844,8 +844,9 @@
       - generatedTokenCount: 17
         inputTokenCount: 45
         stopReason: STOP_SEQUENCE
+        stopSequence: "🤖 \nTran"
         text: "Convert movie titles into emoji.\n\nBack to the Future:
-        👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars: 🤗🤖 \nBatman: 🤗🤖 \nTransform"
+        👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars: 🤗🤖 \nBatman: 🤗🤖 \nTran"
 
 
 # Single token input
@@ -1087,6 +1088,7 @@
       - generatedTokenCount: 14
         inputTokenCount: 6
         stopReason: STOP_SEQUENCE
+        stopSequence: confused
         text: The first time I saw the movie, I was a little bit confused
 
 # Stop sequence
@@ -1104,9 +1106,90 @@
       - generatedTokenCount: 10
         inputTokenCount: 6
         stopReason: STOP_SEQUENCE
+        stopSequence: I was
         text: The first time I saw the movie, I was
 
 
+# Stop sequence, omitted
+- name: Stop sequence omitted
+  request:
+    params:
+      stopping:
+        maxNewTokens: 20
+        includeStopSequence: false
+        stopSequences:
+          - " movie"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 7
+        inputTokenCount: 6
+        stopReason: STOP_SEQUENCE
+        stopSequence: " movie"
+        text: 'The first time I saw the'
+
+# Stop sequence partial token
+- name: Stop sequence partial token
+  request:
+    params:
+      stopping:
+        maxNewTokens: 20
+        includeStopSequence: true
+        stopSequences:
+          - "confu"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 14
+        inputTokenCount: 6
+        stopReason: STOP_SEQUENCE
+        stopSequence: "confu"
+        text: "The first time I saw the movie, I was a little bit confu"
+
+
+# Long stop sequence, omitted
+- name: Long stop sequence omitted
+  request:
+    params:
+      stopping:
+        maxNewTokens: 80
+        includeStopSequence: false
+        stopSequences:
+          - "w the movie, I was a little bit confused. I "
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 17
+        inputTokenCount: 6
+        stopReason: STOP_SEQUENCE
+        stopSequence: 'w the movie, I was a little bit confused. I '
+        text: The first time I sa
+
+
+# Long stop sequence partial token
+- name: Long stop sequence partial token
+  request:
+    params:
+      stopping:
+        maxNewTokens: 80
+        includeStopSequence: true
+        stopSequences:
+          - "to exp"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 22
+        inputTokenCount: 6
+        stopReason: STOP_SEQUENCE
+        stopSequence: to exp
+        text: "The first time I saw the movie, I was a little bit confused. I wasn\u2019\
+        t sure what to exp"
+
+
  # Repetition penalty
 - name: Repetition penalty
   request:
 
@@ -681,6 +681,7 @@
       - generatedTokenCount: 6
         inputTokenCount: 33
         stopReason: STOP_SEQUENCE
+        stopSequence: "🎭"
         text: "Convert movie titles into emoji.\n\nBack to the Future:
         👨👴🚗🕒 \nBatman: 🤵🦇 \nTransformers: 🚗🤖 \nStar Wars:\n\nTransformers: 🎬🎭"
 
@@ -897,7 +898,8 @@
       - generatedTokenCount: 8
         inputTokenCount: 8
         stopReason: STOP_SEQUENCE
-        text: 'The very long story is written'
+        stopSequence: written
+        text: The very long story is written
 
 - name: Stop sequence 2
   request:
@@ -914,8 +916,88 @@
       - generatedTokenCount: 16
         inputTokenCount: 8
         stopReason: STOP_SEQUENCE
+        stopSequence: '.'
         text: 'The very long story is written by a very long story.'
 
+# Stop sequence, omitted
+- name: Stop sequence omitted
+  request:
+    params:
+      stopping:
+        maxNewTokens: 20
+        includeStopSequence: false
+        stopSequences:
+          - "written"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 8
+        inputTokenCount: 8
+        stopReason: STOP_SEQUENCE
+        stopSequence: written
+        text: 'The very long story is '
+
+# Stop sequence partial token
+- name: Stop sequence partial token
+  request:
+    params:
+      stopping:
+        maxNewTokens: 20
+        includeStopSequence: true
+        stopSequences:
+          - "tte"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 8
+        inputTokenCount: 8
+        stopReason: STOP_SEQUENCE
+        stopSequence: tte
+        text: 'The very long story is writte'
+
+
+# Long stop sequence, omitted
+- name: Long stop sequence omitted
+  request:
+    params:
+      stopping:
+        maxNewTokens: 80
+        includeStopSequence: false
+        stopSequences:
+          - "story. The story is written by a very lon"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 27
+        inputTokenCount: 8
+        stopReason: STOP_SEQUENCE
+        stopSequence: "story. The story is written by a very lon"
+        text: 'The very long story is written by a very long '
+
+# Long stop sequence partial token
+- name: Long stop sequence partial token
+  request:
+    params:
+      stopping:
+        maxNewTokens: 80
+        includeStopSequence: true
+        stopSequences:
+          - "story. The story is written by a very lon"
+    requests:
+      - {"text": "A very long story:\n"}
+  response:
+    responses:
+      - generatedTokenCount: 27
+        inputTokenCount: 8
+        stopReason: STOP_SEQUENCE
+        stopSequence: "story. The story is written by a very lon"
+        text: The very long story is written by a very long story. The story is written
+          by a very lon
+
+
 # Repetition penalty
 - name: Repetition penalty
   request:
@@ -1004,26 +1086,28 @@
       - {"text": "Somewhere,\nover the rainbow,\nthere is"}
   response:
     responses:
-    - generatedTokenCount: 7
-      inputTokenCount: 8
-      seed: '22'
-      stopReason: STOP_SEQUENCE
-      text: 'The very long story is '
-    - generatedTokenCount: 3
-      inputTokenCount: 2
-      seed: '22'
-      stopReason: STOP_SEQUENCE
-      text: The Giant
-    - generatedTokenCount: 10
-      inputTokenCount: 5
-      seed: '22'
-      stopReason: MAX_TOKENS
-      text: One of the largest cities in the world
-    - generatedTokenCount: 4
-      inputTokenCount: 11
-      seed: '22'
-      stopReason: EOS_TOKEN
-      text: The rainbow
+      - generatedTokenCount: 7
+        inputTokenCount: 8
+        seed: '22'
+        stopReason: STOP_SEQUENCE
+        stopSequence: 'is '
+        text: 'The very long story is '
+      - generatedTokenCount: 3
+        inputTokenCount: 2
+        seed: '22'
+        stopReason: STOP_SEQUENCE
+        stopSequence: ant
+        text: The Giant
+      - generatedTokenCount: 10
+        inputTokenCount: 5
+        seed: '22'
+        stopReason: MAX_TOKENS
+        text: One of the largest cities in the world
+      - generatedTokenCount: 4
+        inputTokenCount: 11
+        seed: '22'
+        stopReason: EOS_TOKEN
+        text: The rainbow
 
 ## TODO move to separate test
 
 
@@ -67,6 +67,9 @@ struct Args {
     output_special_tokens: bool,
     #[clap(default_value = "1.0", long, short, env)]
     cuda_process_memory_fraction: f32,
+    // Default for default_include_stop_seqs is true for now, for backwards compatibility
+    #[clap(default_value = "true", long, env, action = clap::ArgAction::Set)]
+    default_include_stop_seqs: bool,
 }
 
 fn main() -> ExitCode {
@@ -233,6 +236,10 @@ fn main() -> ExitCode {
         argv.push("--output-special-tokens".into());
     }
 
+    if args.default_include_stop_seqs {
+        argv.push("--default-include-stop-seqs".into());
+    }
+
     let mut webserver = match Popen::create(
         &argv,
         PopenConfig {
 
@@ -54,6 +54,8 @@ message GenerationResponse {
   uint32 generated_token_count = 2;
   string text = 4;
   StopReason stop_reason = 7;
+  // The stop sequence encountered, iff stop_reason == STOP_SEQUENCE
+  string stop_sequence = 11;
   // Random seed used, not applicable for greedy requests
   uint64 seed = 10;
 
@@ -121,6 +123,8 @@ message StoppingCriteria {
   // Default (0) means no time limit
   uint32 time_limit_millis = 3;
   repeated string stop_sequences = 4;
+  // If not specified, default behavior depends on server setting
+  optional bool include_stop_sequence = 5;
 
   //more to come
 }
 
@@ -83,9 +83,9 @@ impl Client {
             .instrument(info_span!("model_info"))
             .await?
             .into_inner();
-        ModelType::from_i32(response.model_type)
+        ModelType::try_from(response.model_type)
             .map(|mt| (mt, response.eos_token, response.batch_padding))
-            .ok_or(ClientError::Generation("Unrecognized model type".to_string()))
+            .map_err(|_| ClientError::Generation("Unrecognized model type".to_string()))
     }
 
     /// Get model health
@@ -99,9 +99,9 @@ impl Client {
     /// Get shard model info
     #[instrument(skip(self))]
     pub async fn prefix_lookup(&mut self, prefix_id: String) -> Result<u32> {
-        let mut request = tonic::Request::new(PrefixLookupRequest {
-            prefix_id
-        });
+        let mut request = tonic::Request::new(
+            PrefixLookupRequest { prefix_id }
+        );
         request.set_timeout(PREFIX_LOOKUP_TIMEOUT);
         let response = self.stub
             .prefix_lookup(request)