red-hat-data-services
diff --git a/‎integration_tests/prompt_prefixes/tiny_starcoder/decoder.pt
19.2 KB b/‎integration_tests/prompt_prefixes/tiny_starcoder/decoder.pt
19.2 KB
diff --git a/‎integration_tests/test_cases_tinystarcoderpy.yaml
Lines changed: 259 additions & 0 deletions b/‎integration_tests/test_cases_tinystarcoderpy.yaml
Lines changed: 259 additions & 0 deletions
diff --git a/‎integration_tests/text_generation_tests/test_server.py
Lines changed: 13 additions & 3 deletions b/‎integration_tests/text_generation_tests/test_server.py
Lines changed: 13 additions & 3 deletions
@@ -0,0 +1,259 @@
+# Test empty requests
+- name: Empty 1
+  request: {}
+  response: {}
+- name: Empty 2
+  request:
+    params: {}
+    requests: []
+  response: {}
+
+# Simple
+- name: Simple
+  request:
+    requests:
+      - {"text": "def hello_world():\n"}
+  response:
+    responses:
+      - generatedTokenCount: 14
+        inputTokenCount: 6
+        stopReason: EOS_TOKEN
+        text: "\tprint(\"Hello World!\")\n\nhello_world()\n"
+
+# Basic Greedy (implicit)
+- name: Basic Greedy, max new tokens (implicit)
+  request:
+    requests:
+      - {"text": "'''Implement the class Shape'''\n"}
+  response:
+    responses:
+      - generatedTokenCount: 20
+        inputTokenCount: 7
+        stopReason: MAX_TOKENS
+        text: "\nclass Shape(object):\n    '''Shape class'''\n\n    def __init__(self, x,"
+
+# Basic Greedy (explicit)
+- name: Basic Greedy, max new tokens (implicit)
+  request:
+    params:
+      method: GREEDY
+      stopping: {"maxNewTokens": 24}
+    requests:
+      - {"text": "'''Implement the class Shape'''\n"}
+  response:
+    responses:
+      - generatedTokenCount: 24
+        inputTokenCount: 7
+        stopReason: MAX_TOKENS
+        text: "\nclass Shape(object):\n    '''Shape class'''\n\n    def __init__(self, x, y, z):"
+
+# Multiple inputs with token info
+- name: Multiple inputs with token info
+  request:
+    params:
+      method: GREEDY
+      stopping: {"maxNewTokens": 2}
+      response:
+        generatedTokens: true
+        tokenLogprobs: true
+        topNTokens: 2
+    requests:
+      - {"text": "def hello_world():\n"}
+      - {"text": "def merge_lists("}
+      - {"text": "if __name__ == \""}
+  response:
+    responses:
+      - generatedTokenCount: 2
+        inputTokenCount: 6
+        stopReason: MAX_TOKENS
+        text: "\tprint"
+        tokens:
+        - logprob: -0.08069111
+          text: "\u0109"
+          topTokens:
+          - logprob: -0.08069111
+            text: "\u0109"
+          - logprob: -3.2008388
+            text: '#'
+        - logprob: -0.89866674
+          text: print
+          topTokens:
+          - logprob: -0.89866674
+            text: print
+          - logprob: -1.8317665
+            text: return
+      - generatedTokenCount: 2
+        inputTokenCount: 5
+        stopReason: MAX_TOKENS
+        text: l1
+        tokens:
+        - logprob: -1.9720234
+          text: l
+          topTokens:
+          - logprob: -1.9720234
+            text: l
+          - logprob: -2.3360019
+            text: list
+        - logprob: -0.24351147
+          text: '1'
+          topTokens:
+          - logprob: -0.24351147
+            text: '1'
+          - logprob: -2.4751484
+            text: ','
+      - generatedTokenCount: 2
+        inputTokenCount: 6
+        stopReason: MAX_TOKENS
+        text: 'main":'
+        tokens:
+        - logprob: -1.5838054
+          text: main
+          topTokens:
+          - logprob: -1.5838054
+            text: main
+          - logprob: -3.0222993
+            text: test
+        - logprob: -0.18766436
+          text: '":'
+          topTokens:
+          - logprob: -0.18766436
+            text: '":'
+          - logprob: -2.5319178
+            text: '"'
+
+
+# Prompt prefix
+- name: Greedy with tuned prompt prefix
+  # Prompt prefixes with multi-shard not yet supported
+  singleShardOnly: true
+  request:
+    # Prefix is "def hello_world():\n"
+    prefixId: tiny_starcoder
+    params:
+      method: GREEDY
+    requests:
+      - {"text": "\tprint"}
+  response:
+    responses:
+      - generatedTokenCount: 12
+        inputTokenCount: 2
+        stopReason: EOS_TOKEN
+        text: "(\"Hello World!\")\n\nhello_world()\n"
+
+
+# Prompt prefix returning input and generated tokens
+- name: Greedy with tuned prompt prefix and returned tokens
+  # Prompt prefixes with multi-shard not yet supported
+  singleShardOnly: true
+  request:
+    # Prefix is "def hello_world():\n"
+    prefixId: tiny_starcoder
+    params:
+      method: GREEDY
+      stopping: {"maxNewTokens": 2}
+      response:
+        inputTokens: true
+        generatedTokens: true
+        tokenLogprobs: true
+        tokenRanks: true
+        topNTokens: 2
+    requests:
+      - {"text": "\tprint(\"Hello"}
+  response:
+    responses:
+      - generatedTokenCount: 2
+        inputTokenCount: 4
+        text: ' World!")'
+        stopReason: MAX_TOKENS
+        inputTokens:
+          - logprob: NaN
+            text: <|endoftext|>
+          - logprob: -10.14109
+            rank: 2574
+            text: <|endoftext|>
+            topTokens:
+            - logprob: -3.447822
+              text: "\u0120_"
+            - logprob: -3.672276
+              text: "\u0120__"
+          - logprob: -12.594888
+            rank: 1165
+            text: <|endoftext|>
+            topTokens:
+            - logprob: -1.1129533
+              text: _
+            - logprob: -1.2004529
+              text: (
+          - logprob: -13.206944
+            rank: 4837
+            text: <|endoftext|>
+            topTokens:
+            - logprob: -0.32641557
+              text: world
+            - logprob: -4.8018546
+              text: server
+          - logprob: -11.724733
+            rank: 76
+            text: <|endoftext|>
+            topTokens:
+            - logprob: -0.70839006
+              text: '():'
+            - logprob: -0.9568966
+              text: (
+          - logprob: -11.811299
+            rank: 122
+            text: <|endoftext|>
+            topTokens:
+            - logprob: -0.15292865
+              text: "\u010A\u0120\u0120\u0120"
+            - logprob: -3.31403
+              text: "\u010D\u010A\u0120\u0120\u0120"
+          - logprob: -0.080691434
+            rank: 1
+            text: "\u0109"
+            topTokens:
+            - logprob: -0.080691434
+              text: "\u0109"
+            - logprob: -3.2008343
+              text: '#'
+          - logprob: -0.8986669
+            rank: 1
+            text: print
+            topTokens:
+            - logprob: -0.8986669
+              text: print
+            - logprob: -1.8317685
+              text: return
+          - logprob: -0.67005044
+            rank: 1
+            text: ("
+            topTokens:
+            - logprob: -0.67005044
+              text: ("
+            - logprob: -1.3652618
+              text: ('
+          - logprob: -0.6229511
+            rank: 1
+            text: Hello
+            topTokens:
+            - logprob: -0.6229511
+              text: Hello
+            - logprob: -1.4623008
+              text: hello
+        tokens:
+          - logprob: -0.61369985
+            rank: 1
+            text: "\u0120World"
+            topTokens:
+            - logprob: -0.61369985
+              text: "\u0120World"
+            - logprob: -1.7381792
+              text: ','
+          - logprob: -0.7115159
+            rank: 1
+            text: '!")'
+            topTokens:
+            - logprob: -0.7115159
+              text: '!")'
+            - logprob: -1.0358996
+              text: '")'
@@ -19,6 +19,7 @@
 
 INCLUDE_STREAMING = True
 TESTS_TIMEOUT = 300.0  # 5 mins
+TESTS_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
 
 
 def start_server(
@@ -65,6 +66,7 @@ def start_server(
 
     env = os.environ.copy()
     env["RUST_BACKTRACE"] = "full"
+    env["PREFIX_STORE_PATH"] = os.path.join(TESTS_DIR, "prompt_prefixes")
     if not include_cache_env_vars:
         env.pop("TRANSFORMERS_CACHE", None)
         env.pop("HUGGING_FACE_HUB_CACHE", None)
@@ -122,7 +124,7 @@ def server_fixture(request):
 @pytest.fixture
 def test_cases(request):
     filename = request.node.get_closest_marker("test_case_file").args[0]
-    with open(filename) as f:
+    with open(os.path.join(TESTS_DIR, filename)) as f:
         return yaml.load(f, Loader=yaml.Loader)
 
 
@@ -290,7 +292,7 @@ async def run_test_cases_async(test_cases, seq2seq_model=False, sharded=False):
 async def _test_multi_input_seeds(stub):
     # Ensure that sending a batch of identical inputs in sampling mode results
     # in different output seeds and texts
-    with open("test_cases_common.yaml") as f:
+    with open(os.path.join(TESTS_DIR, "test_cases_common.yaml")) as f:
         test_case = yaml.load(f, Loader=yaml.Loader)
         request = test_case["seed_test"]["request"]
         message = json_format.ParseDict(request, pb2.BatchedGenerationRequest())
@@ -326,6 +328,15 @@ async def test_bloom(server_fixture, test_cases):
 async def test_mt0(server_fixture, test_cases):
     await run_test_cases_async(test_cases, seq2seq_model=True)
 
+# test with tiny GPTBigCode model for the merged kv cache
+@pytest.mark.model("bigcode/tiny_starcoder_py")
+@pytest.mark.extensions(".safetensors,.json")
+@pytest.mark.shards(1)
+@pytest.mark.test_case_file("test_cases_tinystarcoderpy.yaml")
+@pytest.mark.asyncio
+async def test_gptbigcode(server_fixture, test_cases):
+    await run_test_cases_async(test_cases)
+
 
 # Test distributed inference - two shards
 @pytest.mark.model("bigscience/bloom-560m")
@@ -375,4 +386,3 @@ def event_loop():
         loop = asyncio.new_event_loop()
     yield loop
     loop.close()
-