opendatahub-io
diff --git a/‎Cargo.lock
Lines changed: 288 additions & 203 deletions b/‎Cargo.lock
Lines changed: 288 additions & 203 deletions
diff --git a/‎Dockerfile
Lines changed: 8 additions & 2 deletions b/‎Dockerfile
Lines changed: 8 additions & 2 deletions
diff --git a/‎integration_tests/Makefile
Lines changed: 1 addition & 1 deletion b/‎integration_tests/Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎integration_tests/poetry.lock
Lines changed: 303 additions & 308 deletions b/‎integration_tests/poetry.lock
Lines changed: 303 additions & 308 deletions
diff --git a/‎integration_tests/pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎integration_tests/pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎integration_tests/sample_client.py
Lines changed: 85 additions & 0 deletions b/‎integration_tests/sample_client.py
Lines changed: 85 additions & 0 deletions
diff --git a/‎launcher/Cargo.toml
Lines changed: 2 additions & 2 deletions b/‎launcher/Cargo.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎router/Cargo.toml
Lines changed: 13 additions & 12 deletions b/‎router/Cargo.toml
Lines changed: 13 additions & 12 deletions
diff --git a/‎router/client/Cargo.toml
Lines changed: 2 additions & 2 deletions b/‎router/client/Cargo.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎router/src/batcher.rs
Lines changed: 13 additions & 4 deletions b/‎router/src/batcher.rs
Lines changed: 13 additions & 4 deletions
@@ -1,6 +1,6 @@
 ## Global Args #################################################################
 ARG BASE_UBI_IMAGE_TAG=9.3-1610
-ARG PROTOC_VERSION=25.2
+ARG PROTOC_VERSION=25.3
 ARG PYTORCH_INDEX="https://download.pytorch.org/whl"
 # ARG PYTORCH_INDEX="https://download.pytorch.org/whl/nightly"
 ARG AUTO_GPTQ_VERSION=0.7.1
@@ -86,7 +86,7 @@ ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
 
 ## Rust builder ################################################################
 # Specific debian version so that compatible glibc version is used
-FROM rust:1.77-bullseye as rust-builder
+FROM rust:1.77.2-bullseye as rust-builder
 ARG PROTOC_VERSION
 
 ENV CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -164,6 +164,9 @@ RUN cd server && \
     make gen-server && \
     pip install ".[accelerate]" --no-cache-dir
 
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
+
 # Patch codegen model changes into transformers
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
@@ -288,6 +291,9 @@ COPY server server
 # Ref: https://onnxruntime.ai/docs/install/#install-onnx-runtime-gpu-cuda-12x
 RUN cd server && make gen-server && pip install ".[accelerate, ibm-fms, onnx-gpu, quantize]" --no-cache-dir --extra-index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
 
+# temp: install newer transformers lib that optimum clashes with
+RUN pip install transformers==4.40.0 tokenizers==0.19.1 --no-cache-dir
+
 # Patch codegen model changes into transformers 4.35
 RUN cp server/transformers_patch/modeling_codegen.py ${SITE_PACKAGES}/transformers/models/codegen/modeling_codegen.py
 
 
@@ -1,6 +1,6 @@
 gen-client:
 	# Compile protos
-	pip install grpcio-tools==1.60.0 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
+	pip install grpcio-tools==1.62.2 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4' --no-cache-dir
 	mkdir text_generation_tests/pb || true
 	python -m grpc_tools.protoc -I../proto --python_out=text_generation_tests/pb \
 		--grpc_python_out=text_generation_tests/pb --mypy_out=text_generation_tests/pb ../proto/generation.proto
 
@@ -9,7 +9,7 @@ python = ">=3.11"
 
 [tool.poetry.group.dev.dependencies]
 protobuf = "^4.25.3"
-grpcio-tools = "^1.62.1"
+grpcio-tools = "^1.62.2"
 pytest = "^8.1.1"
 pytest-asyncio = "^0.23.6"
 requests = "^2.31.0"
 
@@ -0,0 +1,85 @@
+import time
+import grpc
+from google.protobuf import json_format
+from text_generation_tests.pb import generation_pb2_grpc as gpb2, generation_pb2 as pb2
+
+
+def get_streaming_response_tgis(response):
+    stop = False
+    generated_tokens = 0
+    while not stop:
+        try:
+            x = next(response)
+            timestamp = time.time_ns()
+            data = json_format.MessageToDict(x)
+            # skip first response (tokenizer output only)
+            if "inputTokenCount" not in data:
+                n_tokens = data["generatedTokenCount"] - generated_tokens
+                generated_tokens = data["generatedTokenCount"]
+                yield data, n_tokens, timestamp, True, None
+        except Exception as e:
+            timestamp = time.time_ns()
+            yield None, 0, timestamp, False, e
+
+
+channel = grpc.insecure_channel("localhost:8033")
+stub = gpb2.GenerationServiceStub(channel)
+max_new_tokens = 100
+
+template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:"
+num_req = 0
+while True:
+    prompt_input = input(f"\n{num_req}) Enter a prompt:\n")
+
+    print("-" * 40)
+    print("Output:")
+    prompt = template.format(prompt_input)
+    sample_request = {
+        "model_id": "dummy-model-name",
+        "request": {"text": prompt},
+        "params": {
+            "method": "GREEDY",
+            "stopping": {
+                "max_new_tokens": max_new_tokens,
+                "min_new_tokens": max_new_tokens,
+            },
+        },
+    }
+    message = json_format.ParseDict(sample_request, pb2.SingleGenerationRequest())
+    output = []
+    total_time = 0
+    response = stub.GenerateStream(message)
+    response_generator = get_streaming_response_tgis(response)
+    t0 = time.time_ns()
+    response = ""
+    stop = False
+    while not stop:
+        r, n_tokens, t, ok, err = next(response_generator)
+
+        if not ok:
+            stop = True
+            # check if we have reached end of stream
+            if type(err) is StopIteration:
+                continue
+        duration = (t - t0) / 1000.0 / 1000.0
+        record = {
+            "response": r,
+            "ok": ok,
+            "error": str(err),
+            "timestamp": t,
+            "duration_ms": duration,
+            "n_tokens": n_tokens,
+        }
+        total_time += duration
+        response += r["text"]
+        output.append(record)
+        t0 = t
+
+    # print(json.dumps(output, indent=4))
+    print("-" * 40)
+    print(response)
+    print("-" * 40)
+    print(f"Total_time : {total_time}ms")
+    print(f"Time_per_token : {total_time/max_new_tokens}ms")
+    print("-" * 40)
+    num_req += 1
@@ -6,10 +6,10 @@ authors = ["Olivier Dehaene"]
 description = "Text Generation Launcher"
 
 [dependencies]
-clap = { version = "4.5.3", features = ["derive", "env"] }
+clap = { version = "4.5.4", features = ["derive", "env"] }
 ctrlc = { version = "3.4.4", features = ["termination"] }
 nix = { version = "0.28.0", features = ["process", "signal"] }
-serde_json = "^1.0.114"
+serde_json = "^1.0.11"
 tracing = "0.1.40"
 tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
 uuid = { version = "1.8.0", features = ["v4", "fast-rng"] }
@@ -17,33 +17,34 @@ path = "src/main.rs"
 axum = { version = "0.6.20", features = ["json"] }
 axum-tracing-opentelemetry = "0.10.0"
 text-generation-client = { path = "client" }
-clap = { version = "^4.5.2", features = ["derive", "env"] }
+clap = { version = "^4.5.4", features = ["derive", "env"] }
 futures = "^0.3.30"
 flume = "^0.11.0"
 metrics = "0.21.1"
 metrics-exporter-prometheus = { version = "0.12.2", features = [] }
-moka = { version = "0.12.5", features = ["future"] }
+moka = { version = "0.12.6", features = ["future"] }
 nohash-hasher = "^0.2.0"
-num = "^0.4.1"
+num = "^0.4.2"
 num_cpus = "^1.16.0"
 hyper = "^0.14.28" # Override to address CVE-2023-26964
+h2 = "^0.3.26 "  # Override to address CVEs
 openssl = "^0.10.64" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
-openssl-sys = "^0.9.101" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
+openssl-sys = "^0.9.102" # Override to address WS-2023-0082, WS-2023-0083, WS-2023-0195
 rustls-webpki = "0.102.2" # Override to address WS-2023-0305, CVE-2018-16875
 rand = "^0.8.5"
-serde = "^1.0.197"
-serde_json = "^1.0.114"
+serde = "^1.0.198"
+serde_json = "^1.0.116"
 thiserror = "^1.0.57"
-tokenizers = "0.15.2"
-tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "fs"] }
-tokio-rustls = "^0.25.0"
-rustls = "0.22.2"
+tokenizers = "0.19.1"
+tokio = { version = "1.37.0", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync", "fs"] }
+tokio-rustls = "^0.26.0"
+rustls = "0.22.4"
 tracing = "^0.1.40"
-prost = "^0.12.3"
+prost = "^0.12.4"
 tonic = { version = "^0.11.0", features = ["tls"] }
 tracing-subscriber = { version = "0.3.18", features = ["json", "env-filter"] }
 tracing-opentelemetry = "0.23.0"
-tokio-stream ="^0.1.14"
+tokio-stream ="^0.1.15"
 unicode-segmentation = "^1.11.0"
 unicode-truncate = "^0.2.0"
 opentelemetry = "0.22.0"
 
@@ -6,9 +6,9 @@ build="build.rs"
 
 [dependencies]
 futures = "^0.3.30"
-prost = "^0.12.3"
+prost = "^0.12.4"
 thiserror = "^1.0.58"
-tokio = { version = "1.36.0", features = ["sync"] }
+tokio = { version = "1.37.0", features = ["sync"] }
 tonic = "^0.11.0"
 tower = "^0.4.13"
 tracing = "^0.1.40"
 
@@ -839,10 +839,19 @@ impl<'a> TokenProcessor<'a> {
             let request_id = output.request_id;
             let next_token_id = output.token_id;
 
-            let e = self
-                .entries
-                .get_mut(&request_id)
-                .expect("ID not found. This is a bug.");
+            let e = self.entries.get_mut(&request_id);
+
+            // if a client cancelled a request and speculative decoding is 
+            // enabled, it's possible that the request will get removed
+            // from entries table, but there can still be tokens in outputs stream 
+            // corresponding to that request. ideally we could defer removing
+            // the request_id from the entries table until all tokens have been 
+            // processed...but for now let's just ignore them.
+            if e.is_none() {
+                continue;
+            }
+
+            let e = e.unwrap();
 
             let is_stream = e.stream_tx.is_some();
             let stop_seqs = &e.request.parameters.stop_seqs;