Skip to content

Commit 2e1a223

Browse files
committed
Add num_input_tokens additional output
1 parent 366e668 commit 2e1a223

File tree

3 files changed

+42
-2
lines changed

3 files changed

+42
-2
lines changed

ci/L0_additional_outputs_vllm/additional_outputs_test.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def _get_inputs(
4444
sampling_parameters=None,
4545
return_finish_reason=None,
4646
return_cumulative_logprob=None,
47+
return_num_input_tokens=None,
4748
return_num_output_tokens=None,
4849
):
4950
inputs = []
@@ -76,6 +77,12 @@ def _get_inputs(
7677
np.array([return_cumulative_logprob], dtype=bool)
7778
)
7879

80+
if return_num_input_tokens is not None:
81+
inputs.append(grpcclient.InferInput("return_num_input_tokens", [1], "BOOL"))
82+
inputs[-1].set_data_from_numpy(
83+
np.array([return_num_input_tokens], dtype=bool)
84+
)
85+
7986
if return_num_output_tokens is not None:
8087
inputs.append(
8188
grpcclient.InferInput("return_num_output_tokens", [1], "BOOL")
@@ -135,6 +142,18 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
135142
assert cumulative_logprob != prev_cumulative_logprob
136143
prev_cumulative_logprob = cumulative_logprob
137144

145+
def _assert_num_input_tokens(self, return_num_input_tokens):
146+
for response in self._responses:
147+
result, error = response["result"], response["error"]
148+
assert error is None
149+
num_input_tokens_np = result.as_numpy(name="num_input_tokens")
150+
if return_num_input_tokens is None or return_num_input_tokens == False:
151+
assert num_input_tokens_np is None
152+
continue
153+
num_input_tokens = num_input_tokens_np.astype(int)
154+
assert num_input_tokens > 0
155+
assert num_input_tokens <= len(self._prompt)
156+
138157
def _assert_num_output_tokens(self, return_num_output_tokens):
139158
for response in self._responses:
140159
result, error = response["result"], response["error"]
@@ -166,12 +185,14 @@ def _assert_num_output_tokens(self, return_num_output_tokens):
166185
@pytest.mark.parametrize("stream", [True, False])
167186
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
168187
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
188+
@pytest.mark.parametrize("return_num_input_tokens", [None, True, False])
169189
@pytest.mark.parametrize("return_num_output_tokens", [None, True, False])
170190
def test_additional_outputs(
171191
self,
172192
stream,
173193
return_finish_reason,
174194
return_cumulative_logprob,
195+
return_num_input_tokens,
175196
return_num_output_tokens,
176197
):
177198
inputs = self._get_inputs(
@@ -180,10 +201,12 @@ def test_additional_outputs(
180201
sampling_parameters=self._sampling_parameters,
181202
return_finish_reason=return_finish_reason,
182203
return_cumulative_logprob=return_cumulative_logprob,
204+
return_num_input_tokens=return_num_input_tokens,
183205
return_num_output_tokens=return_num_output_tokens,
184206
)
185207
self._llm_infer(inputs)
186208
self._assert_text_output_valid()
187209
self._assert_finish_reason(return_finish_reason)
188210
self._assert_cumulative_logprob(return_cumulative_logprob)
211+
self._assert_num_input_tokens(return_num_input_tokens)
189212
self._assert_num_output_tokens(return_num_output_tokens)

ci/L0_additional_outputs_vllm/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/v
4040
RET=0
4141

4242
# Test
43-
SERVER_LOG="vllm_opt.server.log"
43+
SERVER_LOG="additional_outputs_test.server.log"
4444
SERVER_ARGS="--model-repository=models"
4545
run_server
4646
if [ "$SERVER_PID" == "0" ]; then

src/model.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
104104
"dims": [1],
105105
"optional": True,
106106
},
107+
{
108+
"name": "return_num_input_tokens",
109+
"data_type": "TYPE_BOOL",
110+
"dims": [1],
111+
"optional": True,
112+
},
107113
{
108114
"name": "return_num_output_tokens",
109115
"data_type": "TYPE_BOOL",
@@ -125,6 +131,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
125131
{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]},
126132
{"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]},
127133
{"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]},
134+
{"name": "num_input_tokens", "data_type": "TYPE_UINT32", "dims": [1]},
128135
{"name": "num_output_tokens", "data_type": "TYPE_UINT32", "dims": [-1]},
129136
]
130137

@@ -377,10 +384,11 @@ def _get_input_tensors(self, request):
377384
else:
378385
parameters = request.parameters()
379386

380-
# return_finish_reason, return_cumulative_logprob, return_num_output_tokens
387+
# additional outputs
381388
additional_outputs = {
382389
"return_finish_reason": None,
383390
"return_cumulative_logprob": None,
391+
"return_num_input_tokens": None,
384392
"return_num_output_tokens": None,
385393
}
386394
for tensor_name in additional_outputs.keys():
@@ -496,6 +504,15 @@ def _create_response(
496504
)
497505
)
498506

507+
# num_input_tokens
508+
if additional_outputs["return_num_input_tokens"]:
509+
num_input_tokens = len(request_output.prompt_token_ids)
510+
output_tensors.append(
511+
pb_utils.Tensor(
512+
"num_input_tokens", np.asarray(num_input_tokens, dtype=np.uint32)
513+
)
514+
)
515+
499516
# num_output_tokens
500517
if additional_outputs["return_num_output_tokens"]:
501518
if prev_request_output is None:

0 commit comments

Comments
 (0)