@@ -1427,39 +1427,29 @@ inference request. For example,
14271427import triton_python_backend_utils as pb_utils
14281428
14291429class TritonPythonModel :
1430- ...
1430+ ...
14311431 def execute (self , requests ):
1432- ...
1433- infer_request = pb_utils.InferenceRequest(
1434- model_name = ' model_name' ,
1435- requested_output_names = [' REQUESTED_OUTPUT' ],
1436- inputs = [< pb_utils.Tensor object > ])
1432+ ...
1433+ bls_response_iterator = bls_request.exec(decoupled = True )
1434+ ...
1435+ bls_response_iterator.cancel()
1436+ ...
1437+ ```
14371438
1438- # Execute the infer_request and wait for the response. Here we are
1439- # running a BLS request on a decoupled model, hence setting the parameter
1440- # 'decoupled' to 'True'.
1441- infer_responses = infer_request.exec(decoupled = True )
1439+ You may also call the ` cancel() ` method on the response iterator returned from
1440+ the ` async_exec() ` method of the inference request. For example,
14421441
1443- response_tensors_received = []
1444- for infer_response in infer_responses:
1445- # Check if the inference response indicates an error.
1446- # vLLM backend uses the CANCELLED error code when a request is cancelled.
1447- # TensorRT-LLM backend does not use error codes; instead, it sends the
1448- # TRITONSERVER_RESPONSE_COMPLETE_FINAL flag to the iterator.
1449- if infer_response.has_error():
1450- if infer_response.error().code() == pb_utils.TritonError.CANCELLED :
1451- print (" request has been cancelled." )
1452- break
1453-
1454- # Collect the output tensor from the model's response
1455- output = pb_utils.get_output_tensor_by_name(
1456- infer_response, ' REQUESTED_OUTPUT' )
1457- response_tensors_received.append(output)
1458-
1459- # Check if we have received enough inference output tensors
1460- # and then cancel the response iterator
1461- if has_enough_response(response_tensors_received):
1462- infer_responses.cancel()
1442+ ``` python
1443+ import triton_python_backend_utils as pb_utils
1444+
1445+ class TritonPythonModel :
1446+ ...
1447+ async def execute (self , requests ):
1448+ ...
1449+ bls_response_iterator = await bls_request.async_exec(decoupled = True )
1450+ ...
1451+ bls_response_iterator.cancel()
1452+ ...
14631453```
14641454
14651455Note: Whether the decoupled model returns a cancellation error and stops executing
0 commit comments