@@ -1457,6 +1457,28 @@ message ModelSequenceBatching
14571457 //@@ wait for additional requests for batching. Default is 0.
14581458 //@@
14591459 uint64 max_queue_delay_microseconds = 3 ;
1460+
1461+ //@@ .. cpp:var:: bool preserve_ordering
1462+ //@@
1463+ //@@ Should the dynamic batcher preserve the ordering of responses to
1464+ //@@ match the order of requests received by the scheduler. Default is
1465+ //@@ false. If true, the responses will be returned in the same order as
1466+ //@@ the order of requests sent to the scheduler. If false, the responses
1467+ //@@ may be returned in arbitrary order. This option is specifically
1468+ //@@ needed when a sequence of related inference requests (i.e. inference
1469+ //@@ requests with the same correlation ID) are sent to the dynamic
1470+ //@@ batcher to ensure that the sequence responses are in the correct
1471+ //@@ order.
1472+ //@@
1473+ //@@ When using decoupled models, setting this to true may block the
1474+ //@@ responses from independent sequences from being returned to the
1475+ //@@ client until the previous request completes, hurting overall
1476+ //@@ performance. If using GRPC streaming protocol, the stream ordering
1477+ //@@ guarantee may be sufficient alone to ensure the responses for each
1478+ //@@ sequence are returned in sequence-order without blocking based on
1479+ //@@ independent requests, depending on the use case.
1480+ //@@
1481+ bool preserve_ordering = 4 ;
14601482 }
14611483
14621484 //@@ .. cpp:var:: oneof strategy_choice
0 commit comments