Add reshape+batching and dynamic batching support for string I/O (#69)

krishung5 · web-flow · commit 3ecda562665c · 2022-07-07T17:25:51.000-07:00
* reshape+batching and dynamic batching support for string I/O

* Address comment

* Address comment
diff --git a/README.md b/README.md
@@ -231,5 +231,3 @@ a List of Strings as input(s) / produces a List of String as output(s). For thes
 Triton allows users to pass String input(s)/recieve String output(s) using the String
 datatype. As a limitation of using List instead of Tensor for String I/O, only for
 1-dimensional input(s)/output(s) are supported for I/O of String type.
-Batching is not allowed for PyTorch models with String I/O. For these models,
-the user must specify `max_batch_size: 0` in the configuration.
diff --git a/src/libtorch.cc b/src/libtorch.cc
@@ -535,6 +535,9 @@ class ModelInstanceState : public BackendModelInstance {
 
   // If the input to the tensor is a dictionary of tensors.
   bool is_dict_input_;
+
+  // If the model supports batching.
+  bool supports_batching_;
 };
 
 TRITONSERVER_Error*
@@ -607,6 +610,7 @@ ModelInstanceState::ModelInstanceState(
       expected_input_cnt += 1;
     }
   }
+  supports_batching_ = model_state_->MaxBatchSize() > 0;
 
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs(expected_input_cnt));
   THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
@@ -787,7 +791,6 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
         "specified.");
   }
 
-  bool supports_batching = model_state_->MaxBatchSize() > 0;
   NamingConvention naming_convention;
   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, allowed_inputs));
 
@@ -837,8 +840,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
               .c_str());
     }
 
-    // Validate shape for String inputs. Only allow 1 dimension and no
-    // batching.
+    // Validate shape for String inputs. Only allow 1 dimension.
     if (io_dtype == "TYPE_STRING") {
       // If a reshape is provided for the input then use that when
       // validating the model shapes.
@@ -850,7 +852,7 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
       }
 
-      if ((dims.size() > 1) || supports_batching) {
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
             ("Triton only supports 1 dimensional List of String as input for "
@@ -880,7 +882,6 @@ ModelInstanceState::ValidateOutputs()
         "specified.");
   }
 
-  const bool supports_batching = model_state_->MaxBatchSize() > 0;
   NamingConvention naming_convention;
   RETURN_IF_ERROR(GetNamingConvention(&naming_convention, {}));
 
@@ -917,8 +918,7 @@ ModelInstanceState::ValidateOutputs()
               .c_str());
     }
 
-    // Validate shape for String outputs. Only allow 1 dimension and no
-    // batching.
+    // Validate shape for String outputs. Only allow 1 dimension.
     if (io_dtype == "TYPE_STRING") {
       // If a reshape is provided for the output then use that when
       // validating the model shapes.
@@ -930,7 +930,7 @@ ModelInstanceState::ValidateOutputs()
         RETURN_IF_ERROR(ParseShape(io, "dims", &dims));
       }
 
-      if ((dims.size() > 1) || supports_batching) {
+      if ((dims.size() + (supports_batching_ ? 1 : 0)) > 1) {
         return TRITONSERVER_ErrorNew(
             TRITONSERVER_ERROR_INTERNAL,
             ("Triton only supports 1 dimensional List of String as output for "
@@ -1015,7 +1015,7 @@ ModelInstanceState::ProcessRequests(
   for (size_t i = 0; i < request_count; i++) {
     if (max_batch_size > 0) {
       // Retrieve the batch size from one of the inputs, if the model
-      // supports batching, the first dimension size is batch size
+      // supports batching, the first dimension size is batch size.
       TRITONBACKEND_Input* input;
       TRITONSERVER_Error* err =
           TRITONBACKEND_RequestInputByIndex(requests[i], 0 /* index */, &input);
@@ -1294,7 +1294,7 @@ ModelInstanceState::Execute(
           if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
             throw std::invalid_argument(
                 "output at index " + std::to_string(op_index) +
-                " must be of type Tensor or List[str], recieved List[" +
+                " must be of type Tensor or List[str], received List[" +
                 list_output.elementType()->str() + "]");
           }
           output_tensors->push_back(m_op);
@@ -1310,7 +1310,7 @@ ModelInstanceState::Execute(
       auto list_output = model_outputs_.toList();
       if (list_output.elementType()->kind() != c10::TypeKind::StringType) {
         throw std::invalid_argument(
-            "output must be of type Tensor or List[str], recieved List[" +
+            "output must be of type Tensor or List[str], received List[" +
             list_output.elementType()->str() + "]");
       }
       output_tensors->push_back(model_outputs_);
@@ -1505,8 +1505,7 @@ GetContiguousInputContent(
 }
 
 void
-FillStringTensor(
-    torch::List<std::string>* input_list, const size_t idx, const size_t cnt)
+FillStringTensor(torch::List<std::string>* input_list, const size_t cnt)
 {
   for (size_t c = 0; c < cnt; ++c) {
     input_list->push_back("");
@@ -1517,9 +1516,8 @@ bool
 SetStringInputTensor(
     torch::List<std::string>* input_list, TRITONBACKEND_Input* input,
     const char* name, const uint32_t buffer_count,
-    const size_t request_element_cnt, const size_t tensor_offset,
-    TRITONBACKEND_Response** response, cudaStream_t stream,
-    const char* host_policy_name)
+    const size_t request_element_cnt, TRITONBACKEND_Response** response,
+    cudaStream_t stream, const char* host_policy_name)
 {
   bool cuda_copy = false;
   size_t element_idx = 0;
@@ -1537,9 +1535,7 @@ SetStringInputTensor(
       stream, &cuda_copy);
   if (err != nullptr) {
     RESPOND_AND_SET_NULL_IF_ERROR(response, err);
-    FillStringTensor(
-        input_list, tensor_offset + element_idx,
-        request_element_cnt - element_idx);
+    FillStringTensor(input_list, request_element_cnt - element_idx);
     return cuda_copy;
   }
 
@@ -1564,9 +1560,6 @@ SetStringInputTensor(
                   std::to_string(element_idx + 1) + " for inference input '" +
                   name + "', expecting " + std::to_string(request_element_cnt))
                   .c_str()));
-      FillStringTensor(
-          input_list, tensor_offset + element_idx,
-          request_element_cnt - element_idx);
       return cuda_copy;
     }
 
@@ -1585,9 +1578,7 @@ SetStringInputTensor(
                   std::to_string(len) + " but only " +
                   std::to_string(content_byte_size) + " bytes available")
                   .c_str()));
-      FillStringTensor(
-          input_list, tensor_offset + element_idx,
-          request_element_cnt - element_idx);
+      FillStringTensor(input_list, request_element_cnt - element_idx);
       return cuda_copy;
     }
 
@@ -1608,9 +1599,9 @@ SetStringInputTensor(
                           " strings for inference input '" + name + "', got " +
                           std::to_string(element_idx))
                           .c_str()));
-    FillStringTensor(
-        input_list, tensor_offset + element_idx,
-        request_element_cnt - element_idx);
+    if (element_idx < request_element_cnt) {
+      FillStringTensor(input_list, request_element_cnt - element_idx);
+    }
   }
 
   return cuda_copy;
@@ -1620,7 +1611,7 @@ bool
 SetStringOutputBuffer(
     torch::List<torch::jit::IValue>* tensor, TRITONBACKEND_Response** response,
     TRITONBACKEND_Output* response_output, const size_t tensor_element_count,
-    const size_t tensor_offset, cudaStream_t stream, std::string* serialized)
+    cudaStream_t stream, std::string* serialized)
 {
   bool cuda_copy = false;
 
@@ -1677,8 +1668,6 @@ ModelInstanceState::SetInputTensors(
     std::vector<torch::jit::IValue>* input_tensors,
     std::vector<BackendMemory*>* input_memories, bool* cuda_copy)
 {
-  const int max_batch_size = model_state_->MaxBatchSize();
-
   // InferenceMode should be used to guard all tensors operations
   torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode());
 
@@ -1705,7 +1694,7 @@ ModelInstanceState::SetInputTensors(
     // The shape for the entire input patch, [total_batch_size, ...]
     std::vector<int64_t> batchn_shape(
         input_shape, input_shape + input_dims_count);
-    if (max_batch_size != 0) {
+    if (supports_batching_) {
       batchn_shape[0] = total_batch_size;
     }
 
@@ -1735,20 +1724,10 @@ ModelInstanceState::SetInputTensors(
 
 
     if (input_datatype == TRITONSERVER_TYPE_BYTES) {
-      if (batchn_shape.size() != 1) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL, ("Triton only supports 1 dimensional "
-                                          "List of string as input for '" +
-                                          std::string(input_name) + "'")
-                                             .c_str());
-      }
-
       // Create the PyTorch list to hold the strings.
       torch::List<std::string> input_list;
       input_list.reserve(batchn_shape[0]);
 
-      size_t tensor_offset = 0;
-
       for (size_t idx = 0; idx < request_count; idx++) {
         TRITONBACKEND_Input* input;
         RESPOND_AND_SET_NULL_IF_ERROR(
@@ -1767,9 +1746,7 @@ ModelInstanceState::SetInputTensors(
 
         *cuda_copy |= SetStringInputTensor(
             &input_list, input, input_name, buffer_count, batch_element_cnt,
-            tensor_offset, &((*responses)[idx]), CudaStream(),
-            HostPolicyName().c_str());
-        tensor_offset += batch_element_cnt;
+            &((*responses)[idx]), CudaStream(), HostPolicyName().c_str());
       }
 
       (*input_tensors)[input_index_map_[input_name]] = input_list;
@@ -1864,18 +1841,25 @@ ModelInstanceState::ReadOutputTensors(
 
     } else if (output_tensors[op_index].isList()) {
       // Custom handling for string/bytes tensor...
-
       torch::List<torch::jit::IValue> output_list =
           output_tensors[op_index].toList();
 
       // Get output shape
       std::vector<int64_t> batchn_shape{(int64_t)output_list.size()};
 
-      size_t tensor_offset = 0;
-
       for (size_t idx = 0; idx < responses->size(); idx++) {
+        auto& request = requests[idx];
         auto& response = (*responses)[idx];
 
+        if (supports_batching_ != 0) {
+          TRITONBACKEND_Input* input;
+          TRITONBACKEND_RequestInputByIndex(request, 0 /* index*/, &input);
+          const int64_t* shape;
+          TRITONBACKEND_InputProperties(
+              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);
+          batchn_shape[0] = shape[0];
+        }
+
         const size_t tensor_element_cnt = GetElementCount(batchn_shape);
 
         // Only need an response tensor for requested outputs.
@@ -1889,10 +1873,8 @@ ModelInstanceState::ReadOutputTensors(
           string_buffer.emplace_back(new std::string());
           cuda_copy |= SetStringOutputBuffer(
               &output_list, &response, response_output, tensor_element_cnt,
-              tensor_offset, CudaStream(), string_buffer.back().get());
+              CudaStream(), string_buffer.back().get());
         }
-
-        tensor_offset += tensor_element_cnt;
       }
     } else {
       return TRITONSERVER_ErrorNew(