[ml service] Support asynchronous output from sub-plugin in ML service extension single API

songgot · songgot · commit dd435c807c1c · 2025-07-25T13:47:25.000+09:00
- Add invoke async callback to receive results from sub-plugins asynchronously.

Signed-off-by: hyunil park &lt;hyunil46.park@samsung.com&gt;
diff --git a/c/include/nnstreamer-tizen-internal.h b/c/include/nnstreamer-tizen-internal.h
@@ -26,6 +26,8 @@ extern "C" {
  */
 int ml_pipeline_construct_internal (const char *pipeline_description, ml_pipeline_state_cb cb, void *user_data, ml_pipeline_h *pipe);
 
+typedef void (*ml_single_invoke_async_cb) (void *handle, void *output, void *user_data); /**< The callback function pointer to be called every time the sub-plugin generates a new output tensor asynchronously. */
+
 /**
  * @brief An information to create single-shot instance.
  */
@@ -39,6 +41,8 @@ typedef struct {
   char *fw_name;                 /**< The explicit framework name given by user */
   int invoke_dynamic;            /**< True for supporting invoke with flexible output. */
   int invoke_async;              /**< The sub-plugin must support asynchronous output to use this option. If set to TRUE, the sub-plugin can generate multiple outputs asynchronously per single input. Otherwise, only synchronous single-output is expected and async callback/handle are ignored. */
+  void *invoke_async_data;     /**< User data to be passed to async callback. */
+  ml_single_invoke_async_cb invoke_async_cb;    /**< Callback function to be called when the sub-plugin generates an output asynchronously. */
 } ml_single_preset;
 
 /**
diff --git a/c/src/ml-api-inference-single.c b/c/src/ml-api-inference-single.c
@@ -1088,6 +1088,20 @@ ml_single_open_custom (ml_single_h * single, ml_single_preset * info)
     g_object_set (filter_obj, "custom", info->custom_option, NULL);
   }
 
+  if (single_h->klass && info->invoke_async) {
+    if (info->invoke_async_cb != NULL && info->invoke_async_data!= NULL) {
+      NNSFilterInvokeAsyncCallback invoke_async_cb =
+          (NNSFilterInvokeAsyncCallback) info->invoke_async_cb;
+      single_h->klass->set_invoke_async_callback (single_h->filter,
+          invoke_async_cb, info->invoke_async_data);
+    } else {
+      _ml_error_report
+        ("The parameters invoke_async_cb and invoke_async_data in the info argument are invalid");
+      status = ML_ERROR_INVALID_PARAMETER;
+      goto error;
+    }
+  }
+
   /* 4. Start the nnfw to get inout configurations if needed */
   if (!single_h->klass->start (single_h->filter)) {
     _ml_error_report
@@ -1235,6 +1249,14 @@ ml_single_open_with_option (ml_single_h * single, const ml_option_h option)
     if (strcasecmp ((gchar *) value, "TRUE") == 0)
       info.invoke_async = TRUE;
   }
+  if (info.invoke_async) {
+    if (ML_ERROR_NONE == ml_option_get (option, "invoke_async_cb", &value)) {
+      info.invoke_async_cb = (ml_single_invoke_async_cb) value;
+    }
+    if (ML_ERROR_NONE == ml_option_get (option, "invoke_async_cb_data", &value)) {
+      info.invoke_async_data = (void *) value;
+    }
+  }
 
   return ml_single_open_custom (single, &info);
 }
diff --git a/c/src/ml-api-service-extension.c b/c/src/ml-api-service-extension.c
@@ -234,6 +234,52 @@ _ml_extension_destroy_tensors_info (void *data)
     ml_tensors_info_destroy (info);
 }
 
+/**
+ * @brief Internal function to invoke model asynchronously. It is called by the sub-plugin.
+ */
+static void
+_ml_extension_invoke_async_callback (void *handle, GstTensorMemory * output)
+{
+  int ret;
+  size_t len;
+  ml_service_s *mls = NULL;
+  ml_tensors_info_h info;
+  ml_tensors_data_h data;
+  ml_tensor_dimension dimension = { 0 };
+
+  mls = (ml_service_s *) handle;
+  if (!output || !output->data || !mls) {
+    _ml_loge ("Invalid callback parameters.");
+    return;
+  }
+  // TODO: Use the tensor information received from the sub-plugin. This should not be a problem for llama.cpp
+  dimension[0] = len = strlen ((char *) output->data);
+  ml_tensors_info_create (&info);
+  ml_tensors_info_set_count (info, 1U);
+  ml_tensors_info_set_tensor_type (info, 0U, ML_TENSOR_TYPE_UINT8);
+  ml_tensors_info_set_tensor_dimension (info, 0U, dimension);
+
+  ret = ml_tensors_data_create (info, &data);
+  if (ret != ML_ERROR_NONE) {
+    _ml_loge("Failed to create tensors info. error: %d", ret);
+     ml_tensors_info_destroy(info);
+    g_free(output->data);
+  return;
+  }
+
+  ret = ml_tensors_data_set_tensor_data (data, 0U, output->data, len);
+  if (ret != ML_ERROR_NONE) {
+    _ml_loge("Failed to set tensor data. error: %d", ret);
+    ml_tensors_data_destroy(data);
+    ml_tensors_info_destroy(info);
+    g_free(output->data);
+    return;
+  }
+  g_free (output->data);
+
+  _ml_service_invoke_event_new_data (mls, NULL, data);
+}
+
 /**
  * @brief Internal function to parse single-shot info from json.
  */
@@ -352,8 +398,14 @@ _ml_extension_conf_parse_single (ml_service_s * mls, JsonObject * single)
     const gchar *invoke_async =
         json_object_get_string_member (single, "invoke_async");
 
-    if (STR_IS_VALID (invoke_async))
+    if (STR_IS_VALID (invoke_async)) {
       ml_option_set (option, "invoke_async", g_strdup (invoke_async), g_free);
+    }
+    if (strcasecmp (invoke_async, "TRUE") == 0) {
+      ml_option_set (option, "invoke_async_cb",
+          (void *) _ml_extension_invoke_async_callback, NULL);
+      ml_option_set (option, "invoke_async_cb_data", (void *) mls, NULL);
+    }
   }
 
 error:
diff --git a/tests/capi/unittest_capi_service_extension.cc b/tests/capi/unittest_capi_service_extension.cc
@@ -496,6 +496,31 @@ TEST_REQUIRE_TFLITE (MLServiceExtension, scenarioConfigLlamacpp)
   EXPECT_EQ (status, ML_ERROR_NONE);
 }
 
+/**
+ * @brief Usage of ml-service extension API.
+ */
+TEST_REQUIRE_TFLITE (MLServiceExtension, scenarioConfigLlamacppAsync)
+{
+  ml_service_h handle;
+  int status;
+
+  g_autofree gchar *model_file = _get_model_path ("llama-2-7b-chat.Q2_K.gguf");
+  if (!g_file_test (model_file, G_FILE_TEST_EXISTS)) {
+    g_critical ("Skipping scenarioConfigLlamacppAsync test due to missing model file. "
+                "Please download model file from https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF");
+    return;
+  }
+
+  g_autofree gchar *config = get_config_path ("config_single_llamacpp_async.conf");
+
+  status = ml_service_new (config, &handle);
+  ASSERT_EQ (status, ML_ERROR_NONE);
+
+  _extension_test_llamacpp (handle, FALSE);
+
+  status = ml_service_destroy (handle);
+  EXPECT_EQ (status, ML_ERROR_NONE);
+}
 
 /**
  * @brief Usage of ml-service extension API.
diff --git a/tests/test_models/config/config_single_llamacpp_async.conf b/tests/test_models/config/config_single_llamacpp_async.conf
@@ -0,0 +1,10 @@
+{
+    "single" :
+    {
+        "framework" : "llamacpp",
+        "model" : ["../tests/test_models/models/llama-2-7b-chat.Q2_K.gguf"],
+        "custom" : "num_predict:32",
+        "invoke_dynamic" : "true",
+        "invoke_async" : "true"
+    }
+}