opensearch-project
diff --git a/‎plugin/src/main/java/org/opensearch/ml/action/stats/MLStatsNodesRequest.java‎
Lines changed: 14 additions & 0 deletions b/‎plugin/src/main/java/org/opensearch/ml/action/stats/MLStatsNodesRequest.java‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/action/stats/MLStatsNodesTransportAction.java‎
Lines changed: 3 additions & 2 deletions b/‎plugin/src/main/java/org/opensearch/ml/action/stats/MLStatsNodesTransportAction.java‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java‎
Lines changed: 6 additions & 5 deletions b/‎plugin/src/main/java/org/opensearch/ml/plugin/MachineLearningPlugin.java‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/rest/RestStatsMLAction.java‎
Lines changed: 1 addition & 1 deletion b/‎plugin/src/main/java/org/opensearch/ml/rest/RestStatsMLAction.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎plugin/src/main/java/org/opensearch/ml/stats/ActionName.java‎
Lines changed: 12 additions & 0 deletions b/‎plugin/src/main/java/org/opensearch/ml/stats/ActionName.java‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/stats/MLStats.java‎
Lines changed: 22 additions & 0 deletions b/‎plugin/src/main/java/org/opensearch/ml/stats/MLStats.java‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/stats/StatNames.java‎
Lines changed: 18 additions & 21 deletions b/‎plugin/src/main/java/org/opensearch/ml/stats/StatNames.java‎
Lines changed: 18 additions & 21 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java‎
Lines changed: 64 additions & 71 deletions b/‎plugin/src/main/java/org/opensearch/ml/task/MLPredictTaskRunner.java‎
Lines changed: 64 additions & 71 deletions
diff --git a/‎plugin/src/main/java/org/opensearch/ml/task/MLTaskDispatcher.java‎
Lines changed: 4 additions & 4 deletions b/‎plugin/src/main/java/org/opensearch/ml/task/MLTaskDispatcher.java‎
Lines changed: 4 additions & 4 deletions
@@ -24,9 +24,14 @@ public class MLStatsNodesRequest extends BaseNodesRequest<MLStatsNodesRequest> {
 
     @Getter
     private Set<String> statsToBeRetrieved;
+    /**
+     * If set this field as true, will retrieve all stats.
+     */
+    private boolean retrieveAllStats = false;
 
     public MLStatsNodesRequest(StreamInput in) throws IOException {
         super(in);
+        retrieveAllStats = in.readBoolean();
         statsToBeRetrieved = in.readSet(StreamInput::readString);
     }
 
@@ -50,6 +55,14 @@ public MLStatsNodesRequest(DiscoveryNode... nodes) {
         statsToBeRetrieved = new HashSet<>();
     }
 
+    public boolean isRetrieveAllStats() {
+        return retrieveAllStats;
+    }
+
+    public void setRetrieveAllStats(boolean retrieveAllStats) {
+        this.retrieveAllStats = retrieveAllStats;
+    }
+
     /**
      * Adds a stat to the set of stats to be retrieved
      *
@@ -82,6 +95,7 @@ public void readFrom(StreamInput in) throws IOException {
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         super.writeTo(out);
+        out.writeBoolean(retrieveAllStats);
         out.writeStringCollection(statsToBeRetrieved);
     }
 }
@@ -90,14 +90,15 @@ protected MLStatsNodeResponse nodeOperation(MLStatsNodeRequest request) {
     private MLStatsNodeResponse createMLStatsNodeResponse(MLStatsNodesRequest mlStatsNodesRequest) {
         Map<String, Object> statValues = new HashMap<>();
         Set<String> statsToBeRetrieved = mlStatsNodesRequest.getStatsToBeRetrieved();
+        boolean retrieveAllStats = mlStatsNodesRequest.isRetrieveAllStats();
 
-        if (statsToBeRetrieved.contains(InternalStatNames.JVM_HEAP_USAGE.getName())) {
+        if (retrieveAllStats || statsToBeRetrieved.contains(InternalStatNames.JVM_HEAP_USAGE.getName())) {
             long heapUsedPercent = jvmService.stats().getMem().getHeapUsedPercent();
             statValues.put(InternalStatNames.JVM_HEAP_USAGE.getName(), heapUsedPercent);
         }
 
         for (String statName : mlStats.getNodeStats().keySet()) {
-            if (statsToBeRetrieved.contains(statName)) {
+            if (retrieveAllStats || statsToBeRetrieved.contains(statName)) {
                 statValues.put(statName, mlStats.getStats().get(statName).getValue());
             }
         }
 
@@ -9,6 +9,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.function.Supplier;
 
 import org.opensearch.action.ActionRequest;
@@ -89,7 +90,6 @@
 import org.opensearch.watcher.ResourceWatcherService;
 
 import com.google.common.collect.ImmutableList;
-import com.google.common.collect.ImmutableMap;
 
 public class MachineLearningPlugin extends Plugin implements ActionPlugin {
     public static final String TASK_THREAD_POOL = "OPENSEARCH_ML_TASK_THREAD_POOL";
@@ -157,10 +157,11 @@ public Collection<Object> createComponents(
         JvmService jvmService = new JvmService(environment.settings());
         MLCircuitBreakerService mlCircuitBreakerService = new MLCircuitBreakerService(jvmService).init();
 
-        Map<String, MLStat<?>> stats = ImmutableMap
-            .<String, MLStat<?>>builder()
-            .put(StatNames.ML_EXECUTING_TASK_COUNT.getName(), new MLStat<>(false, new CounterSupplier()))
-            .build();
+        Map<String, MLStat<?>> stats = new ConcurrentHashMap<>();
+        stats.put(StatNames.ML_EXECUTING_TASK_COUNT, new MLStat<>(false, new CounterSupplier()));
+        stats.put(StatNames.ML_TOTAL_REQUEST_COUNT, new MLStat<>(false, new CounterSupplier()));
+        stats.put(StatNames.ML_TOTAL_FAILURE_COUNT, new MLStat<>(false, new CounterSupplier()));
+        stats.put(StatNames.ML_TOTAL_MODEL_COUNT, new MLStat<>(false, new CounterSupplier()));
         this.mlStats = new MLStats(stats);
 
         mlIndicesHandler = new MLIndicesHandler(clusterService, client);
 
@@ -78,7 +78,7 @@ MLStatsNodesRequest getRequest(RestRequest request) {
 
         Set<String> validStats = mlStats.getStats().keySet();
         if (isAllStatsRequested(requestedStats)) {
-            mlStatsRequest.addAll(validStats);
+            mlStatsRequest.setRetrieveAllStats(true);
         } else {
             mlStatsRequest.addAll(getStatsToBeRetrieved(request, validStats, requestedStats));
         }
 
@@ -0,0 +1,12 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+package org.opensearch.ml.stats;
+
+public enum ActionName {
+    TRAIN,
+    PREDICT,
+    TRAIN_PREDICT;
+}
@@ -7,9 +7,12 @@
 
 import java.util.HashMap;
 import java.util.Map;
+import java.util.function.Supplier;
 
 import lombok.Getter;
 
+import org.opensearch.ml.stats.suppliers.CounterSupplier;
+
 /**
  * This class is the main entry-point for access to the stats that the ML plugin keeps track of.
  */
@@ -40,6 +43,25 @@ public MLStat<?> getStat(String key) throws IllegalArgumentException {
         return stats.get(key);
     }
 
+    /**
+     * Get stat or create counter stat if absent.
+     * @param key stat key
+     * @return existing MLStat or new MLStat
+     */
+    public MLStat<?> createCounterStatIfAbsent(String key) {
+        return createStatIfAbsent(key, () -> new MLStat<>(false, new CounterSupplier()));
+    }
+
+    /**
+     * Get stat or create if absent.
+     * @param key stat key
+     * @param supplier supplier to create MLStat
+     * @return existing MLStat or new MLStat
+     */
+    public synchronized MLStat<?> createStatIfAbsent(String key, Supplier<MLStat> supplier) {
+        return stats.computeIfAbsent(key, k -> supplier.get());
+    }
+
     /**
      * Get a map of the stats that are kept at the node level
      *
 
@@ -5,35 +5,32 @@
 
 package org.opensearch.ml.stats;
 
-import java.util.HashSet;
-import java.util.Set;
+import java.util.Locale;
 
-import lombok.Getter;
+import org.opensearch.ml.common.parameter.FunctionName;
 
 /**
  * Enum containing names of all stats
  */
-public enum StatNames {
-    ML_EXECUTING_TASK_COUNT("ml_executing_task_count");
+public class StatNames {
+    public static String ML_EXECUTING_TASK_COUNT = "ml_executing_task_count";
+    public static String ML_TOTAL_REQUEST_COUNT = "ml_total_request_count";
+    public static String ML_TOTAL_FAILURE_COUNT = "ml_total_failure_count";
+    public static String ML_TOTAL_MODEL_COUNT = "ml_total_model_count";
+
+    public static String requestCountStat(FunctionName functionName, ActionName actionName) {
+        return String.format("ml_%s_%s_request_count", functionName, actionName, Locale.ROOT).toLowerCase(Locale.ROOT);
+    }
 
-    @Getter
-    private String name;
+    public static String failureCountStat(FunctionName functionName, ActionName actionName) {
+        return String.format("ml_%s_%s_failure_count", functionName, actionName, Locale.ROOT).toLowerCase(Locale.ROOT);
+    }
 
-    StatNames(String name) {
-        this.name = name;
+    public static String executingRequestCountStat(FunctionName functionName, ActionName actionName) {
+        return String.format("ml_%s_%s_executing_request_count", functionName, actionName, Locale.ROOT).toLowerCase(Locale.ROOT);
     }
 
-    /**
-     * Get set of stat names
-     *
-     * @return set of stat names
-     */
-    public static Set<String> getNames() {
-        Set<String> names = new HashSet<>();
-
-        for (StatNames statName : StatNames.values()) {
-            names.add(statName.getName());
-        }
-        return names;
+    public static String modelCountStat(FunctionName functionName) {
+        return String.format("ml_%s_model_count", functionName, Locale.ROOT).toLowerCase(Locale.ROOT);
     }
 }
@@ -10,6 +10,10 @@
 import static org.opensearch.ml.permission.AccessController.getUserContext;
 import static org.opensearch.ml.plugin.MachineLearningPlugin.TASK_THREAD_POOL;
 import static org.opensearch.ml.stats.StatNames.ML_EXECUTING_TASK_COUNT;
+import static org.opensearch.ml.stats.StatNames.ML_TOTAL_FAILURE_COUNT;
+import static org.opensearch.ml.stats.StatNames.ML_TOTAL_REQUEST_COUNT;
+import static org.opensearch.ml.stats.StatNames.failureCountStat;
+import static org.opensearch.ml.stats.StatNames.requestCountStat;
 
 import java.time.Instant;
 import java.util.Base64;
@@ -46,6 +50,7 @@
 import org.opensearch.ml.common.transport.prediction.MLPredictionTaskRequest;
 import org.opensearch.ml.engine.MLEngine;
 import org.opensearch.ml.indices.MLInputDatasetHandler;
+import org.opensearch.ml.stats.ActionName;
 import org.opensearch.ml.stats.MLStats;
 import org.opensearch.threadpool.ThreadPool;
 import org.opensearch.transport.TransportService;
@@ -146,86 +151,74 @@ private void predict(
     ) {
         ActionListener<MLTaskResponse> internalListener = wrappedCleanupListener(listener, mlTask.getTaskId());
         // track ML task count and add ML task into cache
-        mlStats.getStat(ML_EXECUTING_TASK_COUNT.getName()).increment();
+        mlStats.getStat(ML_EXECUTING_TASK_COUNT).increment();
+        mlStats.getStat(ML_TOTAL_REQUEST_COUNT).increment();
+        mlStats.createCounterStatIfAbsent(requestCountStat(mlTask.getFunctionName(), ActionName.PREDICT)).increment();
         mlTaskManager.add(mlTask);
-        MLInput mlInput = request.getMlInput();
 
         // run predict
-        try {
+        if (request.getModelId() != null) {
             // search model by model id.
-            Model model = new Model();
-            if (request.getModelId() != null) {
+            try (ThreadContext.StoredContext context = threadPool.getThreadContext().stashContext()) {
+                MLInput mlInput = request.getMlInput();
+                ActionListener<GetResponse> getResponseListener = ActionListener.wrap(r -> {
+                    if (r == null || !r.isExists()) {
+                        internalListener.onFailure(new ResourceNotFoundException("No model found, please check the modelId."));
+                        return;
+                    }
+                    Map<String, Object> source = r.getSourceAsMap();
+                    User requestUser = getUserContext(client);
+                    User resourceUser = User.parse((String) source.get(USER));
+                    if (!checkUserPermissions(requestUser, resourceUser, request.getModelId())) {
+                        // The backend roles of request user and resource user doesn't have intersection
+                        OpenSearchException e = new OpenSearchException(
+                            "User: " + requestUser.getName() + " does not have permissions to run predict by model: " + request.getModelId()
+                        );
+                        log.debug(e);
+                        handlePredictFailure(mlTask, internalListener, e, false);
+                        return;
+                    }
+
+                    Model model = new Model();
+                    model.setName((String) source.get(MLModel.MODEL_NAME));
+                    model.setVersion((Integer) source.get(MLModel.MODEL_VERSION));
+                    byte[] decoded = Base64.getDecoder().decode((String) source.get(MLModel.MODEL_CONTENT));
+                    model.setContent(decoded);
+
+                    // run predict
+                    mlTaskManager.updateTaskState(mlTask.getTaskId(), MLTaskState.RUNNING, mlTask.isAsync());
+                    MLOutput output = MLEngine
+                        .predict(mlInput.toBuilder().inputDataset(new DataFrameInputDataset(inputDataFrame)).build(), model);
+                    if (output instanceof MLPredictionOutput) {
+                        ((MLPredictionOutput) output).setStatus(MLTaskState.COMPLETED.name());
+                    }
+
+                    // Once prediction complete, reduce ML_EXECUTING_TASK_COUNT and update task state
+                    handleAsyncMLTaskComplete(mlTask);
+                    MLTaskResponse response = MLTaskResponse.builder().output(output).build();
+                    internalListener.onResponse(response);
+                }, e -> {
+                    log.error("Failed to predict " + mlInput.getAlgorithm() + ", modelId: " + mlTask.getModelId(), e);
+                    handlePredictFailure(mlTask, internalListener, e, true);
+                });
                 GetRequest getRequest = new GetRequest(ML_MODEL_INDEX, mlTask.getModelId());
-                try (ThreadContext.StoredContext context = threadPool.getThreadContext().stashContext()) {
-                    ActionListener<GetResponse> getResponseListener = ActionListener.wrap(r -> {
-                        if (r == null || !r.isExists()) {
-                            internalListener.onFailure(new ResourceNotFoundException("No model found, please check the modelId."));
-                            return;
-                        }
-                        Map<String, Object> source = r.getSourceAsMap();
-                        User requestUser = getUserContext(client);
-                        User resourceUser = User.parse((String) source.get(USER));
-                        if (!checkUserPermissions(requestUser, resourceUser, request.getModelId())) {
-                            // The backend roles of request user and resource user doesn't have intersection
-                            OpenSearchException e = new OpenSearchException(
-                                "User: "
-                                    + requestUser.getName()
-                                    + " does not have permissions to run predict by model: "
-                                    + request.getModelId()
-                            );
-                            log.debug(e);
-                            handlePredictFailure(mlTask, internalListener, e);
-                            return;
-                        }
-
-                        model.setName((String) source.get(MLModel.MODEL_NAME));
-                        model.setVersion((Integer) source.get(MLModel.MODEL_VERSION));
-                        byte[] decoded = Base64.getDecoder().decode((String) source.get(MLModel.MODEL_CONTENT));
-                        model.setContent(decoded);
-
-                        // run predict
-                        MLOutput output;
-                        try {
-                            mlTaskManager.updateTaskState(mlTask.getTaskId(), MLTaskState.RUNNING, mlTask.isAsync());
-                            output = MLEngine
-                                .predict(mlInput.toBuilder().inputDataset(new DataFrameInputDataset(inputDataFrame)).build(), model);
-                            if (output instanceof MLPredictionOutput) {
-                                ((MLPredictionOutput) output).setStatus(MLTaskState.COMPLETED.name());
-                            }
-
-                            // Once prediction complete, reduce ML_EXECUTING_TASK_COUNT and update task state
-                            handleAsyncMLTaskComplete(mlTask);
-                        } catch (Exception e) {
-                            // todo need to specify what exception
-                            log.error("Failed to predict " + mlInput.getAlgorithm() + ", modelId: " + model.getName(), e);
-                            handlePredictFailure(mlTask, internalListener, e);
-                            return;
-                        }
-
-                        MLTaskResponse response = MLTaskResponse.builder().output(output).build();
-                        internalListener.onResponse(response);
-                    }, e -> {
-                        log.error("Failed to predict model " + mlTask.getModelId(), e);
-                        internalListener.onFailure(e);
-                    });
-                    client.get(getRequest, ActionListener.runBefore(getResponseListener, () -> context.restore()));
-                } catch (Exception e) {
-                    log.error("Failed to get model " + mlTask.getModelId(), e);
-                    internalListener.onFailure(e);
-                }
-            } else {
-                IllegalArgumentException e = new IllegalArgumentException("ModelId is invalid");
-                log.error("ModelId is invalid", e);
-                handlePredictFailure(mlTask, internalListener, e);
-                return;
+                client.get(getRequest, ActionListener.runBefore(getResponseListener, () -> context.restore()));
+            } catch (Exception e) {
+                log.error("Failed to get model " + mlTask.getModelId(), e);
+                handlePredictFailure(mlTask, internalListener, e, true);
             }
-        } catch (Exception e) {
-            log.error("Failed to predict " + mlInput.getAlgorithm(), e);
-            internalListener.onFailure(e);
+        } else {
+            IllegalArgumentException e = new IllegalArgumentException("ModelId is invalid");
+            log.error("ModelId is invalid", e);
+            handlePredictFailure(mlTask, internalListener, e, false);
         }
     }
 
-    private void handlePredictFailure(MLTask mlTask, ActionListener<MLTaskResponse> listener, Exception e) {
+    private void handlePredictFailure(MLTask mlTask, ActionListener<MLTaskResponse> listener, Exception e, boolean trackFailure) {
+        if (trackFailure) {
+            mlStats.createCounterStatIfAbsent(failureCountStat(mlTask.getFunctionName(), ActionName.PREDICT)).increment();
+            mlStats.getStat(ML_TOTAL_FAILURE_COUNT).increment();
+        }
         handleAsyncMLTaskFailure(mlTask, e);
         listener.onFailure(e);
     }
 
@@ -56,7 +56,7 @@ public void dispatchTask(ActionListener<DiscoveryNode> listener) {
         // DiscoveryNode[] mlNodes = getEligibleMLNodes();
         DiscoveryNode[] mlNodes = getEligibleDataNodes();
         MLStatsNodesRequest MLStatsNodesRequest = new MLStatsNodesRequest(mlNodes);
-        MLStatsNodesRequest.addAll(ImmutableSet.of(ML_EXECUTING_TASK_COUNT.getName(), JVM_HEAP_USAGE.getName()));
+        MLStatsNodesRequest.addAll(ImmutableSet.of(ML_EXECUTING_TASK_COUNT, JVM_HEAP_USAGE.getName()));
 
         client.execute(MLStatsNodesAction.INSTANCE, MLStatsNodesRequest, ActionListener.wrap(mlStatsResponse -> {
             // Check JVM pressure
@@ -78,7 +78,7 @@ public void dispatchTask(ActionListener<DiscoveryNode> listener) {
             // Check # of executing ML task
             candidateNodeResponse = candidateNodeResponse
                 .stream()
-                .filter(stat -> (Long) stat.getStatsMap().get(ML_EXECUTING_TASK_COUNT.getName()) < maxMLBatchTaskPerNode)
+                .filter(stat -> (Long) stat.getStatsMap().get(ML_EXECUTING_TASK_COUNT) < maxMLBatchTaskPerNode)
                 .collect(Collectors.toList());
             if (candidateNodeResponse.size() == 0) {
                 String errorMessage = "All nodes' executing ML task count reach limitation.";
@@ -91,8 +91,8 @@ public void dispatchTask(ActionListener<DiscoveryNode> listener) {
             Optional<MLStatsNodeResponse> targetNode = candidateNodeResponse
                 .stream()
                 .sorted((MLStatsNodeResponse r1, MLStatsNodeResponse r2) -> {
-                    int result = ((Long) r1.getStatsMap().get(ML_EXECUTING_TASK_COUNT.getName()))
-                        .compareTo((Long) r2.getStatsMap().get(ML_EXECUTING_TASK_COUNT.getName()));
+                    int result = ((Long) r1.getStatsMap().get(ML_EXECUTING_TASK_COUNT))
+                        .compareTo((Long) r2.getStatsMap().get(ML_EXECUTING_TASK_COUNT));
                     if (result == 0) {
                         // if multiple nodes have same running task count, choose the one with least
                         // JVM heap usage.
Original file line number	Diff line number	Diff line change
`@@ -90,14 +90,15 @@ protected MLStatsNodeResponse nodeOperation(MLStatsNodeRequest request) {`
`90`	`90`	`private MLStatsNodeResponse createMLStatsNodeResponse(MLStatsNodesRequest mlStatsNodesRequest) {`
`91`	`91`	`Map<String, Object> statValues = new HashMap<>();`
`92`	`92`	`Set<String> statsToBeRetrieved = mlStatsNodesRequest.getStatsToBeRetrieved();`
	`93`	`+ boolean retrieveAllStats = mlStatsNodesRequest.isRetrieveAllStats();`
`93`	`94`
`94`		`- if (statsToBeRetrieved.contains(InternalStatNames.JVM_HEAP_USAGE.getName())) {`
	`95`	`+ if (retrieveAllStats \|\| statsToBeRetrieved.contains(InternalStatNames.JVM_HEAP_USAGE.getName())) {`
`95`	`96`	`long heapUsedPercent = jvmService.stats().getMem().getHeapUsedPercent();`
`96`	`97`	`statValues.put(InternalStatNames.JVM_HEAP_USAGE.getName(), heapUsedPercent);`
`97`	`98`	`}`
`98`	`99`
`99`	`100`	`for (String statName : mlStats.getNodeStats().keySet()) {`
`100`		`- if (statsToBeRetrieved.contains(statName)) {`
	`101`	`+ if (retrieveAllStats \|\| statsToBeRetrieved.contains(statName)) {`
`101`	`102`	`statValues.put(statName, mlStats.getStats().get(statName).getValue());`
`102`	`103`	`}`
`103`	`104`	`}`
Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ MLStatsNodesRequest getRequest(RestRequest request) {`
`78`	`78`
`79`	`79`	`Set<String> validStats = mlStats.getStats().keySet();`
`80`	`80`	`if (isAllStatsRequested(requestedStats)) {`
`81`		`- mlStatsRequest.addAll(validStats);`
	`81`	`+ mlStatsRequest.setRetrieveAllStats(true);`
`82`	`82`	`} else {`
`83`	`83`	`mlStatsRequest.addAll(getStatsToBeRetrieved(request, validStats, requestedStats));`
`84`	`84`	`}`