sustainable-computing-io
diff --git a/‎.github/workflows/unit-test.yml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/unit-test.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 36 additions & 1 deletion b/‎Makefile‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎src/kepler_model/server/model_server.py‎
Lines changed: 41 additions & 6 deletions b/‎src/kepler_model/server/model_server.py‎
Lines changed: 41 additions & 6 deletions
diff --git a/‎src/kepler_model/train/profiler/node_type_index.py‎
Lines changed: 120 additions & 15 deletions b/‎src/kepler_model/train/profiler/node_type_index.py‎
Lines changed: 120 additions & 15 deletions
diff --git a/‎src/kepler_model/util/config.py‎
Lines changed: 2 additions & 1 deletion b/‎src/kepler_model/util/config.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/kepler_model/util/loader.py‎
Lines changed: 13 additions & 1 deletion b/‎src/kepler_model/util/loader.py‎
Lines changed: 13 additions & 1 deletion
@@ -38,3 +38,9 @@ jobs:
         timeout-minutes: 5
       - name: Test offline trainer
         run: make test-offline-trainer
+      - name: Test model server select
+        run: make test-model-server-select
+        timeout-minutes: 5
+      - name: Test model server select via estimator
+        run: make test-model-server-estimator-select
+        timeout-minutes: 5
@@ -150,3 +150,5 @@ model_training/*data*
 model_training/tekton/secret
 local-dev-cluster
 tmp
+tests/db-models
+db-models
@@ -119,12 +119,47 @@ test-offline-trainer: \
 	run-offline-trainer-client \
 	clean-offline-trainer
 
+# test model server select
+create-container-net:
+	@$(CTR_CMD) network create kepler-model-server-test
+
+run-model-server-with-db:
+	$(CTR_CMD) run -d --platform linux/amd64 \
+		--network kepler-model-server-test \
+		-p 8100:8100 \
+		--name model-server $(TEST_IMAGE) \
+		model-server
+	while ! docker logs model-server 2>&1 | grep -q 'Running on all'; do \
+		echo "... waiting for model-server to serve";  sleep 5; \
+	done
+
+run-estimator-with-model-server:
+	$(CTR_CMD) run -d --platform linux/amd64 \
+		--network kepler-model-server-test \
+		-e "MODEL_SERVER_ENABLE=true" \
+		-e "MODEL_SERVER_URL=http://model-server:8100" \
+		--name estimator $(TEST_IMAGE) \
+		/bin/bash -c "estimator --machine-spec tests/data/machine/spec.json"
+
+clean-container-net:
+	@$(CTR_CMD) network rm kepler-model-server-test
+
+run-select-client:
+	$(CTR_CMD) exec model-server \
+		hatch run test -vvv -s ./tests/model_select_test.py
+
+test-model-server-select: create-container-net run-model-server-with-db run-select-client clean-model-server clean-container-net
+
+test-model-server-estimator-select: create-container-net run-model-server-with-db run-estimator-with-model-server run-collector-client clean-estimator clean-model-server clean-container-net
+
 test: \
 	build-test \
 	test-pipeline \
 	test-estimator \
 	test-model-server \
-	test-offline-trainer
+	test-offline-trainer \
+	test-model-server-select \
+	test-model-server-estimator-select
 
 # set image
 set-image:
 
@@ -23,9 +23,11 @@
     CHECKPOINT_FOLDERNAME,
     METADATA_FILENAME,
     any_node_type,
+    default_pipelines,
     get_archived_file,
     get_largest_candidates,
     get_model_group_path,
+    get_node_type_from_name,
     get_pipeline_path,
     is_matched_type,
     is_valid_model,
@@ -51,11 +53,11 @@
 
 
 class ModelRequest:
-    def __init__(self, metrics, output_type, source="rapl-sysfs", node_type=-1, weight=False, trainer_name="", filter="", pipeline_name="", spec=None):
+    def __init__(self, metrics, output_type, source="rapl-sysfs", node_type=-1, weight=False, trainer_name="", filter="", pipeline_name="", spec=None, loose_node_type=True):
         # target source of power metric to be predicted (e.g., rapl-sysfs, acpi)
         self.source = convert_enery_source(source)
         # type of node to select a model learned from similar nodes (default: -1, applied universal model learned by all node_type (TODO))
-        self.node_type = node_type
+        self.node_type = int(node_type) if node_type or node_type == 0 else -1
         # list of available resource usage metrics to find applicable models (using a valid feature group that can be obtained from the list)
         self.metrics = metrics
         # specific trainer name (default: empty, selecting any of the best trainer)
@@ -72,6 +74,7 @@ def __init__(self, metrics, output_type, source="rapl-sysfs", node_type=-1, weig
         self.spec = NodeTypeSpec()
         if spec is not None:
             self.spec = NodeTypeSpec(**spec)
+        self.loose_node_type = loose_node_type
 
 # ModelListParams defines parameters for /best-models API
 class ModelListParam(enum.Enum):
@@ -102,10 +105,27 @@ class ModelListParam(enum.Enum):
 """
 
 
-def select_best_model(spec, valid_group_path: str, filters: dict, energy_source: str, pipeline_name: str="", trainer_name: str="", node_type: int=any_node_type, weight: bool=False):
-    model_names = [f for f in os.listdir(valid_group_path) if f != CHECKPOINT_FOLDERNAME and not os.path.isfile(os.path.join(valid_group_path, f)) and (trainer_name == "" or trainer_name in f)]
+def select_best_model(spec, valid_group_path: str, filters: dict, energy_source: str, pipeline_name: str="", trainer_name: str="", node_type: int=any_node_type, weight: bool=False, loose_node_type: bool=True):
+    # Set default pipeline if not specified
+    if pipeline_name == "" and energy_source in default_pipelines:
+        pipeline_name = default_pipelines[energy_source]
+
+    # Find initial model list filtered by trainer
+    initial_model_names = [f for f in os.listdir(valid_group_path) if f != CHECKPOINT_FOLDERNAME and not os.path.isfile(os.path.join(valid_group_path, f)) and os.path.exists(os.path.join(valid_group_path, f, METADATA_FILENAME + ".json")) and (trainer_name == "" or trainer_name in f)]
+    if node_type != any_node_type:
+        model_names = [name for name in initial_model_names if f"_{node_type}" in name]
+        if len(model_names) == 0:
+            if not loose_node_type:
+                return None, None
+            logger.warning(f"{valid_group_path} has no matched model for node type={node_type}, try all available models")
+            model_names = initial_model_names
+    else:
+        model_names = initial_model_names
+
+    # Filter weight models
     if weight:
         model_names = [name for name in model_names if name.split("_")[0] in weight_support_trainers]
+
     # Load metadata of trainers
     best_cadidate = None
     best_response = None
@@ -167,18 +187,33 @@ def get_model():
     output_type = ModelOutputType[req.output_type]
     best_model = None
     best_response = None
+    best_uncertainty = None
+    best_looseness = None
     # find best model comparing best candidate from each valid feature group complied with filtering conditions
     for fg in valid_fgs:
+        pipeline_name = pipelineName[energy_source]
         valid_group_path = get_model_group_path(model_toppath, output_type, fg, energy_source, pipeline_name=pipelineName[energy_source])
+        node_type = req.node_type
+        if req.node_type == any_node_type and req.spec is not None and not req.spec.is_none() and pipeline_name in nodeCollection:
+            node_type, uncertainty, looseness = nodeCollection[pipeline_name].get_node_type(req.spec, loose_search=True)
+        else:
+            uncertainty = 0
+            looseness = 0
         if os.path.exists(valid_group_path):
-            best_candidate, response = select_best_model(req.spec, valid_group_path, filters, energy_source, req.pipeline_name, req.trainer_name, req.node_type, req.weight)
+            best_candidate, response = select_best_model(req.spec, valid_group_path, filters, energy_source, req.pipeline_name, req.trainer_name, node_type, req.weight, loose_node_type=req.loose_node_type)
             if best_candidate is None:
                 continue
+            if node_type != any_node_type and best_model is not None and get_node_type_from_name(best_model['model_name']) == node_type:
+                if get_node_type_from_name(best_candidate['model_name']) != node_type:
+                    continue
             if best_model is None or best_model[ERROR_KEY] > best_candidate[ERROR_KEY]:
                 best_model = best_candidate
                 best_response = response
+                best_uncertainty = uncertainty
+                best_looseness = looseness
     if best_model is None:
         return make_response(f"cannot find model for {model_request} at the moment", 400)
+    logger.info(f"response: model {best_model['model_name']} by {best_model['features']} with {ERROR_KEY}={best_model[ERROR_KEY]} selected with uncertainty={best_uncertainty}, looseness={best_looseness}")
     if req.weight:
         try:
             response = app.response_class(response=json.dumps(best_response), status=200, mimetype="application/json")
@@ -234,7 +269,7 @@ def get_available_models():
                 logger.debug(f"Searching feature group {fg}")
                 valid_group_path = get_model_group_path(model_toppath, output_type, fg, energy_source, pipeline_name=pipelineName[energy_source])
                 if os.path.exists(valid_group_path):
-                    best_candidate, _ = select_best_model(None, valid_group_path, filters, energy_source, node_type=node_type)
+                    best_candidate, _ = select_best_model(None, valid_group_path, filters, energy_source, node_type=node_type, loose_node_type=False)
                     if best_candidate is None:
                         continue
                     model_names[output_type.name][fg.name] = best_candidate["model_name"]
 
@@ -5,7 +5,6 @@
 #   node_type = index_collection.index_train_machine(machine_id, new_spec)
 #   index_collection.save()
 
-import enum
 import logging
 import os
 import re
@@ -17,6 +16,8 @@
 
 from kepler_model.util.loader import load_json, load_node_type_index
 from kepler_model.util.saver import save_machine_spec, save_node_type_index
+from kepler_model.util.similarity import compute_jaccard_similarity, compute_looseness, compute_similarity, compute_uncertainty, find_best_candidate, get_candidate_score, get_num_of_none, get_similarity_weight
+from kepler_model.util.train_types import NodeAttribute
 
 logger = logging.getLogger(__name__)
 
@@ -100,13 +101,6 @@ def get_machine_spec(cmd_machine_spec_file: str):
             return spec
     return discover_spec_values()
 
-class NodeAttribute(str, enum.Enum):
-    PROCESSOR = "processor"
-    CORES = "cores"
-    CHIPS = "chips"
-    MEMORY = "memory"
-    FREQ = "frequency"
-
 def load_node_type_spec(node_type_index_json):
     node_type_spec_index = dict()
     if node_type_index_json is not None:
@@ -181,6 +175,34 @@ def cover(self, compare_spec):
                         return False
         return True
 
+    def get_uncertain_attribute_freq(self, compare_spec):
+        uncertain_attribute_freq = dict()
+        if not self.cover(compare_spec):
+            # not covered
+            return None
+        size = self.get_size()
+        for attr in NodeAttribute:
+            if compare_spec.attrs[attr] is None:
+                uncertain_attribute_freq[attr] = size
+        return uncertain_attribute_freq
+
+    def get_similarity(self, compare_spec, debug=False):
+        total_similarity = 0
+        for attr in NodeAttribute:
+            similarity = 0
+            # compare similar string
+            if compare_spec.attrs[attr] is not None and attr in [NodeAttribute.PROCESSOR]:
+                similarity = compute_jaccard_similarity(self.attrs[attr], compare_spec.attrs[attr])
+            # compare number
+            elif compare_spec.attrs[attr] is not None:
+                similarity = compute_similarity(self.attrs[attr], compare_spec.attrs[attr])
+            if debug:
+                print(attr, self.attrs[attr], compare_spec.attrs[attr], similarity, get_similarity_weight(attr))
+            total_similarity += (similarity*get_similarity_weight(attr))
+        if total_similarity > 1:
+            total_similarity = 1
+        return total_similarity
+
     def __str__(self):
         out_str = ""
         for attr in NodeAttribute:
@@ -218,7 +240,7 @@ def index_train_machine(self, machine_id, new_spec):
         if not new_spec.complete_info():
             print("Machine info not completed: ", str(new_spec))
             return -1
-        covered_index = self.get_node_type(new_spec)
+        covered_index, _, _ = self.get_node_type(new_spec)
         if covered_index == -1:
             covered_index = 0
             if len(self.node_type_index.keys()) > 0:
@@ -227,13 +249,31 @@ def index_train_machine(self, machine_id, new_spec):
         self.node_type_index[covered_index].add_member(machine_id)
         return covered_index
 
-    def get_node_type(self, compare_spec):
+    def get_node_type(self, in_spec: NodeTypeSpec, loose_search: bool=False):
         if len(self.node_type_index) == 0:
-            return -1
-        for index, node_type_spec in self.node_type_index.items():
-            if node_type_spec.cover(compare_spec):
-                return index
-        return -1
+            return -1, -1, -1
+        compare_spec = in_spec.copy()
+        num_of_none = get_num_of_none(compare_spec)
+        similarity_map, max_similarity, most_similar_index, has_candidate, candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total = self._find_candidates(in_spec, loose_search)
+        if max_similarity == 1:
+            return most_similar_index, 0, 0
+        if has_candidate:
+            # covered
+            candidate_score = get_candidate_score(candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total)
+            best_candidate_index, max_score = find_best_candidate(candidate_score)
+            uncertainty = compute_uncertainty(max_score, num_of_none)
+            return best_candidate_index, uncertainty, 0
+        elif loose_search:
+            if most_similar_index != -1:
+                candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total, num_of_none = self._loose_search(compare_spec, similarity_map, max_similarity, most_similar_index, candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total)
+                candidate_score = get_candidate_score(candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total)
+                logger.debug(f"candidate score: {candidate_score}")
+                most_similar_score = candidate_score[most_similar_index]
+                uncertainty = compute_uncertainty(most_similar_score, num_of_none)
+                if max_similarity != -1:
+                    looseness = compute_looseness(max_similarity)
+                    return most_similar_index, uncertainty, looseness
+        return -1, -1, -1
 
     def get_json(self):
         json_obj = dict()
@@ -251,3 +291,68 @@ def copy(self):
         for node_type in removed_items:
             del node_collection.node_type_index[node_type]
         return node_collection
+
+    def _find_candidates(self, compare_spec, loose_search=False):
+        """
+        This function returns most similar node_type index.
+        - similarity value for the compare_spec to each node_type in collection index will be computed
+        - among candidates with similarity value, the most frequently-found node_type will be selected
+        - loose_search flag allows adding candidate even if the compare spec is not covered
+        """
+        candidate_uncertain_attribute_freq = dict()
+        candidate_uncertain_attribute_total = dict()
+        most_similar_index = -1
+        max_similarity = -1
+        most_similar_freq = -1
+        completed_info = compare_spec.complete_info()
+        has_candidate = False
+        similarity_map = dict()
+        for attr in NodeAttribute:
+            candidate_uncertain_attribute_freq[attr] = []
+            candidate_uncertain_attribute_total[attr] = 0
+        for index, node_type_spec in self.node_type_index.items():
+            freq = node_type_spec.get_size()
+            if loose_search:
+                similarity = node_type_spec.get_similarity(compare_spec)
+                similarity_map[index] = similarity
+                if similarity > max_similarity or (similarity == max_similarity and most_similar_freq < freq):
+                    most_similar_index = index
+                    max_similarity = similarity
+                    most_similar_freq = freq
+                    logger.debug(f"{index} - {node_type_spec}: {similarity}")
+            if node_type_spec.cover(compare_spec):
+                if completed_info:
+                    return similarity_map, 1, index, has_candidate, candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total
+                else:
+                    for attr in NodeAttribute:
+                        if compare_spec.attrs[attr] is None:
+                            candidate_uncertain_attribute_freq[attr] += [(index, freq)]
+                            candidate_uncertain_attribute_total[attr] += freq
+                            has_candidate = True
+        return similarity_map, max_similarity, most_similar_index, has_candidate, candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total
+
+    def _loose_search(self, compare_spec, similarity_map, max_similarity, most_similar_index, candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total):
+        """
+        This function tries loosing the attribute that doesn't match the spec with maximum similarility and recompute uncertainty value of selection.
+        """
+        num_of_none = get_num_of_none(compare_spec)
+        most_similar_spec = self.node_type_index[most_similar_index]
+        # remove uncovered spec
+        for attr in NodeAttribute:
+            if compare_spec.attrs[attr] != most_similar_spec.attrs[attr]:
+                logger.debug(f"Loosen {attr} ({compare_spec.attrs[attr]}-->{most_similar_spec.attrs[attr]})")
+                compare_spec.attrs[attr] = None
+                num_of_none += 1
+        # find uncertainty
+        for index, node_type_spec in self.node_type_index.items():
+            if node_type_spec.cover(compare_spec):
+                similarity = similarity_map[index]
+                freq = node_type_spec.get_size()
+                if similarity == max_similarity and freq > self.node_type_index[most_similar_index].get_size():
+                    logger.debug(f"change most similar index from {most_similar_index} to {index}")
+                    most_similar_index = index
+                for attr in NodeAttribute:
+                    if compare_spec.attrs[attr] is None:
+                        candidate_uncertain_attribute_freq[attr] += [(index, freq)]
+                        candidate_uncertain_attribute_total[attr] += freq
+        return candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total, num_of_none
@@ -13,9 +13,10 @@
 #################################################
 
 import os
+
 import requests
 
-from .loader import base_model_url, default_pipelines, default_train_output_pipeline, get_pipeline_url, get_url, default_init_model_name
+from .loader import base_model_url, default_init_model_name, default_pipelines, default_train_output_pipeline, get_pipeline_url, get_url
 from .train_types import FeatureGroup, ModelOutputType, is_output_type_supported
 
 # must be writable (for shared volume mount)
 
@@ -112,6 +112,18 @@ def load_remote_pkl(url_path):
         logger.error(f"failed to load pkl url {url_path}: {e}")
         return None
 
+def load_remote_json(url_path):
+    if ".json" not in url_path:
+        url_path = url_path + ".json"
+    try:
+        response = urlopen(url_path)
+        response_data = response.read().decode('utf-8')
+        json_data = json.loads(response_data)
+        return json_data
+    except Exception as e:
+        logger.error(f"failed to load json url {url_path}: {e}")
+        return None
+
 def load_machine_spec(data_path, machine_id):
     machine_spec_path = os.path.join(data_path, MACHINE_SPEC_PATH)
     return load_json(machine_spec_path, machine_id)
@@ -216,7 +228,7 @@ def is_matched_type(nodeCollection, spec, pipeline_name, model_name, node_type,
             return True
     return False
 
-
+# get_largest_candidates return list of model_names that have maximum number of cores
 def get_largest_candidates(model_names, pipeline_name, nodeCollection, energy_source):
     pipeline_name = assure_pipeline_name(pipeline_name, energy_source, nodeCollection)
     if pipeline_name not in nodeCollection: