cmu-db
diff --git a/‎script/model/data_class/grouped_op_unit_data.py
Lines changed: 23 additions & 26 deletions b/‎script/model/data_class/grouped_op_unit_data.py
Lines changed: 23 additions & 26 deletions
diff --git a/‎script/model/data_class/opunit_data.py
Lines changed: 19 additions & 19 deletions b/‎script/model/data_class/opunit_data.py
Lines changed: 19 additions & 19 deletions
diff --git a/‎script/model/endtoend_estimator.py
Lines changed: 10 additions & 9 deletions b/‎script/model/endtoend_estimator.py
Lines changed: 10 additions & 9 deletions
@@ -46,8 +46,8 @@ def _default_get_global_data(filename, sample_interval=0):
     df = pd.read_csv(filename)
     file_name = os.path.splitext(os.path.basename(filename))[0]
 
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
 
     # Construct the new data
     opunit = OpUnit[file_name.upper()]
@@ -66,14 +66,14 @@ def _txn_get_mini_runner_data(filename, txn_sample_interval):
     # prepending a column of ones as the base transaction data feature
     base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
     df = pd.concat([base_x, df], axis=1)
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
-    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.START_TIME]].values
-    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.CPU_ID]].values
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
+    start_times = df.iloc[:, data_info.instance.target_csv_index[data_info.instance.Target.START_TIME]].values
+    cpu_ids = df.iloc[:, data_info.instance.target_csv_index[data_info.instance.Target.CPU_ID]].values
 
     logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))
 
-    interval = data_info.CONTENDING_OPUNIT_INTERVAL
+    interval = data_info.instance.CONTENDING_OPUNIT_INTERVAL
 
     # Map from interval start time to the data in this interval
     interval_x_map = {}
@@ -123,15 +123,14 @@ def _pipeline_get_grouped_op_unit_data(filename, warmup_period, ee_sample_interv
     data_list = []
     with open(filename, "r") as f:
         reader = csv.reader(f, delimiter=",", skipinitialspace=True)
-        indexes = next(reader)
-        data_info.parse_csv_header(indexes, True)
-        features_vector_index = data_info.RAW_FEATURES_CSV_INDEX[ExecutionFeature.FEATURES]
-        input_output_boundary = data_info.RAW_FEATURES_CSV_INDEX[data_info.INPUT_OUTPUT_BOUNDARY]
-        input_end_boundary = len(data_info.INPUT_CSV_INDEX)
+        next(reader)
+        features_vector_index = data_info.instance.raw_features_csv_index[ExecutionFeature.FEATURES]
+        input_output_boundary = data_info.instance.raw_features_csv_index[data_info.instance.INPUT_OUTPUT_BOUNDARY]
+        input_end_boundary = len(data_info.instance.input_csv_index)
 
         for line in reader:
             # extract the time
-            cpu_time = line[data_info.RAW_TARGET_CSV_INDEX[Target.START_TIME]]
+            cpu_time = line[data_info.instance.raw_target_csv_index[Target.START_TIME]]
             if start_time is None:
                 start_time = cpu_time
 
@@ -144,7 +143,7 @@ def _pipeline_get_grouped_op_unit_data(filename, warmup_period, ee_sample_interv
             record = [d for i,d in enumerate(line) if i >= input_output_boundary]
             data = list(map(data_util.convert_string_to_numeric, record))
             x_multiple = data[:input_end_boundary]
-            metrics = np.array(data[-data_info.METRICS_OUTPUT_NUM:])
+            metrics = np.array(data[-data_info.instance.METRICS_OUTPUT_NUM:])
 
             # Get the opunits located within
             opunits = []
@@ -156,12 +155,12 @@ def _pipeline_get_grouped_op_unit_data(filename, warmup_period, ee_sample_interv
 
                 opunit = OpUnit[feature]
                 x_loc = [v[idx] if type(v) == list else v for v in x_multiple]
-                if x_loc[data_info.INPUT_CSV_INDEX[ExecutionFeature.NUM_ROWS]] == 0:
+                if x_loc[data_info.instance.input_csv_index[ExecutionFeature.NUM_ROWS]] == 0:
                     logging.info("Skipping {} OU with 0 tuple num".format(opunit.name))
                     continue
 
                 if opunit == OpUnit.CREATE_INDEX:
-                    concurrency = x_loc[data_info.CONCURRENCY_INDEX]
+                    concurrency = x_loc[data_info.instance.CONCURRENCY_INDEX]
                     # TODO(lin): we won't do sampling for CREATE_INDEX. We probably should encapsulate this when
                     #  generating the data
                     sample_interval = 0
@@ -189,15 +188,13 @@ def _pipeline_get_grouped_op_unit_data(filename, warmup_period, ee_sample_interv
 def _interval_get_grouped_op_unit_data(filename):
     # In the default case, the data does not need any pre-processing and the file name indicates the opunit
     df = pd.read_csv(filename, skipinitialspace=True)
-    headers = list(df.columns.values)
-    data_info.parse_csv_header(headers, False)
     file_name = os.path.splitext(os.path.basename(filename))[0]
 
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
-    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[Target.START_TIME]].values
-    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[Target.CPU_ID]].values
-    interval = data_info.PERIODIC_OPUNIT_INTERVAL
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
+    start_times = df.iloc[:, data_info.instance.target_csv_index[Target.START_TIME]].values
+    cpu_ids = df.iloc[:, data_info.instance.target_csv_index[Target.CPU_ID]].values
+    interval = data_info.instance.PERIODIC_OPUNIT_INTERVAL
 
     # Map from interval start time to the data in this interval
     interval_x_map = {}
@@ -248,9 +245,9 @@ def __init__(self, name, opunit_features, metrics, sample_interval=0, concurrenc
         """
         self.name = name
         self.opunit_features = opunit_features
-        self.y = metrics[-data_info.MINI_MODEL_TARGET_NUM:]
+        self.y = metrics[-data_info.instance.MINI_MODEL_TARGET_NUM:]
         self.y_pred = None
-        index_map = data_info.TARGET_CSV_INDEX
+        index_map = data_info.instance.target_csv_index
         self.start_time = metrics[index_map[Target.START_TIME]]
         self.end_time = self.start_time + self.y[index_map[Target.ELAPSED_US]] - 1
         self.cpu_id = int(metrics[index_map[Target.CPU_ID]])
@@ -282,7 +279,7 @@ def get_end_time(self, concurrent_counting_mode):
         if concurrent_counting_mode is ConcurrentCountingMode.EXACT:
             end_time = self.end_time
         if concurrent_counting_mode is ConcurrentCountingMode.ESTIMATED:
-            end_time = self.start_time + self.y_pred[data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]] - 1
+            end_time = self.start_time + self.y_pred[data_info.instance.target_csv_index[Target.ELAPSED_US]] - 1
         if concurrent_counting_mode is ConcurrentCountingMode.INTERVAL:
             end_time = self.start_time + global_model_config.INTERVAL_START + global_model_config.INTERVAL_SIZE
         return end_time
@@ -53,11 +53,11 @@ def _default_get_mini_runner_data(filename):
     # In the default case, the data does not need any pre-processing and the file name indicates the opunit
     df = pd.read_csv(filename, skipinitialspace=True)
     headers = list(df.columns.values)
-    data_info.parse_csv_header(headers, False)
+    data_info.instance.parse_csv_header(headers, False)
     file_name = os.path.splitext(os.path.basename(filename))[0]
 
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
 
     return [OpUnitData(OpUnit[file_name.upper()], x, y)]
 
@@ -70,18 +70,18 @@ def _txn_get_mini_runner_data(filename, model_results_path, txn_sample_interval)
     # prepending a column of ones as the base transaction data feature
     base_x = pd.DataFrame(data=np.ones((df.shape[0], 1), dtype=int))
     df = pd.concat([base_x, df], axis=1)
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
-    start_times = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.START_TIME]].values
-    cpu_ids = df.iloc[:, data_info.TARGET_CSV_INDEX[data_info.Target.CPU_ID]].values
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
+    start_times = df.iloc[:, data_info.instance.TARGET_CSV_INDEX[data_info.instance.Target.START_TIME]].values
+    cpu_ids = df.iloc[:, data_info.instance.TARGET_CSV_INDEX[data_info.instance.Target.CPU_ID]].values
 
     logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))
 
     # change the data based on the interval for the periodically invoked operating units
     prediction_path = "{}/{}_txn_converted_data.csv".format(model_results_path, file_name)
     io_util.create_csv_file(prediction_path, [""])
 
-    interval = data_info.CONTENDING_OPUNIT_INTERVAL
+    interval = data_info.instance.CONTENDING_OPUNIT_INTERVAL
 
     # Map from interval start time to the data in this interval
     interval_x_map = {}
@@ -119,19 +119,19 @@ def _interval_get_mini_runner_data(filename, model_results_path):
     # In the default case, the data does not need any pre-processing and the file name indicates the opunit
     df = pd.read_csv(filename, skipinitialspace=True)
     headers = list(df.columns.values)
-    data_info.parse_csv_header(headers, False)
+    data_info.instance.parse_csv_header(headers, False)
     file_name = os.path.splitext(os.path.basename(filename))[0]
 
-    x = df.iloc[:, :-data_info.METRICS_OUTPUT_NUM].values
-    y = df.iloc[:, -data_info.MINI_MODEL_TARGET_NUM:].values
-    start_times = df.iloc[:, data_info.RAW_TARGET_CSV_INDEX[Target.START_TIME]].values
+    x = df.iloc[:, :-data_info.instance.METRICS_OUTPUT_NUM].values
+    y = df.iloc[:, -data_info.instance.MINI_MODEL_TARGET_NUM:].values
+    start_times = df.iloc[:, data_info.instance.RAW_TARGET_CSV_INDEX[Target.START_TIME]].values
     logging.info("Loaded file: {}".format(OpUnit[file_name.upper()]))
 
     # change the data based on the interval for the periodically invoked operating units
     prediction_path = "{}/{}_interval_converted_data.csv".format(model_results_path, file_name)
     io_util.create_csv_file(prediction_path, [""])
 
-    interval = data_info.PERIODIC_OPUNIT_INTERVAL
+    interval = data_info.instance.PERIODIC_OPUNIT_INTERVAL
 
     # Map from interval start time to the data in this interval
     interval_x_map = {}
@@ -179,17 +179,17 @@ def _execution_get_mini_runner_data(filename, model_map, predict_cache, trim):
     with open(filename, "r") as f:
         reader = csv.reader(f, delimiter=",", skipinitialspace=True)
         indexes = next(reader)
-        data_info.parse_csv_header(indexes, True)
-        features_vector_index = data_info.RAW_FEATURES_CSV_INDEX[ExecutionFeature.FEATURES]
-        raw_boundary = data_info.RAW_FEATURES_CSV_INDEX[data_info.INPUT_OUTPUT_BOUNDARY]
-        input_output_boundary = len(data_info.INPUT_CSV_INDEX)
+        data_info.instance.parse_csv_header(indexes, True)
+        features_vector_index = data_info.instance.raw_features_csv_index[ExecutionFeature.FEATURES]
+        raw_boundary = data_info.instance.raw_features_csv_index[data_info.instance.INPUT_OUTPUT_BOUNDARY]
+        input_output_boundary = len(data_info.instance.input_csv_index)
 
         for line in reader:
             # drop query_id, pipeline_id, num_features, features_vector
             record = [d for i, d in enumerate(line) if i >= raw_boundary]
             data = list(map(data_util.convert_string_to_numeric, record))
             x_multiple = data[:input_output_boundary]
-            y_merged = np.array(data[-data_info.MINI_MODEL_TARGET_NUM:])
+            y_merged = np.array(data[-data_info.instance.MINI_MODEL_TARGET_NUM:])
 
             # Get the opunits located within
             opunits = []
@@ -254,7 +254,7 @@ def _execution_get_mini_runner_data(filename, model_map, predict_cache, trim):
     for opunit, values in data_map.items():
         np_value = np.array(values)
         x = np_value[:, :input_output_boundary]
-        y = np_value[:, -data_info.MINI_MODEL_TARGET_NUM:]
+        y = np_value[:, -data_info.instance.MINI_MODEL_TARGET_NUM:]
         data_list.append(OpUnitData(opunit, x, y))
 
     return data_list
 
@@ -45,6 +45,7 @@ def estimate(self):
                                                                                       self.model_results_path,
                                                                                       0,
                                                                                       False,
+                                                                                      False,
                                                                                       self.ee_sample_interval,
                                                                                       self.txn_sample_interval,
                                                                                       self.network_sample_interval)
@@ -88,7 +89,7 @@ def _model_prediction_with_derived_data(self, impact_data_list, model_name, mode
             data_list.append(d.target_grouped_op_unit_data)
             mini_model_y_pred.append(d.target_grouped_op_unit_data.y_pred)
             raw_y.append(d.target_grouped_op_unit_data.y)
-            predicted_elapsed_us = mini_model_y_pred[-1][data_info.TARGET_CSV_INDEX[Target.ELAPSED_US]]
+            predicted_elapsed_us = mini_model_y_pred[-1][data_info.instance.target_csv_index[Target.ELAPSED_US]]
             predicted_resource_util = None
             if model_name == "impact":
                 predicted_resource_util = d.get_y_pred()
@@ -100,9 +101,9 @@ def _model_prediction_with_derived_data(self, impact_data_list, model_name, mode
             predicted_resource_util[:mini_model_y_pred[-1].shape[0]] -= self_resource
             predicted_resource_util[predicted_resource_util < 0] = 0
             x.append(np.concatenate((mini_model_y_pred[-1] / predicted_elapsed_us,
-                                    predicted_resource_util,
-                                    d.resource_util_same_core_x)))
-            #x.append(np.concatenate((mini_model_y_pred[-1] / predicted_elapsed_us, predicted_resource_util)))
+                                     predicted_resource_util,
+                                     d.resource_util_same_core_x)))
+            # x.append(np.concatenate((mini_model_y_pred[-1] / predicted_elapsed_us, predicted_resource_util)))
             y.append(d.target_grouped_op_unit_data.y / (d.target_grouped_op_unit_data.y_pred +
                                                         global_model_config.RATIO_DIVISION_EPSILON))
 
@@ -154,7 +155,7 @@ def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_li
             ratio_error = np.abs(raw_y - raw_y_pred) / (raw_y + epsilon)
             avg_ratio_error = np.average(ratio_error, axis=0)
             accumulated_percentage_error = np.abs(accumulated_raw_y - accumulated_raw_y_pred) / (
-                        accumulated_raw_y + epsilon)
+                    accumulated_raw_y + epsilon)
             original_accumulated_percentage_error = np.abs(accumulated_raw_y - np.sum(mini_model_y_pred, axis=0)) / (
                     accumulated_raw_y + epsilon)
 
@@ -175,8 +176,8 @@ def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_li
                 prediction_path = "{}/grouped_opunit_prediction.csv".format(self.model_results_path)
                 io_util.create_csv_file(prediction_path, ["Pipeline", "", "Actual", "", "Predicted", "", "Ratio Error"])
                 for i, data in enumerate(data_list):
-                        io_util.write_csv_result(prediction_path, data.name, [""] + list(raw_y[i]) + [""] +
-                                                 list(raw_y_pred[i]) + [""] + list(ratio_error[i]))
+                    io_util.write_csv_result(prediction_path, data.name, [""] + list(raw_y[i]) + [""] +
+                                             list(raw_y_pred[i]) + [""] + list(ratio_error[i]))
 
                 average_result_path = "{}/interval_average_prediction.csv".format(self.model_results_path)
                 io_util.create_csv_file(average_result_path,
@@ -185,7 +186,7 @@ def _record_results(self, x, y, y_pred, raw_y, mini_model_y_pred, label, data_li
                 interval_y_map = {}
                 interval_y_pred_map = {}
                 mark_list = None
-                #mark_list = _generate_mark_list(data_list)
+                # mark_list = _generate_mark_list(data_list)
                 for i, data in enumerate(data_list):
                     # Don't count the create index OU
                     # TODO(lin): needs better way to evaluate... maybe add a id_query field to GroupedOpunitData
@@ -268,7 +269,7 @@ def _generate_mark_list(data_list):
     logging_util.init_logging(args.log)
 
     with open(args.mini_model_file, 'rb') as pickle_file:
-        model_map = pickle.load(pickle_file)
+        model_map, data_info.instance = pickle.load(pickle_file)
     with open(args.global_resource_model_file, 'rb') as pickle_file:
         resource_model = pickle.load(pickle_file)
     with open(args.global_impact_model_file, 'rb') as pickle_file: