Skip to content

Commit 9758dd9

Browse files
committed
fix trainer/s3 bugs
Signed-off-by: Sunyanan Choochotkaew <[email protected]>
1 parent 5866956 commit 9758dd9

File tree

4 files changed

+31
-28
lines changed

4 files changed

+31
-28
lines changed

model_training/s3/s3-loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def ibmcloud_list_keys(client, bucket_name, prefix):
2020

2121
def get_bucket_file_map(client, bucket_name, machine_id, mnt_path, pipeline_name, list_func):
2222
bucket_file_map = dict()
23+
top_key_path = ""
2324
if machine_id is not None and machine_id != "":
2425
top_key_path = "/" + machine_id
2526
# add data key map

src/train/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ def prepare_data_from_input_list(self, input_query_results_list, energy_componen
117117

118118
def _train(self, abs_data, dyn_data, power_labels, energy_source, feature_group):
119119
# start the thread pool
120-
with ThreadPoolExecutor(2) as executor:
120+
with ThreadPoolExecutor(len(self.trainers)) as executor:
121121
futures = []
122122
for trainer in self.trainers:
123123
if trainer.feature_group_name != feature_group:

src/train/profiler/node_type_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def generate_spec(data_path, machine_id):
4444
if "brand_raw" in cpu_info:
4545
processor = format_processor(cpu_info["brand_raw"])
4646
cores = psutil.cpu_count(logical=True)
47-
chips = psutil.cpu_count(logical=False)
47+
chips = int(cores/psutil.cpu_count(logical=False))
4848
memory = psutil.virtual_memory().total
4949
memory_gb = int(memory/GB)
5050
cpu_freq_mhz = round(psutil.cpu_freq(percpu=False).max/100)*100 # round to one decimal of GHz

src/train/trainer/__init__.py

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -130,39 +130,41 @@ def load_model(self, node_type):
130130

131131
def process(self, data, power_labels, pipeline_lock):
132132
node_types = pd.unique(data[node_info_column])
133-
try:
134-
for node_type in node_types:
135-
node_type = int(node_type)
136-
save_path = self._get_save_path(str(node_type))
137-
self.node_scalers[node_type] = load_scaler(save_path)
138-
self.load_model(node_type)
139-
140-
node_type_filtered_data = data[data[node_info_column] == node_type]
141-
if self.node_scalers[node_type] is None:
142-
self.print_log("fit scaler to latest data {1} for node_type={0}".format(node_type, self.feature_group_name))
143-
# no profiled scaler
144-
x_values = node_type_filtered_data[self.features].values
145-
self.node_scalers[node_type] = MaxAbsScaler()
146-
self.node_scalers[node_type].fit(x_values)
147-
148-
X_test_map = dict()
149-
y_test_map = dict()
133+
for node_type in node_types:
134+
node_type = int(node_type)
135+
save_path = self._get_save_path(str(node_type))
136+
self.node_scalers[node_type] = load_scaler(save_path)
137+
self.load_model(node_type)
138+
139+
node_type_filtered_data = data[data[node_info_column] == node_type]
140+
if self.node_scalers[node_type] is None:
141+
self.print_log("fit scaler to latest data {1} for node_type={0}".format(node_type, self.feature_group_name))
142+
# no profiled scaler
143+
x_values = node_type_filtered_data[self.features].values
144+
self.node_scalers[node_type] = MaxAbsScaler()
145+
self.node_scalers[node_type].fit(x_values)
146+
147+
X_test_map = dict()
148+
y_test_map = dict()
149+
try:
150150
for component in self.energy_components:
151151
X_values, y_values = self.apply_ratio(component, node_type_filtered_data, power_labels)
152152
X_train, X_test, y_train, y_test = normalize_and_split(X_values, y_values, scaler=self.node_scalers[node_type])
153153
X_test_map[component] = X_test
154154
y_test_map[component] = y_test
155155
self.train(node_type, component, X_train, y_train)
156156
self.save_checkpoint(self.node_models[node_type][component], self._checkpoint_filepath(component, node_type))
157-
if self.should_archive(node_type):
158-
pipeline_lock.acquire()
159-
try:
160-
self.save_model_and_metadata(node_type, X_test_map, y_test_map)
161-
finally:
162-
pipeline_lock.release()
163-
except Exception as e:
164-
print(e)
165-
pipeline_lock.release()
157+
except Exception as err:
158+
self.print_log("failed to process {}: {}".format(node_type, err))
159+
continue
160+
if self.should_archive(node_type):
161+
pipeline_lock.acquire()
162+
try:
163+
self.save_model_and_metadata(node_type, X_test_map, y_test_map)
164+
except Exception as err:
165+
self.print_log("failed to save model {}: {}".format(node_type, err))
166+
finally:
167+
pipeline_lock.release()
166168

167169
def apply_ratio(self, component, node_type_filtered_data, power_labels):
168170
power_label = component_to_col(component)

0 commit comments

Comments
 (0)