global-agland-2015/utils/process/train_process.py at main · ziameanslight/global-agland-2015 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
from utils.dataset import *
from utils.tools.census_core import load_census_table_pkl
from models import gbt
import h2o
from tqdm import tqdm
from utils.metrics import *
import ast


class CVPerformance:
    def __init__(self):
        self.stats = {}

    @classmethod
    def write_single_performance_to_file(cls, output_file, hyper_param, performance):
        if output_file is not None:
            with open(output_file, 'a+') as file:
                log_msg = f"{hyper_param}"
                for metric, p in performance.items():
                    log_msg += f" | {metric}: {p}"
                log_msg += "\n"
                file.write(log_msg)

    def __len__(self):
        return len(self.stats)

    def _decode_hyper_param_str(self, hyper_param_str):
        return ast.literal_eval(hyper_param_str)

    def append(self, hyper_param, performance):
        assert isinstance(hyper_param, tuple), "Input hyper_param must be tuple"
        assert isinstance(performance, dict), "Input performance must be dict"
        self.stats[hyper_param] = performance

    def write_to_file(self, output_file):
        for hyper_param, performance in self.stats.items():
            CVPerformance.write_single_performance_to_file(output_file, hyper_param, performance)

    def load_from_file(self, output_file):
        with open(output_file, 'r') as file:
            for row in file:
                row_list = row.split(' | ')
                hyper_params = self._decode_hyper_param_str(row_list[0])
                performance = {}
                for i in range(len(row_list)-1):
                    metric_name = row_list[i+1].split(':')[0]
                    if i == len(row_list)-2:
                        metric_value = [float(j) for j in row_list[i+1][len(metric_name+': ['):-len('\n')-1].split()]
                    else:
                        metric_value = [float(j) for j in row_list[i+1][len(metric_name+': ['):-1].split()]
                    performance[metric_name] = metric_value
                self.append(hyper_params, performance)
        return self

    def find_top_N_hyper_param(self, criteria, lower_is_better=True, N=1):
        score_dict = {}
        for hyper_param, performance in self.stats.items():
            current_score = criteria(performance)
            score_dict[hyper_param] = current_score
        sorted_items = sorted(score_dict.items(), key=lambda item: item[1], reverse=True)
        if lower_is_better:
            top_N_items = sorted_items[-N:]
        else:
            top_N_items = sorted_items[:N]
        return top_N_items

    def query_hyper_param(self, hyper_params):
        assert(isinstance(hyper_params, tuple) or isinstance(hyper_params, str)), "Input hyper_params must be str or tuples"
        if isinstance(hyper_params, str):
            query_key = self._decode_hyper_param_str(hyper_params)
        else:
            query_key = hyper_params
        if query_key in set(self.stats.keys()):
            return self.stats[query_key]
        return None


def best_model_criteria_by_R2(performance):
    return np.mean(performance['mean_R2_score'])


def best_model_criteria_by_RMSE(performance):
    return np.mean(performance['mean_RMSE_score'])


def get_spatial_cross_validation_sets(data, num_folds=5):
    """
    Apply blocked spatial cross validation to an input data. Each fold contains
    a centroid which is uniformly distributed over data's geospatial field, and
    the rest of the data points in each fold find the N-1 nearest data point to
    the centroid, where N is the number of samples in each fold based on num_folds

    Args:
        data (Dataset): input dataset to be cross validated on
        num_folds (int, optional): number of folds. Defaults to 5.

    Returns: (list) validation set indicies list
    """
    # Apply spatial cross validation to prepare N fold sets
    fold_size, remainder = divmod(len(data), num_folds)
    validation_set_sizes = [fold_size + 1 if i < remainder else fold_size for i in range(num_folds)]

    # To get N-fold validation sets, first get N spatial-uniformly sampled data points
    # as centroids for blocks in each fold
    centroid_indices = data.spatial_sampling(
        method='uniform',
        num_samples=num_folds,
        masked_indices=[])

    validation_set_indices = []
    current_mask = []
    for i, validation_set_size in enumerate(validation_set_sizes):
        # mask out any centroid indices except the current centroid
        # current_fold_indices already covers the last centroid
        current_fold_indices = data.spatial_sampling(
            method='blocked',
            num_samples=validation_set_size,
            center_index=centroid_indices[i],
            masked_indices=current_mask + centroid_indices[i+1:])
        current_mask += current_fold_indices
        validation_set_indices.append(current_fold_indices)

    return validation_set_indices


def compute_performance(pred_results, ground_truth):
    """ Return RMSE and R2 for each label class """
    RMSE_score = rmse(pred_results, ground_truth, axis=0)
    R2_score = np.asarray([r2(pred_results[:, i], ground_truth[:, i]) for i in range(pred_results.shape[1])])
    return RMSE_score, R2_score


def run_cross_validation(model, data, validation_set_indices):
    """
    Run cross validation and get performance stats for input model

    Args:
        model (model): model
        data (Dataset): training dataset
        validation_set_indices (list): validation set indices

    Returns: (tuple) mean_RMSE_score, mean_R2_score
    """
    for i, current_validation_set_indices in enumerate(validation_set_indices):
        validation_subset = data.get_subset_by_indices(current_validation_set_indices)
        train_subset = data.remove_by_indices(current_validation_set_indices)
        model.train(train_subset, shuffle=True)
        pred_results, ground_truth = model.evaluate(validation_subset)
        RMSE_score, R2_score = compute_performance(pred_results, ground_truth)

        if i == 0:
            mean_RMSE_score, mean_R2_score = RMSE_score, R2_score
        else:
            mean_RMSE_score += RMSE_score
            mean_R2_score += R2_score

    mean_RMSE_score /= len(validation_set_indices)
    mean_R2_score /= len(validation_set_indices)
    return mean_RMSE_score, mean_R2_score


def grid_search(base_model, hyper_params_space, data, num_folds=5, output_file=None, rerun=False):
    """
    Apply grid search on spatial cross validation of input data

    Args:
        base_model (model): model
        hyper_params_space (list): hyper_params_space iterable
        data (Dataset): train set
        num_folds (int, optional): number of folds. Defaults to 5.
        output_file (str, optional): dir of cv results file. Defaults to None.
        rerun (bool, optional): whether to rerun experiments in output_file. Defaults to False.

    Returns: (dict): best model based on RMSE or R2
    """
    best_RMSE = None
    best_R2 = None
    best_model = {'by_RMSE': None, 'by_R2': None}

    validation_set_indices = get_spatial_cross_validation_sets(data, num_folds=num_folds)
    cv_performance = CVPerformance()
    if output_file is not None:
        cv_performance.load_from_file(output_file)
    if not rerun:
        # If no rerun, we need to find the historical best hyper params
        best_hyper_param_RMSE, best_RMSE = cv_performance.find_best_hyper_param(best_model_criteria_by_RMSE, lower_is_better=True)
        best_hyper_param_R2, best_R2 = cv_performance.find_best_hyper_param(best_model_criteria_by_R2, lower_is_better=False)
        best_model['by_RMSE'] = best_hyper_param_RMSE
        best_model['by_R2'] = best_hyper_param_R2
        print(f"Best Model by RMSE: {best_hyper_param_RMSE} | RMSE: {best_RMSE}")
        print(f"Best Model by R2: {best_hyper_param_R2} | R2: {best_R2}")

    for hyper_params in tqdm(hyper_params_space, desc="Grid Search Process:"):
        if not rerun:
            if cv_performance.query_hyper_param(hyper_params) is not None:
                # print(f"{hyper_params} skipped")
                continue

        base_model.reset_hyper_params(*hyper_params)

        mean_RMSE_score, mean_R2_score = run_cross_validation(base_model, data, validation_set_indices)
        avg_mean_RMSE_score = np.mean(mean_RMSE_score)
        avg_mean_R2_score = np.mean(mean_R2_score)

        # Log to output_file
        performance = {
            'mean_RMSE_score': mean_RMSE_score,
            'mean_R2_score': mean_R2_score
        }
        CVPerformance.write_single_performance_to_file(output_file,
                                                       hyper_params,
                                                       performance)

        if best_RMSE is None or avg_mean_RMSE_score < best_RMSE:
            best_model['by_RMSE'] = hyper_params
            best_RMSE = avg_mean_RMSE_score
            print(f"Best Model by RMSE: {hyper_params} | RMSE: {best_RMSE} | R2: {avg_mean_R2_score}")
        if best_R2 is None or avg_mean_R2_score > best_R2:
            best_model['by_R2'] = hyper_params
            best_R2 = avg_mean_R2_score
            print(f"Best Model by R2: {hyper_params} | RMSE: {avg_mean_RMSE_score} | R2: {best_R2}")

    return best_model


def pipeline(training_cfg, land_cover_cfg):
    """
    Training pipeline:
    1. Load input dataset for training from census output
    2. If model parameters are specified in config, load model weights,
       otherwise train on input dataset from scratch
    3. Save (direct) evaluation on model performance (for tuning or other references)

    Args:
        training_cfg (dict): training settings from yaml
        land_cover_cfg (dict): land cover settings from yaml

    Returns: (GradientBoostingTree)
    """
    # Load census_table input as Dataset obj
    input_dataset = Dataset(
        census_table=load_census_table_pkl(
            training_cfg['path_dir']['inputs']['census_table_input']),
        land_cover_code=land_cover_cfg['code']['MCD12Q1'],
        remove_land_cover_feature_index=training_cfg['feature_remove'],
        invalid_data=training_cfg['invalid_data_handle'])

    # Declare model structure
    # prob_est = gbt.OvRBernoulliGradientBoostingTree(
    #     ntrees=training_cfg['model']['gradient_boosting_tree']['ntrees'],
    #     max_depth=training_cfg['model']['gradient_boosting_tree']['max_depth'],
    #     nfolds=training_cfg['model']['gradient_boosting_tree']['nfolds'])
    prob_est = gbt.MultinomialGradientBoostingTree(
            ntrees=training_cfg['model']['gradient_boosting_tree']['ntrees'],
            max_depth=training_cfg['model']['gradient_boosting_tree']['max_depth'],
            nfolds=training_cfg['model']['gradient_boosting_tree']['nfolds'],
            min_rows=training_cfg['model']['gradient_boosting_tree']['min_rows'],
            learn_rate=training_cfg['model']['gradient_boosting_tree']['learn_rate'],
            sample_rate=training_cfg['model']['gradient_boosting_tree']['sample_rate'],
            col_sample_rate=training_cfg['model']['gradient_boosting_tree']['col_sample_rate']
            )

    # Load model weights if specified, otherwise train on input_dataset
    if training_cfg['path_dir']['inputs']['load_model'] is not None:
        try:
            prob_est.load(training_cfg['path_dir']['inputs']['load_model'])
            print('Model loaded from {}'.format(
                training_cfg['path_dir']['inputs']['load_model']))
        except h2o.exceptions.H2OResponseError:
            raise h2o.exceptions.H2OResponseError(
                'File {} is not valid model path.'.format(
                    training_cfg['path_dir']['inputs']['load_model']))

    else:
        print('No parameters to load. Start training ...')
        prob_est.train(input_dataset)
        prob_est.save(training_cfg['path_dir']['outputs']['save_model'])
        print('Model saved at {}'.format(
            training_cfg['path_dir']['outputs']['save_model']))

        # Evaluate the model on the whole input dataset
        # Note: This generated raw prediction on the state level census table, therefore
        #       it shall directly reflect the training performance
        # _, _ = prob_est.evaluate(input_dataset)