|
| 1 | +from mbpls.mbpls import MBPLS |
| 2 | +import time |
| 3 | +import rpy2 |
| 4 | + |
| 5 | +# define Parameters |
| 6 | +rand_seed = 25 |
| 7 | + |
| 8 | +# initialize |
| 9 | +from matplotlib import pyplot as plt |
| 10 | +import numpy as np |
| 11 | +import pickle |
| 12 | +import pandas as pd |
| 13 | + |
| 14 | +np.seterr(all='raise') |
| 15 | + |
| 16 | + |
| 17 | +def mbpls_ade4(data, num_comp=20): |
| 18 | + from rpy2 import robjects |
| 19 | + from rpy2.robjects import pandas2ri |
| 20 | + import time |
| 21 | + |
| 22 | + x1, x2, y = data |
| 23 | + |
| 24 | + pandas2ri.activate() # to activate easy conversion from r to pandas dataframes |
| 25 | + |
| 26 | + # to generate a dataframe in R global environment |
| 27 | + robjects.globalenv['x1'] = pandas2ri.py2ri_pandasdataframe(x1) |
| 28 | + robjects.globalenv['x2'] = pandas2ri.py2ri_pandasdataframe(x2) |
| 29 | + robjects.globalenv['ref'] = pandas2ri.py2ri_pandasdataframe(y) |
| 30 | + |
| 31 | + start_time = time.time() |
| 32 | + # how to execute R code using the global environment variables defined above |
| 33 | + robjects.r( |
| 34 | + ''' |
| 35 | + library(ade4) |
| 36 | + library(adegraphics) |
| 37 | +
|
| 38 | + dudiY.act <- dudi.pca(ref, center = TRUE, scale = TRUE, scannf = |
| 39 | + FALSE,nf=1) |
| 40 | + ktabX.act <- ktab.list.df(list(mainPerformance=x1,mainInput=x2)) |
| 41 | + resmbpls.act <- mbpls(dudiY.act, ktabX.act, scale = TRUE, |
| 42 | + option = "none", scannf = FALSE, nf=''' + str(num_comp) + ''') |
| 43 | +
|
| 44 | + bip <- resmbpls.act$bip |
| 45 | + ''') |
| 46 | + |
| 47 | + return start_time |
| 48 | + |
| 49 | +################################################################### |
| 50 | +# General settings |
| 51 | +################################################################### |
| 52 | + |
| 53 | +# How many runs? |
| 54 | +reruns = 3 |
| 55 | + |
| 56 | +number_components = 20 |
| 57 | + |
| 58 | +feature_size_y = 10 |
| 59 | + |
| 60 | +standardize = True |
| 61 | + |
| 62 | +# Should all parameters be calculated |
| 63 | +calc_all = False |
| 64 | + |
| 65 | +# Verbosity |
| 66 | +# 0-show nothing |
| 67 | +# 1-show datasets |
| 68 | +# 2-show methods |
| 69 | +# 3-show run information |
| 70 | +# 4-show time of each run |
| 71 | +verbosity_level = 4 #higher verbosity levels includer lower ones |
| 72 | + |
| 73 | + |
| 74 | +# Calculation Measurements |
| 75 | +include_timing = True |
| 76 | + |
| 77 | +# Datasets |
| 78 | +x_sizes = np.arange(1000, 1001, 1000) |
| 79 | +y_sizes = np.arange(100, 20001, 100) |
| 80 | + |
| 81 | + |
| 82 | +methods = [ |
| 83 | + ('Ade4', 'Ade4'), |
| 84 | + ('Unipals', 'UNIPALS'), |
| 85 | + ('Kernel', 'KERNEL'), |
| 86 | + ('Nipals', 'NIPALS'), |
| 87 | + ('Simpls', 'SIMPLS') |
| 88 | +] |
| 89 | + |
| 90 | + |
| 91 | +################################################################### |
| 92 | +# General settings - END |
| 93 | +################################################################### |
| 94 | +class bcolors: |
| 95 | + HEADER = '\033[95m' |
| 96 | + OKBLUE = '\033[94m' |
| 97 | + OKGREEN = '\033[92m' |
| 98 | + WARNING = '\033[93m' |
| 99 | + FAIL = '\033[91m' |
| 100 | + ENDC = '\033[0m' |
| 101 | + BOLD = '\033[1m' |
| 102 | + UNDERLINE = '\033[4m' |
| 103 | + |
| 104 | +# Dictionary to save timing |
| 105 | +if include_timing: |
| 106 | + timing_results = np.zeros(shape=(len(methods), len(y_sizes), len(x_sizes), reruns)) |
| 107 | + indicator_matrix = np.zeros(shape=(len(y_sizes), len(x_sizes))) |
| 108 | + |
| 109 | +initiate_files = True |
| 110 | +if initiate_files: |
| 111 | + pickle.dump(indicator_matrix, open('indicator_matrix_colincrease.pkl', 'wb')) |
| 112 | + pickle.dump(timing_results, open('timing_results_colincrease.pkl', 'wb')) |
| 113 | + |
| 114 | +# Derive shapes from datasets |
| 115 | +datasets_shape = [] |
| 116 | + |
| 117 | +finished = False |
| 118 | + |
| 119 | +# load the indicator matrix that shows what has been done already |
| 120 | +# Lock the access first |
| 121 | +while not finished: |
| 122 | + while True: |
| 123 | + print('Loading indicator matrix data') |
| 124 | + indicator_matrix = pickle.load(open('indicator_matrix_colincrease.pkl', 'rb')) |
| 125 | + try: |
| 126 | + index = np.argwhere(indicator_matrix == 0) |
| 127 | + index = index[0] |
| 128 | + feature_index = index[0] |
| 129 | + sample_index = index[1] |
| 130 | + indicator_matrix[feature_index, sample_index] = 2 |
| 131 | + pickle.dump(indicator_matrix, open('indicator_matrix_colincrease.pkl', 'wb')) |
| 132 | + except (ValueError, IndexError): |
| 133 | + print("Couldn't find indexes that are not finished") |
| 134 | + finished = True |
| 135 | + break |
| 136 | + print('Saved indicator matrix data') |
| 137 | + break |
| 138 | + |
| 139 | + sample_size = x_sizes[sample_index] |
| 140 | + feature_size = y_sizes[feature_index] |
| 141 | + |
| 142 | + datasets_shape.append( |
| 143 | + 'X-shape: ' + str(sample_size) + 'x' + str(feature_size) + '\n Y-shape: ' + str( |
| 144 | + sample_size) + 'x' + str(feature_size_y)) |
| 145 | + |
| 146 | + print('Loading data') |
| 147 | + timing_results_tmp = pickle.load(open('timing_results_colincrease.pkl', 'rb')) |
| 148 | + |
| 149 | + for run in range(reruns): |
| 150 | + if verbosity_level>2: |
| 151 | + print(bcolors.OKGREEN, "Starting run {:d} of {:d}".format(run+1, reruns), bcolors.ENDC) |
| 152 | + |
| 153 | + if verbosity_level > 0: |
| 154 | + print(bcolors.OKGREEN, "Creating dataset size {:d}x{:d}" |
| 155 | + .format(sample_size, feature_size), bcolors.ENDC) |
| 156 | + |
| 157 | + dataset_x_1 = np.random.randint(1, 100, size=(sample_size, feature_size//2)) |
| 158 | + dataset_x_2 = np.random.randint(1, 100, size=(sample_size, feature_size//2)) |
| 159 | + dataset_y = np.random.randint(1, 100, size=(sample_size, feature_size_y)) |
| 160 | + |
| 161 | + if verbosity_level > 0: |
| 162 | + print(bcolors.OKGREEN, "Starting calculations for dataset size {:d}x{:d}" |
| 163 | + .format(sample_size, feature_size), bcolors.ENDC) |
| 164 | + |
| 165 | + for j, (methodname, method) in enumerate(methods): |
| 166 | + if methodname == 'Ade4': |
| 167 | + class_call = mbpls_ade4 |
| 168 | + else: |
| 169 | + class_call = MBPLS(n_components=number_components, method=method, standardize=standardize, |
| 170 | + calc_all=calc_all) |
| 171 | + # It has to be checked if the methods still run in reasonable time |
| 172 | + if timing_results_tmp.mean(axis=3)[j].max() > 630: |
| 173 | + if include_timing: |
| 174 | + timing_results[j, feature_index, sample_index, run] = 9999 |
| 175 | + if verbosity_level > 1: |
| 176 | + print(bcolors.OKGREEN, "Skipping calculating for {:s}-method, because it gets too slow now" |
| 177 | + .format(methodname), bcolors.ENDC) |
| 178 | + else: |
| 179 | + if verbosity_level > 1: |
| 180 | + print(bcolors.OKGREEN, "Calculating {:s}-method".format(methodname), bcolors.ENDC) |
| 181 | + if include_timing: |
| 182 | + timing_results[j, feature_index, sample_index, run] = time.time() |
| 183 | + # The ade4-package might run into convergence problems, which result in numerical problems that cause |
| 184 | + # the package to crash. When this happens the code is restarted with a new random matrix. |
| 185 | + if method == 'Ade4': |
| 186 | + count = 0 |
| 187 | + while True: |
| 188 | + try: |
| 189 | + results = class_call([pd.DataFrame(dataset_x_1), pd.DataFrame(dataset_x_2), |
| 190 | + pd.DataFrame(dataset_y)],number_components) |
| 191 | + if include_timing: |
| 192 | + timing_results[j, feature_index, sample_index, run] = \ |
| 193 | + time.time() - results |
| 194 | + if verbosity_level == 4: |
| 195 | + print("Task finished in {:.2f} seconds".format(timing_results[j, feature_index, |
| 196 | + sample_index, run])) |
| 197 | + break |
| 198 | + except: |
| 199 | + count += 1 |
| 200 | + if count > 100: |
| 201 | + print("Aborting. The method Ade4 has failed more than 100 times on the same task.") |
| 202 | + timing_results[j, feature_index, sample_index, run] = 9999 |
| 203 | + break |
| 204 | + else: |
| 205 | + print("Tried. Trying again.") |
| 206 | + dataset_x_1 = np.random.randint(1, 100, size=(sample_size, feature_size // 2)) |
| 207 | + dataset_x_2 = np.random.randint(1, 100, size=(sample_size, feature_size // 2)) |
| 208 | + dataset_y = np.random.randint(1, 100, size=(sample_size, feature_size_y)) |
| 209 | + else: |
| 210 | + results = class_call.fit([dataset_x_1, dataset_x_2], dataset_y) |
| 211 | + if include_timing: |
| 212 | + timing_results[j, feature_index, sample_index, run] = \ |
| 213 | + time.time() - timing_results[j, feature_index, sample_index, |
| 214 | + run] |
| 215 | + |
| 216 | + if verbosity_level == 4: |
| 217 | + print("Task finished in {:.2f} seconds".format(timing_results[j, feature_index, |
| 218 | + sample_index, run])) |
| 219 | + |
| 220 | + if verbosity_level>2: |
| 221 | + print(bcolors.OKGREEN, "Finished run {:d} of {:d}".format(run+1, reruns), bcolors.ENDC) |
| 222 | + |
| 223 | + |
| 224 | + print('Loading data') |
| 225 | + indicator_matrix_shared = pickle.load(open('indicator_matrix_colincrease.pkl', 'rb')) |
| 226 | + timing_results_shared = pickle.load(open('timing_results_colincrease.pkl', 'rb')) |
| 227 | + |
| 228 | + indicator_matrix_shared[feature_index, sample_index] = 1 |
| 229 | + for i in range(len(methods)): |
| 230 | + timing_results_shared[i, feature_index, sample_index, :] = \ |
| 231 | + timing_results[i, feature_index, sample_index, :] |
| 232 | + |
| 233 | + |
| 234 | + print('Saving data') |
| 235 | + pickle.dump(indicator_matrix_shared, open('indicator_matrix_colincrease.pkl', 'wb')) |
| 236 | + pickle.dump(timing_results_shared, open('timing_results_colincrease.pkl', 'wb')) |
| 237 | + |
0 commit comments