Skip to content

Commit 13007ac

Browse files
committed
Add runtime analysis scripts used for the benchmark section of the paper
The scripts used to obtain the results shown in the benchmark section of the paper were added as well as a brief README file showing the required setup to run the tests. new file: paper/benchmark/README.md new file: paper/benchmark/runtime_analysis_colincrease.py new file: paper/benchmark/runtime_analysis_rowincrease.py new file: paper/benchmark/runtime_analysis_symmetric.py
1 parent 652e774 commit 13007ac

File tree

4 files changed

+722
-0
lines changed

4 files changed

+722
-0
lines changed

paper/benchmark/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
To run the tests shown in the Benchmark section of the paper, the following additional requirements have to be
2+
fulfilled:
3+
1. Install R
4+
2. Install the following R-packages
5+
1. 'ade4'
6+
2. 'sp'
7+
3. 'adegraphics'
8+
3. Install rpy2
9+
4. Set the path to the R installation
10+
- import os
11+
- os.environ['R_HOME'] = '/Your/path/to/lib/R'
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
from mbpls.mbpls import MBPLS
2+
import time
3+
import rpy2
4+
5+
# define Parameters
6+
rand_seed = 25
7+
8+
# initialize
9+
from matplotlib import pyplot as plt
10+
import numpy as np
11+
import pickle
12+
import pandas as pd
13+
14+
np.seterr(all='raise')
15+
16+
17+
def mbpls_ade4(data, num_comp=20):
18+
from rpy2 import robjects
19+
from rpy2.robjects import pandas2ri
20+
import time
21+
22+
x1, x2, y = data
23+
24+
pandas2ri.activate() # to activate easy conversion from r to pandas dataframes
25+
26+
# to generate a dataframe in R global environment
27+
robjects.globalenv['x1'] = pandas2ri.py2ri_pandasdataframe(x1)
28+
robjects.globalenv['x2'] = pandas2ri.py2ri_pandasdataframe(x2)
29+
robjects.globalenv['ref'] = pandas2ri.py2ri_pandasdataframe(y)
30+
31+
start_time = time.time()
32+
# how to execute R code using the global environment variables defined above
33+
robjects.r(
34+
'''
35+
library(ade4)
36+
library(adegraphics)
37+
38+
dudiY.act <- dudi.pca(ref, center = TRUE, scale = TRUE, scannf =
39+
FALSE,nf=1)
40+
ktabX.act <- ktab.list.df(list(mainPerformance=x1,mainInput=x2))
41+
resmbpls.act <- mbpls(dudiY.act, ktabX.act, scale = TRUE,
42+
option = "none", scannf = FALSE, nf=''' + str(num_comp) + ''')
43+
44+
bip <- resmbpls.act$bip
45+
''')
46+
47+
return start_time
48+
49+
###################################################################
50+
# General settings
51+
###################################################################
52+
53+
# How many runs?
54+
reruns = 3
55+
56+
number_components = 20
57+
58+
feature_size_y = 10
59+
60+
standardize = True
61+
62+
# Should all parameters be calculated
63+
calc_all = False
64+
65+
# Verbosity
66+
# 0-show nothing
67+
# 1-show datasets
68+
# 2-show methods
69+
# 3-show run information
70+
# 4-show time of each run
71+
verbosity_level = 4 #higher verbosity levels includer lower ones
72+
73+
74+
# Calculation Measurements
75+
include_timing = True
76+
77+
# Datasets
78+
x_sizes = np.arange(1000, 1001, 1000)
79+
y_sizes = np.arange(100, 20001, 100)
80+
81+
82+
methods = [
83+
('Ade4', 'Ade4'),
84+
('Unipals', 'UNIPALS'),
85+
('Kernel', 'KERNEL'),
86+
('Nipals', 'NIPALS'),
87+
('Simpls', 'SIMPLS')
88+
]
89+
90+
91+
###################################################################
92+
# General settings - END
93+
###################################################################
94+
class bcolors:
95+
HEADER = '\033[95m'
96+
OKBLUE = '\033[94m'
97+
OKGREEN = '\033[92m'
98+
WARNING = '\033[93m'
99+
FAIL = '\033[91m'
100+
ENDC = '\033[0m'
101+
BOLD = '\033[1m'
102+
UNDERLINE = '\033[4m'
103+
104+
# Dictionary to save timing
105+
if include_timing:
106+
timing_results = np.zeros(shape=(len(methods), len(y_sizes), len(x_sizes), reruns))
107+
indicator_matrix = np.zeros(shape=(len(y_sizes), len(x_sizes)))
108+
109+
initiate_files = True
110+
if initiate_files:
111+
pickle.dump(indicator_matrix, open('indicator_matrix_colincrease.pkl', 'wb'))
112+
pickle.dump(timing_results, open('timing_results_colincrease.pkl', 'wb'))
113+
114+
# Derive shapes from datasets
115+
datasets_shape = []
116+
117+
finished = False
118+
119+
# load the indicator matrix that shows what has been done already
120+
# Lock the access first
121+
while not finished:
122+
while True:
123+
print('Loading indicator matrix data')
124+
indicator_matrix = pickle.load(open('indicator_matrix_colincrease.pkl', 'rb'))
125+
try:
126+
index = np.argwhere(indicator_matrix == 0)
127+
index = index[0]
128+
feature_index = index[0]
129+
sample_index = index[1]
130+
indicator_matrix[feature_index, sample_index] = 2
131+
pickle.dump(indicator_matrix, open('indicator_matrix_colincrease.pkl', 'wb'))
132+
except (ValueError, IndexError):
133+
print("Couldn't find indexes that are not finished")
134+
finished = True
135+
break
136+
print('Saved indicator matrix data')
137+
break
138+
139+
sample_size = x_sizes[sample_index]
140+
feature_size = y_sizes[feature_index]
141+
142+
datasets_shape.append(
143+
'X-shape: ' + str(sample_size) + 'x' + str(feature_size) + '\n Y-shape: ' + str(
144+
sample_size) + 'x' + str(feature_size_y))
145+
146+
print('Loading data')
147+
timing_results_tmp = pickle.load(open('timing_results_colincrease.pkl', 'rb'))
148+
149+
for run in range(reruns):
150+
if verbosity_level>2:
151+
print(bcolors.OKGREEN, "Starting run {:d} of {:d}".format(run+1, reruns), bcolors.ENDC)
152+
153+
if verbosity_level > 0:
154+
print(bcolors.OKGREEN, "Creating dataset size {:d}x{:d}"
155+
.format(sample_size, feature_size), bcolors.ENDC)
156+
157+
dataset_x_1 = np.random.randint(1, 100, size=(sample_size, feature_size//2))
158+
dataset_x_2 = np.random.randint(1, 100, size=(sample_size, feature_size//2))
159+
dataset_y = np.random.randint(1, 100, size=(sample_size, feature_size_y))
160+
161+
if verbosity_level > 0:
162+
print(bcolors.OKGREEN, "Starting calculations for dataset size {:d}x{:d}"
163+
.format(sample_size, feature_size), bcolors.ENDC)
164+
165+
for j, (methodname, method) in enumerate(methods):
166+
if methodname == 'Ade4':
167+
class_call = mbpls_ade4
168+
else:
169+
class_call = MBPLS(n_components=number_components, method=method, standardize=standardize,
170+
calc_all=calc_all)
171+
# It has to be checked if the methods still run in reasonable time
172+
if timing_results_tmp.mean(axis=3)[j].max() > 630:
173+
if include_timing:
174+
timing_results[j, feature_index, sample_index, run] = 9999
175+
if verbosity_level > 1:
176+
print(bcolors.OKGREEN, "Skipping calculating for {:s}-method, because it gets too slow now"
177+
.format(methodname), bcolors.ENDC)
178+
else:
179+
if verbosity_level > 1:
180+
print(bcolors.OKGREEN, "Calculating {:s}-method".format(methodname), bcolors.ENDC)
181+
if include_timing:
182+
timing_results[j, feature_index, sample_index, run] = time.time()
183+
# The ade4-package might run into convergence problems, which result in numerical problems that cause
184+
# the package to crash. When this happens the code is restarted with a new random matrix.
185+
if method == 'Ade4':
186+
count = 0
187+
while True:
188+
try:
189+
results = class_call([pd.DataFrame(dataset_x_1), pd.DataFrame(dataset_x_2),
190+
pd.DataFrame(dataset_y)],number_components)
191+
if include_timing:
192+
timing_results[j, feature_index, sample_index, run] = \
193+
time.time() - results
194+
if verbosity_level == 4:
195+
print("Task finished in {:.2f} seconds".format(timing_results[j, feature_index,
196+
sample_index, run]))
197+
break
198+
except:
199+
count += 1
200+
if count > 100:
201+
print("Aborting. The method Ade4 has failed more than 100 times on the same task.")
202+
timing_results[j, feature_index, sample_index, run] = 9999
203+
break
204+
else:
205+
print("Tried. Trying again.")
206+
dataset_x_1 = np.random.randint(1, 100, size=(sample_size, feature_size // 2))
207+
dataset_x_2 = np.random.randint(1, 100, size=(sample_size, feature_size // 2))
208+
dataset_y = np.random.randint(1, 100, size=(sample_size, feature_size_y))
209+
else:
210+
results = class_call.fit([dataset_x_1, dataset_x_2], dataset_y)
211+
if include_timing:
212+
timing_results[j, feature_index, sample_index, run] = \
213+
time.time() - timing_results[j, feature_index, sample_index,
214+
run]
215+
216+
if verbosity_level == 4:
217+
print("Task finished in {:.2f} seconds".format(timing_results[j, feature_index,
218+
sample_index, run]))
219+
220+
if verbosity_level>2:
221+
print(bcolors.OKGREEN, "Finished run {:d} of {:d}".format(run+1, reruns), bcolors.ENDC)
222+
223+
224+
print('Loading data')
225+
indicator_matrix_shared = pickle.load(open('indicator_matrix_colincrease.pkl', 'rb'))
226+
timing_results_shared = pickle.load(open('timing_results_colincrease.pkl', 'rb'))
227+
228+
indicator_matrix_shared[feature_index, sample_index] = 1
229+
for i in range(len(methods)):
230+
timing_results_shared[i, feature_index, sample_index, :] = \
231+
timing_results[i, feature_index, sample_index, :]
232+
233+
234+
print('Saving data')
235+
pickle.dump(indicator_matrix_shared, open('indicator_matrix_colincrease.pkl', 'wb'))
236+
pickle.dump(timing_results_shared, open('timing_results_colincrease.pkl', 'wb'))
237+

0 commit comments

Comments
 (0)