diff --git a/Pilot1/NT3/make_csv.py b/Pilot1/NT3/make_csv.py new file mode 100644 index 00000000..4b6b1f79 --- /dev/null +++ b/Pilot1/NT3/make_csv.py @@ -0,0 +1,61 @@ +import pandas as pd +import pickle +import argparse +import glob, os +from pathlib import Path +import matplotlib.pyplot as plt + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-f",type=str, help="Run folder") + parser.add_argument("-c1", type=str, help="cluster 1 name") + parser.add_argument("-c2", type=str, help="cluster 2 name") + args = parser.parse_args() + return args + +def main(): + args = get_args() + l1 = [] + l2 = [] + runs = glob.glob(args.f+"/EXP000/*/") + print(runs) + for r in runs: + print(r) + global_data = pd.read_csv(r+"training.log") + val_abs = global_data['val_abstention'].iloc[-1] + val_abs_acc = global_data['val_abstention_acc'].iloc[-1] + if os.path.exists(r+"cluster_trace.pkl"): + cluster_data = pickle.load(open(r+"cluster_trace.pkl", "rb")) + else: + continue + polluted_abs = cluster_data['Abs polluted'] + val_abs_cluster = cluster_data['Abs val cluster'] + val_abs_acc_cluster = cluster_data['Abs val acc'] + ratio = float(r[-8:-5]) + if args.c1 in r: + l1.append([ratio, val_abs, val_abs_acc, val_abs_cluster, val_abs_acc_cluster, polluted_abs]) + elif args.c2 in r: + l2.append([ratio, val_abs, val_abs_acc, val_abs_cluster, val_abs_acc_cluster, polluted_abs]) + + df1 = pd.DataFrame(l1, columns=['Noise Fraction', 'Val Abs', 'Val Abs Acc', 'Val Abs Cluster', 'Val Abs Acc Cluster', 'Polluted Abs']) + df2 = pd.DataFrame(l2, columns=['Noise Fraction', 'Val Abs', 'Val Abs Acc', 'Val Abs Cluster', 'Val Abs Acc Cluster', 'Polluted Abs']) + print(df1) + df1.to_csv("cluster_1.csv") + df2.to_csv("cluster_2.csv") + plt.plot(df1['Noise Fraction'], df1['Val Abs'], marker='o', label='Val Abs') + plt.plot(df1['Noise Fraction'], df1['Val Abs Acc'], marker='o',label='Val Abs Acc') + plt.plot(df1['Noise Fraction'], df1['Val Abs Cluster'], marker='o',label='Val Abs Cluster') + plt.plot(df1['Noise Fraction'], df1['Val Abs Acc Cluster'], marker='o',label='Val Abs Acc Cluster') + plt.xlabel("Noise fraction") + plt.legend() + plt.savefig('c1.png') + + plt.plot(df2['Noise Fraction'], df2['Val Abs'], marker='o',label='Val Abs') + plt.plot(df2['Noise Fraction'], df2['Val Abs Acc'], marker='o',label='Val Abs Acc') + plt.plot(df2['Noise Fraction'], df2['Val Abs Cluster'], marker='o',label='Val Abs Cluster') + plt.plot(df2['Noise Fraction'], df2['Val Abs Acc Cluster'], marker='o',label='Val Abs Acc Cluster') + plt.xlabel("Noise Fraction") + plt.legend() + plt.savefig('c2.png') +if __name__ == "__main__": + main() diff --git a/Pilot1/NT3/nt3_abstention_keras2.py b/Pilot1/NT3/nt3_abstention_keras2.py index 7563d258..ff66aa4b 100644 --- a/Pilot1/NT3/nt3_abstention_keras2.py +++ b/Pilot1/NT3/nt3_abstention_keras2.py @@ -16,6 +16,7 @@ import nt3 as bmk import candle +import pickle additional_definitions = abs_definitions @@ -51,7 +52,13 @@ def initialize_parameters(default_model='nt3_noise_model.txt'): gParameters = candle.finalize_parameters(nt3Bmk) return gParameters - + +def load_data_cf(cf_path): + # Pickle file holds the test train split and cf index info + print("Loading data...") + X_train, X_test, Y_train, Y_test, polluted_inds, cluster_inds = pickle.load(open(cf_path, 'rb')) + print('done') + return X_train, Y_train, X_test, Y_test, polluted_inds, cluster_inds def load_data(train_path, test_path, gParameters): @@ -86,6 +93,38 @@ def load_data(train_path, test_path, gParameters): return X_train, Y_train, X_test, Y_test +def evaluate_cf(model, nb_classes, output_dir, X_train, X_test, Y_train, Y_test, polluted_inds, cluster_inds, gParameters): + if len(polluted_inds) > 0: + y_pred = model.predict(X_test) + abstain_inds = [] + for i in range(y_pred.shape[0]): + if np.argmax(y_pred[i]) == nb_classes: + abstain_inds.append(i) + + # Cluster indices and polluted indices are wrt to entire train + test dataset + # whereas y_pred only contains test dataset so add offset for correct indexing + offset_testset = Y_train.shape[0] + abstain_inds=[i+offset_testset for i in abstain_inds] + polluted_percentage = np.sum([el in polluted_inds for el in abstain_inds])/np.max([len(abstain_inds),\ +1]) + print("Percentage of abstained samples that were polluted {}".format(polluted_percentage)) + + cluster_inds_test = list(filter(lambda cluster_inds: cluster_inds >= offset_testset, cluster_inds)) + cluster_inds_test_abstain = [el in abstain_inds for el in cluster_inds_test] + cluster_percentage = c = np.sum(cluster_inds_test_abstain)/len(cluster_inds_test) + print("Percentage of cluster (in test set) that was abstained {}".format(cluster_percentage)) + + unabstain_inds = [] + for i in range(y_pred.shape[0]): + if np.argmax(y_pred[i]) != nb_classes and (i+offset_testset in cluster_inds_test): + unabstain_inds.append(i) + # Make sure number of unabstained indices in cluster test set plus number of abstainsed indices in cluster test set + # equals number of indices in cluster in the test set + assert(len(unabstain_inds)+np.sum(cluster_inds_test_abstain) == len(cluster_inds_test)) + score_cluster = 1 if len(unabstain_inds)==0 else model.evaluate(X_test[unabstain_inds], Y_test[unabstain_inds])[1] + print("Accuracy of unabastained cluster {}".format(score_cluster)) + if gParameters['noise_save_cf']: + pickle.dump({'Abs polluted': polluted_percentage, 'Abs val cluster': cluster_percentage, 'Abs val acc': score_cluster}, open("{}/cluster_trace.pkl".format(output_dir), "wb")) def run(gParameters): @@ -96,7 +135,10 @@ def run(gParameters): train_file = candle.get_file(file_train, url + file_train, cache_subdir='Pilot1') test_file = candle.get_file(file_test, url + file_test, cache_subdir='Pilot1') - X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) + if gParameters['noise_cf'] is not None: + X_train, Y_train, X_test, Y_test, polluted_inds, cluster_inds = load_data_cf(gParameters['noise_cf']) + else: + X_train, Y_train, X_test, Y_test = load_data(train_file, test_file, gParameters) # only training set has noise X_train, Y_train = candle.add_noise(X_train, Y_train, gParameters) @@ -274,6 +316,8 @@ def run(gParameters): score = model.evaluate(X_test, Y_test, verbose=0) + if gParameters['noise_cf'] is not None: + evaluate_cf(model, nb_classes, output_dir, X_train, X_test, Y_train, Y_test, polluted_inds, cluster_inds, gParameters) alpha_trace = open(output_dir + "/alpha_trace", "w+") for alpha in abstention_cbk.alphavalues: alpha_trace.write(str(alpha) + '\n') diff --git a/Pilot1/NT3/nt3_baseline_keras2.py b/Pilot1/NT3/nt3_baseline_keras2.py index 3eceba0a..4cb17472 100644 --- a/Pilot1/NT3/nt3_baseline_keras2.py +++ b/Pilot1/NT3/nt3_baseline_keras2.py @@ -15,9 +15,9 @@ import nt3 as bmk import candle +import pickle - -def initialize_parameters(default_model='nt3_default_model.txt'): +def initialize_parameters(default_model='nt3_noise_model.txt'): # Build benchmark object nt3Bmk = bmk.BenchmarkNT3( @@ -238,6 +238,10 @@ def run(gParameters): print("json %s: %.2f%%" % (loaded_model_json.metrics_names[1], score_json[1] * 100)) + + if gParameters['noise_save_cf']: + model.save('{}/{}.autosave.model'.format(output_dir, model_name)) + pickle.dump([X_train, X_test, Y_train, Y_test], open('{}/{}.autosave.data.pkl'.format(output_dir, model_name), "wb")) return history diff --git a/Pilot1/NT3/nt3_cf/README.md b/Pilot1/NT3/nt3_cf/README.md new file mode 100644 index 00000000..284a1bb3 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/README.md @@ -0,0 +1,30 @@ +NT3 with counterfactuals: +Code to generate counterfactual examples given an input model and dataset in pkl format. \ +Clusters and thresholds counterfactuals, injects noise into dataset \ +Workflow: +1) Generate counterfactuals using cf_nb.py +``` +python cf_nb.py +``` + +2) Create threshold pickle files using threshold.py (provide a threshold value between 0 and 1, see --help) +``` +python threshold.py -d ../nt3.autosave.data.pkl -c cf_redo_all_reformat.pkl -t 0.9 -o threshold_0.9.pkl +``` + +3) Cluster threshold files using gen_clusters.py +``` +python gen_clusters.py -t_value 0.9 -t threshold_0.9.pkl +``` + +4) Inject noise into dataset using inject_noise.py (provide a scale value to modify the amplitude of the noise, see --help) +``` +python inject_noise.py -t threshold_0.9.pkl -c1 cf_class_0_cluster0.pkl -c2 cf_class_1_cluster0.pkl -scale 1.0 -r True -d ../nt3.autosave.data.pkl -f cf_failed_inds.pkl -o noise_data +``` + +Abstention with counterfactuals: +Code located in abstention/ +Workflow: +1) Run abstention model with nt3_abstention_keras2_cf.py, pass in a pickle file with X (with noise), y (this is the output of 4) above) +2) For a sweep use run_abstention_sweep.sh +3) To collect metrics (abstention, cluster abstention) run make_csv.py diff --git a/Pilot1/NT3/nt3_cf/abstention/make_csv.py b/Pilot1/NT3/nt3_cf/abstention/make_csv.py new file mode 100644 index 00000000..6ee2d98a --- /dev/null +++ b/Pilot1/NT3/nt3_cf/abstention/make_csv.py @@ -0,0 +1,41 @@ +import pandas as pd +import pickle +import argparse +import glob, os +from pathlib import Path + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-f",type=str, help="Run folder") + parser.add_argument("-c1", type=str, help="cluster 1 name") + parser.add_argument("-c2", type=str, help="cluster 2 name") + args = parser.parse_args() + return args + +def main(): + args = get_args() + l1 = [] + l2 = [] + runs = glob.glob(args.f+"/EXP000/*/") + print(runs) + for r in runs: + global_data = pd.read_csv(r+"training.log") + val_abs = global_data['val_abstention'].iloc[-1] + val_abs_acc = global_data['val_abstention_acc'].iloc[-1] + cluster_data = pickle.load(open(r+"cluster_trace.pkl", "rb")) + polluted_abs = cluster_data['Abs polluted'] + val_abs_cluster = cluster_data['Abs val cluster'] + val_abs_acc_cluster = cluster_data['Abs val acc'] + ratio = float(r[-4:-1]) + if args.c1 in r: + l1.append([ratio, val_abs, val_abs_acc, val_abs_cluster, val_abs_acc_cluster, polluted_abs]) + elif args.c2 in r: + l2.append([ratio, val_abs, val_abs_acc, val_abs_cluster, val_abs_acc_cluster, polluted_abs]) + + df1 = pd.DataFrame(l1, columns=['Noise Fraction', 'Val Abs', 'Val Abs Acc', 'Val Abs Cluster', 'Val Abs Acc Cluster', 'Polluted Abs']) + df2 = pd.DataFrame(l2, columns=['Noise Fraction', 'Val Abs', 'Val Abs Acc', 'Val Abs Cluster', 'Val Abs Acc Cluster', 'Polluted Abs']) + print(df1) + df1.to_csv("cluster_1.csv") + df2.to_csv("cluster_2.csv") +if __name__ == "__main__": + main() diff --git a/Pilot1/NT3/nt3_cf/abstention/run_abstention_sweep.sh b/Pilot1/NT3/nt3_cf/abstention/run_abstention_sweep.sh new file mode 100755 index 00000000..b7a9f611 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/abstention/run_abstention_sweep.sh @@ -0,0 +1,5 @@ +#!/bin/bash +for filename in /vol/ml/shahashka/xai-geom/nt3/nt3.data*; do + python nt3_abstention_keras2_cf.py --cf_noise $filename --output_dir cf_sweep_0906 --run_id ${filename:40:21} --epochs 100 + #cp cf_sweep_0902/EXP000/RUN000/training.log ${filename}_training_0902.log +done diff --git a/Pilot1/NT3/nt3_cf/analyze.py b/Pilot1/NT3/nt3_cf/analyze.py new file mode 100644 index 00000000..6e62783b --- /dev/null +++ b/Pilot1/NT3/nt3_cf/analyze.py @@ -0,0 +1,33 @@ +# Script to analyze perturbation by cluster +# Plot the perturbations by cluster +# Plot the pertubation centroids + +import os +import pickle +import matplotlib.pyplot as plt +import numpy as np +directory = 'clusters_0911_0.5/' +orig_dataset = pickle.load(open("nt3.autosave.data.pkl", 'rb'))[0] +cf_dataset = pickle.load(open("threshold_0905.pkl", 'rb'))['perturbation vector'] +for filename in os.listdir(directory): + if filename.startswith("cf_class_0") or filename.startswith("cf_class_1") : + data = pickle.load(open(os.path.join(directory, filename), 'rb')) + x_range = np.arange(len(data['centroid perturb vector'])) + ind_in_cluster = data['sample indices in this cluster'][0:5] + fig,ax = plt.subplots(3, figsize=(20,15)) + fig.suptitle("Perturbation Vectors for counterfactual class 1, cluster 1", fontsize=25) + for i,ax_i in zip(ind_in_cluster,ax): + d = cf_dataset[i] + ax_i.plot(x_range, d, label='perturbation vector') + ax_i.plot(x_range ,data['centroid perturb vector'], label='centroid') + #ax_i.axhline(y=0.5*np.max(np.abs(d)), color='r', linestyle='-') + #ax_i.axhline(y=-0.5*np.max(np.abs(d)), color='r', linestyle='-') + ax_i.axvline(x=9603, color='r', linestyle='-', linewidth=5, alpha=0.3) + + ax_i.set_title("sample {}".format(i)) + ax_i.legend() + fig.supxlabel("Feature index", fontsize=18) + plt.savefig("centroids_{}.png".format(filename)) + + else: + continue diff --git a/Pilot1/NT3/nt3_cf/cf_nb.py b/Pilot1/NT3/nt3_cf/cf_nb.py new file mode 100644 index 00000000..2cd187b3 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/cf_nb.py @@ -0,0 +1,56 @@ +import tensorflow as tf +tf.get_logger().setLevel(40) # suppress deprecation messages +tf.compat.v1.disable_v2_behavior() # disable TF2 behaviour as alibi code still relies on TF1 constructs +from tensorflow.keras.models import Model, load_model +import matplotlib.pyplot as plt +import numpy as np +import os +os.environ["CUDA_VISIBLE_DEVICES"]="1" +from time import time +from alibi.explainers import CounterFactual, CounterFactualProto +print('TF version: ', tf.__version__) +print('Eager execution enabled: ', tf.executing_eagerly()) # False +print(tf.test.is_gpu_available()) +import pickle +model_nt3 = tf.keras.models.load_model('../nt3.autosave.model') +with open('../nt3.autosave.data.pkl', 'rb') as pickle_file: + X_train,X_test,Y_train,Y_test = pickle.load(pickle_file) + +shape_cf = (1,) + X_train.shape[1:] +print(shape_cf) +target_proba = 0.9 +tol = 0.1 # want counterfactuals with p(class)>0.90 +target_class = 'other' # any class other than will do +max_iter = 1000 +lam_init = 1e-1 +max_lam_steps = 20 +learning_rate_init = 0.1 +feature_range = (0,1) +cf = CounterFactual(model_nt3, shape=shape_cf, target_proba=target_proba, tol=tol, + target_class=target_class, max_iter=max_iter, lam_init=lam_init, + max_lam_steps=max_lam_steps, learning_rate_init=learning_rate_init, + feature_range=feature_range) +shape = X_train[0].shape[0] +results=[] +failed_inds = [] +X = np.concatenate([X_train,X_test]) + +for i in np.arange(0,X.shape[0]): + print(i) + x_sample=X[i:i+1] + print(x_sample.shape) + start = time() + try: + explanation = cf.explain(x_sample) + print('Counterfactual prediction: {}, {}'.format(explanation.cf['class'], explanation.cf['proba'])) + print("Actual prediction: {}".format(model_nt3.predict(x_sample))) + results.append([i, explanation.cf['X'],explanation.cf['class'], explanation.cf['proba']]) + test = model_nt3.predict(explanation.cf['X']) + print(test, explanation.cf['proba'], explanation.cf['class']) + except: + print("Failed cf generation") + failed_inds.append(i) + if i%100 == 0 and i is not 0: + pickle.dump(results, open("cf_{}.pkl".format(i), "wb")) + results = [] +pickle.dump(failed_inds, open("cf_failed_inds.pkl", "wb")) diff --git a/Pilot1/NT3/nt3_cf/environment.yml b/Pilot1/NT3/nt3_cf/environment.yml new file mode 100644 index 00000000..669ca25e --- /dev/null +++ b/Pilot1/NT3/nt3_cf/environment.yml @@ -0,0 +1,264 @@ +name: xai-geom-tf +channels: + - anaconda + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - _tflow_select=2.1.0=gpu + - absl-py=0.11.0=py38h578d9bd_0 + - aiohttp=3.7.3=py38h497a2fe_1 + - anyio=2.0.2=py38h578d9bd_4 + - argon2-cffi=20.1.0=py38h497a2fe_2 + - astor=0.8.1=pyh9f0ad1d_0 + - astunparse=1.6.3=pyhd8ed1ab_0 + - async-timeout=3.0.1=py_1000 + - async_generator=1.10=py_0 + - attrs=20.3.0=pyhd3deb0d_0 + - babel=2.9.0=pyhd3deb0d_0 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.1=py_0 + - bleach=3.3.0=pyh44b312d_0 + - blinker=1.4=py_1 + - brotlipy=0.7.0=py38h497a2fe_1001 + - c-ares=1.17.1=h36c2ea0_0 + - ca-certificates=2020.10.14=0 + - cachetools=4.2.1=pyhd8ed1ab_0 + - certifi=2020.6.20=py38_0 + - cffi=1.14.4=py38ha65f79e_1 + - chardet=3.0.4=py38h924ce5b_1008 + - click=7.1.2=pyh9f0ad1d_0 + - cryptography=3.3.1=py38h2b97feb_1 + - cudatoolkit=10.1.243=h036e899_7 + - cudnn=7.6.5.32=hc0a50b0_1 + - cupti=10.1.168=0 + - cycler=0.10.0=py_2 + - dbus=1.13.6=hfdff14a_1 + - decorator=4.4.2=py_0 + - defusedxml=0.6.0=py_0 + - entrypoints=0.3=pyhd8ed1ab_1003 + - expat=2.2.10=h9c3ff4c_0 + - fontconfig=2.13.1=hba837de_1004 + - freetype=2.10.4=h0708190_1 + - gast=0.3.3=py_0 + - gettext=0.19.8.1=h0b5b191_1005 + - glib=2.66.4=hc4f0c31_2 + - glib-tools=2.66.4=hc4f0c31_2 + - google-auth=1.24.0=pyhd3deb0d_0 + - google-auth-oauthlib=0.4.1=py_2 + - google-pasta=0.2.0=pyh8c360ce_0 + - grpcio=1.35.0=py38hdd6454d_0 + - gst-plugins-base=1.14.5=h0935bb2_2 + - gstreamer=1.18.3=h3560a44_0 + - h5py=2.10.0=nompi_py38h7442b35_105 + - hdf5=1.10.6=nompi_h6a2412b_1114 + - icu=68.1=h58526e2_0 + - idna=2.10=pyh9f0ad1d_0 + - importlib-metadata=3.4.0=py38h578d9bd_0 + - importlib_metadata=3.4.0=hd8ed1ab_0 + - intel-openmp=2020.2=254 + - ipykernel=5.3.4=py38h5ca1d4c_0 + - ipython=7.20.0=py38h81c977d_0 + - ipython_genutils=0.2.0=py_1 + - jedi=0.18.0=py38h578d9bd_2 + - jinja2=2.11.3=pyh44b312d_0 + - jpeg=9d=h36c2ea0_0 + - json5=0.9.5=pyh9f0ad1d_0 + - jsonschema=3.2.0=py_2 + - jupyter_client=6.1.11=pyhd8ed1ab_1 + - jupyter_core=4.7.1=py38h578d9bd_0 + - jupyter_server=1.2.3=py38h578d9bd_1 + - jupyterlab=3.0.6=pyhd8ed1ab_0 + - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 + - jupyterlab_server=2.1.3=pyhd8ed1ab_0 + - keras-preprocessing=1.1.2=pyhd8ed1ab_0 + - kiwisolver=1.3.1=py38h1fd1430_1 + - krb5=1.17.2=h926e7f8_0 + - lcms2=2.11=hcbb858e_1 + - ld_impl_linux-64=2.35.1=hea4e1c9_2 + - libblas=3.9.0=7_openblas + - libcblas=3.9.0=7_openblas + - libclang=11.0.1=default_ha53f305_1 + - libcurl=7.71.1=hcdd3856_8 + - libedit=3.1.20191231=he28a2e2_2 + - libev=4.33=h516909a_1 + - libevent=2.1.10=hcdb4288_3 + - libffi=3.3=h58526e2_2 + - libgcc-ng=9.3.0=h2828fa1_18 + - libgfortran-ng=9.3.0=hff62375_18 + - libgfortran5=9.3.0=hff62375_18 + - libglib=2.66.4=h748fe8e_2 + - libgomp=9.3.0=h2828fa1_18 + - libiconv=1.16=h516909a_0 + - liblapack=3.9.0=7_openblas + - libllvm11=11.0.1=hf817b99_0 + - libnghttp2=1.43.0=h812cca2_0 + - libopenblas=0.3.12=pthreads_h4812303_1 + - libpng=1.6.37=h21135ba_2 + - libpq=12.3=h255efa7_3 + - libprotobuf=3.14.0=h780b84a_0 + - libsodium=1.0.18=h36c2ea0_1 + - libssh2=1.9.0=hab1572f_5 + - libstdcxx-ng=9.3.0=h6de172a_18 + - libtiff=4.2.0=hdc55705_0 + - libuuid=2.32.1=h7f98852_1000 + - libwebp-base=1.2.0=h7f98852_0 + - libxcb=1.13=h7f98852_1003 + - libxkbcommon=1.0.3=he3ba5ed_0 + - libxml2=2.9.10=h72842e0_3 + - lz4-c=1.9.3=h9c3ff4c_0 + - markdown=3.3.3=pyh9f0ad1d_0 + - markupsafe=1.1.1=py38h497a2fe_3 + - matplotlib=3.3.4=py38h578d9bd_0 + - matplotlib-base=3.3.4=py38h0efea84_0 + - mistune=0.8.4=py38h497a2fe_1003 + - mkl=2020.2=256 + - multidict=5.1.0=py38h497a2fe_1 + - mysql-common=8.0.22=ha770c72_3 + - mysql-libs=8.0.22=h935591d_3 + - nbclassic=0.2.6=pyhd8ed1ab_0 + - nbclient=0.5.1=py_0 + - nbconvert=6.0.7=py38h578d9bd_3 + - nbformat=5.1.2=pyhd8ed1ab_1 + - ncurses=6.2=h58526e2_4 + - nest-asyncio=1.4.3=pyhd8ed1ab_0 + - ninja=1.10.2=h4bd325d_0 + - notebook=6.2.0=py38h578d9bd_0 + - nspr=4.29=h9c3ff4c_1 + - nss=3.61=hb5efdd6_0 + - numpy=1.20.0=py38h18fd61f_0 + - oauthlib=3.0.1=py_0 + - olefile=0.46=pyh9f0ad1d_1 + - openssl=1.1.1i=h7f98852_0 + - opt_einsum=3.3.0=py_0 + - packaging=20.8=pyhd3deb0d_0 + - pandoc=2.11.4=h7f98852_0 + - pandocfilters=1.4.2=py_1 + - parso=0.8.1=pyhd8ed1ab_0 + - pcre=8.44=he1b5a44_0 + - pexpect=4.8.0=pyh9f0ad1d_2 + - pickleshare=0.7.5=py_1003 + - pillow=8.1.0=py38h357d4e7_1 + - pip=21.0.1=pyhd8ed1ab_0 + - prometheus_client=0.9.0=pyhd3deb0d_0 + - prompt-toolkit=3.0.14=pyha770c72_0 + - protobuf=3.14.0=py38h709712a_1 + - pthread-stubs=0.4=h36c2ea0_1001 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pyasn1=0.4.8=py_0 + - pyasn1-modules=0.2.7=py_0 + - pycparser=2.20=pyh9f0ad1d_2 + - pygments=2.7.4=pyhd8ed1ab_0 + - pyjwt=2.0.1=pyhd8ed1ab_0 + - pyopenssl=20.0.1=pyhd8ed1ab_0 + - pyparsing=2.4.7=pyh9f0ad1d_0 + - pyqt=5.12.3=py38h578d9bd_7 + - pyqt-impl=5.12.3=py38h7400c14_7 + - pyqt5-sip=4.19.18=py38h709712a_7 + - pyqtchart=5.12=py38h7400c14_7 + - pyqtwebengine=5.12.1=py38h7400c14_7 + - pyrsistent=0.17.3=py38h497a2fe_2 + - pysocks=1.7.1=py38h578d9bd_3 + - python=3.8.6=hffdb5ce_5_cpython + - python-dateutil=2.8.1=py_0 + - python_abi=3.8=1_cp38 + - pytz=2021.1=pyhd8ed1ab_0 + - pyzmq=22.0.1=py38h3d7ac18_0 + - qt=5.12.9=h9d6b050_2 + - readline=8.0=he28a2e2_2 + - requests=2.25.1=pyhd3deb0d_0 + - requests-oauthlib=1.3.0=pyh9f0ad1d_0 + - rsa=4.7=pyhd3deb0d_0 + - send2trash=1.5.0=py_0 + - setuptools=49.6.0=py38h578d9bd_3 + - sip=4.19.13=py38he6710b0_0 + - six=1.15.0=pyh9f0ad1d_0 + - sniffio=1.2.0=py38h578d9bd_1 + - sqlite=3.34.0=h74cdb3f_0 + - tensorboard-plugin-wit=1.8.0=pyh44b312d_0 + - tensorflow=2.2.0=gpu_py38hb782248_0 + - tensorflow-base=2.2.0=gpu_py38h83e3d50_0 + - tensorflow-gpu=2.2.0=h0d30ee6_0 + - termcolor=1.1.0=py_2 + - terminado=0.9.2=py38h578d9bd_0 + - testpath=0.4.4=py_0 + - tk=8.6.10=h21135ba_1 + - tornado=6.1=py38h497a2fe_1 + - traitlets=5.0.5=py_0 + - typing-extensions=3.7.4.3=0 + - typing_extensions=3.7.4.3=py_0 + - urllib3=1.26.3=pyhd8ed1ab_0 + - wcwidth=0.2.5=pyh9f0ad1d_2 + - webencodings=0.5.1=py_1 + - werkzeug=1.0.1=pyh9f0ad1d_0 + - wheel=0.36.2=pyhd3deb0d_0 + - wrapt=1.12.1=py38h497a2fe_3 + - xorg-libxau=1.0.9=h7f98852_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xz=5.2.5=h516909a_1 + - yarl=1.6.3=py38h497a2fe_1 + - zeromq=4.3.3=h58526e2_3 + - zipp=3.4.0=py_0 + - zlib=1.2.11=h516909a_1010 + - zstd=1.4.8=ha95c52a_1 + - pip: + - alibi==0.5.5 + - altair==4.1.0 + - astropy==4.2 + - beautifulsoup4==4.9.3 + - blis==0.7.4 + - catalogue==2.0.1 + - click-plugins==1.1.1 + - cligj==0.7.1 + - cloudpickle==1.6.0 + - cymem==2.0.5 + - descartes==1.1.0 + - eli5==0.11.0 + - fiona==1.8.18 + - geopandas==0.8.2 + - imageio==2.9.0 + - joblib==1.0.0 + - keras==2.4.3 + - llvmlite==0.35.0 + - munch==2.5.0 + - murmurhash==1.0.5 + - networkx==2.5 + - numba==0.52.0 + - opt-einsum==3.3.0 + - pandas==1.2.1 + - pathy==0.3.4 + - patsy==0.5.1 + - preshed==3.0.5 + - pydantic==1.7.3 + - pyerfa==1.7.1.1 + - pyproj==3.0.0.post1 + - python-graphviz==0.16 + - pywavelets==1.1.1 + - pyyaml==5.4.1 + - scikit-image==0.18.1 + - scikit-learn==0.24.1 + - scipy==1.4.1 + - shap==0.38.1 + - shapely==1.7.1 + - slicer==0.0.7 + - smart-open==3.0.0 + - soupsieve==2.1 + - spacy==3.0.0 + - spacy-legacy==3.0.1 + - spacy-lookups-data==1.0.0 + - srsly==2.4.0 + - statsmodels==0.12.2 + - tabulate==0.8.7 + - tensorboard==2.2.2 + - tensorflow-estimator==2.2.0 + - thinc==8.0.1 + - threadpoolctl==2.1.0 + - tifffile==2021.1.14 + - toolz==0.11.1 + - tqdm==4.56.0 + - typer==0.3.2 + - wasabi==0.8.2 +prefix: /vol/ml/shahashka/anaconda3/envs/xai-geom-tf + diff --git a/Pilot1/NT3/nt3_cf/gen_clusters.py b/Pilot1/NT3/nt3_cf/gen_clusters.py new file mode 100644 index 00000000..2dcd36a7 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/gen_clusters.py @@ -0,0 +1,113 @@ + +import numpy as np +import pickle +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +from sklearn.decomposition import PCA +from sklearn.metrics import silhouette_score +import argparse + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-t", type=str, help="threshod input file") + parser.add_argument("-t_value", type=float, help="threshold value") + args = parser.parse_args() + return args + +if __name__ == '__main__': + + args = get_args() + + thresholds_9 = pickle.load(open(args.t, 'rb')) + + perturb_vector=thresholds_9['perturbation vector'] + cf_class = thresholds_9['counterfactual class'] + indices = thresholds_9['sample index'] + + # split by class + perturb_vector_0=[] + perturb_vector_1=[] + indices_0 = [] + indices_1 = [] + for i,j,k in zip(perturb_vector, cf_class, indices): + if j==0: + perturb_vector_0.append(i) + indices_0.append(k) + else: + perturb_vector_1.append(i) + indices_1.append(k) + + indices_0 = np.array(indices_0) + indices_1 = np.array(indices_1) + sil = [] + print(len(perturb_vector_0), len(perturb_vector_1)) + kmax = np.min([len(perturb_vector_0), len(perturb_vector_1),10]) + data_2D = PCA(20).fit_transform(perturb_vector_0) + + # dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2 + for k in range(2, kmax + 1): + print(k) + kmeans = KMeans(n_clusters=k).fit(data_2D[:,0:2]) + labels = kmeans.labels_ + sil.append(silhouette_score(data_2D[:,0:2], labels, metric='euclidean')) + #plt.plot(np.arange(2, kmax+1), sil) + #plt.title("Silhouette scores to determine optimal k") + #plt.xlabel("k") + #plt.show() + k = np.argmax(sil) + 2 if len(sil) > 0 else kmax + print(k) + #data_2D = PCA(2).fit_transform(perturb_vector_0) + kmeans_0 = KMeans(n_clusters=k).fit(data_2D[:,0:2]) + labels_0 = kmeans_0.labels_ + colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] + for i in range(k): + plt.scatter(data_2D[:,0][labels_0==i], data_2D[:,1][labels_0==i], c=colors[i%len(colors)]) + plt.title("KMeans clusters with 2D PCA") + plt.savefig("CF_0.png") + k0 = k + sil=[] + data_2D = PCA(20).fit_transform(perturb_vector_1) + for k in range(2, kmax + 1): + kmeans = KMeans(n_clusters=k).fit(data_2D[:,0:2])#perturb_vector_1) + labels = kmeans.labels_ + sil.append(silhouette_score(data_2D[:,0:2], labels, metric='euclidean')) + #plt.plot(np.arange(2, kmax+1), sil) + #plt.title("Silhouette scores to determine optimal k") + #plt.xlabel("k") + #plt.show() + k = np.argmax(sil) + 2 if len(sil) > 0 else kmax + print(k) + #data_2D = PCA(2).fit_transform(perturb_vector_1) + kmeans_1 = KMeans(n_clusters=k).fit(data_2D[:,0:2])#perturb_vector_1) + labels_1 = kmeans_1.labels_ + colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] + for i in range(k): + plt.scatter(data_2D[:,0][labels_1==i], data_2D[:,1][labels_1==i], c=colors[i%len(colors)]) + plt.title("Perturbation vectors KMeans clusters with 2D PCA") + plt.savefig("CF 1.png") + +for i in range(len(kmeans_0.cluster_centers_)): + diff_0=kmeans_0.cluster_centers_[i] + max_value = np.max(np.abs(diff_0)) + ind_pos = np.where(diff_0 > args.t_value*max_value) + ind_neg = np.where(diff_0 < -1*args.t_value*max_value) + output = {'centroid perturb vector': diff_0, + 'positive threshold indices':ind_pos, + 'negative threshold indices':ind_neg, + 'sample indices in this cluster':indices_0[labels_0==i]} + print(output) + pickle.dump(output, + open("cf_class_0_cluster{}.pkl".format(i), "wb")) + +for i in range(len(kmeans_1.cluster_centers_)): + diff_1=kmeans_1.cluster_centers_[i] + max_value = np.max(np.abs(diff_1)) + ind_pos = np.where(diff_1 > args.t_value*max_value) + ind_neg = np.where(diff_1 < -1*args.t_value*max_value) + output = {'centroid perturb vector': diff_1, + 'positive threshold indices':ind_pos, + 'negative threshold indices':ind_neg, + 'sample indices in this cluster':indices_1[labels_1==i]} + print(output) + pickle.dump(output, + open("cf_class_1_cluster{}.pkl".format(i), "wb")) diff --git a/Pilot1/NT3/nt3_cf/inject_noise.py b/Pilot1/NT3/nt3_cf/inject_noise.py new file mode 100644 index 00000000..b8071579 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/inject_noise.py @@ -0,0 +1,120 @@ +import pickle +import numpy as np +import copy +import argparse +import os +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-t", type=str, help="threshold pickle file") + parser.add_argument("-c1", type=str, help="cluster 1 file") + parser.add_argument("-c2", type=str, help="cluster 2 file") + parser.add_argument("-scale", type=float, help="scale factor for noise injection") + parser.add_argument("-r", type=bool, help="flag to add random noise") + parser.add_argument("-o", type=str, help="folder for output files") + parser.add_argument("-d", type=str, help="nt3 data file") + parser.add_argument("-f", type=str, help="pickle file containing failed cf indices") + args = parser.parse_args() + return args + +# Choose a random set of indices to inject cf noise into +def random_noise(s,scale,size, cluster_inds, args): + X_train, X_test, y_train, y_test = pickle.load(open(args.d, 'rb')) + #X_data, y_data = pickle.load(open(args.d, 'rb')) + #X_data = np.concatenate([X_train, X_test]) + genes = np.random.choice(np.arange(X_train.shape[0]), replace=False, size=size) + noise = np.random.normal(0,1,size) + X_data_noise = copy.deepcopy(X_train) + s, _ = s.split(".") + cluster_name = s[3:] + for p in np.arange(0.1,1.0, 0.1): + for i in cluster_inds: + for j in range(size): + X_data_noise[i][genes[j]]+=noise[j] + # Now split back into train test for output + #X_train = X_data_noise[0:(int)(0.8*X_data.shape[0])] + #X_test = X_data_noise[(int)(0.8*X_data.shape[0]):] + pickle.dump([X_data_noise, X_test, y_train, y_test, [], cluster_inds], open("{}/nt3.data.random.scale_{}_{}.noise_{}.pkl".format(args.o,scale,cluster_name,round(p,1)), "wb")) + +def main(): + args = get_args() + isExist = os.path.exists(args.o) + if not isExist: + os.makedirs(args.o) + # For 2 clusters (with sparse injection feature vector) add CF noise to x% of samples + X_train, X_test, y_train, y_test = pickle.load(open(args.d, 'rb')) + print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) + #X_data, y_data = pickle.load(open(args.d, 'rb')) + threshold_dataset = pickle.load(open(args.t, 'rb')) + perturb_dataset = threshold_dataset['perturbation vector'] + + + #combine for easier indexing later + #X_data = np.concatenate([X_train, X_test]) + + #account for failed indices + failed_indices = pickle.load(open(args.f, 'rb'))[0] + failed_indices=[919] + print(failed_indices) + for i in failed_indices: + perturb_dataset.insert(i, np.zeros(X_train.shape[1])) + perturb_dataset = np.array(perturb_dataset) + + _, cf1 = os.path.split(args.c1) + _, cf2 = os.path.split(args.c2) + cluster_files = [cf1, cf2] + perturb_dataset = perturb_dataset[0:X_train.shape[0]] + for i in range(len(cluster_files)): + print(cluster_files[i]) + d = pickle.load(open(cluster_files[i], "rb")) + cluster_inds = d['sample indices in this cluster'] + cluster_inds_noise = list(filter(lambda val: val < 1120, cluster_inds)) + + if args.r: + random_noise(cluster_files[i],args.scale,20, cluster_inds_noise, args) + + # Sweep through percentages + for p in np.arange(0.1,1.0, 0.1): + print("p={}".format(p)) + X_data_noise = copy.deepcopy(X_train) + + #Full cf injection + # Choose x% of the indices to be perturbed + selector = np.random.choice(a=cluster_inds_noise, replace=False, size = (int)(p*len(cluster_inds_noise))) + X_data_noise[selector]-= args.scale*perturb_dataset[selector][:,:,None] + + # Now split back into train test for output + #X_train = X_data_noise[0:(int)(0.8*X_data.shape[0])] + #X_test = X_data_noise[(int)(0.8*X_data.shape[0]):] + + s,_ = cluster_files[i].split(".") + cluster_name = s[3:] + pickle.dump([X_data_noise, X_test, y_train, y_test, selector, cluster_inds], open("{}/nt3.data.scale_{}_{}.noise_{}.pkl".format(args.o, args.scale,cluster_name, round(p,1)), "wb")) + + # Add cf noise only to those indices that passed the threshold value (instead of the full cf profile) + inds = [] + for j in d['positive threshold indices'][0]: + inds.append(j) + for j in d['negative threshold indices'][0]: + inds.append(j) + X_data_noise_2 = copy.deepcopy(X_train) + + all_inds = np.arange(X_train.shape[0]) + for j in all_inds: + if j not in inds: + perturb_dataset[:,j]=0 + X_data_noise_2[selector]-= args.scale*perturb_dataset[selector][:,:,None] + + # Now split back into train test + #X_train = X_data_noise_2[0:(int)(0.8*X_data.shape[0])] + #X_test = X_data_noise_2[(int)(0.8*X_data.shape[0]):] + + pickle.dump([X_data_noise_2, X_test, y_train, y_test, selector, cluster_inds], open("{}/nt3.data.threshold.scale_{}_{}.noise_{}.pkl".format(args.o, args.scale, cluster_name, round(p,1)), "wb")) + +if __name__ == "__main__": + main() + + + + + +# Save dataset file diff --git a/Pilot1/NT3/nt3_cf/test_cf_accuracy.py b/Pilot1/NT3/nt3_cf/test_cf_accuracy.py new file mode 100644 index 00000000..af9b5f19 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/test_cf_accuracy.py @@ -0,0 +1,82 @@ +import tensorflow as tf +from tensorflow.keras.models import Model, load_model +import matplotlib.pyplot as plt +import numpy as np +import os +import pickle +import argparse +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-m", type=str, help="model file") + parser.add_argument("-prefix", type=str, help="noise file prefix") + parser.add_argument("-prefix_rand", type=str, help="random noise file prefix") + parser.add_argument("-prefix_rand_cf", type=str, help="random noise along cf indices") + parser.add_argument("-folder", type=str, help="folder path to noise files") + parser.add_argument("-o", type=str, help="name of saved png") + parser.add_argument("-n", type=str, help="name of cluster") + args = parser.parse_args() + return args +def main(): + args = get_args() + model_nt3 = tf.keras.models.load_model(args.m) + + results = [] + for i in np.arange(0.1,1.0, 0.1): + cf_dataset = pickle.load(open("{}_{}.pkl".format(args.prefix, round(i,2)), "rb")) + X_cf_dataset = np.concatenate([cf_dataset[0], cf_dataset[1]]) + y_cf_dataset = np.concatenate([cf_dataset[2], cf_dataset[3]]) + #X_cf_dataset = cf_dataset[0] + #y_cf_dataset = cf_dataset[1] + cluster_inds = cf_dataset[-1] + print(model_nt3.metrics_names) + acc = model_nt3.evaluate(X_cf_dataset, y_cf_dataset) + cluster_acc = model_nt3.evaluate(X_cf_dataset[cluster_inds], y_cf_dataset[cluster_inds]) + print(i, acc, cluster_acc) + results.append([acc[1], cluster_acc[1]]) + results = np.array(results) +# plt.plot(np.arange(0.1,1.0,0.1), results[:,0], label="full dataset accuracy with cf pertubation", marker='o') + plt.plot(np.arange(0.1,1.0, 0.1), results[:,1], label="cluster accuracy with cf perturbation", marker='o') + + results = [] + for i in np.arange(0.1,1.0, 0.1): + cf_dataset = pickle.load(open("{}_{}.pkl".format(args.prefix_rand, round(i,2)), "rb")) + X_cf_dataset = np.concatenate([cf_dataset[0], cf_dataset[1]]) + y_cf_dataset = np.concatenate([cf_dataset[2], cf_dataset[3]]) + #X_cf_dataset = cf_dataset[0] + #y_cf_dataset = cf_dataset[1] + cluster_inds = cf_dataset[-1] + print(model_nt3.metrics_names) + acc = model_nt3.evaluate(X_cf_dataset, y_cf_dataset) + cluster_acc = model_nt3.evaluate(X_cf_dataset[cluster_inds], y_cf_dataset[cluster_inds]) + print(i, acc, cluster_acc) + results.append([acc[1], cluster_acc[1]]) + results = np.array(results) +# plt.plot(np.arange(0.1,1.0,0.1), results[:,0], label="full dataset accuracy with Gaussian noise (rand indices)", marker='o') + plt.plot(np.arange(0.1,1.0, 0.1), results[:,1], label="cluster accuracy with Gaussian noise (random features)", marker='o') + + results = [] + for i in np.arange(0.1,1.0, 0.1): + cf_dataset = pickle.load(open("{}_{}.pkl".format(args.prefix_rand_cf, round(i,2)), "rb")) + X_cf_dataset = np.concatenate([cf_dataset[0], cf_dataset[1]]) + y_cf_dataset = np.concatenate([cf_dataset[2], cf_dataset[3]]) + #X_cf_dataset = cf_dataset[0] + #y_cf_dataset = cf_dataset[1] + cluster_inds = cf_dataset[-1] + print(model_nt3.metrics_names) + acc = model_nt3.evaluate(X_cf_dataset, y_cf_dataset) + cluster_acc = model_nt3.evaluate(X_cf_dataset[cluster_inds], y_cf_dataset[cluster_inds]) + print(i, acc, cluster_acc) + results.append([acc[1], cluster_acc[1]]) + results = np.array(results) +# plt.plot(np.arange(0.1,1.0,0.1), results[:,0], label="full dataset accuracy with Gaussian noise (cf indices)", marker='o') + plt.plot(np.arange(0.1,1.0, 0.1), results[:,1], label="cluster accuracy with Gaussian noise (cf features)", marker='o') + + + plt.xlabel("Noise fraction in cluster") + plt.ylabel("Accuracy") + plt.legend() + plt.title("Model accuracy with counterfactual noise injection") + plt.savefig(args.o) + +if __name__ == "__main__": + main() diff --git a/Pilot1/NT3/nt3_cf/threshold.py b/Pilot1/NT3/nt3_cf/threshold.py new file mode 100644 index 00000000..cdd339e6 --- /dev/null +++ b/Pilot1/NT3/nt3_cf/threshold.py @@ -0,0 +1,70 @@ +# Example run python threshold.py -d nt3.autosave.data.pkl -c small_cf.pkl -t 0.2 -o small_threshold.pkl +import pickle +import numpy as np +import argparse + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-d', type=str, + help='data input file', required=True) + parser.add_argument('-c', type=str, + help='counterfactual input file', required=True) + parser.add_argument('-o', type=str, + help='output file', required=True) + parser.add_argument('-t', type=float, + help='threshold value', required=True) + + args = parser.parse_args() + return args + +def threshold(t_value, X, y, cf): + pos = [] + neg = [] + cf_classes = [] + inds = [] + diffs = [] + for i in range(len(cf)): + test_y = X[i].flatten() + test_cf = cf[i][1].flatten() + + diff = test_y-test_cf + max_value = np.max(np.abs(diff)) + + ind_pos = np.where(diff > t_value*max_value) + ind_neg = np.where(diff < -t_value*max_value) + + cf_class = np.abs(1-np.argmax(y[i])) + + pos.append(ind_pos) + neg.append(ind_neg) + cf_classes.append(cf_class) + inds.append(cf[i][0]) + diffs.append(diff) + + return pos,neg,cf_classes,inds, diffs + +def main(): + args = get_args() + with open(args.d, 'rb') as pickle_file: + X_train,X_test, Y_train,Y_test = pickle.load(pickle_file) + + with open(args.c, 'rb') as pickle_file: + cf = pickle.load(pickle_file) + + X = np.concatenate([X_train,X_test]) + Y = np.concatenate([Y_train, Y_test]) +# X=X_test +# Y=Y_test + pos,neg,cf_classes,inds, diff = threshold(args.t, X, Y, cf) + + # Note that sample index is here to keep track of counterfactuals that succeeded, counterfactuals that failed are not included here + results = {'sample index': inds, + 'positive threshold indices': pos, + 'negative threshold indices':neg, + 'counterfactual class':cf_classes, + 'perturbation vector': diff} + pickle.dump(results, open(args.o, "wb")) + + +if __name__ == "__main__": + main() diff --git a/Pilot1/NT3/nt3_default_model.txt b/Pilot1/NT3/nt3_default_model.txt index 708b7051..c49c7645 100644 --- a/Pilot1/NT3/nt3_default_model.txt +++ b/Pilot1/NT3/nt3_default_model.txt @@ -10,7 +10,7 @@ out_activation = 'softmax' loss = 'categorical_crossentropy' optimizer = 'sgd' metrics = 'accuracy' -epochs = 400 +epochs = 10 batch_size = 20 learning_rate = 0.001 dropout = 0.1 @@ -31,3 +31,4 @@ timeout = 3600 ckpt_restart_mode = 'off' ckpt_save_interval = 0 ckpt_checksum = True +noise_save_cf = True diff --git a/Pilot1/NT3/nt3_noise_model.txt b/Pilot1/NT3/nt3_noise_model.txt index 6c8f1d73..3f2cc9f0 100644 --- a/Pilot1/NT3/nt3_noise_model.txt +++ b/Pilot1/NT3/nt3_noise_model.txt @@ -38,4 +38,5 @@ alpha = 0.3 alpha_scale_factor = 0.8 init_abs_epoch = 5 task_list = 0 -task_names = ['activation_4'] +noise_save_cf = True +task_names = ['activation_4'] \ No newline at end of file diff --git a/Pilot1/NT3/run_abstention_sweep.sh b/Pilot1/NT3/run_abstention_sweep.sh new file mode 100755 index 00000000..7978ecc1 --- /dev/null +++ b/Pilot1/NT3/run_abstention_sweep.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +#vals=0.1 +#for filename in /vol/ml/shahashka/temp/Benchmarks/Pilot1/NT3/nt3_cf/noise_both_clusters/nt3.data.threshold.*; do +# echo $filename +# python nt3_abstention_keras2.py --noise_cf $filename --output_dir cf_sweep_1104 --run_id $(basename $filename) --epochs 100 +# #cp cf_sweep_0902/EXP000/RUN000/training.log ${filename}_training_0902.log +#done + +for i in $(seq 0 0.1 1); do + echo $i + for j in $(seq 1 1 5); do + python nt3_baseline_keras2.py --label_noise $i --output_dir baseline_label_noise_$i --run_id RUN$j + done +done diff --git a/common/file_utils.py b/common/file_utils.py index a1cfdb0b..b11bf90d 100644 --- a/common/file_utils.py +++ b/common/file_utils.py @@ -204,7 +204,6 @@ def directory_from_parameters(params, commonroot='Output'): String to specify the common folder to store results. """ - if commonroot in set(['.', './']): # Same directory --> convert to absolute path outdir = os.path.abspath('.') else: # Create path specified diff --git a/common/parsing_utils.py b/common/parsing_utils.py index 9d8b68cb..297a0368 100644 --- a/common/parsing_utils.py +++ b/common/parsing_utils.py @@ -97,6 +97,17 @@ 'help': 'set the run unique identifier.'} ] +noise_conf = [ + {'name': 'noise_save_cf', + 'type': bool, + 'default': False, + 'help': 'save the model (Tensoflow saved model format) and data (pickle) objects for cf runs'}, + {'name': 'noise_cf', + 'type': str, + 'default': None, + 'help': 'pickle file to hold dataset with noise already added through counterfactuals'} +] + logging_conf = [ {'name': 'verbose', 'abv': 'v', @@ -311,7 +322,7 @@ ] -registered_conf = [basic_conf, input_output_conf, logging_conf, data_preprocess_conf, model_conf, training_conf, cyclic_learning_conf, ckpt_conf] +registered_conf = [basic_conf, input_output_conf, logging_conf, data_preprocess_conf, model_conf, training_conf, cyclic_learning_conf, ckpt_conf, noise_conf] def extract_keywords(lst_dict, kw): @@ -375,7 +386,7 @@ class ArgumentStruct: or object entries) can be used. """ def __init__(self, **entries): - self.__dict__.update(entries) + self.__dict__.update(entries) class ListOfListsAction(argparse.Action): @@ -562,7 +573,6 @@ def args_overwrite_config(args, config): for key in args_dict.keys(): # try casting here params[key] = args_dict[key] - if 'data_type' not in params: params['data_type'] = DEFAULT_DATATYPE else: