diff --git a/FAIR_universe_Higgs_tautau/Data_Loader.py b/FAIR_universe_Higgs_tautau/Data_Loader.py new file mode 100644 index 0000000..56cb792 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Data_Loader.py @@ -0,0 +1,192 @@ +import os +import argparse +import logging +import warnings +import numpy as np +import pandas as pd +import mplhep as hep +import uproot +from HiggsML.datasets import download_dataset +from HiggsML.systematics import systematics + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# # Suppress warnings +# warnings.simplefilter(action='ignore', category=FutureWarning) + +hep.style.use(hep.style.ATLAS) + +def parse_args(): + parser = argparse.ArgumentParser(description="Download and process HiggsML data for analysis.") + + parser.add_argument( + "--url", + type=str, + default="https://zenodo.org/records/15131565/files/FAIR_Universe_HiggsML_data.zip", + help="URL to the dataset zip file." + ) + parser.add_argument( + "--output-dir", + type=str, + default="./saved_datasets/", + help="Directory to save the processed ROOT files." + ) + parser.add_argument( + "--train-size", + type=float, + default=0.35, + help="Fraction of dataset to use for training." + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Random seed for sampling." + ) + + return parser.parse_args() + +def download_and_load(url, train_size): + """Downloads the dataset and loads the training set.""" + logger.info(f"Downloading dataset from {url}...") + data = download_dataset(url) + + logger.info(f"Loading training set with size fraction: {train_size}") + data.load_train_set(train_size=train_size) + + df_training_full = data.get_train_set() + del data # Clean up memory + return df_training_full + +def process_data(df, list_of_processes, seed): + """Filters specific processes and balances the dataset.""" + + all_labels = df["detailed_labels"].unique() + process_to_exclude = list(set(all_labels) - set(list_of_processes)) + logger.info(f"Excluding processes: {process_to_exclude}") + + # Filter dataframe + mask_process_exclusion = ~np.isin(df["detailed_labels"], process_to_exclude) + df_filtered = df[mask_process_exclusion].copy() + + counts = df_filtered["detailed_labels"].value_counts() + logger.info(f"Counts before balancing:\n{counts}") + + # Trim the dataset, so all processes have equal entries + + # Here the notebook implemented the the number of ttbar events (lowest) + min_process = counts.idxmin() + n_min = counts.min() + logger.info(f"Balancing to minimum process count ({min_process}): {n_min}") + + df_list = [] + for _, df_process in df_filtered.groupby('detailed_labels'): + weight_sum_orig = df_process.weights.sum() + + df_sampled = df_process.sample(n=n_min, random_state=seed) + + df_sampled['weights'] *= weight_sum_orig / df_sampled['weights'].sum() + + df_list.append(df_sampled) + del df_sampled + + df_balanced = pd.concat(df_list).reset_index(drop=True) + return df_balanced + +def apply_systematics(df, syst_settings): + """Generates variations of the dataset based on systematics.""" + dataset_dict = {} + + logger.info("Generating nominal dataset...") + dataset_dict['nominal'] = systematics( + data_set=df, + dopostprocess=False + ) + + for sample_name, syst_args in syst_settings.items(): + logger.info(f"Generating systematic variation: {sample_name}") + dataset_dict[sample_name] = systematics( + data_set=df, + dopostprocess=False, + **syst_args + ) + + return dataset_dict + +def save_root_files(dataset_dict, output_dir, processes, selections): + """Saves the datasets to ROOT files applying selections.""" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + logger.info(f"Created output directory: {output_dir}") + + for sample, df in dataset_dict.items(): + output_path = os.path.join(output_dir, f"dataset_{sample}.root") + logger.info(f"Writing {output_path}...") + + with uproot.recreate(output_path) as ntuple: + for process in processes: + + df_process = df[df["detailed_labels"] == process].copy() + + + df_process = df_process.query(selections).copy() + + + columns_to_keep = list(set(df_process.columns.tolist()) - {"detailed_labels"}) + + + arrays = {col: df_process[col].to_numpy() for col in columns_to_keep} + + if arrays: + ntuple[f"tree_{process}"] = arrays + else: + logger.warning(f"No events found for {process} in {sample} after selection.") + +def main(): + args = parse_args() + + list_of_processes_to_model = ["htautau", "ztautau", "ttbar"] + + syst_settings = { + 'TES_up': {'tes': 1.02, 'seed': args.seed}, + 'TES_dn': {'tes': 0.98, 'seed': args.seed}, + 'JES_up': {'jes': 1.02, 'seed': args.seed}, + 'JES_dn': {'jes': 0.98, 'seed': args.seed} + } + + # Some common analysis selections to remove low-stats regions + selections = ( + "DER_mass_transverse_met_lep <= 250.0 and " + "DER_mass_vis <= 500.0 and " + "DER_sum_pt <= 1000 and " + "DER_pt_tot <= 250 and " + "DER_deltar_had_lep <= 4.5 and " + "DER_pt_h <= 400 and " + "DER_pt_ratio_lep_had <= 9.0" + ) + + # Execution Flow + try: + + df_training_full = download_and_load(args.url, args.train_size) + + df_training = process_data(df_training_full, list_of_processes_to_model, args.seed) + del df_training_full + + + dataset_dict = apply_systematics(df_training, syst_settings) + + + save_root_files(dataset_dict, args.output_dir, list_of_processes_to_model, selections) + + logger.info("Data loading workflow completed successfully.") + + except Exception as e: + logger.error(f"An error occurred: {e}", exc_info=True) + raise + +if __name__ == "__main__": + main() diff --git a/FAIR_universe_Higgs_tautau/Data_Preprocessing.py b/FAIR_universe_Higgs_tautau/Data_Preprocessing.py new file mode 100644 index 0000000..7a934a9 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Data_Preprocessing.py @@ -0,0 +1,153 @@ +import os, sys, importlib +import argparse +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import mplhep as hep +import yaml +import uproot + +from utils import plot_kinematic_features + +sys.path.append('../src') +import nsbi_common_utils +from nsbi_common_utils import configuration +from nsbi_common_utils import datasets + +import logging +import warnings +# Suppress warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +hep.style.use(hep.style.ATLAS) + +def parse_args(): + parser = argparse.ArgumentParser(description="Download and process HiggsML data for analysis.") + + parser.add_argument( + "--config", + type=str, + default='./config.yml', + help="config file path" + ) + + + return parser.parse_args() + +def ds_helper(cfg, branches): + ''' + Uses nsbi_common_utils.datasets to load data. + ''' + datasets_helper = nsbi_common_utils.datasets.datasets(config_path = cfg, + branches_to_load = branches) + return datasets_helper + +def process_data(df, input_features_by_jet, branches): + """Filters specific processes and balances the dataset.""" + + median_feature = {} + + for sample, sample_dataset in df["Nominal"].items(): + + median_feature[sample] = {} + + for nJets, feat_list in input_features_by_jet.items(): + for feature in feat_list: + + median_feature[sample][feature] = np.median(sample_dataset.loc[sample_dataset['PRI_n_jets'] >= nJets, feature]) + + logger.info(f"extracting additional branches from the engineered features") + branches_to_add = [] + + for region, sample_datasets in df.items(): + + for sample, sample_dataset in sample_datasets.items(): + + sample_dataset['njet_0'] = (sample_dataset['PRI_n_jets'] == 0).astype(int) + sample_dataset['njet_1'] = (sample_dataset['PRI_n_jets'] == 1).astype(int) + sample_dataset['njet_2'] = (sample_dataset['PRI_n_jets'] >= 2).astype(int) + + branches_to_add += ['njet_0', 'njet_1', 'njet_2'] + + for i, feat_list in input_features_by_jet.items(): + mask_i = (sample_dataset['PRI_n_jets'] >= i).astype(float) + sample_dataset[f'jet{i}_mask'] = mask_i + + branches_to_add += [f'jet{i}_mask'] + + for feat in feat_list: + sample_dataset[feat] = sample_dataset[feat].where(sample_dataset['PRI_n_jets'] >= i, median_feature[sample][feat]) + + for feat in branches.copy(): + + kin = sample_dataset[feat].to_numpy() + + if (np.amin(kin) > 0.0) and (np.amax(kin)>100): + log_feat = 'log_'+feat + sample_dataset[log_feat] = np.log(kin+10.0) + + if log_feat not in branches_to_add: + branches_to_add += [log_feat] + + df[region][sample] = sample_dataset + + return df, branches_to_add + + +def main(): + args = parse_args() + + # Specify branches to load from the ROOT ntuples + input_features_noJets = ['PRI_lep_pt', 'PRI_lep_eta', 'PRI_lep_phi', 'PRI_had_pt', 'PRI_had_eta', + 'PRI_had_phi', 'PRI_met', 'PRI_met_phi', 'DER_mass_transverse_met_lep', + 'DER_mass_vis', 'DER_pt_h', 'DER_deltar_had_lep', 'DER_pt_tot', 'DER_sum_pt', + 'DER_pt_ratio_lep_had', 'DER_met_phi_centrality'] + + input_features_1Jets = ['PRI_jet_leading_pt', 'PRI_jet_leading_eta', + 'PRI_jet_leading_phi', + 'PRI_jet_all_pt'] + + input_features_2Jets = ['PRI_jet_subleading_pt', + 'PRI_jet_subleading_eta', 'PRI_jet_subleading_phi', 'DER_deltaeta_jet_jet', 'DER_mass_jet_jet', + 'DER_prodeta_jet_jet', + 'DER_lep_eta_centrality'] + + input_features_nJets = ['PRI_n_jets'] + + branches_to_load = input_features_noJets \ + + input_features_1Jets \ + + input_features_2Jets \ + + input_features_nJets + input_features_by_jet = { + 1 : input_features_1Jets, + 2 : input_features_2Jets + } + + # Execution Flow + try: + logger.info(f"Loading and converting the dataset to Pandas DataFrame for processing...") + datasets_helper = ds_helper(args.config, branches_to_load) + datasets_all = datasets_helper.load_datasets_from_config(load_systematics = True) + + datasets_all, add_branches = process_data(datasets_all, input_features_by_jet, + branches=branches_to_load) + + logger.info(f"adding additional branches to the DataFrame") + datasets_helper.add_appended_branches(add_branches) + + datasets_helper.save_datasets(datasets_all, + save_systematics = True) + + + logger.info("Data Preprocessing workflow completed successfully.") + + except Exception as e: + logger.error(f"An error occurred: {e}", exc_info=True) + raise + +if __name__ == "__main__": + main() diff --git a/FAIR_universe_Higgs_tautau/Neural_Likelihood_Ratio_Estimation.py b/FAIR_universe_Higgs_tautau/Neural_Likelihood_Ratio_Estimation.py new file mode 100644 index 0000000..5d65f78 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Neural_Likelihood_Ratio_Estimation.py @@ -0,0 +1,200 @@ +import os +import sys +import argparse +import warnings +import logging +import numpy as np +import tensorflow as tf +import yaml +sys.path.append('../') + +# Load the package and modules for training and plotting +import nsbi_common_utils +from nsbi_common_utils import datasets, configuration +from nsbi_common_utils.training import density_ratio_trainer + + +import mplhep as hep +hep.style.use(hep.style.ATLAS) + +warnings.filterwarnings("ignore", category=RuntimeWarning) +warnings.filterwarnings("ignore", category=FutureWarning) + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + + +def load_training_settings(config_path): + """ + Helper function to load training settings from the config_training_NN_parms.yml file. + """ + with open(config_path, 'r') as file: + full_config = yaml.safe_load(file) + + if 'training_settings' not in full_config: + raise KeyError(f"The configuration file {config_path} must contain a 'training_settings' section.") + + return full_config['training_settings'] + +def main(): + + parser = argparse.ArgumentParser(description="Neural Likelihood Ratio Estimation Trainer") + parser.add_argument('--config', type=str, default='./config.yml', + help='Path to the YAML configuration file') + parser.add_argument('--train', action='store_true', + help='Force training of new models (overrides training_NN_paras.py USE_SAVED_MODELS)') + parser.add_argument('--savedDataPath', type=str, default='./saved_datasets/', + help='path to the saved dataset') + parser.add_argument('--use-log-loss', action='store_true', + help='Use log loss instead of default loss') + parser.add_argument('--delete-existing-models', action='store_true', + help='Delete existing models before training') + parser.add_argument('--training-config', type=str, default='./config_training_NN_parms.yml', + help='Path to the training parameter YAML file') + args = parser.parse_args() + + + logger.info("Starting Neural Likelihood Ratio Estimation workflow.") + + logger.info(f"Loading configuration from {args.config}") + config = nsbi_common_utils.configuration.ConfigManager(file_path_string=args.config) + + features, features_scaling = config.get_training_features() + logger.info(f"Training features: {features}") + + logger.info("Initializing Datasets...") + branches_to_load = features + ['presel_score'] + + Datasets = nsbi_common_utils.datasets.datasets( + config_path=args.config, + branches_to_load=branches_to_load + ) + + logger.info("Loading datasets from config...") + dataset_incl_dict = Datasets.load_datasets_from_config(load_systematics=False) + dataset_incl_nominal = dataset_incl_dict["Nominal"].copy() + dataset_SR_nominal = Datasets.filter_region_dataset(dataset_incl_nominal, region="SR") + + PATH_TO_SAVED_DATA = args.savedDataPath + if not PATH_TO_SAVED_DATA.endswith('/'): + PATH_TO_SAVED_DATA += '/' + TRAINING_OUTPUT_PATH = f'{PATH_TO_SAVED_DATA}output_training_nominal/' + + basis_processes = config.get_basis_samples() + ref_processes = config.get_reference_samples() + logger.info(f"Basis processes: {basis_processes}") + logger.info(f"Reference processes: {ref_processes}") + + + NN_training_mix_model = {} + use_log_loss = args.use_log_loss + DELETE_EXISTING_MODELS = args.delete_existing_models + + if DELETE_EXISTING_MODELS: + logger.warning("DELETE_EXISTING_MODELS is True. Old models in the output directory will be removed.") + + path_to_ratios = {} + path_to_figures = {} + path_to_models = {} + + logger.info("Preparing datasets and initializing trainers...") + for process_type in basis_processes: + dataset_mix_model = Datasets.prepare_basis_training_dataset( + dataset_SR_nominal, [process_type], dataset_SR_nominal, ref_processes + ) + + output_name = f'{process_type}' + output_dir = f'{TRAINING_OUTPUT_PATH}general_output_{process_type}' + path_to_ratios[process_type] = f'{TRAINING_OUTPUT_PATH}output_ratios_{process_type}/' + path_to_figures[process_type] = f'{TRAINING_OUTPUT_PATH}output_figures_{process_type}/' + path_to_models[process_type] = f'{TRAINING_OUTPUT_PATH}output_model_params_{process_type}/' + + NN_training_mix_model[process_type] = density_ratio_trainer( + dataset_mix_model, + dataset_mix_model['weights_normed'].to_numpy(), + dataset_mix_model['train_labels'].to_numpy(), + features, + features_scaling, + [process_type, 'ref'], + output_dir, output_name, + path_to_figures=path_to_figures[process_type], + path_to_ratios=path_to_ratios[process_type], + path_to_models=path_to_models[process_type], + use_log_loss=use_log_loss, + delete_existing_models=DELETE_EXISTING_MODELS + ) + + del dataset_mix_model + + num_gpus = len(tf.config.list_physical_devices('GPU')) + logger.info(f"Num GPUs Available: {num_gpus}") + + if num_gpus == 0: + logger.warning("No GPUs found. Training might be slow.") + + + force_train = args.train + + # Load the raw training parameters dictionary + training_params_dict = load_training_settings(args.training_config) + + for count, process_type in enumerate(basis_processes): + logger.info(f"Processing {process_type}...") + + if process_type not in training_params_dict: + logger.error(f"Settings for process '{process_type}' not found in 'training_settings' section of YAML.") + raise KeyError(f"Missing config for {process_type}") + + settings = training_params_dict[process_type].copy() + + # Override load_trained_models if --train argument is present + if force_train: + logger.info("Force training mode enabled via CLI.") + settings['load_trained_models'] = False + else: + logger.info(f"Using USE_SAVED_MODELS={settings['load_trained_models']} from config.") + + logger.info(f"Starting training/loading for {process_type}") + NN_training_mix_model[process_type].train_ensemble(**settings) + + logger.info(f"Testing normalization for {process_type}...") + NN_training_mix_model[process_type].test_normalization() + + logger.info("Training/Loading complete.") + + logger.info("Merging dataframes for final evaluation.") + dataset_combined_SR = Datasets.merge_dataframe_dict_for_training( + dataset_SR_nominal, None, samples_to_merge=["htautau", "ztautau", "ttbar"] + ) + + path_to_saved_ratios = {} + + for process_type in basis_processes: + logger.info(f"Evaluating the density ratios p_c/p_ref for the full dataset and ") + logger.info(f"saveing for the inference step for {process_type}...") + path_to_saved_ratios[process_type] = NN_training_mix_model[process_type].evaluate_and_save_ratios( + dataset_combined_SR, + aggregation_type='median_score' + ) + + logger.info(f"Ratios saved to: {path_to_saved_ratios}") + + path_to_save_root = f"{PATH_TO_SAVED_DATA}/dataset_Asimov_SR.root" + logger.info(f"Saving Asimov_SR dataset to {path_to_save_root}...") + nsbi_common_utils.datasets.save_dataframe_as_root( + dataset_combined_SR, + path_to_save=path_to_save_root, + tree_name="nominal" + ) + + # Save Asimov weights + path_to_save_weights = f"{PATH_TO_SAVED_DATA}/asimov_weights.npy" + logger.info(f"Saving Asimov_weights to {path_to_save_weights}...") + np.save(f"{path_to_save_weights}", dataset_combined_SR["weights"].to_numpy()) + + logger.info("Workflow completed successfully.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/FAIR_universe_Higgs_tautau/Parameter_Fitting_with_Systematics.py b/FAIR_universe_Higgs_tautau/Parameter_Fitting_with_Systematics.py new file mode 100644 index 0000000..56c830e --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Parameter_Fitting_with_Systematics.py @@ -0,0 +1,116 @@ +import os, sys, importlib +import argparse +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler +import tensorflow as tf +tf.get_logger().setLevel('ERROR') +from tensorflow.keras.optimizers import Nadam +import mplhep as hep +import pickle +import matplotlib.pyplot as plt +import yaml + +sys.path.append('../src') +import nsbi_common_utils +from nsbi_common_utils import plotting, training, \ +inference, datasets, configuration, workspace_builder, model +from nsbi_common_utils.inference import inference, plot_NLL_scans + +import glob +import numpy as np + +import jax +import jax.numpy as jnp +jax.config.update("jax_enable_x64", True) + + +import logging +import warnings +# Suppress warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +hep.style.use(hep.style.ATLAS) + +def parse_args(): + parser = argparse.ArgumentParser(description="Download and process HiggsML data for analysis.") + + parser.add_argument( + "--config_hist", + type=str, + default="./config_hist.yml", + help="Configuration file path for histogram settings." + ) + parser.add_argument( + "--config", + type=str, + default="./config.yml", + help=" " + ) + parser.add_argument( + "--measurement2fit", + type=str, + default="higgs_tautau_signal_strength", + help="Measurement to fit in the analysis." + ) + + return parser.parse_args() + +def build_workspace(cfg_path): + """Builds the analysis workspace using nsbi_common_utils.workspace_builder.""" + logger.info(f"Building workspace from config: {cfg_path}") + ws_builder = nsbi_common_utils.workspace_builder.WorkspaceBuilder(config_path = cfg_path) + ws_builder.build() + return ws_builder + + +def inference_object(workspace_hist, workspace_nsbi, measurement2fit): + """Creates an inference object for parameter fitting.""" + model_obj_hist = nsbi_common_utils.model.Model(workspace = workspace_hist, + measurement_to_fit = measurement2fit) + list_parameters, initial_parameter_values = model_obj_hist.get_model_parameters() + num_unconstrained_params = model_obj_hist.num_unconstrained_param + + model_obj_nsbi = nsbi_common_utils.model.Model(workspace = workspace_nsbi, + measurement_to_fit = measurement2fit) + + inference_obj_nsbi = nsbi_common_utils.inference.inference(model_nll = model_obj_nsbi.model, + initial_values = initial_parameter_values, + list_parameters = list_parameters, + num_unconstrained_params = num_unconstrained_params) + + logger.info("Fit results nsbi:\n", inference_obj_nsbi.perform_fit()) + + inference_obj_hist = nsbi_common_utils.inference.inference(model_nll = model_obj_hist.model, + initial_values = initial_parameter_values, + list_parameters = list_parameters, + num_unconstrained_params = num_unconstrained_params) + logger.info("Fit results hist:\n", inference_obj_hist.perform_fit()) + + return inference_obj_nsbi, inference_obj_hist + +def main(): + args = parse_args() + + # Execution Flow + try: + + workspace_hist = build_workspace(args.config_hist) + workspace_nsbi = build_workspace(args.config) + logger.info("The workspace nsbi:\n", workspace_nsbi) + + inference_obj_nsbi, inference_obj_hist = inference_object(workspace_hist, workspace_nsbi, + args.measurement2fit) + + logger.info("Data loading workflow completed successfully.") + + except Exception as e: + logger.error(f"An error occurred: {e}", exc_info=True) + raise + +if __name__ == "__main__": + main() diff --git a/FAIR_universe_Higgs_tautau/Preselection_withNN.py b/FAIR_universe_Higgs_tautau/Preselection_withNN.py new file mode 100644 index 0000000..0a2c608 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Preselection_withNN.py @@ -0,0 +1,121 @@ +import os +import sys +import argparse +import logging +import numpy as np +import warnings +import tensorflow as tf + +sys.path.append('../') + +# Import custom modules +import nsbi_common_utils +from nsbi_common_utils import training, datasets, configuration +from nsbi_common_utils.training import preselection_network_trainer +from utils import calculate_preselection_observable + +tf.config.optimizer.set_jit(False) + +warnings.filterwarnings("ignore", category=RuntimeWarning) +warnings.filterwarnings("ignore", category=FutureWarning) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def main(): + parser = argparse.ArgumentParser(description="Preselection Neural Network Training & Inference") + parser.add_argument('--config', type=str, default='./config.yml', + help='Path to the YAML configuration file') + parser.add_argument('--train', action='store_true', + help='Force training of a new preselection model (overrides loading existing model)') + parser.add_argument('--model-path', type=str, default='./saved_datasets/preselection_model/', + help='Path to save/load the preselection model') + + args = parser.parse_args() + + logger.info(f"Loading configuration from {args.config}") + config = nsbi_common_utils.configuration.ConfigManager(file_path_string=args.config) + + features, features_scaling = config.get_training_features() + logger.info(f"Training features loaded: {features}") + + logger.info("Initializing Datasets...") + datasets_helper = nsbi_common_utils.datasets.datasets( + config_path=args.config, + branches_to_load=features + ) + + train_label_sample_dict = { + "htautau": 0, + "ttbar": 1, + "ztautau": 2 + } + + dataset_incl_dict = datasets_helper.load_datasets_from_config(load_systematics=True) + + dataset_incl_nominal = dataset_incl_dict["Nominal"].copy() + + logger.info("Merging nominal samples for training...") + dataset_incl_nominal_training = datasets_helper.merge_dataframe_dict_for_training( + dataset_incl_nominal, + train_label_sample_dict, + samples_to_merge=dataset_incl_nominal.keys() + ) + + PATH_PRESEL_MODEL = args.model_path + FORCE_TRAIN = args.train + + # Initialize Trainer + preselectionTraining = preselection_network_trainer( + dataset_incl_nominal_training, + features, + features_scaling + ) + + model_exists = os.path.exists(PATH_PRESEL_MODEL) + + if FORCE_TRAIN or not model_exists: + logger.info(f"Starting Training (Force Train={FORCE_TRAIN}, Model Exists={model_exists})...") + + preselectionTraining.train( + test_size=0.2, + random_state=42, + path_to_save=PATH_PRESEL_MODEL, + batch_size=1024, + epochs=50, + learning_rate=0.1 + ) + logger.info(f"Training complete. Model saved to {PATH_PRESEL_MODEL}") + + else: + logger.info(f"Loading existing model from {PATH_PRESEL_MODEL}...") + preselectionTraining.assign_trained_model(PATH_PRESEL_MODEL) + + for region_name, dataset_sample_dict in dataset_incl_dict.items(): + logger.debug(f"Processing region: {region_name}") + + for sample_name, dataset in dataset_sample_dict.items(): + + # Get predictions (softmax outputs) + pred_NN_incl = preselectionTraining.predict(dataset) + + presel_score = calculate_preselection_observable( + pred_NN_incl, + train_label_sample_dict, + signal_processes = ['htautau'], + background_processes = ['ttbar', 'ztautau'], + pre_factor_dict = {'htautau': 1.0, 'ttbar': 1.0, 'ztautau': 1.0} + ) + + dataset_incl_dict[region_name][sample_name]['presel_score'] = presel_score + + logger.info("Saving datasets with new 'presel_score' branch...") + + datasets_helper.add_appended_branches(['presel_score']) + + datasets_helper.save_datasets(dataset_incl_dict, save_systematics=True) + + logger.info("Preselection workflow completed successfully.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/FAIR_universe_Higgs_tautau/Systematic_Uncertainty_Estimation.py b/FAIR_universe_Higgs_tautau/Systematic_Uncertainty_Estimation.py new file mode 100644 index 0000000..6dc2ad8 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/Systematic_Uncertainty_Estimation.py @@ -0,0 +1,227 @@ +import os +import sys +import argparse +import logging +import numpy as np +import yaml +import warnings +import random +import tensorflow as tf + +# Ensure the parent directory is in the path for imports +sys.path.append('../') + +# Import custom modules +import nsbi_common_utils +from nsbi_common_utils import datasets, configuration +from nsbi_common_utils.training import density_ratio_trainer + +import mplhep as hep +hep.style.use(hep.style.ATLAS) + +warnings.filterwarnings("ignore", category=RuntimeWarning) +warnings.filterwarnings("ignore", category=FutureWarning) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def load_training_settings(config_path): + """ + Helper function to load training settings from the config_training_NN_parms.yml file. + """ + if not os.path.exists(config_path): + raise FileNotFoundError(f"Training config file not found: {config_path}") + + with open(config_path, 'r') as f: + data = yaml.safe_load(f) + + return data['systematic_uncertainty_training'] + +def main(): + parser = argparse.ArgumentParser(description="Systematic Uncertainty Estimation") + parser.add_argument('--config', type=str, default='./config.yml', + help='Path to the main dataset configuration file') + parser.add_argument('--training-config', type=str, default='./config_training_NN_parms.yml', + help='Path to the training parameter YAML file') + parser.add_argument('--savedDataPath', type=str, default='./saved_datasets/', + help='Path to save the generated ROOT files') + parser.add_argument('--region', type=str, default='SR', + help='Region to filter (e.g., SR)') + parser.add_argument('--train', action='store_true', + help='Run the Neural Network training for systematics') + + args = parser.parse_args() + + PATH_TO_SAVED_DATA = args.savedDataPath + if not PATH_TO_SAVED_DATA.endswith('/'): + PATH_TO_SAVED_DATA += '/' + + if not os.path.exists(PATH_TO_SAVED_DATA): + os.makedirs(PATH_TO_SAVED_DATA) + + logger.info(f"Loading configuration from {args.config}") + config = nsbi_common_utils.configuration.ConfigManager(file_path_string=args.config) + + features, features_scaling = config.get_training_features() + + logger.info("Initializing Datasets...") + branches_to_load = features + ['presel_score'] + + Datasets = nsbi_common_utils.datasets.datasets( + config_path=args.config, + branches_to_load=branches_to_load + ) + + logger.info("Loading full datasets (Nominal + Systematics)...") + dataset_incl_dict = Datasets.load_datasets_from_config(load_systematics=True) + + region = args.region + logger.info(f"Filtering datasets for region: {region}") + dataset_SR_dict = Datasets.filter_region_dataset(dataset_incl_dict, region=region) + + # ================================================================================= + # PART 1: SAVE ROOT FILES + # ================================================================================= + logger.info("--- Starting Part 1: Saving ROOT files for variations ---") + + samples_to_merge = ["htautau", "ztautau", "ttbar"] + + for variation_name, sample_dataset in dataset_SR_dict.items(): + if variation_name == 'Nominal': continue + + # Simple check to avoid re-saving if desired, or just overwrite + try: + dataset_SR = Datasets.merge_dataframe_dict_for_training( + sample_dataset, None, samples_to_merge=samples_to_merge + ) + filename = f"dataset_{variation_name}_{region}.root" + path_to_save = f"{PATH_TO_SAVED_DATA}{filename}" + + logger.info(f"Saving {filename}...") + nsbi_common_utils.datasets.save_dataframe_as_root( + dataset_SR, path_to_save=path_to_save, tree_name="nominal" + ) + except Exception as e: + logger.error(f"Failed to save {variation_name}: {e}") + + # ================================================================================= + # PART 2: SYSTEMATIC TRAINING + # ================================================================================= + if args.train: + logger.info("--- Starting Part 2: Neural Network Training for Systematics ---") + + logger.info(f"Loading systematic training parameters from {args.training_config}") + sys_training_params = load_training_settings(args.training_config) + + NN_training_syst_process = {} + path_to_ratios = {} + path_to_figures = {} + path_to_models = {} + + rnd_seed_traintestsplit = random.randint(0, 2**32 - 1) + + for process in config.get_basis_samples(): + + NN_training_syst_process[process] = {} + path_to_ratios[process] = {} + path_to_figures[process] = {} + path_to_models[process] = {} + + for dict_syst in config.config["Systematics"]: + syst = dict_syst["Name"] + NN_training_syst_process[process][syst] = {} + path_to_ratios[process][syst] = {} + path_to_figures[process][syst] = {} + path_to_models[process][syst] = {} + + for direction in ["Up", "Dn"]: + samples_to_train = config.get_samples_in_syst_for_training(syst, direction) + + if (process not in samples_to_train): + # Cleanup empty dicts + if not NN_training_syst_process[process][syst]: + del NN_training_syst_process[process][syst] + del path_to_ratios[process][syst] + del path_to_figures[process][syst] + del path_to_models[process][syst] + continue + + syst_key_name = syst + '_' + direction + if syst_key_name not in dataset_SR_dict: continue + + logger.info(f"Initializing: {process} vs {syst_key_name}") + + # Prepare Dataset: Ratio of Systematic / Nominal + dataset_syst_nom = Datasets.prepare_basis_training_dataset( + dataset_SR_dict[syst_key_name], + [process], + dataset_SR_dict["Nominal"], + [process] + ) + + top_path = f'{PATH_TO_SAVED_DATA}output_training_systematics/' + output_name = f'{process}_{syst}_{direction}' + output_dir = f'{top_path}general_output_{output_name}' + + path_to_ratios[process][syst][direction] = f'{top_path}output_ratios_{output_name}/' + path_to_figures[process][syst][direction] = f'{top_path}output_figures_{output_name}/' + path_to_models[process][syst][direction] = f'{top_path}output_model_params_{output_name}/' + + NN_training_syst_process[process][syst][direction] = density_ratio_trainer( + dataset_syst_nom, + dataset_syst_nom['weights_normed'].to_numpy(), + dataset_syst_nom['train_labels'].to_numpy(), + features, + features_scaling, + [syst+'_'+direction, process], + output_dir, output_name, + path_to_figures=path_to_figures[process][syst][direction], + path_to_ratios=path_to_ratios[process][syst][direction], + path_to_models=path_to_models[process][syst][direction], + delete_existing_models=False + ) + + logger.info("Executing Training Loop...") + + # If --train is passed, we FORCE training, ignoring 'load_trained_models' in YAML if needed + # sys_training_params['load_trained_models'] = False + + for process, process_dict in NN_training_syst_process.items(): + for syst, syst_dict in process_dict.items(): + for direction in syst_dict.keys(): + + logger.info(f"Training: {process} | {syst} | {direction}") + + NN_training_syst_process[process][syst][direction].train_ensemble(**sys_training_params) + + # NN_training_syst_process[process][syst][direction].test_normalization() + + logger.info("Evaluating Ratios on Asimov Dataset...") + path_to_load = f"{PATH_TO_SAVED_DATA}dataset_Asimov_SR.root" + + dataset_Asimov_SR = nsbi_common_utils.datasets.load_dataframe_from_root( + path_to_load=path_to_load, tree_name="nominal", branches_to_load=branches_to_load + ) + + ensemble_aggregation_type = 'mean_ratio' + path_to_saved_ratios_eval = {} # Renamed to avoid confusion + + for process in config.get_basis_samples(): + path_to_saved_ratios_eval[process] = {} + for dict_syst in config.config["Systematics"]: + syst = dict_syst["Name"] + if (process not in dict_syst["Samples"]) or (dict_syst["Type"] != "NormPlusShape"): continue + + path_to_saved_ratios_eval[process][syst] = {} + for direction in ["Up", "Dn"]: + + logger.info(f"Evaluating {process} {syst} {direction}") + path_to_saved_ratios_eval[process][syst][direction] = \ + NN_training_syst_process[process][syst][direction].evaluate_and_save_ratios( + dataset_Asimov_SR, aggregation_type=ensemble_aggregation_type + ) + + logger.info("Systematic workflow completed.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/FAIR_universe_Higgs_tautau/config_training_NN_parms.yml b/FAIR_universe_Higgs_tautau/config_training_NN_parms.yml new file mode 100644 index 0000000..a8e8ce8 --- /dev/null +++ b/FAIR_universe_Higgs_tautau/config_training_NN_parms.yml @@ -0,0 +1,52 @@ + +training_settings: + # Define parameter values here + defaults: &training_defaults + hidden_layers: 4 + neurons: 1000 + number_of_epochs: 100 + batch_size: 512 + learning_rate: 0.1 + scalerType: 'MinMax' + calibration: False + num_bins_cal: 500 + callback: True + callback_patience: 30 + callback_factor: 0.01 + validation_split: 0.1 + holdout_split: 0.25 + verbose: 1 + plot_scaled_features: False + load_trained_models: True + recalibrate_output: False + type_of_calibration: 'histogram' + num_ensemble_members: 10 + summarize_model: True + + htautau: + <<: *training_defaults + + ttbar: + <<: *training_defaults + + ztautau: + <<: *training_defaults + +systematic_uncertainty_training: + calibration: True + num_bins_cal: 50 + load_trained_models: True + recalibrate_output: True + num_ensemble_members: 1 + plot_scaled_features: False + verbose: 1 + scalerType: 'MinMax' + hidden_layers: 4 + neurons: 1000 + number_of_epochs: 100 + batch_size: 10000 + learning_rate: 0.1 + type_of_calibration: 'histogram' + callback: True + callback_patience: 30 + callback_factor: 0.01 \ No newline at end of file