diff --git a/README.md b/README.md index 64da8cf1..9f02975e 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ HGCalML =============================================================================== Requirements - * DeepJetCore 3.X (``https://github.com/DL4Jets/DeepJetCore``) - * DeepJetCore 3.X container (or latest version in general) + * DeepJetCore 4.X (``https://github.com/DL4Jets/DeepJetCore``) + * DeepJetCore 4.X container (or latest version in general) For CERN (or any machine with cvmfs mounted), a script to start the latest container use this script: ``` @@ -23,7 +23,7 @@ sing=`which singularity` unset PATH cd -$sing run -B /eos -B /afs $gpuopt /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cernml4reco/deepjetcore3:latest +$sing run -B /eos -B /afs $gpuopt /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cernml4reco/deepjetcore4:latest ``` The package follows the structure and logic of all DeepJetCore subpackages (also the example in DeepJetCore). So as a fresh starting point, it can be a good idea to follow the DeepJetCore example first. diff --git a/Train/config_trainer.py b/Train/config_trainer.py index 74b96613..2adf003f 100644 --- a/Train/config_trainer.py +++ b/Train/config_trainer.py @@ -21,14 +21,14 @@ from Layers import MixWhere from Layers import RaggedGravNet from Layers import PlotCoordinates -from Layers import DistanceWeightedMessagePassing +from Layers import DistanceWeightedMessagePassing, AccumulateNeighbours from Layers import LLFillSpace from Layers import LLExtendedObjectCondensation -from Layers import DictModel,RaggedDictModel +from Layers import DictModel from Layers import RaggedGlobalExchange from Layers import SphereActivation from Layers import Multi -from Layers import ShiftDistance +from Layers import ShiftDistance, KNN from Layers import LLRegulariseGravNetSpace from Regularizers import AverageDistanceRegularizer from model_blocks import tiny_pc_pool, condition_input @@ -132,18 +132,21 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): x = ScaledGooeyBatchNorm2( fluidity_decay=0.01, max_viscosity=0.999999, + learn=True, no_gaus=False)([x, is_track]) x = ScaledGooeyBatchNorm2( fluidity_decay=0.01, max_viscosity=0.999999, invert_condition=True, + learn=True, no_gaus=False)([x, is_track]) c_coords = prime_coords c_coords = ScaledGooeyBatchNorm2( name='batchnorm_ccoords', fluidity_decay=0.01, + learn=True, max_viscosity=0.999999)(c_coords) c_coords = PlotCoordinates( plot_every=plot_debug_every, @@ -156,14 +159,20 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): x = Concatenate()([x, c_coords, is_track]) x = Dense(64, name='dense_pre_loop', activation=DENSE_ACTIVATION)(x) - allfeat = [] + allfeat = [c_coords] print("Available keys: ", pre_processed.keys()) + ## testing ## + + #nidx,dist = KNN(8, use_approximate_knn=True)([prime_coords, rs]) + #x = Concatenate()([x, dist]) + #x = Concatenate()([x, DistanceWeightedMessagePassing([16])([x,nidx,dist]) ]) + ########################################################################### ### Loop over GravNet Layers ############################################## ########################################################################### - gravnet_regs = [0.01, 0.01, 0.01] + gravnet_reg = 0.01 for i in range(GRAVNET_ITERATIONS): @@ -189,14 +198,18 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): )([x, rs]) gndist = LLRegulariseGravNetSpace( - scale=gravnet_regs[i], + scale=gravnet_reg, record_metrics=False, name=f'regularise_gravnet_{i}')([gndist, prime_coords, gnnidx]) - x_rand = random_sampling_block( - xgn, rs, gncoords, gnnidx, gndist, is_track, - reduction=6, layer_norm=True, name=f"RSU_{i}") - x_rand = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x_rand) + x = DistanceWeightedMessagePassing( + [32,32,32,32,32,32,32], + activation='elu')([x,gnnidx,gndist]) + + #x_rand = random_sampling_block( + # xgn, rs, gncoords, gnnidx, gndist, is_track, + # reduction=6, layer_norm=True, name=f"RSU_{i}") + #x_rand = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x_rand) gndist = AverageDistanceRegularizer( strength=1e-3, @@ -214,8 +227,8 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): # x_rand = ScalarMultiply(0.1)(x_rand) # gndist = ScalarMultiply(0.01)(gndist) # gncoords = ScalarMultiply(0.01)(gncoords) - x = Concatenate()([x_pre, xgn, x_rand, gndist, gncoords]) - x = Dense(d_shape, + x = Concatenate()([x, x_pre, xgn, gndist, gncoords]) + x = Dense(2*d_shape, name=f"dense_post_gravnet_1_iteration_{i}", activation=DENSE_ACTIVATION, kernel_regularizer=DENSE_REGULARIZER)(x) @@ -270,7 +283,7 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): pred_beta = LLExtendedObjectCondensation(scale=1., use_energy_weights=True, - record_metrics=False, + record_metrics=True, print_loss=True, name="ExtendedOCLoss", implementation = loss_implementation, @@ -304,7 +317,7 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000): # 'no_noise_rs': pre_processed['no_noise_rs'], } - return RaggedDictModel(inputs=Inputs, outputs=model_outputs) + return DictModel(inputs=Inputs, outputs=model_outputs) #return DictModel(inputs=Inputs, outputs=model_outputs) diff --git a/Train/config_trainer_jk.py b/Train/config_trainer_jk.py new file mode 100644 index 00000000..58dc16e3 --- /dev/null +++ b/Train/config_trainer_jk.py @@ -0,0 +1,404 @@ +""" +Flexible training script that should be mostly configured with a yaml config file +""" + +import os +import sys +import yaml +import shutil +from argparse import ArgumentParser + +import tensorflow as tf +from tensorflow.keras.layers import Concatenate, Dense, Dropout + +from DeepJetCore.training.DeepJet_callbacks import simpleMetricsCallback +from DeepJetCore.DJCLayers import StopGradient, ScalarMultiply + +from DeepJetCore.wandb_interface import wandb_wrapper as wandb + +import training_base_hgcal +from Layers import ScaledGooeyBatchNorm2, DummyLayer +from Layers import MixWhere, ElementWiseMultiply +from Layers import RaggedGravNet +from Layers import PlotCoordinates +from Layers import DistanceWeightedMessagePassing, AccumulateNeighbours +from Layers import LLFillSpace +from Layers import LLExtendedObjectCondensation, TranslationInvariantMP + +from tensorflow.keras import Model + +from Layers import RaggedGlobalExchange +from Layers import SphereActivation +from Layers import Multi +from Layers import ShiftDistance, KNN, MessagePassing +from Layers import LLRegulariseGravNetSpace +from Regularizers import AverageDistanceRegularizer +from model_blocks import tiny_pc_pool, condition_input +from model_blocks import extent_coords_if_needed +from model_blocks import create_outputs +from model_tools import apply_weights_from_path +from model_blocks import random_sampling_unit, random_sampling_block, random_sampling_block2 +from noise_filter import noise_filter +from callbacks import plotClusteringDuringTraining +from callbacks import plotClusterSummary +from callbacks import NanSweeper, DebugPlotRunner + + +#################################################################################################### +### Load Arguments and trainer ##################################################################### +#################################################################################################### + +parser = ArgumentParser('training') +parser.add_argument('configFile') +parser.add_argument('--no_wandb', help="Don't use wandb", action='store_true') +parser.add_argument('--run_name', help="wandb run name", default='test') +parser.add_argument('--wandb_project', help="wandb project name", default="Autumn_2023") + +train = training_base_hgcal.HGCalTraining(parser=parser) #this will add all the other standard args -> train.args + +CONFIGFILE = train.args.configFile +print(f"Using config File: \n{CONFIGFILE}") + +with open(CONFIGFILE, 'r') as f: + config = yaml.load(f, Loader=yaml.FullLoader) + +N_CLUSTER_SPACE_COORDINATES = config['General']['n_cluster_space_coordinates'] +N_GRAVNET_SPACE_COORDINATES = config['General']['n_gravnet_space_coordinates'] +GRAVNET_ITERATIONS = len(config['General']['gravnet']) +LOSS_OPTIONS = config['LossOptions'] +BATCHNORM_OPTIONS = config['BatchNormOptions'] +DENSE_ACTIVATION = config['DenseOptions']['activation'] +DENSE_REGULARIZER = tf.keras.regularizers.l2(config['DenseOptions']['kernel_regularizer_rate']) +DROPOUT = config['DenseOptions']['dropout'] + +wandb_config = { + "loss_implementation" : config['General']['oc_implementation'], + "gravnet_iterations" : GRAVNET_ITERATIONS, + "gravnet_space_coordinates" : N_GRAVNET_SPACE_COORDINATES, + "cluster_space_coordinates" : N_CLUSTER_SPACE_COORDINATES, + "loss_energy_weight" : config['LossOptions']['energy_loss_weight'], + "loss_classification_weight" : config['LossOptions']['classification_loss_weight'], + "loss_qmin" : config['LossOptions']['q_min'], + "loss_use_average_cc_pos" : config['LossOptions']['use_average_cc_pos'], + "loss_too_much_beta_scale" : config['LossOptions']['too_much_beta_scale'], + "loss_beta_scale" : config['LossOptions']['beta_loss_scale'], + "batch_max_viscosity" : config['BatchNormOptions']['max_viscosity'], + "dense_activation" : config['DenseOptions']['activation'], + "dense_kernel_reg" : config['DenseOptions']['kernel_regularizer_rate'] , + "dense_dropout" : config['DenseOptions']['dropout'], +} + +for i in range(GRAVNET_ITERATIONS): + wandb_config[f"gravnet_{i}_neighbours"] =config['General']['gravnet'][i]['n'] +for i in range(len(config['Training'])): + wandb_config[f"train_{i}_lr"] = config['Training'][i]['learning_rate'] + wandb_config[f"train_{i}_epochs"] = config['Training'][i]['epochs'] + wandb_config[f"train_{i}_batchsize"] = config['Training'][i]['batch_size'] + if i == 1: + wandb_config[f"train_{i}+_max_visc"] = 0.999 + wandb_config[f"train_{i}+_fluidity_decay"] = 0.1 + +if not train.args.no_wandb: + wandb.init( + project=train.args.wandb_project, + config=wandb_config, + ) + wandb.save(sys.argv[0]) # Save python file + wandb.save(train.args.configFile) # Save config file +else: + wandb.active=False + + +############################################################################### +### Define Model ############################################################## +############################################################################### + + +def config_model(Inputs, td, debug_outdir=None, plot_debug_every=500): + """ + Function that defines the model to train + """ + + ########################################################################### + ### Pre-processing step ################################################### + ########################################################################### + + orig_input = td.interpretAllModelInputs(Inputs) + pre_processed = condition_input(orig_input, no_scaling=False, no_prime=False) + + prime_coords = pre_processed['prime_coords'] + is_track = pre_processed['is_track'] + rs = pre_processed['row_splits'] + energy = pre_processed['rechit_energy'] + t_idx = pre_processed['t_idx'] + x = pre_processed['features'] + + # preprocessed features here are already normalised! (no_scaling=False above) + # Embed hits and track features differently + x = ScaledGooeyBatchNorm2( + learn=True, + no_bias_gamma=True, + no_gaus=False)([x, is_track]) + + x = ScaledGooeyBatchNorm2( + invert_condition=True, + learn=True, + no_bias_gamma=True, + no_gaus=False)([x, is_track]) + + c_coords = prime_coords + c_coords = ScaledGooeyBatchNorm2( + name='batchnorm_ccoords', + no_bias_gamma=True, + learn=True + )(c_coords) + + c_coords = PlotCoordinates( + plot_every=plot_debug_every, + outdir=debug_outdir, + name='input_c_coords', + # publish = publishpath + )([c_coords, energy, t_idx, rs]) + + x = Concatenate()([x, c_coords, is_track]) + x = Dense(24, name='dense_pre_pre_loop', activation=DENSE_ACTIVATION)(x) + print("Available keys: ", pre_processed.keys()) + + ## create a pre-information exchange + layerwise_coords = ElementWiseMultiply([[1.,1.,1000.]])(prime_coords) + flat_coords = ElementWiseMultiply([[1.,1.,0.]])(prime_coords) + + lw_nidx, lw_dist = KNN(32)([layerwise_coords, rs]) + flat_nidx, flat_dist = KNN(32)([flat_coords, rs]) + + # a few simple MP in different directions + + for _ in range(1): + xt = TranslationInvariantMP([16,16])([x, lw_nidx]) + x = Concatenate()([xt,x]) + xt = TranslationInvariantMP([16,16])([x, flat_nidx]) + x = Concatenate()([x,xt]) + + c_coords = extent_coords_if_needed(prime_coords, x, N_CLUSTER_SPACE_COORDINATES) + x = Dense(64, name='dense_pre_loop', activation=DENSE_ACTIVATION)(x) + allfeat = [c_coords, x] + gncoords = c_coords + + ########################################################################### + ### Loop over GravNet Layers ############################################## + ########################################################################### + + gravnet_reg = 1e-4 + + for i in range(GRAVNET_ITERATIONS): + + d_shape = x.shape[1]//2 + + x = Dense(d_shape,activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + x = Dense(d_shape,activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + + x = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x) + x_pre = x + x = Concatenate()([gncoords, x]) + + x, gncoords, gnnidx, gndist = RaggedGravNet( + name = f"Gravnet_{i}", # 76929, 42625, 42625 + n_neighbours=config['General']['gravnet'][i]['n'], + n_dimensions=N_GRAVNET_SPACE_COORDINATES, + n_filters=d_shape, + n_propagate=2*d_shape, + coord_initialiser_noise=1e-2, + feature_activation=None, #classic gravnet + # sumwnorm=True, + )([x, rs]) + + gndist = LLRegulariseGravNetSpace( + scale=gravnet_reg, + record_metrics=False, + name=f'regularise_gravnet_{i}')([gndist, prime_coords, gnnidx]) + + x = Concatenate()([x, TranslationInvariantMP( + [32,32], + activation='elu')([x,gnnidx,gndist]) + ]) + + gncoords = PlotCoordinates( + plot_every=plot_debug_every, + outdir=debug_outdir, + name='gn_coords_'+str(i) + )([gncoords, energy, t_idx, rs]) + + x = DummyLayer()([x,gncoords,gndist])#just make sure any layer above is not optimised away + + # afew of those again + xt = TranslationInvariantMP([16,16])([x, lw_nidx]) + x = Concatenate()([x,xt]) + xt = TranslationInvariantMP([16,16])([x, flat_nidx]) + x = Concatenate()([x,xt]) + + x = Concatenate()([x, x_pre]) + x = Dense(2*d_shape, + name=f"dense_post_gravnet_1_iteration_{i}", + activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + x = Dense(d_shape, + name=f"dense_post_gravnet_2_iteration_{i}", + activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + + x = ScaledGooeyBatchNorm2( + name=f"batchnorm_loop1_iteration_{i}", + **BATCHNORM_OPTIONS)(x) + + allfeat.append(x) + + if len(allfeat) > 1: + x = Concatenate()(allfeat) + else: + x = allfeat[0] + + ########################################################################### + ### Create output of model and define loss ################################ + ########################################################################### + + + # afew of those again + xt = TranslationInvariantMP([32,16])([x, lw_nidx]) + x = Concatenate()([x,xt]) + xt = TranslationInvariantMP([32,16])([x, flat_nidx]) + x = Concatenate()([x,xt]) + + x = Dense(64, + name=f"dense_final_{1}", + activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + x = Dense(64, + name=f"dense_final_{2}", + activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + x = Dense(64, + name=f"dense_final_{3}", + activation=DENSE_ACTIVATION, + kernel_regularizer=DENSE_REGULARIZER)(x) + x = ScaledGooeyBatchNorm2( + name=f"batchnorm_final", + **BATCHNORM_OPTIONS)(x) + + pred_beta, pred_ccoords, pred_dist, \ + pred_energy_corr, pred_energy_low_quantile, pred_energy_high_quantile, \ + pred_pos, pred_time, pred_time_unc, pred_id = \ + create_outputs(x, n_ccoords=N_CLUSTER_SPACE_COORDINATES, fix_distance_scale=True) + + # pred_ccoords = LLFillSpace(maxhits=2000, runevery=5, scale=0.01)([pred_ccoords, rs, t_idx]) + + + pred_beta = LLExtendedObjectCondensation(scale=1., + use_energy_weights=True, + record_metrics=True, + print_loss=False, + name="ExtendedOCLoss", + implementation = config['General']['oc_implementation'], + **LOSS_OPTIONS)( + [pred_beta, pred_ccoords, pred_dist, pred_energy_corr, pred_energy_low_quantile, + pred_energy_high_quantile, pred_pos, pred_time, pred_time_unc, pred_id, energy, + pre_processed['t_idx'] , pre_processed['t_energy'] , pre_processed['t_pos'] , + pre_processed['t_time'] , pre_processed['t_pid'] , pre_processed['t_spectator_weight'], + pre_processed['t_fully_contained'], pre_processed['t_rec_energy'], + pre_processed['t_is_unique'], pre_processed['row_splits']]) + + pred_ccoords = PlotCoordinates( + plot_every=plot_debug_every, + outdir = debug_outdir, + name='condensation' + )([pred_ccoords, pred_beta, pre_processed['t_idx'], rs]) + + model_outputs = { + 'pred_beta': pred_beta, + 'pred_ccoords': pred_ccoords, + 'pred_energy_corr_factor': pred_energy_corr, + 'pred_energy_low_quantile': pred_energy_low_quantile, + 'pred_energy_high_quantile': pred_energy_high_quantile, + 'pred_pos': pred_pos, + 'pred_time': pred_time, + 'pred_id': pred_id, + 'pred_dist': pred_dist, + 'rechit_energy': energy, + 'row_splits': pre_processed['row_splits'], + # 'no_noise_sel': pre_processed['no_noise_sel'], + # 'no_noise_rs': pre_processed['no_noise_rs'], + } + + return Model(inputs=Inputs, outputs=model_outputs) + #return DictModel(inputs=Inputs, outputs=model_outputs) + + +############################################################################### +### Set up training ########################################################### +############################################################################### + + + +if not train.modelSet(): + train.setModel( + config_model, + td=train.train_data.dataclass(), + debug_outdir=train.outputDir+'/intplots', + ) + train.setCustomOptimizer(tf.keras.optimizers.Nadam(clipnorm=1.)) + train.compileModel(learningrate=1e-4) + train.keras_model.summary() + +############################################################################### +### Callbacks ################################################################# +############################################################################### + + +samplepath = train.val_data.getSamplePath(train.val_data.samples[0]) +PUBLISHPATH = "" +PUBLISHPATH += [d for d in train.outputDir.split('/') if len(d)][-1] +RECORD_FREQUENCY = 10 +PLOT_FREQUENCY = 40 + +cb = [NanSweeper()] #this takes a bit of time checking each batch but could be worth it + +cb += [ + plotClusterSummary( + outputfile=train.outputDir + "/clustering/", + samplefile=train.val_data.getSamplePath(train.val_data.samples[0]), + after_n_batches=500 + ) + ] + +############################################################################### +### Actual Training ########################################################### +############################################################################### + +shutil.copyfile(CONFIGFILE, os.path.join(sys.argv[3], "config.yaml")) + +N_TRAINING_STAGES = len(config['Training']) +for i in range(N_TRAINING_STAGES): + print(f"Starting training stage {i}") + learning_rate = config['Training'][i]['learning_rate'] + epochs = config['Training'][i]['epochs'] + batch_size = config['Training'][i]['batch_size'] + train.change_learning_rate(learning_rate) + print(f"Training for {epochs} epochs") + print(f"Learning rate set to {learning_rate}") + print(f"Batch size: {batch_size}") + + if i == 1: + # change batchnorm + for layer in train.keras_model.layers: + if 'batchnorm' in layer.name: + layer.max_viscosity = 0.999 + layer.fluidity_decay = 0.01 + print("Here we are") + train.trainModel( + nepochs=epochs, + batchsize=batch_size, + add_progbar=True, + additional_callbacks=cb, + collect_gradients = 1 + ) diff --git a/Train/configuration/rsu.yaml b/Train/configuration/rsu.yaml index 305b8832..954a78f1 100644 --- a/Train/configuration/rsu.yaml +++ b/Train/configuration/rsu.yaml @@ -1,44 +1,44 @@ General: oc_implementation: 'hinge' gravnet: - - n: 32 - - n: 128 + - n: 64 - n: 64 n_cluster_space_coordinates: 3 n_gravnet_space_coordinates: 3 DenseOptions: activation: "elu" - kernel_regularizer_rate: 0.001 + kernel_regularizer_rate: 0.00000000001 dropout: 0.000001 BatchNormOptions: max_viscosity: 0.5 + learn: True LossOptions: - energy_loss_weight: .00 - q_min: 5.0 - use_average_cc_pos: 0.99 - classification_loss_weight: 0.0 + energy_loss_weight: 0.00000000001 + q_min: 2.1 + use_average_cc_pos: 0.99999 + classification_loss_weight: 0.00000000001 too_much_beta_scale: 0.0 - position_loss_weight: 0.0 - timing_loss_weight: 0.0 + position_loss_weight: 0.00000000001 + timing_loss_weight: 0.00000000001 beta_loss_scale: 1.0 Training: - stage_1: batch_size: 120000 - learning_rate: 0.0003 - epochs: 1 + learning_rate: 0.0005 + epochs: 4 - stage_2: - batch_size: 120000 + batch_size: 400000 learning_rate: 0.0001 epochs: 10 - stage_3: - batch_size: 120000 - learning_rate: 0.00001 + batch_size: 400000 + learning_rate: 0.00003 epochs: 20 diff --git a/modules/DebugLayers.py b/modules/DebugLayers.py index 7b2b791e..29f71b9a 100644 --- a/modules/DebugLayers.py +++ b/modules/DebugLayers.py @@ -125,7 +125,7 @@ def __init__(self, if len(outdir) < 1: self.plot_every=0 self.outdir = outdir - self.counter=-1 + self.counter=0 if not os.path.isdir(os.path.dirname(self.outdir)): #could not be created self.outdir='' diff --git a/modules/GravNetLayersRagged.py b/modules/GravNetLayersRagged.py index 7e2189b1..cb957408 100644 --- a/modules/GravNetLayersRagged.py +++ b/modules/GravNetLayersRagged.py @@ -33,6 +33,16 @@ # outshape *= 2 # return tf.reshape(out, [-1, outshape]),midx +def layernorm(x, return_norm=False): + #x = x - tf.reduce_mean(x,axis=-1, keepdims=True) + norm = tf.reduce_sum(x**2, axis=-1,keepdims=True) + norm = tf.sqrt(norm+1e-6) + if return_norm: + x = tf.concat([x / norm * tf.sqrt(tf.cast(x.shape[-1],'float32')), norm], axis=-1) + else: + x = x / norm * tf.sqrt(tf.cast(x.shape[-1],'float32')) + return x + class RandomSampling(tf.keras.layers.Layer): """ @@ -1314,7 +1324,7 @@ def call(self, inputs, training=None): class ScaledGooeyBatchNorm2(tf.keras.layers.Layer): def __init__(self, - viscosity=0.01, + viscosity=1e-9,#start at almost zero fluidity_decay=1e-4, max_viscosity=0.99999, no_gaus = True, @@ -1322,6 +1332,8 @@ def __init__(self, invert_condition=False, _promptnames=None, #compatibility, does nothing record_metrics=False, #compatibility, does nothing + learn = False, + no_bias_gamma = False, **kwargs): ''' Input features (or [features, condition]), output: normed features @@ -1350,6 +1362,8 @@ def __init__(self, self.epsilon = epsilon self.no_gaus = no_gaus self.invert_condition = invert_condition + self.learn = learn + self.no_bias_gamma = no_bias_gamma def compute_output_shape(self, input_shapes): #return input_shapes[0] @@ -1363,7 +1377,9 @@ def get_config(self): 'max_viscosity': self.max_viscosity, 'epsilon': self.epsilon, 'no_gaus': self.no_gaus, - 'invert_condition': self.invert_condition + 'invert_condition': self.invert_condition, + 'learn': self.learn, + 'no_bias_gamma': self.no_bias_gamma } base_config = super(ScaledGooeyBatchNorm2, self).get_config() return dict(list(base_config.items()) + list(config.items())) @@ -1378,14 +1394,14 @@ def build(self, input_shapes): shape = (1,)+input_shapes[1:] self.bias = self.add_weight(name = 'bias',shape = shape, - initializer = 'zeros', trainable = self.trainable) + initializer = 'zeros', trainable = self.trainable and not self.no_bias_gamma) self.gamma = self.add_weight(name = 'gamma',shape = shape, - initializer = 'ones', trainable = self.trainable) - + initializer = 'ones', trainable = self.trainable and not self.no_bias_gamma) + self.mean = self.add_weight(name = 'mean',shape = shape, - initializer = 'zeros', trainable = False) + initializer = 'zeros', trainable = self.learn and self.trainable) self.den = self.add_weight(name = 'den',shape = shape, - initializer = 'ones', trainable = False) + initializer = 'ones', trainable = self.learn and self.trainable) self.viscosity = tf.Variable(initial_value=self.viscosity_init, name='viscosity', trainable=False,dtype='float32') @@ -1403,6 +1419,8 @@ def _calc_mean_and_protect(self, x, mask, default): return x def _calc_update(self, old, new, training, visc=None): + if not self.trainable: + return old if visc is None: visc = self.viscosity delta = new-old @@ -1410,6 +1428,8 @@ def _calc_update(self, old, new, training, visc=None): return tf.keras.backend.in_train_phase(update,old,training=training) def _update_viscosity(self, training): + if not self.trainable: + return if self.fluidity_decay > 0: newvisc = self.viscosity + (self.max_viscosity - self.viscosity)*self.fluidity_decay newvisc = tf.keras.backend.in_train_phase(newvisc,self.viscosity,training=training) @@ -1422,15 +1442,14 @@ def _calc_out(self, x_in, cond): out = (x_in - ngmean) / (tf.abs(ngden) + self.epsilon) out = out*self.gamma + self.bias - if self.invert_condition: - return tf.where(cond<=0.5, out, x_in) - else: - return tf.where(cond>0.5, out, x_in) + return tf.where(cond>0.5, out, x_in) def call(self, inputs, training=None): if isinstance(inputs,list): x_in, cond = inputs cond = tf.where(cond > 0.5, tf.ones_like(cond), 0.) #make sure it's ones and zeros + if self.invert_condition: + cond = 1.-cond else: x_in = inputs cond = tf.ones_like(x_in[...,0:1]) @@ -1441,7 +1460,7 @@ def call(self, inputs, training=None): #x = x_in #maybe don't stop the gradient? x_m = self._calc_mean_and_protect(x, cond, self.mean) - diff_to_mean = tf.abs(x - self.mean) #self.mean or x_m + diff_to_mean = tf.abs(x - x_m) #self.mean) #self.mean or x_m -> self.mean over-corrects if not self.no_gaus: diff_to_mean = diff_to_mean**2 @@ -1450,14 +1469,18 @@ def call(self, inputs, training=None): if not self.no_gaus: x_std = tf.sqrt(x_std + self.epsilon) - - update = self._calc_update(self.mean,x_m,training) - tf.keras.backend.update(self.mean, update) - - update = self._calc_update(self.den,x_std,training) - tf.keras.backend.update(self.den, update) - - self._update_viscosity(training) + #tf.print(self.name, 'p_loss',p_loss, tf.reduce_mean(self.mean), tf.reduce_mean(self.den)) + if self.learn: + p_loss = tf.reduce_mean( (self.mean-x_m)**2 + (self.den-x_std)**2 ) + self.add_loss(p_loss) + else: + update = self._calc_update(self.mean,x_m,training) + tf.keras.backend.update(self.mean, update) + + update = self._calc_update(self.den,x_std,training) + tf.keras.backend.update(self.den, update) + + self._update_viscosity(training) out = self._calc_out(x_in, cond) @@ -1482,103 +1505,82 @@ def __init__(self,**kwargs): class ProcessFeatures(tf.keras.layers.Layer): + def __init__(self, - newformat=True,#compat can be restored but default is new format + is_hgcal = False, **kwargs): - """ - Inputs are: - - Features - - Call will return: - - processed features - - will apply some simple fixed preprocessing to the standard TrainData_OC features - """ - - from datastructures import TrainData_NanoML - self.td=TrainData_NanoML() - self.newformat = newformat - super(ProcessFeatures, self).__init__(**kwargs) + super().__init__(**kwargs) + self.is_hgcal = is_hgcal + ''' + 'recHitEnergy': feat[:,0:1] , #recHitEnergy, + 'recHitEta' : feat[:,1:2] , #recHitEta , + 'recHitID' : feat[:,2:3] , #recHitID, #indicator if it is track or not + 'recHitTheta' : feat[:,3:4] , #recHitTheta , + 'recHitR' : feat[:,4:5] , #recHitR , + 'recHitX' : feat[:,5:6] , #recHitX , + 'recHitY' : feat[:,6:7] , #recHitY , + 'recHitZ' : feat[:,7:8] , #recHitZ , + 'recHitTime' : feat[:,8:9] , #recHitTime + 'recHitHitR' : feat[:,9:10] , + ''' + if self.is_hgcal: + self.mean = tf.constant([[ + 0.1, + 0., # 2.6024690e+00, #this is abs + 0., + 0., # 1.5362334e-01, #this is abs + 0., # 3.4008121e+02, #this is abs + 0., + 0., + 0., # 3.3786691e+02 #this is abs + 0., + 0.1]], dtype='float32') + else: + self.mean = tf.constant([[ + 0.1, + 2.6, #this is abs + 0., + 1.5e-01, #this is abs + 3.5e+02, #this is abs + 0., + 0., + 3.5e+02, + 0., + 0.1]], dtype='float32') + + self.std = tf.constant([[ + 2.4, + 3.4e-01, + 1., #ID set to 1, this will be track charge? + 6.4e-02, + 1.5e+01, + 4.0e+01, + 3.9e+01, + 1.3e+01, + 1., + 4.2e-01]], dtype='float32') + + def get_config(self): - config = {'newformat': self.newformat} + config = {'is_hgcal': self.is_hgcal} base_config = super(ProcessFeatures, self).get_config() return dict(list(base_config.items()) + list(config.items())) - + def compute_output_shape(self, input_shape): return input_shape def call(self, inputs): - ''' - 'recHitEnergy', - 'recHitEta', - 'isTrack', - 'recHitTheta', - 'recHitR', - 'recHitX', - 'recHitY', - 'recHitZ', - 'recHitTime', - 'recHitHitR' - ''' - feat = None - fdict = self.td.createFeatureDict(inputs, False) - - if not self.newformat: - #please make sure in TrainData_OC that this is consistent - fdict['recHitR'] /= 100. - fdict['recHitX'] /= 100. - fdict['recHitY'] /= 100. - fdict['recHitZ'] = (tf.abs(fdict['recHitZ'])-400)/100. - fdict['recHitEta'] = tf.abs(fdict['recHitEta']) - fdict['recHitTheta'] = 2.*tf.math.atan(tf.exp(-fdict['recHitEta'])) - fdict['recHitTime'] = tf.nn.relu(fdict['recHitTime'])/10. #remove -1 default - allf = [] - for k in fdict: - allf.append(fdict[k]) - feat = tf.concat(allf,axis=-1) - - mean = tf.constant([[0.0740814656, 2.46156192, 0., 0.207392946, 3.55599976, 0.0609507263, - -0.00401970092, -0.515379727, 0.0874295086]]) - std = tf.constant([[0.299679846, 0.382687777, 1., 0.0841238275, 0.250777304, 0.485394388, - 0.518072903, 0.240222782, 0.194716245]]) - feat -= mean - feat /= std - - return feat - - else: #new std format - fdict['recHitEta'] = tf.abs(fdict['recHitEta']) - fdict['recHitZ'] = tf.abs(fdict['recHitZ']) - fdict['recHitTheta'] = 2.*tf.math.atan(tf.exp(-fdict['recHitEta'])) - allf = [] - for k in fdict: - allf.append(fdict[k]) - feat = tf.concat(allf,axis=-1) - - mean = tf.constant([[ 3.95022651e-02, 2.46088736e+00, - 0.00000000e+00, - 1.56797723e+00, 3.54114793e+02, - 0.,#x - 0.,#y - 3.47267313e+02, - -3.73342582e-01, - 7.61663214e-01]]) - std = tf.constant([[ 0.16587503, 0.36627547, 1. , - 1.39035478, 22.55941696, - 50., 50., - 21.75297722, - 1.89301789, - 0.14808707]]) - - feat -= mean - feat /= std - - return feat - - ##old format below + #hard coded normalisation for HGCAL (or similar) only! + feat = inputs + + feat -= self.mean + feat /= self.std + + return feat + @@ -2405,6 +2407,8 @@ def compute_output_shape(self, input_shapes): def raw_call(coordinates, row_splits, K, radius, use_approximate_knn, min_bins, tfdist, myself): nbins=None if use_approximate_knn: + if min_bins is None: + min_bins = 11 bin_width = radius # default value for SlicingKnn kernel idx,dist,nbins = SlicingKnn(K+1, coordinates, row_splits, features_to_bin_on = (0,1), @@ -3301,7 +3305,7 @@ def build(self, input_shapes): super(RaggedGravNet, self).build(input_shape) - + @tf.function def create_output_features(self, x, neighbour_indices, distancesq): allfeat = [] features = x @@ -3316,20 +3320,11 @@ def create_output_features(self, x, neighbour_indices, distancesq): features = tf.concat(allfeat + [x], axis=-1) return self.output_feature_transform(features) - def priv_call(self, inputs, training=None): - x = inputs[0] - row_splits = inputs[1] - tf.assert_equal(x.shape.ndims, 2) - tf.assert_equal(row_splits.shape.ndims, 1) - if row_splits.shape[0] is not None: - tf.assert_equal(row_splits[-1], x.shape[0]) - - x_coord = x - if len(inputs) == 3: - x_coord = tf.concat([inputs[2], x], axis=-1) - + #@tf.function(reduce_retracing=True) #don't know why this is being retraced so often.. + def priv_call(self, x, row_splits, x_coord): + coordinates = self.input_spatial_transform(x_coord) - neighbour_indices, distancesq, sidx, sdist = self.compute_neighbours_and_distancesq(coordinates, row_splits, training) + neighbour_indices, distancesq, sidx, sdist = self.compute_neighbours_and_distancesq(coordinates, row_splits) neighbour_indices = tf.reshape(neighbour_indices, [-1, self.n_neighbours]) #for proper output shape for keras distancesq = tf.reshape(distancesq, [-1, self.n_neighbours]) @@ -3338,8 +3333,14 @@ def priv_call(self, inputs, training=None): neighbour_indices, distancesq = sidx, sdist return outfeats, coordinates, neighbour_indices, distancesq - def call(self, inputs, training): - return self.priv_call(inputs, training) + def call(self, inputs, training=None): + x = inputs[0] + row_splits = inputs[1] + x_coord = x + if len(inputs) == 3: + x_coord = tf.concat([inputs[2], x], axis=-1) + + return self.priv_call(x, row_splits, x_coord) def compute_output_shape(self, input_shapes): if self.return_self: @@ -3353,9 +3354,8 @@ def compute_output_shape(self, input_shapes): (input_shapes[0][0], self.n_neighbours),\ (input_shapes[0][0], self.n_neighbours) - - - def compute_neighbours_and_distancesq(self, coordinates, row_splits, training): + #@tf.function + def compute_neighbours_and_distancesq(self, coordinates, row_splits): idx,dist = BinnedSelectKnn(self.n_neighbours+1, coordinates, row_splits, max_radius= -1.0, tf_compatible=False, @@ -3367,9 +3367,7 @@ def compute_neighbours_and_distancesq(self, coordinates, row_splits, training): dist = tf.where(idx<0,0.,dist) - if self.return_self: - return idx[:, 1:], dist[:, 1:], idx, dist - return idx[:, 1:], dist[:, 1:], None, None + return idx[:, 1:], dist[:, 1:], idx, dist def collect_neighbours(self, features, neighbour_indices, distancesq): @@ -3415,6 +3413,115 @@ def call(self, input): att = 2.* self.att_dense(input) return input * att +class TranslationInvariantMP(tf.keras.layers.Layer): + def __init__(self, + n_feature_transformation, + activation='elu', + mean=True, + layer_norm = False, + sum_weight=False, + **kwargs): + super(TranslationInvariantMP, self).__init__(**kwargs) + + self.n_feature_transformation = n_feature_transformation + self.activation = activation + self.mean = mean + self.layer_norm = layer_norm + self.sum_weight = sum_weight + self.feature_tranformation_dense = [] + for i in range(len(self.n_feature_transformation)): + with tf.name_scope(self.name + "/" + str(i)): + self.feature_tranformation_dense.append( + tf.keras.layers.Dense(n_feature_transformation[i], activation=activation, use_bias = i>0)) + + + def build(self, input_shapes): + input_shape = input_shapes[0] + + with tf.name_scope(self.name + "/" + str(0)): + self.feature_tranformation_dense[0].build(input_shape) + + for i in range(1, len(self.feature_tranformation_dense)): + with tf.name_scope(self.name + "/" + str(i)): + self.feature_tranformation_dense[i].build((input_shape[0], self.n_feature_transformation[i - 1])) + + super(TranslationInvariantMP, self).build(input_shapes) + + + def compute_output_shape(self, inputs_shapes): + fshape = inputs_shapes[0][-1] + return (None, sum(self.n_feature_transformation)) + + + def get_config(self): + config = {'n_feature_transformation': self.n_feature_transformation, + 'activation': self.activation, + 'mean': self.mean, + 'layer_norm': self.layer_norm, + 'sum_weight': self.sum_weight + } + base_config = super(TranslationInvariantMP, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def _trf_loop(self, features, neighbour_indices, distancesq, K, first): + + if self.layer_norm: + features = layernorm(features) + prev_feat = features + if self.mean: + #add a 1 to the features for translation invariance later + if first: + ones = tf.ones_like(features[:,0:1]) + features = tf.concat([ones, features], axis=-1) + # Standard Message Passing + features = AccumulateKnn( + 10.*distancesq, + features, + neighbour_indices, + mean_and_max=False)[0] * K + + # this is only necessary for the first exchange, afterwards the features are already translation independent + if first: + minus_xi = features[:,0:1] + features = features[:,1:] + features -= prev_feat * minus_xi + if self.sum_weight: + wsum = tf.math.divide_no_nan(K, tf.reduce_sum( tf.exp(-10.*distancesq), axis=1, keepdims=True) + 1e-2)#large eps + features *= wsum + else: #max + nfeat = SelectWithDefault(neighbour_indices, features,-2.) + features = tf.reduce_max(tf.exp(-10.*distancesq) * nfeat - features[:,tf.newaxis,:], axis=1) * K + + return features/K + + + def create_output_features(self, x, neighbour_indices, distancesq): + allfeat = [] + features = x + K = tf.cast(tf.shape(neighbour_indices)[1], 'float32') + + for i in range(len(self.n_feature_transformation)): + features = self._trf_loop(features, neighbour_indices, distancesq, K, i==0) + t = self.feature_tranformation_dense[i] + features = t(features) #divide by K here again + allfeat.append(features) + + features = tf.concat(allfeat, axis=-1) + features = tf.reshape(features, [-1, sum(self.n_feature_transformation)]) + return features + + @tf.function + def call(self, inputs): + if len(inputs) == 3: + x, neighbor_indices, distancesq = inputs + elif len(inputs) == 2: + x, neighbor_indices = inputs + distancesq = tf.zeros_like(neighbor_indices, dtype='float32') + else: + raise ValueError(self.name+" was passed wrong inputs") + + return self.create_output_features(x, neighbor_indices, distancesq) + class MultiAttentionGravNetAdd(tf.keras.layers.Layer): def __init__(self, @@ -3605,7 +3712,7 @@ def __init__(self, n_feature_transformation, self.feature_tranformation_dense = [] for i in range(len(self.n_feature_transformation)): with tf.name_scope(self.name + "/5/" + str(i)): - self.feature_tranformation_dense.append(tf.keras.layers.Dense(self.n_feature_transformation[i], activation='relu')) # restrict variations a bit + self.feature_tranformation_dense.append(tf.keras.layers.Dense(self.n_feature_transformation[i], activation='elu')) # restrict variations a bit def build(self, input_shapes): input_shape = input_shapes[0] @@ -3700,6 +3807,7 @@ def get_config(self): base_config = super(DistanceWeightedMessagePassing, self).get_config() return dict(list(base_config.items()) + list(config.items())) + @tf.function def create_output_features(self, x, neighbour_indices, distancesq): allfeat = [] features = x diff --git a/modules/Layers.py b/modules/Layers.py index ce13ae2c..848c7cfe 100644 --- a/modules/Layers.py +++ b/modules/Layers.py @@ -16,12 +16,8 @@ from tensorflow.keras.layers import LeakyReLU global_layers_list['LeakyReLU'] = LeakyReLU - #base modules -from baseModules import PromptMetric -global_layers_list['PromptMetric'] = PromptMetric - from baseModules import LayerWithMetrics global_layers_list['LayerWithMetrics'] = LayerWithMetrics @@ -230,6 +226,9 @@ from GravNetLayersRagged import RaggedGravNet global_layers_list['RaggedGravNet']=RaggedGravNet +from GravNetLayersRagged import TranslationInvariantMP +global_layers_list['TranslationInvariantMP']=TranslationInvariantMP + from GravNetLayersRagged import SelfAttention global_layers_list['SelfAttention']=SelfAttention @@ -302,7 +301,7 @@ from LossLayers import LLValuePenalty,LLNotNoiseClassifier,CreateTruthSpectatorWeights, NormaliseTruthIdxs, LLGraphCondOCLoss -from LossLayers import LLLocalClusterCoordinates, LLClusterCoordinates,LLFillSpace, LLOCThresholds +from LossLayers import LLLocalClusterCoordinates,LLFractionRegressor, LLClusterCoordinates,LLFillSpace, LLOCThresholds from LossLayers import LossLayerBase, LLBasicObjectCondensation, LLFullObjectCondensation,LLPFCondensates,LLNeighbourhoodClassifier from LossLayers import LLFullObjectCondensationUncertainty, LLFullObjectCondensationID from LossLayers import LLExtendedObjectCondensation, LLExtendedObjectCondensation2 @@ -329,6 +328,9 @@ global_layers_list['LLPushTracks']=LLPushTracks global_layers_list['LLEnergySums']=LLEnergySums +global_layers_list['LLFractionRegressor']=LLFractionRegressor + + global_layers_list['LLOCThresholds']=LLOCThresholds global_layers_list['LLLocalEnergyConservation']=LLLocalEnergyConservation @@ -377,6 +379,17 @@ import tensorflow.keras.backend as K import tensorflow as tf +class DummyLayer(tf.keras.layers.Layer): + ''' + Just to make sure other layers are not optimised away + Inputs: + - list of tensors. First will be passed through, the other will be ignored + ''' + def call(self, inputs): + return inputs[0] + +global_layers_list['DummyLayer']=DummyLayer + class GroupSortActivation(tf.keras.layers.Layer): @@ -390,6 +403,27 @@ def call(self, inputs): global_layers_list['GroupSortActivation']=GroupSortActivation +class ElementWiseMultiply(tf.keras.layers.Layer): + + def __init__(self, multi_vector : list, **kwargs): + super(ElementWiseMultiply, self).__init__(**kwargs) + self.multi_vector = multi_vector + self.mult = tf.constant(self.multi_vector, dtype='float32') + + def get_config(self): + config = {'multi_vector': self.multi_vector} + base_config = super(ElementWiseMultiply, self).get_config() + return dict(list(base_config.items()) + list(config.items() )) + + def compute_output_shape(self, input_shapes): + return input_shapes + + def call(self, inputs): + return inputs * self.mult + +global_layers_list['ElementWiseMultiply']=ElementWiseMultiply + + def layernorm(x, return_norm=False): x = x - tf.reduce_mean(x,axis=-1, keepdims=True) norm = tf.reduce_sum(x**2, axis=-1,keepdims=True) @@ -790,6 +824,7 @@ def __init__(self, skip_non_finite=5, :param kwargs: For subclass Model """ + raise ValueError("RobustModel deprecated, please use simply tf.keras.Model.") super(RobustModel, self).__init__(*args, **kwargs) # if 'config' in kwargs: @@ -950,39 +985,59 @@ def __init__(self, """ Just forces dictionary output """ - + raise ValueError("DictModel deprecated, please use simply tf.keras.Model.") super(DictModel, self).__init__(inputs,outputs=outputs, *args, **kwargs) - -class RaggedDictModel(tf.keras.Model): - def __init__(self, - inputs, - outputs: dict, #force to be dict - *args, **kwargs): - """ - Just forces dictionary output - """ + +global_layers_list['DictModel']=DictModel + + + +class ReduceHits(Layer): + def __init__(self, mode, **kwargs): + super(ReduceHits, self).__init__(**kwargs) - super(RaggedDictModel, self).__init__(inputs,outputs=outputs, *args, **kwargs) - - def call(self, inputs, *args, **kwargs): - return super(RaggedDictModel, self).call(self.unpack_ragged(inputs), *args, **kwargs) - - def train_step(self, inputs, *args, **kwargs): - return super(RaggedDictModel, self).train_step(self.unpack_ragged(inputs), *args, **kwargs) - #super(RaggedDictModel, self).train_step(inputs, *args, **kwargs) - - def unpack_ragged(self, inputs): - output = [] - for i in inputs: - print("Type of i is", type(i)) - print("Hasattr", hasattr(i, "row_splits")) - if type(i) == tf.RaggedTensor: - print("Inside") - output.append((i.values, i.row_splts)) - else: - output.append(i) + assert mode == 'mean' or mode == 'max' or mode == 'sum' or mode == 'min' + self.mode=mode - return output + def call(self, inputs): + assert len(inputs) == 2 + x, rs = inputs + xr = tf.RaggedTensor.from_row_splits(x, rs) + if self.mode == 'mean': + return tf.reduce_mean(xr, axis=1) + elif self.mode == 'max': + return tf.reduce_max(xr, axis=1) + elif self.mode == 'sum': + return tf.reduce_sum(xr, axis=1) + elif self.mode == 'min': + return tf.reduce_min(xr, axis=1) + else: + raise ValueError('Unknown mode %s' % self.mode) -global_layers_list['DictModel']=DictModel + + def get_config(self): + config = {'mode': self.mode} + base_config = super(ReduceHits, self).get_config() + return dict(list(base_config.items()) + list(config.items() )) + +global_layers_list['ReduceHits']=ReduceHits + +class BroadcastMultiply(Layer): + def __init__(self, mode, **kwargs): + ''' + multiplies with broadcasting, e.g. + a = tf.constant([[1,2,3],[4,5,6]]) + b = tf.constant([[1],[2]]) + c = BroadcastMultiply()([a,b]) + + print(c) -> [[1,2,3],[8,10,12]] + ''' + super(BroadcastMultiply, self).__init__(**kwargs) + + def call(self, inputs): + assert len(inputs) == 2 + x, y = inputs + return x*y + +global_layers_list['BroadcastMultiply']=BroadcastMultiply \ No newline at end of file diff --git a/modules/LossLayers.py b/modules/LossLayers.py index d7149dbe..a89f4ffa 100644 --- a/modules/LossLayers.py +++ b/modules/LossLayers.py @@ -271,7 +271,7 @@ def call(self, inputs): if not self.return_lossval: self.add_loss(lossval) - self.add_prompt_metric(lossval, self.name+'_loss') + self.wandb_log({self.name+'_loss': lossval}) if self.return_lossval: return a, lossval else: @@ -331,6 +331,46 @@ def loss(self, inputs): +class LLFractionRegressor(LossLayerBase): + + def __init__(self, + mode : str = "binary", + **kwargs): + ''' + Takes as input: + - score + - truth_fraction + Returns: + - score + ''' + assert mode == "binary" or mode == "regression_bce" or mode == "regression_mse" + + super(LLFractionRegressor, self).__init__(**kwargs) + self.mode = mode + + def get_config(self): + config = {'mode': self.mode} + base_config = super(LLFractionRegressor, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + def loss(self, inputs): + assert isinstance(inputs, list) and len(inputs) == 2 + score, truth_fraction = inputs + + b_truth_fraction = tf.where(truth_fraction > 0.5, 1., tf.zeros_like(truth_fraction)) + if self.mode == "binary": + return tf.reduce_mean(tf.keras.losses.binary_crossentropy(b_truth_fraction, score)) + elif self.mode == "regression_bce": + return tf.reduce_mean(tf.keras.losses.binary_crossentropy(truth_fraction, score)) + elif self.mode == "regression_mse": + return tf.reduce_mean(tf.keras.losses.mean_squared_error(truth_fraction, score)) + else: + raise ValueError("Unknown mode: {}".format(self.mode)) + + + + + class LLValuePenalty(LossLayerBase): def __init__(self, @@ -636,7 +676,7 @@ def loss(self, inputs): abs_mean_diff = tf.abs(tf.reduce_mean(reldiff)) - self.add_prompt_metric(abs_mean_diff, self.name + '_loss') + self.wandb_log({self.name + '_loss': abs_mean_diff}) return abs_mean_diff #print(reldiff, '\n', tf.reduce_mean(reldiff)) @@ -800,14 +840,11 @@ def loss(self, inputs): efficiency /= ow_sum efficiency = tf.reduce_mean(efficiency) - self.add_prompt_metric(purity, self.name+'_purity') - self.add_prompt_metric(efficiency, self.name+'_efficiency') - + self.wandb_log({self.name+'_purity':purity, + self.name+'_efficiency': efficiency}) loss = self.purity_weight * (1. - purity) + (1.-self.purity_weight)* tf.abs(1. - efficiency) - print(f'purity {purity} efficiency {efficiency} loss {loss}') - return loss @@ -2155,7 +2192,7 @@ def __init__(self, *, energy_loss_weight=1., from object_condensation import Basic_OC_per_sample, PushPull_OC_per_sample, PreCond_OC_per_sample from object_condensation import Hinge_OC_per_sample_damped, Hinge_OC_per_sample, Hinge_Manhatten_OC_per_sample - from object_condensation import Hinge_OC_per_sample_learnable_qmin, Hinge_OC_per_sample_learnable_qmin_betascale_position + from object_condensation import Dead_Zone_Hinge_OC_per_sample,Hinge_OC_per_sample_learnable_qmin, Hinge_OC_per_sample_learnable_qmin_betascale_position impl = Basic_OC_per_sample if implementation == 'pushpull': impl = PushPull_OC_per_sample @@ -2163,6 +2200,8 @@ def __init__(self, *, energy_loss_weight=1., impl = PreCond_OC_per_sample if implementation == 'hinge': impl = Hinge_OC_per_sample + if implementation == 'hinge_deadzone': + impl = Dead_Zone_Hinge_OC_per_sample if implementation == 'hinge_full_grad': # same as`hinge` impl = Hinge_OC_per_sample @@ -2457,13 +2496,6 @@ def loss(self, inputs): timing_loss = self.timing_loss_weight * self.calc_timing_loss(t_time, pred_time, pred_time_unc,t_rec_energy) classification_loss = self.classification_loss_weight * self.calc_classification_loss(t_pid, pred_id, t_is_unique, hasunique) - ##just for time metrics - tdiff = (t_time-pred_time) - tdiff -= tf.reduce_mean(tdiff,keepdims=True) - tstd = tf.math.reduce_std(tdiff) - self.add_prompt_metric(tstd,self.name+'_time_std') - self.add_prompt_metric(tf.reduce_mean(pred_time_unc),self.name+'_time_pred_std') - #end just for metrics # nan_energy = tf.reduce_any(tf.math.is_nan(energy_loss)) # nan_position = tf.reduce_any(tf.math.is_nan(position_loss)) @@ -2534,7 +2566,7 @@ def loss(self, inputs): # div_repulsion = self.div_repulsion, # dynamic_payload_scaling_onset=self.dynamic_payload_scaling_onset # ) - att, rep, noise, min_b, payload, exceed_beta, containment, contamination = self.oc_loss_object( + [att, rep, noise, min_b, payload, exceed_beta], mdict = self.oc_loss_object( beta=pred_beta, x=pred_ccoords, d=pred_distscale, @@ -2545,12 +2577,8 @@ def loss(self, inputs): rs=rowsplits, energies = rechit_energy) - self.add_prompt_metric(att+rep,self.name+'_dynamic_payload_scaling') - - if containment is not None: - self.add_prompt_metric(containment,self.name+'_containment') - self.add_prompt_metric(contamination,self.name+'_contamination') - + #log the OC metrics dict, if any + self.wandb_log(mdict) att *= self.potential_scaling rep *= self.potential_scaling * self.repulsion_scaling @@ -2595,20 +2623,16 @@ def loss(self, inputs): lossval = tf.reduce_mean(lossval)+bpush - self.add_prompt_metric(att,self.name+'_attractive_loss') - self.add_prompt_metric(rep,self.name+'_repulsive_loss') - self.add_prompt_metric(min_b,self.name+'_min_beta_loss') - self.add_prompt_metric(noise,self.name+'_noise_loss') - self.add_prompt_metric(energy_loss,self.name+'_energy_loss') - self.add_prompt_metric(energy_unc_loss,self.name+'_energy_unc_loss') - self.add_prompt_metric(pos_loss,self.name+'_position_loss') - self.add_prompt_metric(time_loss,self.name+'_time_loss') - self.add_prompt_metric(class_loss,self.name+'_class_loss') - self.add_prompt_metric(exceed_beta,self.name+'_exceed_beta_loss') - self.add_prompt_metric(bpush,self.name+'_beta_push_loss') - self.add_prompt_metric(ccdamp,self.name+'_cc_damp_loss') - - self.add_prompt_metric(tf.reduce_mean(pred_distscale),self.name+'_avg_dist') + self.wandb_log({self.name+'_attractive_loss' : att, + self.name+'_repulsive_loss': rep, + self.name+'_min_beta_loss': min_b, + self.name+'_noise_loss': noise, + self.name+'_energy_loss': energy_loss, + self.name+'_energy_unc_loss': energy_unc_loss, + self.name+'_position_loss': pos_loss, + self.name+'_time_loss': time_loss, + self.name+'_class_loss': class_loss + }) self.maybe_print_loss(lossval) @@ -2803,7 +2827,7 @@ def calc_energy_correction_factor_loss(self, t_dep_energies = tf.where(t_dep_energies / t_energy < 0.5, 0.5 * t_energy, t_dep_energies) epred = pred_energy * t_dep_energies - sigma = pred_uncertainty_high * t_dep_energies + 1.0 + sigma = tf.abs(pred_uncertainty_high * t_dep_energies) + 1.0 #abs is a safety measure # Uncertainty 'sigma' must minimize this term: # ln(2*pi*sigma^2) + (E_true - E-pred)^2/sigma^2 @@ -2814,9 +2838,9 @@ def calc_energy_correction_factor_loss(self, uncertainty_loss = tf.math.log(sigma**2) - matching_loss = tf.debugging.check_numerics(matching_loss, "matching_loss") - prediction_loss = tf.debugging.check_numerics(prediction_loss, "matching_loss") - uncertainty_loss = tf.debugging.check_numerics(uncertainty_loss, "matching_loss") + prediction_loss = tf.debugging.check_numerics(prediction_loss, "E: prediction_loss") + uncertainty_loss = tf.debugging.check_numerics(uncertainty_loss, "E: uncertainty_loss") + matching_loss = tf.debugging.check_numerics(matching_loss, "E: matching_loss") prediction_loss = tf.clip_by_value(prediction_loss, 0, 10) uncertainty_loss = tf.clip_by_value(uncertainty_loss, 0, 10) @@ -3283,11 +3307,12 @@ def loss(self, inputs): eloss, euncloss = e_andunc_loss[...,0:1], e_andunc_loss[...,1:2] euncloss /= 2. #same as above - self.add_prompt_metric(tf.reduce_mean(eweight * closs), self.name + '_class_loss') - self.add_prompt_metric(tf.reduce_mean(eweight * tloss), self.name + '_time_loss') - self.add_prompt_metric(tf.reduce_mean(eweight * ploss), self.name + '_pos_loss') - self.add_prompt_metric(tf.reduce_mean(eweight * eloss), self.name + '_momentum_loss') - self.add_prompt_metric(tf.reduce_mean(eweight * euncloss), self.name + '_momentum_unc_loss') + self.wand_log({self.name + '_class_loss': tf.reduce_mean(eweight * closs), + self.name + '_time_loss': tf.reduce_mean(eweight * tloss), + self.name + '_pos_loss': tf.reduce_mean(eweight * ploss), + self.name + '_momentum_loss': tf.reduce_mean(eweight * eloss), + self.name + '_momentum_unc_loss': tf.reduce_mean(eweight * euncloss) + }) #this is as if there weren't any modifications pfc_t_energy = tf.gather_nd(t_energy, pf_idx).values @@ -3305,8 +3330,8 @@ def loss(self, inputs): offset = tf.reduce_mean( s_pfc_mom_corr - s_pfc_t_encorr ) var = tf.math.reduce_std(s_pfc_mom_corr - s_pfc_t_encorr - offset) - self.add_prompt_metric(offset, self.name + '_binned_en_offset_'+namestr) - self.add_prompt_metric(var, self.name + '_binned_en_std_'+namestr) + #self.add_prompt_metric(offset, self.name + '_binned_en_offset_'+namestr) + #self.add_prompt_metric(var, self.name + '_binned_en_std_'+namestr) s_pfc_energy = s_pfc_mom_corr * tf.ragged.boolean_mask(pfc_ensum, sel) s_pfc_t_energy = tf.ragged.boolean_mask(pfc_t_energy, sel) @@ -3314,8 +3339,8 @@ def loss(self, inputs): offset = tf.reduce_mean( (s_pfc_energy - s_pfc_t_energy)/s_pfc_t_energy ) var = tf.math.reduce_std((s_pfc_energy - s_pfc_t_energy)/s_pfc_t_energy - offset) - self.add_prompt_metric(offset, self.name + '_binned_t_en_offset_'+namestr) - self.add_prompt_metric(var, self.name + '_binned_t_en_std_'+namestr) + #self.add_prompt_metric(offset, self.name + '_binned_t_en_offset_'+namestr) + #self.add_prompt_metric(var, self.name + '_binned_t_en_std_'+namestr) allloss = eweight * (closs + tloss + ploss + eloss + euncloss) diff --git a/modules/Regularizers.py b/modules/Regularizers.py index ed0c6c1b..2c5aa1b1 100644 --- a/modules/Regularizers.py +++ b/modules/Regularizers.py @@ -94,8 +94,7 @@ def call(self, inputs): float(avneigh), 'penalty',float(loss)) - #self.add_prompt_metric(avdist, self.name+'_dist') - #self.add_prompt_metric(avneigh, self.name+'_Nneigh') + self.wandb_log({self.name+'_dist': avdist, self.name+'_Nneigh': avneigh}) self.add_loss(loss) return inputs @@ -145,9 +144,9 @@ def half_sided_penalty(x): if self.printout: print(self.name,'meanmax dist loss',float(loss)) - self.add_prompt_metric(meanmax, self.name+'_meanmax') - self.add_prompt_metric(maxmax, self.name+'_maxmax') - self.add_prompt_metric(loss, self.name+'_loss') + self.wandb_log({self.name+'_meanmax' : meanmax, + self.name+'_maxmax': maxmax, + self.name+'_loss': loss}) self.add_loss(loss) return inputs diff --git a/modules/baseModules.py b/modules/baseModules.py index 424467c6..30a08505 100644 --- a/modules/baseModules.py +++ b/modules/baseModules.py @@ -1,56 +1,2 @@ -print(__name__,">>> move to DeepJetCore soon") - -import tensorflow as tf - -#for metric layers -class PromptMetric(tf.keras.metrics.Mean): - def __init__(self, **kwargs): - super(PromptMetric, self).__init__(**kwargs) - - def update_state(self,*args,**kwargs): - self.reset_states()#reset and only take last state - super(PromptMetric, self).update_state(*args,**kwargs) - -class LayerWithMetrics(tf.keras.layers.Layer): - def __init__(self, - record_metrics=False, - _promptnames=None, **kwargs): - super(LayerWithMetrics, self).__init__(**kwargs) - self.prompt_metrics = {} - self.record_metrics = record_metrics - if _promptnames is not None: - for n in _promptnames: - if not n in self.prompt_metrics.keys(): - with tf.name_scope(self.name+"/sub/"+n): - self.prompt_metrics[n]=PromptMetric(name=n) - - def get_config(self): - config = {'_promptnames': [m[1].name for m in self.prompt_metrics.items()], - 'record_metrics': self.record_metrics} - base_config = super(LayerWithMetrics, self).get_config() - return dict(list(base_config.items()) + list(config.items())) - - def add_prompt_metric(self,x,name): - if not self.record_metrics: - return - if not name in self.prompt_metrics.keys(): - with tf.name_scope(self.name+"/sub/"+name): - self.prompt_metrics[name]=PromptMetric(name=name) - self.add_metric(self.prompt_metrics[name](x)) - - -#class LayerWithMetrics(tf.keras.layers.Layer): -# def __init__(self, _promptnames=None, **kwargs): -# super(LayerWithMetrics, self).__init__(**kwargs) -# self.prompt_metrics={} -# -# def register_prompt_metric(self,name): -# if name in self.prompt_metrics.keys(): -# return -# with tf.name_scope(self.name+"/sub/"+name): -# self.prompt_metrics[name]=PromptMetric(name=name) -# -# def add_prompt_metric(self,x,name): -# if not name in self.prompt_metrics.keys(): -# raise ValueError("Metric with name "+str(name)+" not registred. Register in layers or model constructor.") -# self.add_metric(self.prompt_metrics[name](x)) +print(__name__,"deprecated, import directly from DeepJetCore.DJCLayers") +from DeepJetCore.DJCLayers import LayerWithMetrics \ No newline at end of file diff --git a/modules/bin_by_coordinates_op.py b/modules/bin_by_coordinates_op.py index 669e636a..68974375 100644 --- a/modules/bin_by_coordinates_op.py +++ b/modules/bin_by_coordinates_op.py @@ -4,13 +4,6 @@ _bin_by_coordinates = tf.load_op_library('bin_by_coordinates.so') -''' - .Input("coordinates: float") - .Input("row_splits: int32") - .Input("bin_width: float") - .Input("nbins: int32")//same in all dimensions - .Output("output: int32"); -''' def BinByCoordinates(coordinates, row_splits, bin_width=None, n_bins=None, calc_n_per_bin=True, pre_normalized=False, name=""): ''' @@ -39,7 +32,7 @@ def BinByCoordinates(coordinates, row_splits, bin_width=None, n_bins=None, calc_ - bin indices (the above) flattened - number of bins used per dimension (dim = dim(coordinates)) - bin width used (dim = 1) - - (opt) number of points per bin (dim = 1) + - (opt) number of points per bin (dim = 1) - this likely reduces the speed of the OP ''' diff --git a/modules/binned_select_knn_op.py b/modules/binned_select_knn_op.py index 0b6949b4..9fb00bb4 100644 --- a/modules/binned_select_knn_op.py +++ b/modules/binned_select_knn_op.py @@ -4,6 +4,9 @@ import globals as gl from oc_helper_ops import SelectWithDefault +from bin_by_coordinates_op import BinByCoordinates +from index_replacer_op import IndexReplacer + _binned_select_knn = tf.load_op_library('binned_select_knn.so') def _BinnedSelectKnn(K : int, coords, bin_idx, dim_bin_idx, bin_boundaries, n_bins, bin_width , tf_compatible=False, @@ -29,8 +32,8 @@ def _BinnedSelectKnn(K : int, coords, bin_idx, dim_bin_idx, bin_boundaries, n_b use_direction = use_direction ) - -def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, max_bin_dims=3, tf_compatible=False, max_radius=None, name=""): +#@tf.function +def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, max_bin_dims : int =3 , tf_compatible=False, max_radius=None, name=""): ''' max_radius is a dummy for now to make it a drop-in replacement @@ -43,21 +46,20 @@ def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, - any other number: can be neighbour and have neighbours ''' - from bin_by_coordinates_op import BinByCoordinates - from index_replacer_op import IndexReplacer # the following number of bins seems a good~ish estimate for good performance # for homogenous point distributions but should be subject to more tests - elems_per_rs = 1 - if row_splits.shape[0] is not None: - elems_per_rs = row_splits[1] - #do checks - tf.assert_equal(row_splits[-1],coords.shape[0]) + elems_per_rs = tf.reduce_max(row_splits) / tf.shape(row_splits)[0] + elems_per_rs = tf.cast(elems_per_rs, dtype='int32')+1 + #if row_splits.shape[0] is not None: + # elems_per_rs = row_splits[1] + # #do checks + # tf.assert_equal(row_splits[-1],coords.shape[0]) - max_bin_dims = min([max_bin_dims, coords.shape[1]]) + max_bin_dims = tf.reduce_min([max_bin_dims, tf.shape(coords)[1]]) if n_bins is None: - n_bins = tf.math.pow(tf.cast(elems_per_rs,dtype='float32')/(K/32),1/max_bin_dims) + n_bins = tf.math.pow(tf.cast(elems_per_rs,dtype='float32')/(K/32), 1./tf.cast(max_bin_dims,dtype='float32' )) n_bins = tf.cast(n_bins,dtype='int32') n_bins = tf.where(n_bins<5,5,n_bins) n_bins = tf.where(n_bins>30,30,n_bins)#just a guess @@ -82,16 +84,19 @@ def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, bin_boundaries = tf.concat([tf.zeros([1],dtype='int32'), nper],axis=0) #row_splits[0:1] # make it row split like bin_boundaries = tf.cumsum(bin_boundaries) + + #sanity check + tf.assert_equal(tf.reduce_max(bin_boundaries), tf.reduce_max(row_splits)) idx,dist = _BinnedSelectKnn(K, scoords, sbinning, sdbinning, bin_boundaries=bin_boundaries, n_bins=nb, bin_width=bin_width, tf_compatible=tf_compatible, direction = direction ) - if row_splits.shape[0] is None: - return idx, dist + #if row_splits.shape[0] is None: + # return idx, dist #sort back idx = IndexReplacer(idx,sorting) - dist = tf.scatter_nd(sorting[...,tf.newaxis], dist, dist.shape) - idx = tf.scatter_nd(sorting[...,tf.newaxis], idx, idx.shape) + dist = tf.scatter_nd(sorting[...,tf.newaxis], dist, tf.shape(dist)) + idx = tf.scatter_nd(sorting[...,tf.newaxis], idx, tf.shape(idx)) dist = tf.where(idx<0, 0., dist)#safety if not gl.knn_ops_use_tf_gradients: diff --git a/modules/callbacks.py b/modules/callbacks.py index 761d91f2..fc68a8f8 100644 --- a/modules/callbacks.py +++ b/modules/callbacks.py @@ -15,7 +15,7 @@ from plotting_tools import publish, shuffle_truth_colors from DebugLayers import _DebugPlotBase from DeepJetCore import TrainData -from DeepJetCore.dataPipeline import TrainDataGenerator +from djcdata.dataPipeline import TrainDataGenerator diff --git a/modules/compiled/bin_by_coordinates_kernel.cu.cc b/modules/compiled/bin_by_coordinates_kernel.cu.cc index 7eb4e117..75339f1b 100644 --- a/modules/compiled/bin_by_coordinates_kernel.cu.cc +++ b/modules/compiled/bin_by_coordinates_kernel.cu.cc @@ -98,7 +98,7 @@ static void calc( idx += rsidx * mul; if(idx>=n_total_bins){ - printf("global index larger than total bins\n");//DEBUG if you see this you're screwed + printf("\nERROR: BinByCoordinatesOpFunctor: global index larger than total bins\n");//DEBUG if you see this you're screwed return; } @@ -161,6 +161,8 @@ struct BinByCoordinatesOpFunctor { //just because access needs n_total_bins, calc_n_per_bin); + cudaDeviceSynchronize(); + } }; diff --git a/modules/compiled/binned_select_knn_kernel.cu.cc b/modules/compiled/binned_select_knn_kernel.cu.cc index 9db98826..0276875f 100644 --- a/modules/compiled/binned_select_knn_kernel.cu.cc +++ b/modules/compiled/binned_select_knn_kernel.cu.cc @@ -149,7 +149,7 @@ static void select_knn_kernel( int idx = stepper.step(); if(idx<0){//not valid if(!continue_search && !distance){//this should not happen - printf("stopping search for vtx %d at distance %d\n",i_v,distance); + printf("\nERROR: binned_select_knn.cu: stopping search for vtx %d at distance %d\n",i_v,distance); } break; @@ -158,13 +158,23 @@ static void select_knn_kernel( idx+=gbin_offset; if(idx>=n_bboundaries-1){ - printf("idx %d out of range, gb offset %d, distance %d, sb_flat_offset %d, nbb %d\n", idx, gbin_offset, distance, sb_flat_offset,n_bboundaries); + printf("\nERROR: binned_select_knn.cu: boundary issue: idx %d out of range, gb offset %d, distance %d, sb_flat_offset %d, nbb %d\n", idx, gbin_offset, distance, sb_flat_offset,n_bboundaries); continue; } int start_vertex = d_bin_boundaries[idx]; int end_vertex = d_bin_boundaries[idx+1]; + if(start_vertex == end_vertex){ //empty bin + continue_search=true; //correct? + continue; + } + + if(start_vertex>=n_vert || end_vertex>n_vert){ + printf("\nERROR: binned_select_knn.cu: start_vertex %d or end_vertex %d out of range %d \n", start_vertex, end_vertex, n_vert); + continue;//safe guard + } + for(size_t j_v=start_vertex;j_v { //just because access needs select_knn_kernel<5><<>>(d_coord, d_bin_idx,d_direction,d_dim_bin_idx,d_bin_boundaries,d_n_bins,d_bin_width, d_indices,d_dist,n_vert,n_neigh,n_coords,n_bin_dim,n_bboundaries,use_direction); + cudaDeviceSynchronize(); } }; diff --git a/modules/compiled/compare_knn_outputs_kernel.cu.cc b/modules/compiled/compare_knn_outputs_kernel.cu.cc index cc71489e..16b1d28e 100644 --- a/modules/compiled/compare_knn_outputs_kernel.cu.cc +++ b/modules/compiled/compare_knn_outputs_kernel.cu.cc @@ -52,6 +52,8 @@ struct CompareKnnOpFunctor{ int thread_per_block = 20; gpu::CompareKnnOpCudaKernel<<>>(nvertices,nneighbours,input1,input2,output); + + cudaDeviceSynchronize(); } }; diff --git a/modules/compiled/compare_knn_outputs_module.cc b/modules/compiled/compare_knn_outputs_module.cc index de4c2cc4..0f07d504 100644 --- a/modules/compiled/compare_knn_outputs_module.cc +++ b/modules/compiled/compare_knn_outputs_module.cc @@ -9,8 +9,4 @@ REGISTER_OP("CompareKnnOutputs") .Input("in_tensor1: int32") .Input("in_tensor2: int32") .Output("out_tensor: int32") - .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - c->set_output(0, c->input(0)); // requires that output with idx 0 - return Status::OK(); // should have the same shape as - }) // input with idx 0. ; diff --git a/modules/compiled/index_replacer_kernel.cc b/modules/compiled/index_replacer_kernel.cc index 82bcd2fc..027e5e21 100644 --- a/modules/compiled/index_replacer_kernel.cc +++ b/modules/compiled/index_replacer_kernel.cc @@ -14,13 +14,13 @@ typedef Eigen::GpuDevice GPUDevice; namespace functor { -template -struct IndexReplacerOpFunctor { //just because access needs to be different for GPU and CPU +template +struct IndexReplacerOpFunctor { //just because access needs to be different for GPU and CPU void operator()( const CPUDevice &d, - const int * to_be_replaced, - const int * replacements, - int * replaced, + const dtype * to_be_replaced, + const dtype * replacements, + dtype * replaced, const int n_to_be_replaced, const int n_replacements @@ -63,12 +63,12 @@ class IndexReplacerOp : public OpKernel { OP_REQUIRES_OK(context, context->allocate_output(0, t_to_be_replaced.shape(), &out));//same shape - IndexReplacerOpFunctor() ( + IndexReplacerOpFunctor() ( context->eigen_device(), - t_to_be_replaced.flat().data(), - t_replacements.flat().data(), - out->flat().data(), + t_to_be_replaced.flat().data(), + t_replacements.flat().data(), + out->flat().data(), n_to_be_replaced, n_replacements ); @@ -82,7 +82,7 @@ class IndexReplacerOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("IndexReplacer").Device(DEVICE_CPU), IndexReplacerOp); #ifdef GOOGLE_CUDA -extern template struct IndexReplacerOpFunctor; +extern template struct IndexReplacerOpFunctor; REGISTER_KERNEL_BUILDER(Name("IndexReplacer").Device(DEVICE_GPU), IndexReplacerOp); #endif diff --git a/modules/compiled/index_replacer_kernel.cu.cc b/modules/compiled/index_replacer_kernel.cu.cc index 4ebc7593..5ae1afdb 100644 --- a/modules/compiled/index_replacer_kernel.cu.cc +++ b/modules/compiled/index_replacer_kernel.cu.cc @@ -17,9 +17,9 @@ typedef Eigen::GpuDevice GPUDevice; __global__ static void calc( - const int * to_be_replaced, - const int * replacements, - int * replaced, + const int32 * to_be_replaced, + const int32 * replacements, + int32 * replaced, const int n_to_be_replaced, const int n_replacements){ @@ -34,7 +34,8 @@ static void calc( return; } if(ridx>=n_replacements){ - printf("IndexReplacerOpFunctor: index out of range\n"); + replaced[i] = to_be_replaced[i]; //security measure but already screwed here + printf("IndexReplacerOpFunctor: Fatal error: index out of range %d of %d at %d of %d\n", ridx, n_replacements, i, n_to_be_replaced); return; } replaced[i] = replacements[ridx]; @@ -43,26 +44,28 @@ static void calc( -template -struct IndexReplacerOpFunctor { //just because access needs to be different for GPU and CPU +template +struct IndexReplacerOpFunctor { //just because access needs to be different for GPU and CPU void operator()( const GPUDevice &d, - const int * to_be_replaced, - const int * replacements, - int * replaced, + const dtype * to_be_replaced, + const dtype * replacements, + dtype * replaced, - const int n_to_be_replaced, - const int n_replacements + const dtype n_to_be_replaced, + const dtype n_replacements ){ - grid_and_block gb(n_to_be_replaced,1024); + grid_and_block gb(n_to_be_replaced,512); calc<<>>(to_be_replaced,replacements,replaced,n_to_be_replaced,n_replacements); + + cudaDeviceSynchronize(); } }; -template struct IndexReplacerOpFunctor; +template struct IndexReplacerOpFunctor; }//functor }//tensorflow diff --git a/modules/compiled/oc_helper_m_indices_kernel.cu.cc b/modules/compiled/oc_helper_m_indices_kernel.cu.cc index 5f0d1dec..eb303f6e 100644 --- a/modules/compiled/oc_helper_m_indices_kernel.cu.cc +++ b/modules/compiled/oc_helper_m_indices_kernel.cu.cc @@ -113,6 +113,8 @@ struct MIndicesOpFunctor { calc_m_not ); + cudaDeviceSynchronize(); + } }; diff --git a/modules/compiled/rs_offset_adder_kernel.cc b/modules/compiled/rs_offset_adder_kernel.cc new file mode 100644 index 00000000..5cb67add --- /dev/null +++ b/modules/compiled/rs_offset_adder_kernel.cc @@ -0,0 +1,98 @@ + + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/cc/ops/math_ops.h" +#include "helpers.h" +#include "rs_offset_adder_kernel.h" + +namespace tensorflow { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + + +template +struct RSOffsetAdderOpFunctor { //just because access needs to be different for GPU and CPU + void operator()( + const CPUDevice &d, + const int * t_dx, + const int * rs, + int * new_t_idx, + + const int n_vert, + const int n_rs + ){ + for(int i = 0; i < n_vert; i++){ + + //get the offset + int offset = 0; + for(int j = 0; j < n_rs; j++){ + if(i >= rs[j]){ + offset = rs[j]; + } + else{ + break; + } + } + if(t_dx[i] >= 0) + new_t_idx[i] = t_dx[i] + offset; + else + new_t_idx[i] = t_dx[i]; + } + } +}; + + + +template +class RSOffsetAdderOp : public OpKernel { +public: + explicit RSOffsetAdderOp(OpKernelConstruction *context) : OpKernel(context) { + + } + + + void Compute(OpKernelContext *context) override { + + const Tensor &t_t_idx = context->input(0); + const Tensor &t_rs = context->input(1); + + const int n_vert = t_t_idx.dim_size(0); + const int n_rs = t_rs.dim_size(0); + + Tensor *out = NULL; + OP_REQUIRES_OK(context, context->allocate_output(0, + t_t_idx.shape(), &out)); + + + + RSOffsetAdderOpFunctor() ( + context->eigen_device(), + + t_t_idx.flat().data(), + t_rs.flat().data(), + out->flat().data(), + n_vert, + n_rs + ); + + + + } + +}; + +REGISTER_KERNEL_BUILDER(Name("RSOffsetAdder").Device(DEVICE_CPU), RSOffsetAdderOp); + +#ifdef GOOGLE_CUDA +extern template struct RSOffsetAdderOpFunctor; +REGISTER_KERNEL_BUILDER(Name("RSOffsetAdder").Device(DEVICE_GPU), RSOffsetAdderOp); +#endif + +}//functor +}//tensorflow diff --git a/modules/compiled/rs_offset_adder_kernel.cu.cc b/modules/compiled/rs_offset_adder_kernel.cu.cc new file mode 100644 index 00000000..8bbc73a4 --- /dev/null +++ b/modules/compiled/rs_offset_adder_kernel.cu.cc @@ -0,0 +1,80 @@ + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include +#include +#include +#include "cuda_helpers.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" + +#include "rs_offset_adder_kernel.h" + +namespace tensorflow { +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +__global__ +static void calc( + const int * t_dx, + const int * rs, + int * new_t_idx, + + const int n_vert, + const int n_rs){ + + int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= n_vert) + return; + int offset = 0; + for (int j = 0; j < n_rs; j++){ + if (i >= rs[j]){ + offset = rs[j]; + } + else{ + break; + } + } + int current = t_dx[i]; + if (t_dx[i] >= 0) + new_t_idx[i] = current + offset; + else + new_t_idx[i] = current; +} + + + +template +struct RSOffsetAdderOpFunctor { //just because access needs to be different for GPU and CPU + void operator()( + const GPUDevice &d, + const int * t_dx, + const int * rs, + int * new_t_idx, + + const int n_vert, + const int n_rs + ){ + + + grid_and_block gb(n_vert,512); + + calc<<>>( + t_dx, + rs, + new_t_idx, + n_vert, + n_rs + ); + cudaDeviceSynchronize(); + } +}; + +template struct RSOffsetAdderOpFunctor; + +}//functor +}//tensorflow + +#endif //GOOGLE_CUDA + diff --git a/modules/compiled/rs_offset_adder_kernel.h b/modules/compiled/rs_offset_adder_kernel.h new file mode 100644 index 00000000..cc828e00 --- /dev/null +++ b/modules/compiled/rs_offset_adder_kernel.h @@ -0,0 +1,26 @@ + +#ifndef RS_OFFSET_ADDER_KERNEL_H +#define RS_OFFSET_ADDER_KERNEL_H + +namespace tensorflow { +namespace functor { + +template +struct RSOffsetAdderOpFunctor { + void operator()( + const Device &d, + const int * t_dx, + const int * rs, + int * new_t_idx, + + const int n_vert, + const int n_rs + ); + +}; + +} // namespace functor +} // namespace tensorflow + + +#endif //RS_OFFSET_ADDER_KERNEL_H diff --git a/modules/compiled/rs_offset_adder_module.cc b/modules/compiled/rs_offset_adder_module.cc new file mode 100644 index 00000000..4d362db1 --- /dev/null +++ b/modules/compiled/rs_offset_adder_module.cc @@ -0,0 +1,10 @@ + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +using namespace tensorflow; + +REGISTER_OP("RSOffsetAdder") + .Input("idx: int32") + .Input("row_splits: int32") + .Output("output: int32"); diff --git a/modules/compiled/select_mod_knn_grad_kernel.cc b/modules/compiled/select_mod_knn_grad_kernel.cc index e6b4378c..99c24898 100644 --- a/modules/compiled/select_mod_knn_grad_kernel.cc +++ b/modules/compiled/select_mod_knn_grad_kernel.cc @@ -47,7 +47,6 @@ class SelectModKnnGradOp : public OpKernel { public: explicit SelectModKnnGradOp(OpKernelConstruction *context) : OpKernel(context) { - } void Compute(OpKernelContext *context) override { @@ -64,13 +63,12 @@ class SelectModKnnGradOp : public OpKernel { int n_coords = t_coord.dim_size(1); int n_neigh = t_distances.dim_size(1); - auto coorddimsok = t_coord_mod.dims() == 3 - && t_coord_mod.dim_size(0) == n_vert - && t_coord_mod.dim_size(1) == n_coords - && t_coord_mod.dim_size(2) == n_coords - ? tensorflow::Status(): tensorflow::Status(tensorflow::error::INVALID_ARGUMENT, - "Coordinate modifier tensor needs to have 3 dimensions (V x C x C)"); - OP_REQUIRES_OK(context,coorddimsok); + //auto coorddimsok = t_coord_mod.dims() == 3 + // && t_coord_mod.dim_size(0) == n_vert + // && t_coord_mod.dim_size(1) == n_coords + // && t_coord_mod.dim_size(2) == n_coords + // ? OkStatus() : errors::FailedPrecondition("Coordinate modifier tensor needs to have 3 dimensions (V x C x C)"); + //OP_REQUIRES_OK(context,coorddimsok); TensorShape outputShape; diff --git a/modules/compiled/select_mod_knn_grad_kernel.cu.cc b/modules/compiled/select_mod_knn_grad_kernel.cu.cc index d9b44419..a447f57b 100644 --- a/modules/compiled/select_mod_knn_grad_kernel.cu.cc +++ b/modules/compiled/select_mod_knn_grad_kernel.cu.cc @@ -247,7 +247,7 @@ struct SelectModKnnGradOpFunctor { n_neigh, n_coords); - + cudaDeviceSynchronize(); } }; diff --git a/modules/compiled/select_mod_knn_kernel.cc b/modules/compiled/select_mod_knn_kernel.cc index 447ccf07..20a578c0 100644 --- a/modules/compiled/select_mod_knn_kernel.cc +++ b/modules/compiled/select_mod_knn_kernel.cc @@ -282,13 +282,12 @@ class SelectModKnnOp : public OpKernel { int n_coords = d_coord_tensor.dim_size(1); int n_rs = d_rs_tensor.dim_size(0); - auto coorddimsok = d_coord_mod_tensor.dims() == 3 - && d_coord_mod_tensor.dim_size(0) == n_vert - && d_coord_mod_tensor.dim_size(1) == n_coords - && d_coord_mod_tensor.dim_size(2) == n_coords - ? tensorflow::Status(): tensorflow::Status(tensorflow::error::INVALID_ARGUMENT, - "Coordinate modifier tensor needs to have 3 dimensions (V x C x C)"); - OP_REQUIRES_OK(context,coorddimsok); + //auto coorddimsok = d_coord_mod_tensor.dims() == 3 + // && d_coord_mod_tensor.dim_size(0) == n_vert + // && d_coord_mod_tensor.dim_size(1) == n_coords + // && d_coord_mod_tensor.dim_size(2) == n_coords + // ? OkStatus() : errors::FailedPrecondition("Coordinate modifier tensor needs to have 3 dimensions (V x C x C)"); + //OP_REQUIRES_OK(context,coorddimsok); TensorShape outputShape; outputShape.AddDim(n_vert); diff --git a/modules/compiled/select_threshold_kernel.cu.cc b/modules/compiled/select_threshold_kernel.cu.cc index 077aca01..3643501e 100644 --- a/modules/compiled/select_threshold_kernel.cu.cc +++ b/modules/compiled/select_threshold_kernel.cu.cc @@ -197,7 +197,8 @@ struct CopyOutputSelectThresholdOpFunctor { grid_and_block gb(n_scatter_idxs, 512); copy_all_kernel<<>>(d_scatter_idxs,d_tmp_scatter_idxs,n_scatter_idxs); - + + cudaDeviceSynchronize(); } }; diff --git a/modules/compiled/tests/test_binned_select_knn.py b/modules/compiled/tests/test_binned_select_knn.py index e8cbc7ff..46e49a85 100644 --- a/modules/compiled/tests/test_binned_select_knn.py +++ b/modules/compiled/tests/test_binned_select_knn.py @@ -38,11 +38,17 @@ def createData(nvert,ncoords): nbins = None idx, dist = BinnedSelectKnn(test_neighbours, coords, row_splits, n_bins=nbins) + + + start = time.time() for _ in range(5): idx, dist = BinnedSelectKnn(test_neighbours, coords, row_splits, n_bins=nbins) end = time.time() print('took',(end-start)/5.,'s','for',nvert,'points on',nbins,'and nn',test_neighbours ) + + #exit() + continue idx, dist = SlicingKnn(test_neighbours, coords, row_splits, features_to_bin_on=[0,1], bin_width=(0.02,0.02)) start = time.time() @@ -51,6 +57,7 @@ def createData(nvert,ncoords): end = time.time() print('took',(end-start)/5.,'s','for',nvert,'and nn',test_neighbours,'points on SlicingKnn') + #exit() continue idx, dist = SelectKnn(test_neighbours, coords, row_splits) start = time.time() diff --git a/modules/datastructures/TrainData_NanoML.py b/modules/datastructures/TrainData_NanoML.py index 229d2c29..ad68a7fc 100644 --- a/modules/datastructures/TrainData_NanoML.py +++ b/modules/datastructures/TrainData_NanoML.py @@ -1,4 +1,5 @@ -from DeepJetCore.TrainData import TrainData, fileTimeOut +from DeepJetCore import TrainData +from djcdata.TrainData import fileTimeOut from DeepJetCore import SimpleArray import uproot3 as uproot import awkward as ak1 diff --git a/modules/datastructures/TrainData_PrepoolingNanoML.py b/modules/datastructures/TrainData_PrepoolingNanoML.py index 82173be2..b6a2a031 100644 --- a/modules/datastructures/TrainData_PrepoolingNanoML.py +++ b/modules/datastructures/TrainData_PrepoolingNanoML.py @@ -1,4 +1,4 @@ -from DeepJetCore.TrainData import TrainData +from DeepJetCore import TrainData from DeepJetCore import SimpleArray import pickle import numpy as np @@ -7,7 +7,7 @@ import gzip from datastructures.TrainData_NanoML import TrainData_NanoML -from DeepJetCore.dataPipeline import TrainDataGenerator +from djcdata.dataPipeline import TrainDataGenerator from DebugLayers import switch_off_debug_plots diff --git a/modules/datastructures/TrainData_PreselectionNanoML.py b/modules/datastructures/TrainData_PreselectionNanoML.py index bd508374..5369ac87 100644 --- a/modules/datastructures/TrainData_PreselectionNanoML.py +++ b/modules/datastructures/TrainData_PreselectionNanoML.py @@ -1,4 +1,4 @@ -from DeepJetCore.TrainData import TrainData +from DeepJetCore import TrainData from DeepJetCore import SimpleArray import pickle import numpy as np @@ -7,7 +7,7 @@ import gzip from datastructures.TrainData_NanoML import TrainData_NanoML -from DeepJetCore.dataPipeline import TrainDataGenerator +from djcdata.dataPipeline import TrainDataGenerator from DebugLayers import switch_off_debug_plots diff --git a/modules/datastructures/TrainData_TrackML.py b/modules/datastructures/TrainData_TrackML.py index 0eb52ccd..4792de28 100644 --- a/modules/datastructures/TrainData_TrackML.py +++ b/modules/datastructures/TrainData_TrackML.py @@ -1,4 +1,4 @@ -from DeepJetCore.TrainData import TrainData, fileTimeOut +from DeepJetCore import TrainData from DeepJetCore import SimpleArray import awkward0 as ak import pickle diff --git a/modules/datastructures/TrainData_crilin.py b/modules/datastructures/TrainData_crilin.py index d212099b..75e1e7b1 100644 --- a/modules/datastructures/TrainData_crilin.py +++ b/modules/datastructures/TrainData_crilin.py @@ -2,7 +2,7 @@ -from DeepJetCore.TrainData import TrainData, fileTimeOut +from DeepJetCore import TrainData from DeepJetCore import SimpleArray import numpy as np import uproot3 as uproot @@ -34,7 +34,7 @@ def branchToFlatArray(self, b, return_row_splits=False, dtype='float32'): def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="converted_photons"): - fileTimeOut(filename, 10)#wait 10 seconds for file in case there are hiccups + #fileTimeOut(filename, 10)#wait 10 seconds for file in case there are hiccups tree = uproot.open(filename)[treename] ''' diff --git a/modules/datastructures/TrainData_fcc.py b/modules/datastructures/TrainData_fcc.py index 9cf0fdb0..9235ab15 100644 --- a/modules/datastructures/TrainData_fcc.py +++ b/modules/datastructures/TrainData_fcc.py @@ -1,8 +1,9 @@ -from DeepJetCore.TrainData import TrainData, fileTimeOut -from DeepJetCore import SimpleArray +from djcdata import TrainData +from djcdata.TrainData import fileTimeOut +from djcdata import SimpleArray import numpy as np import uproot3 as uproot import awkward as ak1 diff --git a/modules/datastructures/TrainData_ild.py b/modules/datastructures/TrainData_ild.py index 0480dca0..ff6981b9 100644 --- a/modules/datastructures/TrainData_ild.py +++ b/modules/datastructures/TrainData_ild.py @@ -2,7 +2,7 @@ -from DeepJetCore.TrainData import TrainData, fileTimeOut +from DeepJetCore import TrainData from DeepJetCore import SimpleArray import numpy as np import uproot3 as uproot @@ -86,7 +86,7 @@ def branchToFlatArray(self, b, returnRowSplits=False, selectmask=None, is3d=None def fileIsValid(self, filename): import ROOT try: - fileTimeOut(filename, 2) + #fileTimeOut(filename, 2) tree = uproot.open(filename)["SLCIOConverted"] f=ROOT.TFile.Open(filename) t=f.Get("SLCIOConverted") @@ -101,7 +101,7 @@ def fileIsValid(self, filename): def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="SLCIOConverted"): - fileTimeOut(filename, 10)#10 seconds for eos to recover + #fileTimeOut(filename, 10)#10 seconds for eos to recover tree = uproot.open(filename)[treename] nevents = tree.numentries diff --git a/modules/object_condensation.py b/modules/object_condensation.py index 8c02e421..016f4b51 100644 --- a/modules/object_condensation.py +++ b/modules/object_condensation.py @@ -72,6 +72,8 @@ def __init__(self, spect_supp=None, #None means same as noise global_weight=False ): + + self.rep_range = -1. self.q_min = q_min self.s_b = s_b @@ -161,6 +163,7 @@ def set_input(self, def att_func(self,dsq_k_m): return tf.math.log(tf.math.exp(1.)*dsq_k_m/2. + 1.) + #@tf.function def V_att_k(self): ''' ''' @@ -197,33 +200,37 @@ def calc_dsq_rep(self): dsq = tf.reduce_sum(dsq**2, axis=-1, keepdims=True) #K x V x 1 return dsq + #@tf.function def V_rep_k(self): K = tf.reduce_sum(tf.ones_like(self.q_k)) N_notk = tf.reduce_sum(self.Mnot, axis=1) - #future remark: if this gets too large, one could use a kNN here + #future remark: if this gets too large, one could use a kNN/radiusNN here dsq = self.calc_dsq_rep() # nogradbeta = tf.stop_gradient(self.beta_k_m) #weight. tf.reduce_sum( tf.exp(-dsq) * d_v_e, , axis=1) / tf.reduce_sum( tf.exp(-dsq) ) sigma = self.weighted_d_k_m(dsq) #create gradients for all, but prefer k vertex - dsq = tf.math.divide_no_nan(dsq, sigma + 1e-4) #K x V x 1 V_rep = self.rep_func(dsq) * self.Mnot * tf.expand_dims(self.q_v,axis=0) #K x V x 1 + if self.rep_range > 0: + N_notk = tf.reduce_sum(tf.where(dsq < self.rep_range**2 , 1., tf.zeros_like(dsq)) * self.Mnot, axis=1) V_rep = self.q_k * tf.reduce_sum(V_rep, axis=1) #K x 1 if self.global_weight: N_full = tf.reduce_sum(tf.ones_like(self.beta_v)) V_rep = K * tf.math.divide_no_nan(V_rep, N_full+1e-3) #K x 1 - else: + elif True: #TEST DEBUG REMOVE AGAIN V_rep = tf.math.divide_no_nan(V_rep, N_notk+1e-3) #K x 1 + else: + V_rep/=100. return V_rep - + #@tf.function def Pll_k(self): tanhsqbeta = self.beta_v**2 #softer here @@ -241,6 +248,7 @@ def Pll_k(self): pw_k_sum )#K x P return pll_k + #@tf.function def Beta_pen_k(self): #use continuous max approximation through LSE eps = 1e-3 @@ -249,7 +257,8 @@ def Beta_pen_k(self): beta_pen += 1. - tf.clip_by_value(tf.reduce_sum(self.beta_k_m, axis=1), 0., 1) beta_pen = tf.debugging.check_numerics(beta_pen, "OC: beta pen") return beta_pen - + + #@tf.function def Noise_pen(self): nsupp_v = self.beta_v * self.isn_v @@ -272,7 +281,7 @@ def pll_weight_k(self, ow_k, vatt_k, vrep_k): return ow_k - + #@tf.function def add_to_terms(self, V_att, V_rep, @@ -306,10 +315,13 @@ def add_to_terms(self, return V_att, V_rep, Noise_pen, B_pen, pll, high_B_pen - + #@tf.function(reduce_retracing=True) def calc_metrics(self, energies): - return self._calc_containment(energies), self._calc_contamination(energies) + d,rel_metrics_radius = self._calc_containment(energies) + d.update(self._calc_contamination(energies, rel_metrics_radius)) + return d # metrics functions that can be called at the end, first calc containment, then contamination + #@tf.function def _calc_containment(self, energies): ''' energies as V x 1 @@ -321,34 +333,44 @@ def _calc_containment(self, energies): d_x_k = tf.reduce_sum(d_x_k, axis=2) + 10000. * tf.eye(d_x_k.shape[0],d_x_k.shape[1]) # K x K , add large identity d_m_x_k = tf.reduce_min(d_x_k, axis=1, keepdims=True)# K x 1 d_m_x_k = tf.sqrt(d_m_x_k)/self.d_k - self.rel_metrics_radius = tf.reduce_mean(d_m_x_k) / 2. # () + rel_metrics_radius = tf.reduce_mean(d_m_x_k) / 2. # () ##now metrics dxk = tf.reduce_sum( (tf.expand_dims(x_k_alpha, axis=1) - self.x_k_m)**2 , axis= -1) #K x V' - in_radius = self.rel_metrics_radius > tf.sqrt(dxk)/self.d_k #K x V' + in_radius = rel_metrics_radius > tf.sqrt(dxk)/self.d_k #K x V' in_radius = in_radius[...,tf.newaxis] energies_k_m = SelectWithDefault(self.Msel, energies, 0.) #K x V' x 1 - self.energies_k = tf.reduce_sum(energies_k_m, axis=1) #K x 1 + energies_k = tf.reduce_sum(energies_k_m, axis=1) #K x 1 in_radius_energy = tf.reduce_sum(tf.where( in_radius, energies_k_m, 0. ), axis=1) # K x 1 - in_radius_energy /= self.energies_k - return tf.reduce_mean(in_radius_energy) + in_radius_energy /= energies_k + + beta_contain = tf.reduce_sum(tf.where( in_radius, self.beta_k_m, 0. ), axis=1) # K x 1 + beta_contain /= tf.reduce_sum(self.beta_k_m , axis=1) + + return {'containment': tf.reduce_mean(in_radius_energy), + 'beta_containment': tf.reduce_mean(beta_contain)}, rel_metrics_radius - def _calc_contamination(self, energies): + #@tf.function + def _calc_contamination(self, energies, rel_metrics_radius): x_k_alpha = tf.gather_nd(self.x_k_m,self.alpha_k, batch_dims=1) dsq = tf.expand_dims(x_k_alpha, axis=1) - tf.expand_dims(self.x_v, axis=0) #K x V x C dsq = tf.reduce_sum(dsq**2, axis=-1) #K x V - in_radius = self.rel_metrics_radius > tf.sqrt(dsq) / self.d_k# K x V + in_radius = rel_metrics_radius > tf.sqrt(dsq) / self.d_k# K x V energies_k_v = tf.expand_dims(energies, axis=0) # K x V x 1 energies_ir_all_k_v = tf.where(in_radius[...,tf.newaxis], energies_k_v , 0.) energies_ir_not_k_v = tf.where(in_radius[...,tf.newaxis], self.Mnot * energies_k_v , 0.) rel_cont = tf.reduce_sum(energies_ir_not_k_v, axis=1)/tf.reduce_sum(energies_ir_all_k_v, axis=1) - return tf.reduce_mean(rel_cont) + + beta_cont = tf.reduce_sum(tf.where(in_radius[...,tf.newaxis], self.Mnot * self.beta_v[tf.newaxis,...], 0.), axis=1) + beta_cont /= tf.reduce_sum(tf.where(in_radius[...,tf.newaxis], self.beta_v[tf.newaxis,...], 0.), axis=1) + + return {'contamination': tf.reduce_mean(rel_cont), 'beta_contamination': tf.reduce_mean(beta_cont)} class Hinge_OC_per_sample_damped(Basic_OC_per_sample): @@ -378,6 +400,22 @@ class Hinge_OC_per_sample(Hinge_OC_per_sample_damped): def __init__(self, **kwargs): super(Hinge_OC_per_sample, self).__init__(**kwargs) self.condensation_damping = 0.0 # Don't stop any gradients + #self.rep_range = 2. + + + +class Dead_Zone_Hinge_OC_per_sample(Hinge_OC_per_sample): + ''' + This is the classic repulsive hinge loss plus a dead zone + ''' + def __init__(self, **kwargs): + super(Dead_Zone_Hinge_OC_per_sample, self).__init__(**kwargs) + self.condensation_damping = 0.0 # Don't stop any gradients + self.rep_range = 2. + + def att_func(self,dsq_k_m): + return 1. - tf.math.exp(-10. * dsq_k_m) + class Hinge_OC_per_sample_learnable_qmin(Hinge_OC_per_sample): @@ -620,7 +658,7 @@ def __init__(self, ): self.loss_impl=loss_impl(**kwargs) - + def __call__(self, beta, x, d, @@ -638,8 +676,7 @@ def __call__(self, beta, if rs.shape[0] is None or rs.shape[0] < 2: return tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen batch_size = rs.shape[0] - 1 - - contai,contam = tf.constant(0,dtype='float32'),tf.constant(0,dtype='float32') + mdict = {} for b in tf.range(batch_size): @@ -656,16 +693,14 @@ def __call__(self, beta, tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen = self.loss_impl.add_to_terms( tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen ) - if energies is not None: #just last batch is fine - ca,cm = self.loss_impl.calc_metrics(energies[rs[b]:rs[b + 1]]) - contai += ca / float(batch_size) - contam += cm / float(batch_size) + + #just last row split is good enough for metric + if energies is not None and b == batch_size-1: + mdict = self.loss_impl.calc_metrics(energies[rs[b]:rs[b + 1]]) bs = tf.cast(batch_size, dtype='float32') + 1e-3 out = [a/bs for a in [tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen]] - if energies is not None: - return out + [contai, contam] - return out, None, None + return out, mdict ################################################# diff --git a/modules/rs_offset_adder_op.py b/modules/rs_offset_adder_op.py new file mode 100644 index 00000000..2e6abe10 --- /dev/null +++ b/modules/rs_offset_adder_op.py @@ -0,0 +1,52 @@ + +import tensorflow as tf +from tensorflow.python.framework import ops + +_index_replacer = tf.load_op_library('rs_offset_adder.so') + +def RSOffsetAdder(idx, row_splits): + ''' + Adds offsets to indices in idx, and returns the result. + The offsets equal the row split offsets. Indices < 0 are ignored. + ''' + # checks if idx is dim None x 1, and rs is None, both are TF tensors + if ((idx.shape.ndims == 2 and idx.shape[1] == 1) or idx.shape.ndims < 2) and idx.shape.ndims == 1: + return _index_replacer.RSOffsetAdder(idx=idx, row_splits=row_splits) + else: + raise ValueError('idx must be a 1D tensor or a 2D tensor with shape None x 1') + +@ops.RegisterGradient("RSOffsetAdder") +def _RSOffsetAdderGrad(op, gradin): + return None,None + + + +''' +A small unit test in the same file should become standard practice for all custom ops. +''' +def test(print_debug = False): + # a quick test in eager mode + import numpy as np + import tensorflow as tf + idx = tf.constant([0,-1,1,-1,4,5,-1,2,1,100], dtype=tf.int32) + rs = tf.constant([0,3,6,10], dtype=tf.int32) + idx = RSOffsetAdder(idx, rs) + + # expected output: [ 0 -1 1 -1 7 8 -1 8 7 106] + if print_debug: + print(idx.numpy()) + + multiplier = 10000 + #more rigorous test implemeting same in numpy with a loop over rowsplits and 10*multiplier random indices + idx = np.random.randint(-1, 100, 10*multiplier, dtype=np.int32) + rs = multiplier * np.array([0,3,6,10], dtype=np.int32) + kernel_idx = RSOffsetAdder(tf.constant(idx), tf.constant(rs)) + np_idx = idx.copy() + for i in range(1, len(rs)+1): + if i == len(rs): + np_idx[rs[i-1]:] = np.where( np_idx[rs[i-1]:] >=0 , np_idx[rs[i-1]:] + rs[i-1], np_idx[rs[i-1]:]) + else: + np_idx[rs[i-1]:rs[i]] = np.where( np_idx[rs[i-1]:rs[i]] >=0 , np_idx[rs[i-1]:rs[i]] + rs[i-1], np_idx[rs[i-1]:rs[i]]) + return np.all(np_idx == kernel_idx.numpy()) + +#print(test()) diff --git a/modules/select_knn_op.py b/modules/select_knn_op.py index 39371a3e..7a5a517f 100644 --- a/modules/select_knn_op.py +++ b/modules/select_knn_op.py @@ -11,7 +11,7 @@ _sknn_op = tf.load_op_library('select_knn.so') def SelectKnn(K : int, coords, row_splits, masking_values=None, threshold=0.5, tf_compatible=False, max_radius=-1., - mask_mode='none', mask_logic='xor'): + mask_mode='none', mask_logic='xor',n_bins=None): ''' returns indices and distances**2 , gradient for distances is implemented! diff --git a/modules/training_base_hgcal.py b/modules/training_base_hgcal.py index 215e7132..06660a45 100644 --- a/modules/training_base_hgcal.py +++ b/modules/training_base_hgcal.py @@ -1,87 +1,396 @@ -from DeepJetCore.training.training_base import training_base +#from DeepJetCore.training.training_base import training_base + + +import concurrent.futures +import numpy as np + +## to call it from cammand lines +import sys +import os from argparse import ArgumentParser +import shutil +from DeepJetCore import DataCollection +import tensorflow.keras as keras import tensorflow as tf +import copy +from DeepJetCore.training.gpuTools import DJCSetGPUs +from DeepJetCore.training.training_base import training_base as training_base_djc +from DeepJetCore.modeltools import load_model +import time +from DebugLayers import switch_off_debug_plots +from DeepJetCore.DJCLayers import LayerWithMetrics +from tqdm import tqdm -class HGCalTraining(training_base): - def __init__(self, *args, - parser = None, - **kwargs): - ''' - Adds file logging - ''' - #use the DJC training base option to pass a parser - if parser is None: - parser = ArgumentParser('Run the training') - parser.add_argument("--interactive", help="prints output to screen", default=False, action="store_true") + +#for multi-gpu we need to overwrite a few things here + +### +# +# this will become a cleaned-up version of DJC training_base at some point +# +### +class training_base(object): + + def __init__( + self, splittrainandtest=0.85, + useweights=False, testrun=False, + testrun_fraction=0.1, + resumeSilently=False, + renewtokens=False, #compat + collection_class=DataCollection, + parser=None, + recreate_silently=False + ): + + scriptname=sys.argv[0] + + if parser is None: parser = ArgumentParser('Run the training') + parser.add_argument('inputDataCollection') + parser.add_argument('outputDir') + #parser.add_argument('--modelMethod', help='Method to be used to instantiate model in derived training class', metavar='OPT', default=None) + parser.add_argument("--gpu", help="select specific GPU", metavar="OPT", default="") + #parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=-1) + #parser.add_argument("--submitbatch", help="submits the job to condor" , default=False, action="store_true") + #parser.add_argument("--walltime", help="sets the wall time for the batch job, format: 1d5h or 2d or 3h etc" , default='1d') + #parser.add_argument("--isbatchrun", help="is batch run", default=False, action="store_true") + parser.add_argument("--valdata", help="set validation dataset (optional)", default="") + parser.add_argument("--takeweights", help="Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.", default="") + + + args = parser.parse_args() + self.args = args + self.argstring = sys.argv + #sanity check + + + import matplotlib + #if no X11 use below + matplotlib.use('Agg') + DJCSetGPUs(args.gpu) - #no reason for a lot of validation samples usually - super().__init__(*args, resumeSilently=True,parser=parser,splittrainandtest=0.95,**kwargs) - if not self.args.interactive: - print('>>> redirecting the following stdout and stderr to logs in',self.outputDir) - import sys - sys.stdout = open(self.outputDir+'/stdout.txt', 'w') - sys.stderr = open(self.outputDir+'/stderr.txt', 'w') + self.ngpus=1 + + if len(args.gpu): + self.ngpus=len([i for i in args.gpu.split(',')]) + print('running on '+str(self.ngpus)+ ' gpus') + self.keras_inputs=[] + self.keras_inputsshapes=[] + self.keras_model=None + self.mgpu_keras_models = [] + self.keras_weight_model_path=args.takeweights + self.train_data=None + self.val_data=None + self.startlearningrate=None + self.optimizer=None + self.trainedepoches=0 + self.compiled=False + self.checkpointcounter=0 + self.callbacks=None + self.custom_optimizer=False + self.copied_script="" + self.gradients=[] + + self.inputData = os.path.abspath(args.inputDataCollection) \ + if ',' not in args.inputDataCollection else \ + [os.path.abspath(i) for i in args.inputDataCollection.split(',')] + self.outputDir=args.outputDir + # create output dir + + isNewTraining=True + if os.path.isdir(self.outputDir): + if not (resumeSilently or recreate_silently): + var = input('output dir exists. To recover a training, please type "yes"\n') + if not var == 'yes': + raise Exception('output directory must not exists yet') + isNewTraining=False + if recreate_silently: + isNewTraining=True + else: + os.mkdir(self.outputDir) + self.outputDir = os.path.abspath(self.outputDir) + self.outputDir+='/' + + if recreate_silently: + os.system('rm -rf '+ self.outputDir +'*') + + #copy configuration to output dir + try: + shutil.copyfile(scriptname,self.outputDir+os.path.basename(scriptname)) + except shutil.SameFileError: + pass + except BaseException as e: + raise e + + self.copied_script = self.outputDir+os.path.basename(scriptname) + + self.train_data = collection_class() + self.train_data.readFromFile(self.inputData) + self.train_data.useweights=useweights + + if len(args.valdata): + print('using validation data from ',args.valdata) + self.val_data = DataCollection(args.valdata) + + else: + if testrun: + if len(self.train_data)>1: + self.train_data.split(testrun_fraction) + + self.train_data.dataclass_instance=None #can't be pickled + self.val_data=copy.deepcopy(self.train_data) + + else: + self.val_data=self.train_data.split(splittrainandtest) - from config_saver import copyModules - copyModules(self.outputDir)#save the modules with indexing for overwrites - @tf.function - def compute_per_replica_loss(replica_data): - with tf.GradientTape() as tape: - logits = self.keras_model(replica_data) - primary_loss_value = loss_fn(replica_data, logits) - total_loss_value = primary_loss_value + tf.add_n(self.keras_model.losses) - grads = tape.gradient(total_loss_value, self.keras_model.trainable_variables) - optimizer.apply_gradients(zip(grads, self.keras_model.trainable_variables)) - return total_loss_value - - def to_ragged_tensor(self, data_list): - for e in data_list: - rt = tf.RaggedTensor.from_row_splits(values=e[0][0], row_splits=e[0][1].flatten()) - yield rt - def compileModel(self, **kwargs): - super().compileModel(is_eager=True, - loss=None, - **kwargs) + shapes = self.train_data.getNumpyFeatureShapes() + inputdtypes = self.train_data.getNumpyFeatureDTypes() + inputnames= self.train_data.getNumpyFeatureArrayNames() + for i in range(len(inputnames)): #in case they are not named + if inputnames[i]=="" or inputnames[i]=="_rowsplits": + inputnames[i]="input_"+str(i)+inputnames[i] + + + print("shapes", shapes) + print("inputdtypes", inputdtypes) + print("inputnames", inputnames) + + self.keras_inputs=[] + self.keras_inputsshapes=[] + + for s,dt,n in zip(shapes,inputdtypes,inputnames): + self.keras_inputs.append(keras.layers.Input(shape=s, dtype=dt, name=n)) + self.keras_inputsshapes.append(s) + + #bookkeeping + self.train_data.writeToFile(self.outputDir+'trainsamples.djcdc',abspath=True) + self.val_data.writeToFile(self.outputDir+'valsamples.djcdc',abspath=True) + + if not isNewTraining: + kfile = self.outputDir+'/KERAS_check_model_last.h5' + if not os.path.isfile(kfile): + kfile = self.outputDir+'/KERAS_check_model_last' #savedmodel format + if not os.path.isdir(kfile): + kfile='' + if len(kfile): + print('loading model',kfile) + self.loadModel(kfile) + self.trainedepoches=0 + if os.path.isfile(self.outputDir+'losses.log'): + for line in open(self.outputDir+'losses.log'): + valloss = line.split(' ')[1][:-1] + if not valloss == "None": + self.trainedepoches+=1 + else: + print('incomplete epochs, starting from the beginning but with pretrained model') + else: + print('no model found in existing output dir, starting training from scratch') + + def modelSet(self): + return (not self.keras_model==None) and not len(self.keras_weight_model_path) + + def syncModelWeights(self): + if len(self.mgpu_keras_models) < 2: + return + weights = self.mgpu_keras_models[0].get_weights() + for model in self.mgpu_keras_models[1:]: + model.set_weights(weights) + + def setModel(self,model,**modelargs): + if len(self.keras_inputs)<1: + raise Exception('setup data first') + + with tf.device('/GPU:0'): + self.keras_model=model(self.keras_inputs,**modelargs) + + if len(self.keras_weight_model_path): + from DeepJetCore.modeltools import apply_weights_where_possible, load_model + self.keras_model = apply_weights_where_possible(self.keras_model, + load_model(self.keras_weight_model_path)) + if not self.keras_model: + raise Exception('Setting model not successful') + + self.distributeModelToGPUs() + + def distributeModelToGPUs(self): + self.mgpu_keras_models = [self.keras_model] #zero model + if self.ngpus > 1: + print("distributing model to",self.ngpus,"GPUs") + for i in range(self.ngpus-1): + with tf.device(f'/GPU:{i+1}'): + self.mgpu_keras_models.append(tf.keras.models.clone_model(self.keras_model)) + #sync initial or loaded weights + self.syncModelWeights() + + #run debug layers etc just for one model + for i, m in enumerate(self.mgpu_keras_models): + if i: + switch_off_debug_plots(m) + # TBI: this does not work yet, needs wandb update + LayerWithMetrics.switch_off_metrics_layers(m) + + #run record_metrics only for one model + + def saveCheckPoint(self,addstring=''): + self.checkpointcounter=self.checkpointcounter+1 + self.saveModel("KERAS_model_checkpoint_"+str(self.checkpointcounter)+"_"+addstring) + + def _loadModel(self,filename): + keras_model=load_model(filename) + optimizer=keras_model.optimizer + return keras_model, optimizer + + def loadModel(self,filename): + self.keras_model, self.optimizer = self._loadModel(filename) + self.distributeModelToGPUs() + #distribute to gpus + self.compiled=True + + def setCustomOptimizer(self,optimizer): + self.optimizer = optimizer + self.custom_optimizer=True + + def compileModel(self, + learningrate, + clipnorm=None, + print_models=False, + metrics=None, + is_eager=False, + **compileargs): + + if not self.keras_model: + raise Exception('set model first') + + print('Model being compiled for '+str(self.ngpus)+' gpus') + + self.startlearningrate=learningrate + + if not self.custom_optimizer: + from tensorflow.keras.optimizers import Adam + if clipnorm: + self.optimizer = Adam(lr=self.startlearningrate,clipnorm=clipnorm) + else: + self.optimizer = Adam(lr=self.startlearningrate) + + def run_compile(model, device): + with tf.device(device): + model.compile(optimizer=self.optimizer,metrics=metrics,**compileargs) + if is_eager: + #call on one batch to fully build it + print('Model being called once for device '+str(device)) + model(self.train_data.getExampleFeatureBatch()) + + for i, m in enumerate(self.mgpu_keras_models): + run_compile(m, f'/GPU:{i}') + + if print_models: + print(self.mgpu_keras_models[0].summary()) + self.compiled=True + + + def saveModel(self,outfile): + self.keras_model.save(self.outputDir+outfile) + + + # add some of the multi-gpu initialisation here? + def _initTraining(self, + nepochs, + batchsize, + use_sum_of_squares=False): + + + self.train_data.setBatchSize(batchsize) + self.val_data.setBatchSize(batchsize) + self.train_data.batch_uses_sum_of_squares=use_sum_of_squares + self.val_data.batch_uses_sum_of_squares=use_sum_of_squares + + self.train_data.setBatchSize(batchsize) + self.val_data.setBatchSize(batchsize) + + ## create multi-gpu models + + #now this is hgcal specific because of missing truth, think later how to do that better + def compute_gradients(self, model, data, i): + with tf.device(f'/GPU:{i}'): + with tf.GradientTape() as tape: + predictions = model(data, training=True) + loss = tf.add_n(model.losses) + return tape.gradient(loss, model.trainable_variables) + + def average_gradients(self): + all_gradients = self.gradients + # Average the gradients across GPUs + if len(all_gradients) < 2: + return all_gradients[0] + avg_grads = [] + for grad_list_tuple in zip(*all_gradients): + grads = [g for g in grad_list_tuple if g is not None] + avg_grads.append(tf.reduce_mean(grads, axis=0)) + return avg_grads + + + def trainstep_parallel(self, split_data, collect_gradients=1): + + if self.ngpus == 1: #simple + self.gradients += [self.compute_gradients(self.mgpu_keras_models[0], split_data[0], 0)] + + else: + batch_gradients = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=self.ngpus) as executor: + futures = [executor.submit(self.compute_gradients, self.mgpu_keras_models[i], split_data[i], i) for i in range(self.ngpus)] + for future in concurrent.futures.as_completed(futures): + gradients = future.result() + batch_gradients.append(gradients) + self.gradients += batch_gradients + + # Average gradients across GPUs and collection steps + if collect_gradients * self.ngpus <= len(self.gradients): + avg_grads = self.average_gradients() + self.optimizer.apply_gradients(zip(avg_grads, self.mgpu_keras_models[0].trainable_variables)) + self.syncModelWeights() # weights synced + self.gradients = [] + + + def trainModel(self, nepochs, batchsize, - run_eagerly=True, - verbose=1, + run_eagerly=False, batchsize_use_sum_of_squares = False, - fake_truth=True,#extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs - backup_after_batches=500, - checkperiod=1, + fake_truth=False,#extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs stop_patience=-1, lr_factor=0.5, lr_patience=-1, lr_epsilon=0.003, lr_cooldown=6, lr_minimum=0.000001, + checkperiod=10, + backup_after_batches=-1, additional_plots=None, additional_callbacks=None, load_in_mem = False, max_files = -1, plot_batch_loss = False, + add_progbar = False, + verbose = 0, + collect_gradients = 1, #average N batches before update **trainargs): - self.keras_model.run_eagerly=run_eagerly + for m in self.mgpu_keras_models: + m.run_eagerly=run_eagerly # write only after the output classes have been added self._initTraining(nepochs,batchsize, batchsize_use_sum_of_squares) - try: #won't work for purely eager models - self.keras_model.save(self.outputDir+'KERAS_untrained_model') + try: #won't work for purely eager models unless called first - now fixed in model compile + self.saveModel('KERAS_untrained_model.h5') except: pass print('setting up callbacks') from DeepJetCore.training.DeepJet_callbacks import DeepJet_callbacks - minTokenLifetime = 5 - if not self.renewtokens: - minTokenLifetime = -1 self.callbacks=DeepJet_callbacks(self.keras_model, stop_patience=stop_patience, @@ -97,79 +406,175 @@ def trainModel(self, additional_plots=additional_plots, batch_loss = plot_batch_loss, print_summary_after_first_batch=run_eagerly, - minTokenLifetime = minTokenLifetime) + minTokenLifetime = -1) if additional_callbacks is not None: if not isinstance(additional_callbacks, list): additional_callbacks=[additional_callbacks] self.callbacks.callbacks.extend(additional_callbacks) + #create callbacks wrapper - print('starting training') - if load_in_mem: - print('make features') - X_train = self.train_data.getAllFeatures(nfiles=max_files) - X_test = self.val_data.getAllFeatures(nfiles=max_files) - print('make truth') - Y_train = self.train_data.getAllLabels(nfiles=max_files) - Y_test = self.val_data.getAllLabels(nfiles=max_files) - self.keras_model.fit(X_train, Y_train, batch_size=batchsize, epochs=nepochs, - callbacks=self.callbacks.callbacks, - validation_data=(X_test, Y_test), - max_queue_size=1, - use_multiprocessing=False, - workers=0, - **trainargs) - else: - - #prepare generator - - print("setting up generator... can take a while") - use_fake_truth=None - if fake_truth: - if isinstance(self.keras_model.output,dict): - use_fake_truth = [k for k in self.keras_model.output.keys()] - elif isinstance(self.keras_model.output,list): - use_fake_truth = len(self.keras_model.output) - - traingen = self.train_data.invokeGenerator(fake_truth = use_fake_truth) - valgen = self.val_data.invokeGenerator(fake_truth = use_fake_truth) + callbacks = tf.keras.callbacks.CallbackList( + self.callbacks.callbacks, + add_history=True, + model=self.keras_model, #only run them on the main model! + ) + if self.trainedepoches == 0: + callbacks.on_train_begin() + #prepare generator - while(self.trainedepoches < nepochs): - - #this can change from epoch to epoch - #calculate steps for this epoch - #feed info below - traingen.prepareNextEpoch() - valgen.prepareNextEpoch() - nbatches_train = traingen.getNBatches() #might have changed due to shuffeling - nbatches_val = valgen.getNBatches() + print("setting up generator... can take a while") + use_fake_truth=None + if fake_truth: + if isinstance(self.keras_model.output,dict): + use_fake_truth = [k for k in self.keras_model.output.keys()] + elif isinstance(self.keras_model.output,list): + use_fake_truth = len(self.keras_model.output) + + traingen = self.train_data.invokeGenerator(fake_truth = use_fake_truth) + valgen = self.val_data.invokeGenerator(fake_truth = use_fake_truth) + + while(self.trainedepoches < nepochs): + + self.gradients = [] #reset in case of accumulated gradients - print('>>>> epoch', self.trainedepoches,"/",nepochs) - print('training batches: ',nbatches_train) - print('validation batches: ',nbatches_val) + callbacks.on_epoch_begin(self.trainedepoches) + #this can change from epoch to epoch + #calculate steps for this epoch + #feed info below + traingen.prepareNextEpoch() + valgen.prepareNextEpoch() + nbatches_train = traingen.getNBatches() #might have changed due to shuffeling + nbatches_val = valgen.getNBatches() + + print('>>>> epoch', self.trainedepoches,"/",nepochs) + print('training batches: ',nbatches_train) + print('validation batches: ',nbatches_val) - data = self.to_ragged_tensor(traingen.feedNumpyData()) - #data = traingen.feedNumpyData() + nbatches_in = 0 + single_counter = 0 + + if add_progbar: + pbar = tqdm(total=nbatches_train + nbatches_val) + + while nbatches_in < nbatches_train: + + thisbatch = [] + while len(thisbatch) < self.ngpus and nbatches_in < nbatches_train: + #only 'feature' part matters for HGCAL + thisbatch.append(next(traingen.feedNumpyData())[0]) + nbatches_in += 1 + + if len(thisbatch) != self.ngpus: #last batch might not be enough + break + + callbacks.on_train_batch_begin(single_counter) + + self.trainstep_parallel(thisbatch, collect_gradients) - self.keras_model.fit(data, - steps_per_epoch=nbatches_train, - epochs=self.trainedepoches + 1, - initial_epoch=self.trainedepoches, - callbacks=self.callbacks.callbacks, - validation_data=valgen.feedNumpyData(), - validation_steps=nbatches_val, - max_queue_size=1, - use_multiprocessing=False, - workers=0, - **trainargs - ) - self.trainedepoches += 1 - traingen.shuffleFileList() - # + logs = { m.name: m.result() for m in self.keras_model.metrics } #only for main model + + callbacks.on_train_batch_end(single_counter, logs) + + single_counter += 1 + if add_progbar: + pbar.update(len(thisbatch)) + try: + callbacks.on_epoch_end(self.trainedepoches, logs) #use same logs here + except Exception as e: + print(e) + print('will continue training anyway') + + if add_progbar: + pbar.close() + self.trainedepoches += 1 + traingen.shuffleFileList() + # + + self.saveModel("KERAS_model.h5") + + #return self.keras_model, callbacks.history + + + + + def change_learning_rate(self, new_lr): + import tensorflow.keras.backend as K + K.set_value(self.keras_model.optimizer.lr, new_lr) - self.saveModel("KERAS_model.h5") + - return self.keras_model, self.callbacks.history +class HGCalTraining(training_base): + def __init__(self, *args, + **kwargs): + ''' + Adds file logging + ''' + #no reason for a lot of validation samples usually + super().__init__(*args, resumeSilently=True,splittrainandtest=0.95,**kwargs) + + from config_saver import copyModules + copyModules(self.outputDir)#save the modules with indexing for overwrites + + def compileModel(self, **kwargs): + super().compileModel(is_eager=True, + loss=None, + **kwargs) + def trainModel(self, + nepochs, + batchsize, + backup_after_batches=500, + checkperiod=1, + **kwargs): + ''' + Just implements some defaults + ''' + return super().trainModel(nepochs=nepochs, + batchsize=batchsize, + run_eagerly=True, + batchsize_use_sum_of_squares=False, + fake_truth=True, + backup_after_batches=backup_after_batches, + checkperiod=checkperiod, + **kwargs) + + + +class HGCalTraining_compat(training_base_djc): + def __init__(self, *args, + **kwargs): + ''' + Adds file logging + ''' + #no reason for a lot of validation samples usually + super().__init__(*args, resumeSilently=True,splittrainandtest=0.95,**kwargs) + + from config_saver import copyModules + copyModules(self.outputDir)#save the modules with indexing for overwrites + + def compileModel(self, **kwargs): + super().compileModel(is_eager=True, + loss=None, + **kwargs) + + def trainModel(self, + nepochs, + batchsize, + backup_after_batches=500, + checkperiod=1, + **kwargs): + ''' + Just implements some defaults + ''' + return super().trainModel(nepochs=nepochs, + batchsize=batchsize, + run_eagerly=True, + verbose=2, + batchsize_use_sum_of_squares=False, + fake_truth=True, + backup_after_batches=backup_after_batches, + checkperiod=checkperiod, + **kwargs) \ No newline at end of file diff --git a/scripts/fibatch_submit.py b/scripts/fibatch_submit.py index 8c447bfb..432aa1fe 100755 --- a/scripts/fibatch_submit.py +++ b/scripts/fibatch_submit.py @@ -24,7 +24,7 @@ sys.exit() # can be used by others on FI -DJCLOC='/mnt/ceph/users/jkieseler/containers/deepjetcore3_latest.sif' +DJCLOC='/mnt/ceph/users/jkieseler/containers/deepjetcore4_latest.sif' UEXT = str(uuid.uuid4()) WORKDIR=None @@ -34,11 +34,13 @@ TRIGGERED_NAME=False TRIGGERED_CONSTRAINT=False TRIGGERED_TIME=False +TRIGGERED_GPU=False NAME="batchscript" # constraint="a100" CONSTRAINT="a100-80gb" -CONSTRAINT="a100" +#CONSTRAINT="a100" TIME="7-0" +GPUS="4" for clo in sys.argv: @@ -50,6 +52,14 @@ TRIGGERED_NAME=False continue + if clo == '---g': + TRIGGERED_GPU = True + continue + if TRIGGERED_GPU: + GPUS = clo + TRIGGERED_GPU=False + continue + if clo == '---d': TRIGGERED=True continue @@ -99,7 +109,7 @@ bscript_temp=f'''#!/bin/bash -#SBATCH -p gpu --gres=gpu:1 --mincpus 4 -t 7-0 --constraint={CONSTRAINT} +#SBATCH -p gpu --gres=gpu:{GPUS} --mincpus 4 -t 7-0 --constraint={CONSTRAINT} nvidia-smi singularity run -B /mnt --nv {DJCLOC} /bin/bash runscript_{UEXT}.sh