diff --git a/README.md b/README.md
index 64da8cf1..9f02975e 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@ HGCalML
 ===============================================================================
 
 Requirements
-  * DeepJetCore 3.X (``https://github.com/DL4Jets/DeepJetCore``)
-  * DeepJetCore 3.X container (or latest version in general)
+  * DeepJetCore 4.X (``https://github.com/DL4Jets/DeepJetCore``)
+  * DeepJetCore 4.X container (or latest version in general)
   
 For CERN (or any machine with cvmfs mounted), a script to start the latest container use this script:
 ```
@@ -23,7 +23,7 @@ sing=`which singularity`
 unset PATH
 cd
 
-$sing run -B /eos -B /afs $gpuopt /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cernml4reco/deepjetcore3:latest
+$sing run -B /eos -B /afs $gpuopt /cvmfs/unpacked.cern.ch/registry.hub.docker.com/cernml4reco/deepjetcore4:latest
 ```
 
 The package follows the structure and logic of all DeepJetCore subpackages (also the example in DeepJetCore). So as a fresh starting point, it can be a good idea to follow the DeepJetCore example first.
diff --git a/Train/config_trainer.py b/Train/config_trainer.py
index 74b96613..2adf003f 100644
--- a/Train/config_trainer.py
+++ b/Train/config_trainer.py
@@ -21,14 +21,14 @@
 from Layers import MixWhere
 from Layers import RaggedGravNet
 from Layers import PlotCoordinates
-from Layers import DistanceWeightedMessagePassing
+from Layers import DistanceWeightedMessagePassing, AccumulateNeighbours
 from Layers import LLFillSpace
 from Layers import LLExtendedObjectCondensation
-from Layers import DictModel,RaggedDictModel
+from Layers import DictModel
 from Layers import RaggedGlobalExchange
 from Layers import SphereActivation
 from Layers import Multi
-from Layers import ShiftDistance
+from Layers import ShiftDistance, KNN
 from Layers import LLRegulariseGravNetSpace
 from Regularizers import AverageDistanceRegularizer
 from model_blocks import tiny_pc_pool, condition_input
@@ -132,18 +132,21 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
     x = ScaledGooeyBatchNorm2(
             fluidity_decay=0.01,
             max_viscosity=0.999999,
+            learn=True,
             no_gaus=False)([x, is_track])
 
     x = ScaledGooeyBatchNorm2(
             fluidity_decay=0.01,
             max_viscosity=0.999999,
             invert_condition=True,
+            learn=True,
             no_gaus=False)([x, is_track])
 
     c_coords = prime_coords
     c_coords = ScaledGooeyBatchNorm2(
         name='batchnorm_ccoords',
         fluidity_decay=0.01,
+            learn=True,
         max_viscosity=0.999999)(c_coords)
     c_coords = PlotCoordinates(
         plot_every=plot_debug_every,
@@ -156,14 +159,20 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
     x = Concatenate()([x, c_coords, is_track])
     x = Dense(64, name='dense_pre_loop', activation=DENSE_ACTIVATION)(x) 
 
-    allfeat = []
+    allfeat = [c_coords]
     print("Available keys: ", pre_processed.keys())
 
+    ## testing ##
+
+    #nidx,dist = KNN(8, use_approximate_knn=True)([prime_coords, rs])
+    #x = Concatenate()([x, dist])
+    #x = Concatenate()([x,  DistanceWeightedMessagePassing([16])([x,nidx,dist]) ])
+
     ###########################################################################
     ### Loop over GravNet Layers ##############################################
     ###########################################################################
 
-    gravnet_regs = [0.01, 0.01, 0.01]
+    gravnet_reg = 0.01
 
     for i in range(GRAVNET_ITERATIONS):
 
@@ -189,14 +198,18 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
             )([x, rs])
 
         gndist = LLRegulariseGravNetSpace(
-                scale=gravnet_regs[i],
+                scale=gravnet_reg,
                 record_metrics=False,
                 name=f'regularise_gravnet_{i}')([gndist, prime_coords, gnnidx])
 
-        x_rand = random_sampling_block(
-                xgn, rs, gncoords, gnnidx, gndist, is_track,
-                reduction=6, layer_norm=True, name=f"RSU_{i}")
-        x_rand = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x_rand)
+        x = DistanceWeightedMessagePassing(
+                        [32,32,32,32,32,32,32],
+                         activation='elu')([x,gnnidx,gndist])
+
+        #x_rand = random_sampling_block(
+        #        xgn, rs, gncoords, gnnidx, gndist, is_track,
+        #        reduction=6, layer_norm=True, name=f"RSU_{i}")
+        #x_rand = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x_rand)
 
         gndist = AverageDistanceRegularizer(
             strength=1e-3,
@@ -214,8 +227,8 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
         # x_rand = ScalarMultiply(0.1)(x_rand)
         # gndist = ScalarMultiply(0.01)(gndist)
         # gncoords = ScalarMultiply(0.01)(gncoords)
-        x = Concatenate()([x_pre, xgn, x_rand, gndist, gncoords])
-        x = Dense(d_shape,
+        x = Concatenate()([x, x_pre, xgn, gndist, gncoords])
+        x = Dense(2*d_shape,
                   name=f"dense_post_gravnet_1_iteration_{i}",
                   activation=DENSE_ACTIVATION,
                   kernel_regularizer=DENSE_REGULARIZER)(x)
@@ -270,7 +283,7 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
 
     pred_beta = LLExtendedObjectCondensation(scale=1.,
                                              use_energy_weights=True,
-                                             record_metrics=False,
+                                             record_metrics=True,
                                              print_loss=True,
                                              name="ExtendedOCLoss",
                                              implementation = loss_implementation,
@@ -304,7 +317,7 @@ def config_model(Inputs, td, debug_outdir=None, plot_debug_every=2000):
         # 'no_noise_rs': pre_processed['no_noise_rs'],
         }
 
-    return RaggedDictModel(inputs=Inputs, outputs=model_outputs)
+    return DictModel(inputs=Inputs, outputs=model_outputs)
     #return DictModel(inputs=Inputs, outputs=model_outputs)
 
 
diff --git a/Train/config_trainer_jk.py b/Train/config_trainer_jk.py
new file mode 100644
index 00000000..58dc16e3
--- /dev/null
+++ b/Train/config_trainer_jk.py
@@ -0,0 +1,404 @@
+"""
+Flexible training script that should be mostly configured with a yaml config file
+"""
+
+import os
+import sys
+import yaml
+import shutil
+from argparse import ArgumentParser
+
+import tensorflow as tf
+from tensorflow.keras.layers import Concatenate, Dense, Dropout
+
+from DeepJetCore.training.DeepJet_callbacks import simpleMetricsCallback
+from DeepJetCore.DJCLayers import StopGradient, ScalarMultiply
+
+from DeepJetCore.wandb_interface import wandb_wrapper as wandb
+
+import training_base_hgcal
+from Layers import ScaledGooeyBatchNorm2, DummyLayer
+from Layers import MixWhere, ElementWiseMultiply
+from Layers import RaggedGravNet
+from Layers import PlotCoordinates
+from Layers import DistanceWeightedMessagePassing, AccumulateNeighbours
+from Layers import LLFillSpace
+from Layers import LLExtendedObjectCondensation, TranslationInvariantMP
+
+from tensorflow.keras import Model
+
+from Layers import RaggedGlobalExchange
+from Layers import SphereActivation
+from Layers import Multi
+from Layers import ShiftDistance, KNN, MessagePassing
+from Layers import LLRegulariseGravNetSpace
+from Regularizers import AverageDistanceRegularizer
+from model_blocks import tiny_pc_pool, condition_input
+from model_blocks import extent_coords_if_needed
+from model_blocks import create_outputs
+from model_tools import apply_weights_from_path
+from model_blocks import random_sampling_unit, random_sampling_block, random_sampling_block2
+from noise_filter import noise_filter
+from callbacks import plotClusteringDuringTraining
+from callbacks import plotClusterSummary
+from callbacks import NanSweeper, DebugPlotRunner
+
+
+####################################################################################################
+### Load Arguments and trainer #####################################################################
+####################################################################################################
+
+parser = ArgumentParser('training')
+parser.add_argument('configFile')
+parser.add_argument('--no_wandb', help="Don't use wandb", action='store_true')
+parser.add_argument('--run_name', help="wandb run name", default='test')
+parser.add_argument('--wandb_project', help="wandb project name", default="Autumn_2023")
+
+train = training_base_hgcal.HGCalTraining(parser=parser) #this will add all the other standard args -> train.args
+
+CONFIGFILE = train.args.configFile
+print(f"Using config File: \n{CONFIGFILE}")
+
+with open(CONFIGFILE, 'r') as f:
+    config = yaml.load(f, Loader=yaml.FullLoader)
+
+N_CLUSTER_SPACE_COORDINATES = config['General']['n_cluster_space_coordinates']
+N_GRAVNET_SPACE_COORDINATES = config['General']['n_gravnet_space_coordinates']
+GRAVNET_ITERATIONS = len(config['General']['gravnet'])
+LOSS_OPTIONS = config['LossOptions']
+BATCHNORM_OPTIONS = config['BatchNormOptions']
+DENSE_ACTIVATION = config['DenseOptions']['activation']
+DENSE_REGULARIZER = tf.keras.regularizers.l2(config['DenseOptions']['kernel_regularizer_rate'])
+DROPOUT = config['DenseOptions']['dropout']
+
+wandb_config = {
+    "loss_implementation"           :   config['General']['oc_implementation'],
+    "gravnet_iterations"            :   GRAVNET_ITERATIONS,
+    "gravnet_space_coordinates"     :   N_GRAVNET_SPACE_COORDINATES,
+    "cluster_space_coordinates"     :   N_CLUSTER_SPACE_COORDINATES,
+    "loss_energy_weight"            :   config['LossOptions']['energy_loss_weight'],
+    "loss_classification_weight"    :   config['LossOptions']['classification_loss_weight'],
+    "loss_qmin"                     :   config['LossOptions']['q_min'],
+    "loss_use_average_cc_pos"       :   config['LossOptions']['use_average_cc_pos'],
+    "loss_too_much_beta_scale"      :   config['LossOptions']['too_much_beta_scale'],
+    "loss_beta_scale"               :   config['LossOptions']['beta_loss_scale'],
+    "batch_max_viscosity"           :   config['BatchNormOptions']['max_viscosity'],
+    "dense_activation"              :   config['DenseOptions']['activation'],
+    "dense_kernel_reg"              :   config['DenseOptions']['kernel_regularizer_rate'] ,
+    "dense_dropout"                 :   config['DenseOptions']['dropout'],
+}
+
+for i in range(GRAVNET_ITERATIONS):
+    wandb_config[f"gravnet_{i}_neighbours"] =config['General']['gravnet'][i]['n']
+for i in range(len(config['Training'])):
+    wandb_config[f"train_{i}_lr"] = config['Training'][i]['learning_rate']
+    wandb_config[f"train_{i}_epochs"] = config['Training'][i]['epochs']
+    wandb_config[f"train_{i}_batchsize"] = config['Training'][i]['batch_size']
+    if i == 1:
+        wandb_config[f"train_{i}+_max_visc"] = 0.999
+        wandb_config[f"train_{i}+_fluidity_decay"] = 0.1
+
+if not train.args.no_wandb:
+    wandb.init(
+        project=train.args.wandb_project,
+        config=wandb_config,
+    )
+    wandb.save(sys.argv[0]) # Save python file
+    wandb.save(train.args.configFile) # Save config file
+else:
+    wandb.active=False
+
+
+###############################################################################
+### Define Model ##############################################################
+###############################################################################
+
+
+def config_model(Inputs, td, debug_outdir=None, plot_debug_every=500):
+    """
+    Function that defines the model to train
+    """
+
+    ###########################################################################
+    ### Pre-processing step ###################################################
+    ###########################################################################
+
+    orig_input = td.interpretAllModelInputs(Inputs)
+    pre_processed = condition_input(orig_input, no_scaling=False, no_prime=False)
+
+    prime_coords = pre_processed['prime_coords']
+    is_track = pre_processed['is_track']
+    rs = pre_processed['row_splits']
+    energy = pre_processed['rechit_energy']
+    t_idx = pre_processed['t_idx']
+    x = pre_processed['features']
+
+    # preprocessed features here are already normalised! (no_scaling=False above)
+    # Embed hits and track features differently
+    x = ScaledGooeyBatchNorm2(
+            learn=True,
+            no_bias_gamma=True,
+            no_gaus=False)([x, is_track])
+
+    x = ScaledGooeyBatchNorm2(
+            invert_condition=True,
+            learn=True,
+            no_bias_gamma=True,
+            no_gaus=False)([x, is_track])
+
+    c_coords = prime_coords
+    c_coords = ScaledGooeyBatchNorm2(
+        name='batchnorm_ccoords',
+        no_bias_gamma=True,
+        learn=True
+            )(c_coords)
+
+    c_coords = PlotCoordinates(
+        plot_every=plot_debug_every,
+        outdir=debug_outdir,
+        name='input_c_coords',
+        # publish = publishpath
+        )([c_coords, energy, t_idx, rs])
+
+    x = Concatenate()([x, c_coords, is_track])
+    x = Dense(24, name='dense_pre_pre_loop', activation=DENSE_ACTIVATION)(x) 
+    print("Available keys: ", pre_processed.keys())
+
+    ## create a pre-information exchange
+    layerwise_coords = ElementWiseMultiply([[1.,1.,1000.]])(prime_coords)
+    flat_coords = ElementWiseMultiply([[1.,1.,0.]])(prime_coords)
+
+    lw_nidx, lw_dist = KNN(32)([layerwise_coords, rs])
+    flat_nidx, flat_dist = KNN(32)([flat_coords, rs])
+
+    # a few simple MP in different directions
+    
+    for _ in range(1):
+        xt = TranslationInvariantMP([16,16])([x, lw_nidx])
+        x = Concatenate()([xt,x])
+        xt = TranslationInvariantMP([16,16])([x, flat_nidx])
+        x = Concatenate()([x,xt])
+
+    c_coords = extent_coords_if_needed(prime_coords, x, N_CLUSTER_SPACE_COORDINATES)
+    x = Dense(64, name='dense_pre_loop', activation=DENSE_ACTIVATION)(x) 
+    allfeat = [c_coords, x]
+    gncoords = c_coords
+   
+    ###########################################################################
+    ### Loop over GravNet Layers ##############################################
+    ###########################################################################
+
+    gravnet_reg = 1e-4
+
+    for i in range(GRAVNET_ITERATIONS):
+
+        d_shape = x.shape[1]//2
+
+        x = Dense(d_shape,activation=DENSE_ACTIVATION,
+            kernel_regularizer=DENSE_REGULARIZER)(x)
+        x = Dense(d_shape,activation=DENSE_ACTIVATION,
+            kernel_regularizer=DENSE_REGULARIZER)(x)
+
+        x = ScaledGooeyBatchNorm2(**BATCHNORM_OPTIONS)(x)
+        x_pre = x
+        x = Concatenate()([gncoords, x])
+
+        x, gncoords, gnnidx, gndist = RaggedGravNet(
+                name = f"Gravnet_{i}", # 76929, 42625, 42625 
+            n_neighbours=config['General']['gravnet'][i]['n'],
+            n_dimensions=N_GRAVNET_SPACE_COORDINATES,
+            n_filters=d_shape,
+            n_propagate=2*d_shape,
+            coord_initialiser_noise=1e-2,
+            feature_activation=None, #classic gravnet
+            # sumwnorm=True,
+            )([x, rs])
+
+        gndist = LLRegulariseGravNetSpace(
+                scale=gravnet_reg,
+                record_metrics=False,
+                name=f'regularise_gravnet_{i}')([gndist, prime_coords, gnnidx])
+
+        x = Concatenate()([x, TranslationInvariantMP(
+                        [32,32],
+                         activation='elu')([x,gnnidx,gndist])
+                         ])
+
+        gncoords = PlotCoordinates(
+            plot_every=plot_debug_every,
+            outdir=debug_outdir,
+            name='gn_coords_'+str(i)
+            )([gncoords, energy, t_idx, rs])
+
+        x = DummyLayer()([x,gncoords,gndist])#just make sure any layer above is not optimised away
+
+        # afew of those again
+        xt = TranslationInvariantMP([16,16])([x, lw_nidx])
+        x = Concatenate()([x,xt])
+        xt = TranslationInvariantMP([16,16])([x, flat_nidx])
+        x = Concatenate()([x,xt])
+
+        x = Concatenate()([x, x_pre])
+        x = Dense(2*d_shape,
+                  name=f"dense_post_gravnet_1_iteration_{i}",
+                  activation=DENSE_ACTIVATION,
+                  kernel_regularizer=DENSE_REGULARIZER)(x)
+        x = Dense(d_shape,
+                  name=f"dense_post_gravnet_2_iteration_{i}",
+                  activation=DENSE_ACTIVATION,
+                  kernel_regularizer=DENSE_REGULARIZER)(x)
+
+        x = ScaledGooeyBatchNorm2(
+            name=f"batchnorm_loop1_iteration_{i}",
+            **BATCHNORM_OPTIONS)(x)
+
+        allfeat.append(x)
+
+        if len(allfeat) > 1:
+            x = Concatenate()(allfeat)
+        else:
+            x = allfeat[0]
+
+    ###########################################################################
+    ### Create output of model and define loss ################################
+    ###########################################################################
+
+    
+    # afew of those again
+    xt = TranslationInvariantMP([32,16])([x, lw_nidx])
+    x = Concatenate()([x,xt])
+    xt = TranslationInvariantMP([32,16])([x, flat_nidx])
+    x = Concatenate()([x,xt])
+    
+    x = Dense(64,
+              name=f"dense_final_{1}",
+              activation=DENSE_ACTIVATION,
+              kernel_regularizer=DENSE_REGULARIZER)(x)
+    x = Dense(64,
+              name=f"dense_final_{2}",
+              activation=DENSE_ACTIVATION,
+              kernel_regularizer=DENSE_REGULARIZER)(x)
+    x = Dense(64,
+              name=f"dense_final_{3}",
+              activation=DENSE_ACTIVATION,
+              kernel_regularizer=DENSE_REGULARIZER)(x)
+    x = ScaledGooeyBatchNorm2(
+        name=f"batchnorm_final",
+        **BATCHNORM_OPTIONS)(x)
+
+    pred_beta, pred_ccoords, pred_dist, \
+        pred_energy_corr, pred_energy_low_quantile, pred_energy_high_quantile, \
+        pred_pos, pred_time, pred_time_unc, pred_id = \
+        create_outputs(x, n_ccoords=N_CLUSTER_SPACE_COORDINATES, fix_distance_scale=True)
+
+    # pred_ccoords = LLFillSpace(maxhits=2000, runevery=5, scale=0.01)([pred_ccoords, rs, t_idx])
+
+
+    pred_beta = LLExtendedObjectCondensation(scale=1.,
+                                             use_energy_weights=True,
+                                             record_metrics=True,
+                                             print_loss=False,
+                                             name="ExtendedOCLoss",
+                                             implementation = config['General']['oc_implementation'],
+                                             **LOSS_OPTIONS)(
+        [pred_beta, pred_ccoords, pred_dist, pred_energy_corr, pred_energy_low_quantile,
+         pred_energy_high_quantile, pred_pos, pred_time, pred_time_unc, pred_id, energy,
+         pre_processed['t_idx'] , pre_processed['t_energy'] , pre_processed['t_pos'] ,
+         pre_processed['t_time'] , pre_processed['t_pid'] , pre_processed['t_spectator_weight'],
+         pre_processed['t_fully_contained'], pre_processed['t_rec_energy'],
+         pre_processed['t_is_unique'], pre_processed['row_splits']])
+
+    pred_ccoords = PlotCoordinates(
+        plot_every=plot_debug_every,
+        outdir = debug_outdir,
+        name='condensation'
+        )([pred_ccoords, pred_beta, pre_processed['t_idx'], rs])
+
+    model_outputs = {
+        'pred_beta': pred_beta,
+        'pred_ccoords': pred_ccoords,
+        'pred_energy_corr_factor': pred_energy_corr,
+        'pred_energy_low_quantile': pred_energy_low_quantile,
+        'pred_energy_high_quantile': pred_energy_high_quantile,
+        'pred_pos': pred_pos,
+        'pred_time': pred_time,
+        'pred_id': pred_id,
+        'pred_dist': pred_dist,
+        'rechit_energy': energy,
+        'row_splits': pre_processed['row_splits'],
+        # 'no_noise_sel': pre_processed['no_noise_sel'],
+        # 'no_noise_rs': pre_processed['no_noise_rs'],
+        }
+
+    return Model(inputs=Inputs, outputs=model_outputs)
+    #return DictModel(inputs=Inputs, outputs=model_outputs)
+
+
+###############################################################################
+### Set up training ###########################################################
+###############################################################################
+
+
+
+if not train.modelSet():
+    train.setModel(
+        config_model,
+        td=train.train_data.dataclass(),
+        debug_outdir=train.outputDir+'/intplots',
+        )
+    train.setCustomOptimizer(tf.keras.optimizers.Nadam(clipnorm=1.))
+    train.compileModel(learningrate=1e-4)
+    train.keras_model.summary()
+
+###############################################################################
+### Callbacks #################################################################
+###############################################################################
+
+
+samplepath = train.val_data.getSamplePath(train.val_data.samples[0])
+PUBLISHPATH = ""
+PUBLISHPATH += [d  for d in train.outputDir.split('/') if len(d)][-1]
+RECORD_FREQUENCY = 10
+PLOT_FREQUENCY = 40
+
+cb = [NanSweeper()] #this takes a bit of time checking each batch but could be worth it
+
+cb += [
+    plotClusterSummary(
+        outputfile=train.outputDir + "/clustering/",
+        samplefile=train.val_data.getSamplePath(train.val_data.samples[0]),
+        after_n_batches=500
+        )
+    ]
+
+###############################################################################
+### Actual Training ###########################################################
+###############################################################################
+
+shutil.copyfile(CONFIGFILE, os.path.join(sys.argv[3], "config.yaml"))
+
+N_TRAINING_STAGES = len(config['Training'])
+for i in range(N_TRAINING_STAGES):
+    print(f"Starting training stage {i}")
+    learning_rate = config['Training'][i]['learning_rate']
+    epochs = config['Training'][i]['epochs']
+    batch_size = config['Training'][i]['batch_size']
+    train.change_learning_rate(learning_rate)
+    print(f"Training for {epochs} epochs")
+    print(f"Learning rate set to {learning_rate}")
+    print(f"Batch size: {batch_size}")
+
+    if i == 1:
+        # change batchnorm
+        for layer in train.keras_model.layers:
+            if 'batchnorm' in layer.name:
+                layer.max_viscosity = 0.999
+                layer.fluidity_decay = 0.01
+    print("Here we are")
+    train.trainModel(
+        nepochs=epochs,
+        batchsize=batch_size,
+        add_progbar=True,
+        additional_callbacks=cb,
+        collect_gradients = 1
+        )
diff --git a/Train/configuration/rsu.yaml b/Train/configuration/rsu.yaml
index 305b8832..954a78f1 100644
--- a/Train/configuration/rsu.yaml
+++ b/Train/configuration/rsu.yaml
@@ -1,44 +1,44 @@
 General:
   oc_implementation: 'hinge'
   gravnet:
-    - n: 32
-    - n: 128
+    - n: 64
     - n: 64
   n_cluster_space_coordinates: 3
   n_gravnet_space_coordinates: 3
 
 DenseOptions:
   activation: "elu"
-  kernel_regularizer_rate: 0.001
+  kernel_regularizer_rate: 0.00000000001
   dropout: 0.000001
 
 BatchNormOptions:
   max_viscosity: 0.5
+  learn: True
 
 LossOptions:
-  energy_loss_weight: .00
-  q_min: 5.0
-  use_average_cc_pos: 0.99
-  classification_loss_weight: 0.0
+  energy_loss_weight: 0.00000000001
+  q_min: 2.1
+  use_average_cc_pos: 0.99999
+  classification_loss_weight: 0.00000000001
   too_much_beta_scale: 0.0
-  position_loss_weight: 0.0
-  timing_loss_weight: 0.0
+  position_loss_weight: 0.00000000001
+  timing_loss_weight: 0.00000000001
   beta_loss_scale: 1.0
 
 
 Training:
   - stage_1:
     batch_size: 120000
-    learning_rate: 0.0003
-    epochs: 1
+    learning_rate: 0.0005
+    epochs: 4
 
   - stage_2:
-    batch_size: 120000
+    batch_size: 400000
     learning_rate: 0.0001
     epochs: 10
 
   - stage_3:
-    batch_size: 120000
-    learning_rate: 0.00001
+    batch_size: 400000
+    learning_rate: 0.00003
     epochs: 20
 
diff --git a/modules/DebugLayers.py b/modules/DebugLayers.py
index 7b2b791e..29f71b9a 100644
--- a/modules/DebugLayers.py
+++ b/modules/DebugLayers.py
@@ -125,7 +125,7 @@ def __init__(self,
         if len(outdir) < 1:
             self.plot_every=0
         self.outdir = outdir
-        self.counter=-1
+        self.counter=0
         if not os.path.isdir(os.path.dirname(self.outdir)): #could not be created
             self.outdir=''
             
diff --git a/modules/GravNetLayersRagged.py b/modules/GravNetLayersRagged.py
index 7e2189b1..cb957408 100644
--- a/modules/GravNetLayersRagged.py
+++ b/modules/GravNetLayersRagged.py
@@ -33,6 +33,16 @@
 #        outshape *= 2
 #    return tf.reshape(out, [-1, outshape]),midx
 
+def layernorm(x, return_norm=False):
+    #x = x - tf.reduce_mean(x,axis=-1, keepdims=True)
+    norm = tf.reduce_sum(x**2, axis=-1,keepdims=True)
+    norm = tf.sqrt(norm+1e-6)
+    if return_norm:
+        x = tf.concat([x / norm * tf.sqrt(tf.cast(x.shape[-1],'float32')), norm], axis=-1)
+    else:
+        x = x / norm * tf.sqrt(tf.cast(x.shape[-1],'float32'))
+    return x
+
 
 class RandomSampling(tf.keras.layers.Layer):
     """
@@ -1314,7 +1324,7 @@ def call(self, inputs, training=None):
 
 class ScaledGooeyBatchNorm2(tf.keras.layers.Layer):
     def __init__(self,
-                 viscosity=0.01,
+                 viscosity=1e-9,#start at almost zero
                  fluidity_decay=1e-4,
                  max_viscosity=0.99999,
                  no_gaus = True,
@@ -1322,6 +1332,8 @@ def __init__(self,
                  invert_condition=False,
                  _promptnames=None, #compatibility, does nothing
                  record_metrics=False, #compatibility, does nothing
+                 learn = False,
+                 no_bias_gamma = False,
                  **kwargs):
         '''
         Input features (or [features, condition]), output: normed features
@@ -1350,6 +1362,8 @@ def __init__(self,
         self.epsilon = epsilon
         self.no_gaus = no_gaus
         self.invert_condition = invert_condition
+        self.learn = learn
+        self.no_bias_gamma = no_bias_gamma
 
     def compute_output_shape(self, input_shapes):
         #return input_shapes[0]
@@ -1363,7 +1377,9 @@ def get_config(self):
                   'max_viscosity': self.max_viscosity,
                   'epsilon': self.epsilon,
                   'no_gaus': self.no_gaus,
-                  'invert_condition': self.invert_condition
+                  'invert_condition': self.invert_condition,
+                  'learn': self.learn,
+                  'no_bias_gamma': self.no_bias_gamma
                   }
         base_config = super(ScaledGooeyBatchNorm2, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
@@ -1378,14 +1394,14 @@ def build(self, input_shapes):
             shape = (1,)+input_shapes[1:]
 
         self.bias = self.add_weight(name = 'bias',shape = shape,
-                                    initializer = 'zeros', trainable = self.trainable)
+                                    initializer = 'zeros', trainable = self.trainable and not self.no_bias_gamma)
         self.gamma = self.add_weight(name = 'gamma',shape = shape,
-                                    initializer = 'ones', trainable = self.trainable)
-
+                                    initializer = 'ones', trainable = self.trainable and not self.no_bias_gamma)
+    
         self.mean = self.add_weight(name = 'mean',shape = shape,
-                                    initializer = 'zeros', trainable =  False)
+                                    initializer = 'zeros', trainable =  self.learn and self.trainable)
         self.den = self.add_weight(name = 'den',shape = shape,
-                                    initializer = 'ones', trainable =  False)
+                                    initializer = 'ones', trainable =  self.learn and self.trainable)
         self.viscosity = tf.Variable(initial_value=self.viscosity_init,
                                          name='viscosity',
                                          trainable=False,dtype='float32')
@@ -1403,6 +1419,8 @@ def _calc_mean_and_protect(self, x, mask, default):
         return x
 
     def _calc_update(self, old, new, training, visc=None):
+        if not self.trainable:
+            return old
         if visc is None:
             visc = self.viscosity
         delta = new-old
@@ -1410,6 +1428,8 @@ def _calc_update(self, old, new, training, visc=None):
         return tf.keras.backend.in_train_phase(update,old,training=training)
 
     def _update_viscosity(self, training):
+        if not self.trainable:
+            return
         if self.fluidity_decay > 0:
             newvisc = self.viscosity + (self.max_viscosity - self.viscosity)*self.fluidity_decay
             newvisc = tf.keras.backend.in_train_phase(newvisc,self.viscosity,training=training)
@@ -1422,15 +1442,14 @@ def _calc_out(self, x_in, cond):
 
         out = (x_in - ngmean) / (tf.abs(ngden) + self.epsilon)
         out = out*self.gamma + self.bias
-        if self.invert_condition:
-            return tf.where(cond<=0.5,  out, x_in)
-        else:
-            return tf.where(cond>0.5,  out, x_in)
+        return tf.where(cond>0.5,  out, x_in)
 
     def call(self, inputs, training=None):
         if isinstance(inputs,list):
             x_in, cond = inputs
             cond = tf.where(cond > 0.5, tf.ones_like(cond), 0.) #make sure it's ones and zeros
+            if self.invert_condition:
+                cond = 1.-cond
         else:
             x_in = inputs
             cond = tf.ones_like(x_in[...,0:1])
@@ -1441,7 +1460,7 @@ def call(self, inputs, training=None):
         #x = x_in #maybe don't stop the gradient?
         x_m = self._calc_mean_and_protect(x, cond, self.mean)
 
-        diff_to_mean = tf.abs(x - self.mean) #self.mean or x_m
+        diff_to_mean = tf.abs(x - x_m) #self.mean) #self.mean or x_m -> self.mean over-corrects
         if not self.no_gaus:
             diff_to_mean = diff_to_mean**2
 
@@ -1450,14 +1469,18 @@ def call(self, inputs, training=None):
         if not self.no_gaus:
             x_std = tf.sqrt(x_std + self.epsilon)
 
-
-        update = self._calc_update(self.mean,x_m,training)
-        tf.keras.backend.update(self.mean, update)
-
-        update = self._calc_update(self.den,x_std,training)
-        tf.keras.backend.update(self.den, update)
-
-        self._update_viscosity(training)
+        #tf.print(self.name, 'p_loss',p_loss, tf.reduce_mean(self.mean), tf.reduce_mean(self.den))
+        if self.learn:
+            p_loss = tf.reduce_mean( (self.mean-x_m)**2 +  (self.den-x_std)**2 )
+            self.add_loss(p_loss)
+        else:
+            update = self._calc_update(self.mean,x_m,training)
+            tf.keras.backend.update(self.mean, update)
+    
+            update = self._calc_update(self.den,x_std,training)
+            tf.keras.backend.update(self.den, update)
+    
+            self._update_viscosity(training)
 
         out = self._calc_out(x_in, cond)
 
@@ -1482,103 +1505,82 @@ def __init__(self,**kwargs):
 
 
 class ProcessFeatures(tf.keras.layers.Layer):
+
     def __init__(self,
-                 newformat=True,#compat can be restored but default is new format
+                  is_hgcal = False,
                  **kwargs):
-        """
-        Inputs are:
-         - Features
-
-        Call will return:
-         - processed features
-
-        will apply some simple fixed preprocessing to the standard TrainData_OC features
 
-        """
-
-        from datastructures import TrainData_NanoML
-        self.td=TrainData_NanoML()
-        self.newformat = newformat
-        super(ProcessFeatures, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
+        self.is_hgcal = is_hgcal
+        '''
+        'recHitEnergy': feat[:,0:1] ,          #recHitEnergy,
+        'recHitEta'   : feat[:,1:2] ,          #recHitEta   ,
+        'recHitID'    : feat[:,2:3] ,          #recHitID, #indicator if it is track or not
+        'recHitTheta' : feat[:,3:4] ,          #recHitTheta ,
+        'recHitR'     : feat[:,4:5] ,          #recHitR   ,
+        'recHitX'     : feat[:,5:6] ,          #recHitX     ,
+        'recHitY'     : feat[:,6:7] ,          #recHitY     ,
+        'recHitZ'     : feat[:,7:8] ,          #recHitZ     ,
+        'recHitTime'  : feat[:,8:9] ,            #recHitTime  
+        'recHitHitR'  : feat[:,9:10] ,   
+        '''
 
+        if self.is_hgcal:
+            self.mean = tf.constant([[
+                0.1, 
+                0., # 2.6024690e+00, #this is abs
+                0., 
+                0., # 1.5362334e-01, #this is abs
+                0., # 3.4008121e+02, #this is abs
+                0., 
+                0., 
+                0., # 3.3786691e+02 #this is abs
+                0., 
+                0.1]], dtype='float32')
+        else:
+            self.mean = tf.constant([[
+                0.1, 
+                2.6, #this is abs
+                0., 
+                1.5e-01, #this is abs
+                3.5e+02, #this is abs
+                0., 
+                0., 
+                3.5e+02,
+                0., 
+                0.1]], dtype='float32')
+
+        self.std =  tf.constant([[ 
+            2.4, 
+            3.4e-01, 
+            1., #ID set to 1, this will be track charge?
+            6.4e-02,
+            1.5e+01, 
+            4.0e+01, 
+            3.9e+01, 
+            1.3e+01,
+            1., 
+            4.2e-01]], dtype='float32')
+
+    
     def get_config(self):
-        config = {'newformat': self.newformat}
+        config = {'is_hgcal': self.is_hgcal}
         base_config = super(ProcessFeatures, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
-
+    
     def compute_output_shape(self, input_shape):
         return input_shape
 
     def call(self, inputs):
-        '''
-        'recHitEnergy',
-            'recHitEta',
-            'isTrack',
-            'recHitTheta',
-            'recHitR',
-            'recHitX',
-            'recHitY',
-            'recHitZ',
-            'recHitTime',
-            'recHitHitR'
-        '''
-        feat = None
-        fdict = self.td.createFeatureDict(inputs, False)
-
-        if not self.newformat:
-            #please make sure in TrainData_OC that this is consistent
-            fdict['recHitR'] /= 100.
-            fdict['recHitX'] /= 100.
-            fdict['recHitY'] /= 100.
-            fdict['recHitZ'] = (tf.abs(fdict['recHitZ'])-400)/100.
-            fdict['recHitEta'] = tf.abs(fdict['recHitEta'])
-            fdict['recHitTheta'] = 2.*tf.math.atan(tf.exp(-fdict['recHitEta']))
-            fdict['recHitTime'] = tf.nn.relu(fdict['recHitTime'])/10. #remove -1 default
-            allf = []
-            for k in fdict:
-                allf.append(fdict[k])
-            feat = tf.concat(allf,axis=-1)
-
-            mean = tf.constant([[0.0740814656, 2.46156192, 0., 0.207392946, 3.55599976, 0.0609507263,
-                             -0.00401970092, -0.515379727, 0.0874295086]])
-            std =  tf.constant([[0.299679846, 0.382687777, 1., 0.0841238275, 0.250777304, 0.485394388,
-                                 0.518072903,     0.240222782, 0.194716245]])
-            feat -= mean
-            feat /= std
-
-            return feat
-
-        else: #new std format
-            fdict['recHitEta'] = tf.abs(fdict['recHitEta'])
-            fdict['recHitZ'] =  tf.abs(fdict['recHitZ'])
-            fdict['recHitTheta'] = 2.*tf.math.atan(tf.exp(-fdict['recHitEta']))
-            allf = []
-            for k in fdict:
-                allf.append(fdict[k])
-            feat = tf.concat(allf,axis=-1)
-
-            mean = tf.constant([[ 3.95022651e-02,  2.46088736e+00,
-                                 0.00000000e+00,
-                                  1.56797723e+00,  3.54114793e+02,
-                                 0.,#x
-                                 0.,#y
-                                  3.47267313e+02,
-                                 -3.73342582e-01,
-                                  7.61663214e-01]])
-            std =  tf.constant([[ 0.16587503,  0.36627547,  1.        ,
-                                 1.39035478, 22.55941696,
-                                 50., 50.,
-                                 21.75297722,
-                                 1.89301789,
-                                 0.14808707]])
-
-            feat -= mean
-            feat /= std
-
-            return feat
-
-        ##old format below
+        #hard coded normalisation for HGCAL (or similar) only!
+        feat = inputs
+    
+        feat -= self.mean
+        feat /= self.std
+
+        return feat
+
 
 
 
@@ -2405,6 +2407,8 @@ def compute_output_shape(self, input_shapes):
     def raw_call(coordinates, row_splits, K, radius, use_approximate_knn, min_bins, tfdist, myself):
         nbins=None
         if use_approximate_knn:
+            if min_bins is None:
+                min_bins = 11
             bin_width = radius # default value for SlicingKnn kernel
             idx,dist,nbins = SlicingKnn(K+1, coordinates,  row_splits,
                                   features_to_bin_on = (0,1),
@@ -3301,7 +3305,7 @@ def build(self, input_shapes):
 
         super(RaggedGravNet, self).build(input_shape)
 
-
+    @tf.function
     def create_output_features(self, x, neighbour_indices, distancesq):
         allfeat = []
         features = x
@@ -3316,20 +3320,11 @@ def create_output_features(self, x, neighbour_indices, distancesq):
         features = tf.concat(allfeat + [x], axis=-1)
         return self.output_feature_transform(features)
 
-    def priv_call(self, inputs, training=None):
-        x = inputs[0]
-        row_splits = inputs[1]
-        tf.assert_equal(x.shape.ndims, 2)
-        tf.assert_equal(row_splits.shape.ndims, 1)
-        if row_splits.shape[0] is not None:
-            tf.assert_equal(row_splits[-1], x.shape[0])
-
-        x_coord = x
-        if len(inputs) == 3:
-            x_coord = tf.concat([inputs[2], x], axis=-1)
-
+    #@tf.function(reduce_retracing=True) #don't know why this is being retraced so often..
+    def priv_call(self, x, row_splits, x_coord):
+        
         coordinates = self.input_spatial_transform(x_coord)
-        neighbour_indices, distancesq, sidx, sdist = self.compute_neighbours_and_distancesq(coordinates, row_splits, training)
+        neighbour_indices, distancesq, sidx, sdist = self.compute_neighbours_and_distancesq(coordinates, row_splits)
         neighbour_indices = tf.reshape(neighbour_indices, [-1, self.n_neighbours]) #for proper output shape for keras
         distancesq = tf.reshape(distancesq, [-1, self.n_neighbours])
 
@@ -3338,8 +3333,14 @@ def priv_call(self, inputs, training=None):
             neighbour_indices, distancesq = sidx, sdist
         return outfeats, coordinates, neighbour_indices, distancesq
 
-    def call(self, inputs, training):
-        return self.priv_call(inputs, training)
+    def call(self, inputs, training=None):
+        x = inputs[0]
+        row_splits = inputs[1]
+        x_coord = x
+        if len(inputs) == 3:
+            x_coord = tf.concat([inputs[2], x], axis=-1)
+
+        return self.priv_call(x, row_splits, x_coord)
 
     def compute_output_shape(self, input_shapes):
         if self.return_self:
@@ -3353,9 +3354,8 @@ def compute_output_shape(self, input_shapes):
                (input_shapes[0][0], self.n_neighbours),\
                (input_shapes[0][0], self.n_neighbours)
 
-
-
-    def compute_neighbours_and_distancesq(self, coordinates, row_splits, training):
+    #@tf.function
+    def compute_neighbours_and_distancesq(self, coordinates, row_splits):
 
         idx,dist = BinnedSelectKnn(self.n_neighbours+1, coordinates,  row_splits,
                                  max_radius= -1.0, tf_compatible=False,
@@ -3367,9 +3367,7 @@ def compute_neighbours_and_distancesq(self, coordinates, row_splits, training):
 
         dist = tf.where(idx<0,0.,dist)
 
-        if self.return_self:
-            return idx[:, 1:], dist[:, 1:], idx, dist
-        return idx[:, 1:], dist[:, 1:], None, None
+        return idx[:, 1:], dist[:, 1:], idx, dist
 
 
     def collect_neighbours(self, features, neighbour_indices, distancesq):
@@ -3415,6 +3413,115 @@ def call(self, input):
         att = 2.* self.att_dense(input)
         return input * att
 
+class TranslationInvariantMP(tf.keras.layers.Layer):
+    def __init__(self,
+                 n_feature_transformation,
+                 activation='elu',
+                 mean=True,
+                 layer_norm = False,
+                 sum_weight=False,
+                 **kwargs):
+        super(TranslationInvariantMP, self).__init__(**kwargs)
+
+        self.n_feature_transformation = n_feature_transformation
+        self.activation = activation
+        self.mean = mean
+        self.layer_norm = layer_norm
+        self.sum_weight = sum_weight
+        self.feature_tranformation_dense = []
+        for i in range(len(self.n_feature_transformation)):
+            with tf.name_scope(self.name + "/" + str(i)):
+                self.feature_tranformation_dense.append(
+                    tf.keras.layers.Dense(n_feature_transformation[i], activation=activation, use_bias = i>0))
+
+
+    def build(self, input_shapes):
+        input_shape = input_shapes[0]
+
+        with tf.name_scope(self.name + "/" + str(0)):
+            self.feature_tranformation_dense[0].build(input_shape)
+
+        for i in range(1, len(self.feature_tranformation_dense)):
+            with tf.name_scope(self.name + "/" + str(i)):
+                self.feature_tranformation_dense[i].build((input_shape[0], self.n_feature_transformation[i - 1]))
+
+        super(TranslationInvariantMP, self).build(input_shapes)
+
+
+    def compute_output_shape(self, inputs_shapes):
+        fshape = inputs_shapes[0][-1]
+        return (None, sum(self.n_feature_transformation))
+
+
+    def get_config(self):
+        config = {'n_feature_transformation': self.n_feature_transformation,
+                  'activation': self.activation,
+                  'mean': self.mean,
+                  'layer_norm': self.layer_norm,
+                  'sum_weight': self.sum_weight
+        }
+        base_config = super(TranslationInvariantMP, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _trf_loop(self, features, neighbour_indices, distancesq, K, first):
+
+        if self.layer_norm:
+            features = layernorm(features)
+        prev_feat = features
+        if self.mean:
+            #add a 1 to the features for translation invariance later
+            if first:
+                ones = tf.ones_like(features[:,0:1])
+                features = tf.concat([ones, features], axis=-1)
+            # Standard Message Passing
+            features = AccumulateKnn(
+                10.*distancesq,
+                features,
+                neighbour_indices,
+                mean_and_max=False)[0] * K
+
+            # this is only necessary for the first exchange, afterwards the features are already translation independent
+            if first:
+                minus_xi = features[:,0:1]
+                features = features[:,1:]
+                features -= prev_feat * minus_xi
+            if self.sum_weight:
+                wsum = tf.math.divide_no_nan(K, tf.reduce_sum( tf.exp(-10.*distancesq), axis=1, keepdims=True) + 1e-2)#large eps
+                features *= wsum
+        else: #max
+            nfeat = SelectWithDefault(neighbour_indices, features,-2.)
+            features = tf.reduce_max(tf.exp(-10.*distancesq) * nfeat - features[:,tf.newaxis,:], axis=1) * K
+        
+        return features/K
+        
+    
+    def create_output_features(self, x, neighbour_indices, distancesq):
+        allfeat = []
+        features = x
+        K = tf.cast(tf.shape(neighbour_indices)[1], 'float32')
+
+        for i in range(len(self.n_feature_transformation)):
+            features = self._trf_loop(features, neighbour_indices, distancesq, K, i==0)
+            t = self.feature_tranformation_dense[i]
+            features = t(features) #divide by K here again
+            allfeat.append(features)
+
+        features = tf.concat(allfeat, axis=-1)
+        features = tf.reshape(features, [-1, sum(self.n_feature_transformation)])
+        return features
+
+    @tf.function
+    def call(self, inputs):
+        if len(inputs) == 3:
+            x, neighbor_indices, distancesq = inputs
+        elif len(inputs) == 2:
+            x, neighbor_indices = inputs
+            distancesq = tf.zeros_like(neighbor_indices, dtype='float32')
+        else:
+            raise ValueError(self.name+" was passed wrong inputs")
+
+        return self.create_output_features(x, neighbor_indices, distancesq)
+
 
 class MultiAttentionGravNetAdd(tf.keras.layers.Layer):
     def __init__(self,
@@ -3605,7 +3712,7 @@ def __init__(self, n_feature_transformation,
         self.feature_tranformation_dense = []
         for i in range(len(self.n_feature_transformation)):
             with tf.name_scope(self.name + "/5/" + str(i)):
-                self.feature_tranformation_dense.append(tf.keras.layers.Dense(self.n_feature_transformation[i], activation='relu'))  # restrict variations a bit
+                self.feature_tranformation_dense.append(tf.keras.layers.Dense(self.n_feature_transformation[i], activation='elu'))  # restrict variations a bit
 
     def build(self, input_shapes):
         input_shape = input_shapes[0]
@@ -3700,6 +3807,7 @@ def get_config(self):
         base_config = super(DistanceWeightedMessagePassing, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
 
+    @tf.function
     def create_output_features(self, x, neighbour_indices, distancesq):
         allfeat = []
         features = x
diff --git a/modules/Layers.py b/modules/Layers.py
index ce13ae2c..848c7cfe 100644
--- a/modules/Layers.py
+++ b/modules/Layers.py
@@ -16,12 +16,8 @@
 from tensorflow.keras.layers import LeakyReLU
 global_layers_list['LeakyReLU'] = LeakyReLU
 
-
 #base modules
 
-from baseModules import PromptMetric
-global_layers_list['PromptMetric'] = PromptMetric
-
 from baseModules import LayerWithMetrics
 global_layers_list['LayerWithMetrics'] = LayerWithMetrics
 
@@ -230,6 +226,9 @@
 from GravNetLayersRagged import RaggedGravNet
 global_layers_list['RaggedGravNet']=RaggedGravNet
 
+from GravNetLayersRagged import TranslationInvariantMP
+global_layers_list['TranslationInvariantMP']=TranslationInvariantMP
+
 from GravNetLayersRagged import SelfAttention
 global_layers_list['SelfAttention']=SelfAttention
 
@@ -302,7 +301,7 @@
 
 
 from LossLayers import LLValuePenalty,LLNotNoiseClassifier,CreateTruthSpectatorWeights, NormaliseTruthIdxs, LLGraphCondOCLoss
-from LossLayers import LLLocalClusterCoordinates, LLClusterCoordinates,LLFillSpace, LLOCThresholds
+from LossLayers import LLLocalClusterCoordinates,LLFractionRegressor, LLClusterCoordinates,LLFillSpace, LLOCThresholds
 from LossLayers import LossLayerBase, LLBasicObjectCondensation, LLFullObjectCondensation,LLPFCondensates,LLNeighbourhoodClassifier
 from LossLayers import LLFullObjectCondensationUncertainty, LLFullObjectCondensationID
 from LossLayers import LLExtendedObjectCondensation, LLExtendedObjectCondensation2
@@ -329,6 +328,9 @@
 global_layers_list['LLPushTracks']=LLPushTracks
 global_layers_list['LLEnergySums']=LLEnergySums
 
+global_layers_list['LLFractionRegressor']=LLFractionRegressor
+
+
 
 global_layers_list['LLOCThresholds']=LLOCThresholds
 global_layers_list['LLLocalEnergyConservation']=LLLocalEnergyConservation
@@ -377,6 +379,17 @@
 import tensorflow.keras.backend as K
 import tensorflow as tf
 
+class DummyLayer(tf.keras.layers.Layer): 
+    '''
+    Just to make sure other layers are not optimised away
+    Inputs:
+    - list of tensors. First will be passed through, the other will be ignored
+    '''
+    def call(self, inputs):
+        return inputs[0]
+
+global_layers_list['DummyLayer']=DummyLayer
+
 
 class GroupSortActivation(tf.keras.layers.Layer): 
     
@@ -390,6 +403,27 @@ def call(self, inputs):
 global_layers_list['GroupSortActivation']=GroupSortActivation
 
 
+class ElementWiseMultiply(tf.keras.layers.Layer): 
+
+    def __init__(self, multi_vector : list, **kwargs):
+        super(ElementWiseMultiply, self).__init__(**kwargs)
+        self.multi_vector = multi_vector
+        self.mult = tf.constant(self.multi_vector, dtype='float32')
+        
+    def get_config(self):
+        config = {'multi_vector': self.multi_vector}
+        base_config = super(ElementWiseMultiply, self).get_config()
+        return dict(list(base_config.items()) + list(config.items() ))
+    
+    def compute_output_shape(self, input_shapes):
+        return input_shapes
+    
+    def call(self, inputs):
+        return inputs * self.mult
+        
+global_layers_list['ElementWiseMultiply']=ElementWiseMultiply
+
+
 def layernorm(x, return_norm=False):
     x = x - tf.reduce_mean(x,axis=-1, keepdims=True)
     norm = tf.reduce_sum(x**2, axis=-1,keepdims=True)
@@ -790,6 +824,7 @@ def __init__(self, skip_non_finite=5,
         :param kwargs:  For subclass Model
         """
 
+        raise ValueError("RobustModel deprecated, please use simply tf.keras.Model.")
         super(RobustModel, self).__init__(*args, **kwargs)
 
         # if 'config' in kwargs:
@@ -950,39 +985,59 @@ def __init__(self,
         """
         Just forces dictionary output
         """
-        
+        raise ValueError("DictModel deprecated, please use simply tf.keras.Model.")
         super(DictModel, self).__init__(inputs,outputs=outputs, *args, **kwargs)
 
-    
-class RaggedDictModel(tf.keras.Model):
-    def __init__(self, 
-                 inputs,
-                 outputs: dict, #force to be dict
-                 *args, **kwargs):
-        """
-        Just forces dictionary output
-        """
+
+global_layers_list['DictModel']=DictModel
+
+
+
+class ReduceHits(Layer):
+    def __init__(self, mode, **kwargs):
+        super(ReduceHits, self).__init__(**kwargs)
         
-        super(RaggedDictModel, self).__init__(inputs,outputs=outputs, *args, **kwargs)
-
-    def call(self, inputs, *args, **kwargs):
-        return super(RaggedDictModel, self).call(self.unpack_ragged(inputs), *args, **kwargs)
-
-    def train_step(self, inputs, *args, **kwargs):
-        return super(RaggedDictModel, self).train_step(self.unpack_ragged(inputs), *args, **kwargs)
-        #super(RaggedDictModel, self).train_step(inputs, *args, **kwargs)
-
-    def unpack_ragged(self, inputs):
-        output = []
-        for i in inputs:
-            print("Type of i is", type(i))
-            print("Hasattr", hasattr(i, "row_splits"))
-            if type(i) == tf.RaggedTensor:
-                print("Inside")
-                output.append((i.values, i.row_splts))
-            else:
-                output.append(i)
+        assert mode == 'mean' or mode == 'max' or mode == 'sum' or mode == 'min'
+        self.mode=mode
 
-        return output
+    def call(self, inputs):
+        assert len(inputs) == 2
+        x, rs = inputs
+        xr = tf.RaggedTensor.from_row_splits(x, rs)
+        if self.mode == 'mean':
+            return tf.reduce_mean(xr, axis=1)
+        elif self.mode == 'max':
+            return tf.reduce_max(xr, axis=1)
+        elif self.mode == 'sum':
+            return tf.reduce_sum(xr, axis=1)
+        elif self.mode == 'min':
+            return tf.reduce_min(xr, axis=1)
+        else:
+            raise ValueError('Unknown mode %s' % self.mode)
 
-global_layers_list['DictModel']=DictModel
+
+    def get_config(self):
+        config = {'mode': self.mode}
+        base_config = super(ReduceHits, self).get_config()
+        return dict(list(base_config.items()) + list(config.items() ))
+
+global_layers_list['ReduceHits']=ReduceHits
+
+class BroadcastMultiply(Layer):
+    def __init__(self, mode, **kwargs):
+        '''
+        multiplies with broadcasting, e.g.
+        a = tf.constant([[1,2,3],[4,5,6]])
+        b = tf.constant([[1],[2]])
+        c = BroadcastMultiply()([a,b])
+
+        print(c) -> [[1,2,3],[8,10,12]]
+        '''
+        super(BroadcastMultiply, self).__init__(**kwargs)
+
+    def call(self, inputs):
+        assert len(inputs) == 2
+        x, y = inputs
+        return x*y
+    
+global_layers_list['BroadcastMultiply']=BroadcastMultiply
\ No newline at end of file
diff --git a/modules/LossLayers.py b/modules/LossLayers.py
index d7149dbe..a89f4ffa 100644
--- a/modules/LossLayers.py
+++ b/modules/LossLayers.py
@@ -271,7 +271,7 @@ def call(self, inputs):
             if not self.return_lossval:
                 self.add_loss(lossval)
 
-        self.add_prompt_metric(lossval, self.name+'_loss')
+        self.wandb_log({self.name+'_loss': lossval})
         if self.return_lossval:
             return a, lossval
         else:
@@ -331,6 +331,46 @@ def loss(self, inputs):
 
 
 
+class LLFractionRegressor(LossLayerBase):
+
+    def __init__(self,
+                 mode : str = "binary",
+                 **kwargs):
+        '''
+        Takes as input: 
+        - score
+        - truth_fraction
+        Returns:
+        - score
+        '''
+        assert mode == "binary" or mode == "regression_bce" or mode == "regression_mse"
+
+        super(LLFractionRegressor, self).__init__(**kwargs)
+        self.mode = mode
+
+    def get_config(self):
+        config = {'mode': self.mode}
+        base_config = super(LLFractionRegressor, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def loss(self, inputs):
+        assert isinstance(inputs, list) and len(inputs) == 2
+        score, truth_fraction = inputs
+
+        b_truth_fraction = tf.where(truth_fraction > 0.5, 1., tf.zeros_like(truth_fraction))
+        if self.mode == "binary":
+            return tf.reduce_mean(tf.keras.losses.binary_crossentropy(b_truth_fraction, score))
+        elif self.mode == "regression_bce":
+            return tf.reduce_mean(tf.keras.losses.binary_crossentropy(truth_fraction, score))
+        elif self.mode == "regression_mse":
+            return tf.reduce_mean(tf.keras.losses.mean_squared_error(truth_fraction, score))
+        else:
+            raise ValueError("Unknown mode: {}".format(self.mode))
+        
+        
+
+
+
 class LLValuePenalty(LossLayerBase):
 
     def __init__(self,
@@ -636,7 +676,7 @@ def loss(self, inputs):
 
         abs_mean_diff = tf.abs(tf.reduce_mean(reldiff))
 
-        self.add_prompt_metric(abs_mean_diff, self.name + '_loss')
+        self.wandb_log({self.name + '_loss': abs_mean_diff})
 
         return abs_mean_diff
         #print(reldiff, '\n', tf.reduce_mean(reldiff))
@@ -800,14 +840,11 @@ def loss(self, inputs):
         efficiency /= ow_sum
         efficiency = tf.reduce_mean(efficiency)
 
-        self.add_prompt_metric(purity, self.name+'_purity')
-        self.add_prompt_metric(efficiency, self.name+'_efficiency')
-
+        self.wandb_log({self.name+'_purity':purity,
+                        self.name+'_efficiency': efficiency})
 
         loss = self.purity_weight * (1. - purity) + (1.-self.purity_weight)* tf.abs(1. - efficiency)
 
-        print(f'purity {purity} efficiency {efficiency} loss {loss}')
-
         return loss
 
 
@@ -2155,7 +2192,7 @@ def __init__(self, *, energy_loss_weight=1.,
 
         from object_condensation import Basic_OC_per_sample, PushPull_OC_per_sample, PreCond_OC_per_sample
         from object_condensation import Hinge_OC_per_sample_damped, Hinge_OC_per_sample, Hinge_Manhatten_OC_per_sample
-        from object_condensation import Hinge_OC_per_sample_learnable_qmin, Hinge_OC_per_sample_learnable_qmin_betascale_position
+        from object_condensation import Dead_Zone_Hinge_OC_per_sample,Hinge_OC_per_sample_learnable_qmin, Hinge_OC_per_sample_learnable_qmin_betascale_position
         impl = Basic_OC_per_sample
         if implementation == 'pushpull':
             impl = PushPull_OC_per_sample
@@ -2163,6 +2200,8 @@ def __init__(self, *, energy_loss_weight=1.,
             impl = PreCond_OC_per_sample
         if implementation == 'hinge':
             impl = Hinge_OC_per_sample
+        if implementation == 'hinge_deadzone':
+            impl = Dead_Zone_Hinge_OC_per_sample
         if implementation == 'hinge_full_grad':
             # same as`hinge`
             impl = Hinge_OC_per_sample
@@ -2457,13 +2496,6 @@ def loss(self, inputs):
         timing_loss = self.timing_loss_weight * self.calc_timing_loss(t_time, pred_time, pred_time_unc,t_rec_energy)
         classification_loss = self.classification_loss_weight * self.calc_classification_loss(t_pid, pred_id, t_is_unique, hasunique)
 
-        ##just for time metrics
-        tdiff = (t_time-pred_time)
-        tdiff -= tf.reduce_mean(tdiff,keepdims=True)
-        tstd = tf.math.reduce_std(tdiff)
-        self.add_prompt_metric(tstd,self.name+'_time_std')
-        self.add_prompt_metric(tf.reduce_mean(pred_time_unc),self.name+'_time_pred_std')
-        #end just for metrics
 
         # nan_energy = tf.reduce_any(tf.math.is_nan(energy_loss))
         # nan_position = tf.reduce_any(tf.math.is_nan(position_loss))
@@ -2534,7 +2566,7 @@ def loss(self, inputs):
             #                               div_repulsion = self.div_repulsion,
             #                               dynamic_payload_scaling_onset=self.dynamic_payload_scaling_onset
             #                               )
-            att, rep, noise, min_b, payload, exceed_beta, containment, contamination = self.oc_loss_object(
+            [att, rep, noise, min_b, payload, exceed_beta], mdict = self.oc_loss_object(
                 beta=pred_beta,
                 x=pred_ccoords,
                 d=pred_distscale,
@@ -2545,12 +2577,8 @@ def loss(self, inputs):
                 rs=rowsplits,
                 energies = rechit_energy)
 
-        self.add_prompt_metric(att+rep,self.name+'_dynamic_payload_scaling')
-
-        if containment is not None:
-            self.add_prompt_metric(containment,self.name+'_containment')
-            self.add_prompt_metric(contamination,self.name+'_contamination')
-
+        #log the OC metrics dict, if any
+        self.wandb_log(mdict)
 
         att *= self.potential_scaling
         rep *= self.potential_scaling * self.repulsion_scaling
@@ -2595,20 +2623,16 @@ def loss(self, inputs):
 
         lossval = tf.reduce_mean(lossval)+bpush
 
-        self.add_prompt_metric(att,self.name+'_attractive_loss')
-        self.add_prompt_metric(rep,self.name+'_repulsive_loss')
-        self.add_prompt_metric(min_b,self.name+'_min_beta_loss')
-        self.add_prompt_metric(noise,self.name+'_noise_loss')
-        self.add_prompt_metric(energy_loss,self.name+'_energy_loss')
-        self.add_prompt_metric(energy_unc_loss,self.name+'_energy_unc_loss')
-        self.add_prompt_metric(pos_loss,self.name+'_position_loss')
-        self.add_prompt_metric(time_loss,self.name+'_time_loss')
-        self.add_prompt_metric(class_loss,self.name+'_class_loss')
-        self.add_prompt_metric(exceed_beta,self.name+'_exceed_beta_loss')
-        self.add_prompt_metric(bpush,self.name+'_beta_push_loss')
-        self.add_prompt_metric(ccdamp,self.name+'_cc_damp_loss')
-
-        self.add_prompt_metric(tf.reduce_mean(pred_distscale),self.name+'_avg_dist')
+        self.wandb_log({self.name+'_attractive_loss' : att,
+                        self.name+'_repulsive_loss': rep,
+                        self.name+'_min_beta_loss': min_b,
+                        self.name+'_noise_loss': noise,
+                        self.name+'_energy_loss': energy_loss,
+                        self.name+'_energy_unc_loss': energy_unc_loss,
+                        self.name+'_position_loss': pos_loss,
+                        self.name+'_time_loss': time_loss,
+                        self.name+'_class_loss': class_loss
+                        })
 
         self.maybe_print_loss(lossval)
 
@@ -2803,7 +2827,7 @@ def calc_energy_correction_factor_loss(self,
         t_dep_energies = tf.where(t_dep_energies / t_energy < 0.5, 0.5 * t_energy, t_dep_energies)
 
         epred = pred_energy * t_dep_energies
-        sigma = pred_uncertainty_high * t_dep_energies + 1.0
+        sigma = tf.abs(pred_uncertainty_high * t_dep_energies) + 1.0 #abs is a safety measure
 
         # Uncertainty 'sigma' must minimize this term:
         # ln(2*pi*sigma^2) + (E_true - E-pred)^2/sigma^2
@@ -2814,9 +2838,9 @@ def calc_energy_correction_factor_loss(self,
 
         uncertainty_loss = tf.math.log(sigma**2)
 
-        matching_loss = tf.debugging.check_numerics(matching_loss, "matching_loss")
-        prediction_loss = tf.debugging.check_numerics(prediction_loss, "matching_loss")
-        uncertainty_loss = tf.debugging.check_numerics(uncertainty_loss, "matching_loss")
+        prediction_loss = tf.debugging.check_numerics(prediction_loss, "E: prediction_loss")
+        uncertainty_loss = tf.debugging.check_numerics(uncertainty_loss, "E: uncertainty_loss")
+        matching_loss = tf.debugging.check_numerics(matching_loss, "E: matching_loss")
         prediction_loss = tf.clip_by_value(prediction_loss, 0, 10)
         uncertainty_loss = tf.clip_by_value(uncertainty_loss, 0, 10)
 
@@ -3283,11 +3307,12 @@ def loss(self, inputs):
         eloss, euncloss = e_andunc_loss[...,0:1], e_andunc_loss[...,1:2]
         euncloss /= 2. #same as above
 
-        self.add_prompt_metric(tf.reduce_mean(eweight * closs), self.name + '_class_loss')
-        self.add_prompt_metric(tf.reduce_mean(eweight * tloss), self.name + '_time_loss')
-        self.add_prompt_metric(tf.reduce_mean(eweight * ploss), self.name + '_pos_loss')
-        self.add_prompt_metric(tf.reduce_mean(eweight * eloss), self.name + '_momentum_loss')
-        self.add_prompt_metric(tf.reduce_mean(eweight * euncloss), self.name + '_momentum_unc_loss')
+        self.wand_log({self.name + '_class_loss': tf.reduce_mean(eweight * closs),
+                       self.name + '_time_loss': tf.reduce_mean(eweight * tloss),
+                       self.name + '_pos_loss': tf.reduce_mean(eweight * ploss),
+                       self.name + '_momentum_loss': tf.reduce_mean(eweight * eloss),
+                       self.name + '_momentum_unc_loss': tf.reduce_mean(eweight * euncloss)
+                      })
 
         #this is as if there weren't any modifications
         pfc_t_energy = tf.gather_nd(t_energy, pf_idx).values
@@ -3305,8 +3330,8 @@ def loss(self, inputs):
             offset = tf.reduce_mean( s_pfc_mom_corr - s_pfc_t_encorr )
             var = tf.math.reduce_std(s_pfc_mom_corr - s_pfc_t_encorr - offset)
 
-            self.add_prompt_metric(offset, self.name + '_binned_en_offset_'+namestr)
-            self.add_prompt_metric(var, self.name + '_binned_en_std_'+namestr)
+            #self.add_prompt_metric(offset, self.name + '_binned_en_offset_'+namestr)
+            #self.add_prompt_metric(var, self.name + '_binned_en_std_'+namestr)
 
             s_pfc_energy = s_pfc_mom_corr * tf.ragged.boolean_mask(pfc_ensum, sel)
             s_pfc_t_energy = tf.ragged.boolean_mask(pfc_t_energy, sel)
@@ -3314,8 +3339,8 @@ def loss(self, inputs):
             offset = tf.reduce_mean( (s_pfc_energy - s_pfc_t_energy)/s_pfc_t_energy )
             var = tf.math.reduce_std((s_pfc_energy - s_pfc_t_energy)/s_pfc_t_energy - offset)
 
-            self.add_prompt_metric(offset, self.name + '_binned_t_en_offset_'+namestr)
-            self.add_prompt_metric(var, self.name + '_binned_t_en_std_'+namestr)
+            #self.add_prompt_metric(offset, self.name + '_binned_t_en_offset_'+namestr)
+            #self.add_prompt_metric(var, self.name + '_binned_t_en_std_'+namestr)
 
 
         allloss = eweight * (closs + tloss + ploss + eloss + euncloss)
diff --git a/modules/Regularizers.py b/modules/Regularizers.py
index ed0c6c1b..2c5aa1b1 100644
--- a/modules/Regularizers.py
+++ b/modules/Regularizers.py
@@ -94,8 +94,7 @@ def call(self, inputs):
                   float(avneigh),
                   'penalty',float(loss))
             
-        #self.add_prompt_metric(avdist, self.name+'_dist')
-        #self.add_prompt_metric(avneigh, self.name+'_Nneigh')
+        self.wandb_log({self.name+'_dist': avdist,  self.name+'_Nneigh': avneigh})
                   
         self.add_loss(loss)
         return inputs
@@ -145,9 +144,9 @@ def half_sided_penalty(x):
         if self.printout:
             print(self.name,'meanmax dist loss',float(loss))
             
-        self.add_prompt_metric(meanmax, self.name+'_meanmax')
-        self.add_prompt_metric(maxmax, self.name+'_maxmax')
-        self.add_prompt_metric(loss, self.name+'_loss')
+        self.wandb_log({self.name+'_meanmax' : meanmax,
+                        self.name+'_maxmax': maxmax,
+                        self.name+'_loss': loss})
                   
         self.add_loss(loss)
         return inputs
diff --git a/modules/baseModules.py b/modules/baseModules.py
index 424467c6..30a08505 100644
--- a/modules/baseModules.py
+++ b/modules/baseModules.py
@@ -1,56 +1,2 @@
-print(__name__,">>> move to DeepJetCore soon")
-
-import tensorflow as tf
-
-#for metric layers
-class PromptMetric(tf.keras.metrics.Mean):   
-    def __init__(self, **kwargs):
-        super(PromptMetric, self).__init__(**kwargs)
-        
-    def update_state(self,*args,**kwargs):
-        self.reset_states()#reset and only take last state
-        super(PromptMetric, self).update_state(*args,**kwargs)
-        
-class LayerWithMetrics(tf.keras.layers.Layer):
-    def __init__(self, 
-                 record_metrics=False,
-                 _promptnames=None, **kwargs):
-        super(LayerWithMetrics, self).__init__(**kwargs)
-        self.prompt_metrics = {}
-        self.record_metrics = record_metrics
-        if _promptnames is not None:
-            for n in _promptnames:
-                if not n in self.prompt_metrics.keys():
-                    with tf.name_scope(self.name+"/sub/"+n):
-                        self.prompt_metrics[n]=PromptMetric(name=n)
-        
-    def get_config(self):
-        config = {'_promptnames': [m[1].name for m in self.prompt_metrics.items()],
-                  'record_metrics': self.record_metrics}
-        base_config = super(LayerWithMetrics, self).get_config()
-        return dict(list(base_config.items()) + list(config.items())) 
-    
-    def add_prompt_metric(self,x,name):
-        if not self.record_metrics:
-            return
-        if not name in self.prompt_metrics.keys():
-            with tf.name_scope(self.name+"/sub/"+name):
-                self.prompt_metrics[name]=PromptMetric(name=name)
-        self.add_metric(self.prompt_metrics[name](x))
-
-
-#class LayerWithMetrics(tf.keras.layers.Layer):
-#    def __init__(self, _promptnames=None, **kwargs):
-#        super(LayerWithMetrics, self).__init__(**kwargs)
-#        self.prompt_metrics={}
-#        
-#    def register_prompt_metric(self,name): 
-#        if name in self.prompt_metrics.keys():
-#            return
-#        with tf.name_scope(self.name+"/sub/"+name):
-#            self.prompt_metrics[name]=PromptMetric(name=name)
-#            
-#    def add_prompt_metric(self,x,name):
-#        if not name in self.prompt_metrics.keys():
-#            raise ValueError("Metric with name "+str(name)+" not registred. Register in layers or model constructor.")
-#        self.add_metric(self.prompt_metrics[name](x))
+print(__name__,"deprecated, import directly from DeepJetCore.DJCLayers")
+from DeepJetCore.DJCLayers import LayerWithMetrics
\ No newline at end of file
diff --git a/modules/bin_by_coordinates_op.py b/modules/bin_by_coordinates_op.py
index 669e636a..68974375 100644
--- a/modules/bin_by_coordinates_op.py
+++ b/modules/bin_by_coordinates_op.py
@@ -4,13 +4,6 @@
 
 _bin_by_coordinates = tf.load_op_library('bin_by_coordinates.so')
 
-'''
- .Input("coordinates: float")
-    .Input("row_splits: int32")
-    .Input("bin_width: float")
-    .Input("nbins: int32")//same in all dimensions
-    .Output("output: int32"); 
-'''
 
 def BinByCoordinates(coordinates, row_splits, bin_width=None, n_bins=None, calc_n_per_bin=True, pre_normalized=False, name=""):
     '''
@@ -39,7 +32,7 @@ def BinByCoordinates(coordinates, row_splits, bin_width=None, n_bins=None, calc_
     - bin indices (the above) flattened
     - number of bins used per dimension (dim = dim(coordinates))
     - bin width used (dim = 1)
-    - (opt) number of points per bin (dim = 1)
+    - (opt) number of points per bin (dim = 1) - this likely reduces the speed of the OP
     
     '''
     
diff --git a/modules/binned_select_knn_op.py b/modules/binned_select_knn_op.py
index 0b6949b4..9fb00bb4 100644
--- a/modules/binned_select_knn_op.py
+++ b/modules/binned_select_knn_op.py
@@ -4,6 +4,9 @@
 import globals as gl
 from oc_helper_ops import SelectWithDefault
 
+from bin_by_coordinates_op import BinByCoordinates
+from index_replacer_op import IndexReplacer
+
 _binned_select_knn = tf.load_op_library('binned_select_knn.so')
 
 def _BinnedSelectKnn(K : int, coords,  bin_idx, dim_bin_idx, bin_boundaries, n_bins, bin_width , tf_compatible=False,
@@ -29,8 +32,8 @@ def _BinnedSelectKnn(K : int, coords,  bin_idx, dim_bin_idx, bin_boundaries, n_b
                                               use_direction = use_direction
                                               )
 
-
-def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, max_bin_dims=3, tf_compatible=False, max_radius=None, name=""):
+#@tf.function
+def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None, max_bin_dims : int =3 , tf_compatible=False, max_radius=None, name=""):
     '''
     max_radius is a dummy for now to make it a drop-in replacement
     
@@ -43,21 +46,20 @@ def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None,
     - any other number: can be neighbour and have neighbours
     
     '''
-    from bin_by_coordinates_op import BinByCoordinates
-    from index_replacer_op import IndexReplacer
     
     # the following number of bins seems a good~ish estimate for good performance
     # for homogenous point distributions but should be subject to more tests
-    elems_per_rs = 1
-    if row_splits.shape[0] is not None:
-        elems_per_rs = row_splits[1]
-        #do checks
-        tf.assert_equal(row_splits[-1],coords.shape[0])
+    elems_per_rs = tf.reduce_max(row_splits) / tf.shape(row_splits)[0]
+    elems_per_rs = tf.cast(elems_per_rs, dtype='int32')+1
+    #if row_splits.shape[0] is not None:
+    #    elems_per_rs = row_splits[1]
+    #    #do checks
+    #    tf.assert_equal(row_splits[-1],coords.shape[0])
         
-    max_bin_dims = min([max_bin_dims, coords.shape[1]])
+    max_bin_dims = tf.reduce_min([max_bin_dims, tf.shape(coords)[1]])
     
     if n_bins is None:
-        n_bins = tf.math.pow(tf.cast(elems_per_rs,dtype='float32')/(K/32),1/max_bin_dims)
+        n_bins = tf.math.pow(tf.cast(elems_per_rs,dtype='float32')/(K/32), 1./tf.cast(max_bin_dims,dtype='float32' ))
         n_bins = tf.cast(n_bins,dtype='int32')
         n_bins = tf.where(n_bins<5,5,n_bins)
         n_bins = tf.where(n_bins>30,30,n_bins)#just a guess
@@ -82,16 +84,19 @@ def BinnedSelectKnn(K : int, coords, row_splits, direction = None, n_bins=None,
     bin_boundaries = tf.concat([tf.zeros([1],dtype='int32'), nper],axis=0) #row_splits[0:1]
     # make it row split like
     bin_boundaries = tf.cumsum(bin_boundaries)
+
+    #sanity check
+    tf.assert_equal(tf.reduce_max(bin_boundaries), tf.reduce_max(row_splits))
     
     idx,dist = _BinnedSelectKnn(K, scoords,  sbinning, sdbinning, bin_boundaries=bin_boundaries, 
                                 n_bins=nb, bin_width=bin_width, tf_compatible=tf_compatible, direction = direction )
     
-    if row_splits.shape[0] is None:
-        return idx, dist
+    #if row_splits.shape[0] is None:
+    #    return idx, dist
     #sort back 
     idx = IndexReplacer(idx,sorting)
-    dist = tf.scatter_nd(sorting[...,tf.newaxis], dist, dist.shape)
-    idx = tf.scatter_nd(sorting[...,tf.newaxis], idx, idx.shape)
+    dist = tf.scatter_nd(sorting[...,tf.newaxis], dist, tf.shape(dist))
+    idx = tf.scatter_nd(sorting[...,tf.newaxis], idx, tf.shape(idx))
     dist = tf.where(idx<0, 0., dist)#safety
     
     if not gl.knn_ops_use_tf_gradients:
diff --git a/modules/callbacks.py b/modules/callbacks.py
index 761d91f2..fc68a8f8 100644
--- a/modules/callbacks.py
+++ b/modules/callbacks.py
@@ -15,7 +15,7 @@
 from plotting_tools import publish, shuffle_truth_colors
 from DebugLayers import _DebugPlotBase
 from DeepJetCore import TrainData
-from DeepJetCore.dataPipeline import TrainDataGenerator
+from djcdata.dataPipeline import TrainDataGenerator
 
 
 
diff --git a/modules/compiled/bin_by_coordinates_kernel.cu.cc b/modules/compiled/bin_by_coordinates_kernel.cu.cc
index 7eb4e117..75339f1b 100644
--- a/modules/compiled/bin_by_coordinates_kernel.cu.cc
+++ b/modules/compiled/bin_by_coordinates_kernel.cu.cc
@@ -98,7 +98,7 @@ static void calc(
     idx += rsidx * mul;
 
     if(idx>=n_total_bins){
-        printf("global index larger than total bins\n");//DEBUG if you see this you're screwed
+        printf("\nERROR: BinByCoordinatesOpFunctor: global index larger than total bins\n");//DEBUG if you see this you're screwed
         return;
     }
 
@@ -161,6 +161,8 @@ struct BinByCoordinatesOpFunctor<GPUDevice, dummy> { //just because access needs
                     n_total_bins,
                     calc_n_per_bin);
 
+            cudaDeviceSynchronize();
+
     }
 };
 
diff --git a/modules/compiled/binned_select_knn_kernel.cu.cc b/modules/compiled/binned_select_knn_kernel.cu.cc
index 9db98826..0276875f 100644
--- a/modules/compiled/binned_select_knn_kernel.cu.cc
+++ b/modules/compiled/binned_select_knn_kernel.cu.cc
@@ -149,7 +149,7 @@ static void select_knn_kernel(
             int idx = stepper.step();
             if(idx<0){//not valid
                 if(!continue_search && !distance){//this should not happen
-                    printf("stopping search for vtx %d at distance %d\n",i_v,distance);
+                    printf("\nERROR: binned_select_knn.cu: stopping search for vtx %d at distance %d\n",i_v,distance);
                 }
                 break;
 
@@ -158,13 +158,23 @@ static void select_knn_kernel(
             idx+=gbin_offset;
 
             if(idx>=n_bboundaries-1){
-                printf("idx %d out of range, gb offset %d, distance %d, sb_flat_offset %d, nbb %d\n", idx, gbin_offset, distance, sb_flat_offset,n_bboundaries);
+                printf("\nERROR: binned_select_knn.cu: boundary issue: idx %d out of range, gb offset %d, distance %d, sb_flat_offset %d, nbb %d\n", idx, gbin_offset, distance, sb_flat_offset,n_bboundaries);
                 continue;
             }
 
             int start_vertex = d_bin_boundaries[idx];
             int end_vertex = d_bin_boundaries[idx+1];
 
+            if(start_vertex == end_vertex){ //empty bin
+                continue_search=true; //correct?
+                continue;
+            }
+
+            if(start_vertex>=n_vert || end_vertex>n_vert){
+                printf("\nERROR: binned_select_knn.cu: start_vertex %d or end_vertex %d out of range %d \n", start_vertex, end_vertex, n_vert);
+                continue;//safe guard
+            }
+
             for(size_t j_v=start_vertex;j_v<end_vertex;j_v++){
                 if(i_v == j_v)
                     continue;
@@ -284,6 +294,7 @@ struct BinnedSelectKnnOpFunctor<GPUDevice, dummy> { //just because access needs
             select_knn_kernel<5><<<gb.grid(),gb.block()>>>(d_coord, d_bin_idx,d_direction,d_dim_bin_idx,d_bin_boundaries,d_n_bins,d_bin_width,
                     d_indices,d_dist,n_vert,n_neigh,n_coords,n_bin_dim,n_bboundaries,use_direction);
 
+        cudaDeviceSynchronize();
     }
 };
 
diff --git a/modules/compiled/compare_knn_outputs_kernel.cu.cc b/modules/compiled/compare_knn_outputs_kernel.cu.cc
index cc71489e..16b1d28e 100644
--- a/modules/compiled/compare_knn_outputs_kernel.cu.cc
+++ b/modules/compiled/compare_knn_outputs_kernel.cu.cc
@@ -52,6 +52,8 @@ struct CompareKnnOpFunctor<GPUDevice, dummy>{
         int thread_per_block = 20;
 
         gpu::CompareKnnOpCudaKernel<<<block_count, thread_per_block, 0, d.stream()>>>(nvertices,nneighbours,input1,input2,output);
+        
+        cudaDeviceSynchronize();
     }
 };
 
diff --git a/modules/compiled/compare_knn_outputs_module.cc b/modules/compiled/compare_knn_outputs_module.cc
index de4c2cc4..0f07d504 100644
--- a/modules/compiled/compare_knn_outputs_module.cc
+++ b/modules/compiled/compare_knn_outputs_module.cc
@@ -9,8 +9,4 @@ REGISTER_OP("CompareKnnOutputs")
     .Input("in_tensor1: int32")
     .Input("in_tensor2: int32")
     .Output("out_tensor: int32")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-            c->set_output(0, c->input(0)); // requires that output with idx 0
-            return Status::OK();           // should have the same shape as
-    })                                     // input with idx 0.
 ;                                    
diff --git a/modules/compiled/index_replacer_kernel.cc b/modules/compiled/index_replacer_kernel.cc
index 82bcd2fc..027e5e21 100644
--- a/modules/compiled/index_replacer_kernel.cc
+++ b/modules/compiled/index_replacer_kernel.cc
@@ -14,13 +14,13 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
-template<typename dummy>
-struct IndexReplacerOpFunctor<CPUDevice, dummy> { //just because access needs to be different for GPU and CPU
+template<typename dtype>
+struct IndexReplacerOpFunctor<CPUDevice, dtype> { //just because access needs to be different for GPU and CPU
     void operator()(
             const CPUDevice &d,
-            const int * to_be_replaced,
-            const int * replacements,
-            int * replaced,
+            const dtype * to_be_replaced,
+            const dtype * replacements,
+            dtype * replaced,
 
             const int n_to_be_replaced,
             const int n_replacements
@@ -63,12 +63,12 @@ class IndexReplacerOp : public OpKernel {
         OP_REQUIRES_OK(context, context->allocate_output(0,
                 t_to_be_replaced.shape(), &out));//same shape
 
-        IndexReplacerOpFunctor<Device, int>() (
+        IndexReplacerOpFunctor<Device, int32>() (
                 context->eigen_device<Device>(),
 
-                t_to_be_replaced.flat<int>().data(),
-                t_replacements.flat<int>().data(),
-                out->flat<int>().data(),
+                t_to_be_replaced.flat<int32>().data(),
+                t_replacements.flat<int32>().data(),
+                out->flat<int32>().data(),
                 n_to_be_replaced,
                 n_replacements
         );
@@ -82,7 +82,7 @@ class IndexReplacerOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("IndexReplacer").Device(DEVICE_CPU), IndexReplacerOp<CPUDevice>);
 
 #ifdef GOOGLE_CUDA
-extern template struct IndexReplacerOpFunctor<GPUDevice, int>;
+extern template struct IndexReplacerOpFunctor<GPUDevice, int32>;
 REGISTER_KERNEL_BUILDER(Name("IndexReplacer").Device(DEVICE_GPU), IndexReplacerOp<GPUDevice>);
 #endif  
 
diff --git a/modules/compiled/index_replacer_kernel.cu.cc b/modules/compiled/index_replacer_kernel.cu.cc
index 4ebc7593..5ae1afdb 100644
--- a/modules/compiled/index_replacer_kernel.cu.cc
+++ b/modules/compiled/index_replacer_kernel.cu.cc
@@ -17,9 +17,9 @@ typedef Eigen::GpuDevice GPUDevice;
 
 __global__
 static void calc(
-        const int * to_be_replaced,
-        const int * replacements,
-        int * replaced,
+        const int32 * to_be_replaced,
+        const int32 * replacements,
+        int32 * replaced,
 
         const int n_to_be_replaced,
         const int n_replacements){
@@ -34,7 +34,8 @@ static void calc(
         return;
     }
     if(ridx>=n_replacements){
-        printf("IndexReplacerOpFunctor: index out of range\n");
+        replaced[i] = to_be_replaced[i]; //security measure but already screwed here
+        printf("IndexReplacerOpFunctor: Fatal error: index out of range %d of %d at %d of %d\n", ridx, n_replacements, i, n_to_be_replaced);
         return;
     }
     replaced[i] = replacements[ridx];
@@ -43,26 +44,28 @@ static void calc(
 
 
 
-template<typename dummy>
-struct IndexReplacerOpFunctor<GPUDevice, dummy> { //just because access needs to be different for GPU and CPU
+template<typename dtype>
+struct IndexReplacerOpFunctor<GPUDevice, dtype> { //just because access needs to be different for GPU and CPU
     void operator()(
             const GPUDevice &d,
-            const int * to_be_replaced,
-            const int * replacements,
-            int * replaced,
+            const dtype * to_be_replaced,
+            const dtype * replacements,
+            dtype * replaced,
 
-            const int n_to_be_replaced,
-            const int n_replacements
+            const dtype n_to_be_replaced,
+            const dtype n_replacements
             ){
             
 
-            grid_and_block gb(n_to_be_replaced,1024);
+            grid_and_block gb(n_to_be_replaced,512);
 
             calc<<<gb.grid(),gb.block()>>>(to_be_replaced,replacements,replaced,n_to_be_replaced,n_replacements);
+     
+            cudaDeviceSynchronize();
     }
 };
 
-template struct IndexReplacerOpFunctor<GPUDevice, int>;
+template struct IndexReplacerOpFunctor<GPUDevice, int32>;
 
 }//functor
 }//tensorflow
diff --git a/modules/compiled/oc_helper_m_indices_kernel.cu.cc b/modules/compiled/oc_helper_m_indices_kernel.cu.cc
index 5f0d1dec..eb303f6e 100644
--- a/modules/compiled/oc_helper_m_indices_kernel.cu.cc
+++ b/modules/compiled/oc_helper_m_indices_kernel.cu.cc
@@ -113,6 +113,8 @@ struct MIndicesOpFunctor<GPUDevice, dummy> {
                 calc_m_not
         );
 
+        cudaDeviceSynchronize();
+
     }
 };
 
diff --git a/modules/compiled/rs_offset_adder_kernel.cc b/modules/compiled/rs_offset_adder_kernel.cc
new file mode 100644
index 00000000..5cb67add
--- /dev/null
+++ b/modules/compiled/rs_offset_adder_kernel.cc
@@ -0,0 +1,98 @@
+
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif 
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "helpers.h"
+#include "rs_offset_adder_kernel.h"
+
+namespace tensorflow {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+
+
+template<typename dtype>
+struct RSOffsetAdderOpFunctor<CPUDevice, dtype> { //just because access needs to be different for GPU and CPU
+    void operator()(
+            const CPUDevice &d,
+            const int * t_dx,
+            const int * rs,
+            int * new_t_idx,
+
+            const int n_vert,
+            const int n_rs
+            ){
+                for(int i = 0; i < n_vert; i++){
+                    
+                    //get the offset
+                    int offset = 0;
+                    for(int j = 0; j < n_rs; j++){
+                        if(i >= rs[j]){
+                            offset = rs[j];
+                        }
+                        else{
+                            break;
+                        }
+                    }
+                    if(t_dx[i] >= 0)
+                        new_t_idx[i] = t_dx[i] + offset;
+                    else
+                        new_t_idx[i] = t_dx[i];
+                }
+    }
+};
+
+
+
+template<typename Device>
+class RSOffsetAdderOp : public OpKernel {
+public:
+    explicit RSOffsetAdderOp(OpKernelConstruction *context) : OpKernel(context) {
+    
+    }
+
+
+    void Compute(OpKernelContext *context) override {
+
+        const Tensor &t_t_idx = context->input(0);
+        const Tensor &t_rs = context->input(1);
+
+        const int n_vert = t_t_idx.dim_size(0);
+        const int n_rs = t_rs.dim_size(0);
+
+        Tensor *out = NULL;
+        OP_REQUIRES_OK(context, context->allocate_output(0,
+                t_t_idx.shape(), &out));
+
+        
+
+        RSOffsetAdderOpFunctor<Device, int32>() (
+                context->eigen_device<Device>(),
+
+                t_t_idx.flat<int32>().data(),
+                t_rs.flat<int32>().data(),
+                out->flat<int32>().data(),
+                n_vert,
+                n_rs
+        );
+
+
+
+    }
+
+};
+
+REGISTER_KERNEL_BUILDER(Name("RSOffsetAdder").Device(DEVICE_CPU), RSOffsetAdderOp<CPUDevice>);
+
+#ifdef GOOGLE_CUDA
+extern template struct RSOffsetAdderOpFunctor<GPUDevice, int32>;
+REGISTER_KERNEL_BUILDER(Name("RSOffsetAdder").Device(DEVICE_GPU), RSOffsetAdderOp<GPUDevice>);
+#endif  
+
+}//functor
+}//tensorflow
diff --git a/modules/compiled/rs_offset_adder_kernel.cu.cc b/modules/compiled/rs_offset_adder_kernel.cu.cc
new file mode 100644
index 00000000..8bbc73a4
--- /dev/null
+++ b/modules/compiled/rs_offset_adder_kernel.cu.cc
@@ -0,0 +1,80 @@
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include "cuda_helpers.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+#include "rs_offset_adder_kernel.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+__global__
+static void calc(
+        const int * t_dx,
+            const int * rs,
+            int * new_t_idx,
+
+            const int n_vert,
+            const int n_rs){
+
+    int i =  blockIdx.x * blockDim.x + threadIdx.x;
+    if(i >= n_vert)
+        return;
+    int offset = 0;
+    for (int j = 0; j < n_rs; j++){
+        if (i >= rs[j]){
+            offset = rs[j];
+        }
+        else{
+            break;
+        }
+    }
+    int current =  t_dx[i];
+    if (t_dx[i] >= 0)
+        new_t_idx[i] = current + offset;
+    else
+        new_t_idx[i] = current;
+}
+
+
+
+template<typename dtype>
+struct RSOffsetAdderOpFunctor<GPUDevice, dtype> { //just because access needs to be different for GPU and CPU
+    void operator()(
+            const GPUDevice &d,
+            const int * t_dx,
+            const int * rs,
+            int * new_t_idx,
+
+            const int n_vert,
+            const int n_rs
+            ){
+            
+
+            grid_and_block gb(n_vert,512);
+
+            calc<<<gb.grid(),gb.block()>>>(
+                t_dx,
+                rs,
+                new_t_idx,
+                n_vert,
+                n_rs
+            );
+            cudaDeviceSynchronize();
+    }
+};
+
+template struct RSOffsetAdderOpFunctor<GPUDevice, int32>;
+
+}//functor
+}//tensorflow
+
+#endif //GOOGLE_CUDA
+
diff --git a/modules/compiled/rs_offset_adder_kernel.h b/modules/compiled/rs_offset_adder_kernel.h
new file mode 100644
index 00000000..cc828e00
--- /dev/null
+++ b/modules/compiled/rs_offset_adder_kernel.h
@@ -0,0 +1,26 @@
+
+#ifndef RS_OFFSET_ADDER_KERNEL_H
+#define RS_OFFSET_ADDER_KERNEL_H
+
+namespace tensorflow {
+namespace functor {
+
+template<typename Device, typename dummy>
+struct RSOffsetAdderOpFunctor {
+    void operator()(
+            const Device &d,
+            const int * t_dx,
+            const int * rs,
+            int * new_t_idx,
+
+            const int n_vert,
+            const int n_rs
+    );
+
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+
+#endif //RS_OFFSET_ADDER_KERNEL_H
diff --git a/modules/compiled/rs_offset_adder_module.cc b/modules/compiled/rs_offset_adder_module.cc
new file mode 100644
index 00000000..4d362db1
--- /dev/null
+++ b/modules/compiled/rs_offset_adder_module.cc
@@ -0,0 +1,10 @@
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using namespace tensorflow;
+
+REGISTER_OP("RSOffsetAdder")
+    .Input("idx: int32")
+    .Input("row_splits: int32")
+    .Output("output: int32"); 
diff --git a/modules/compiled/select_mod_knn_grad_kernel.cc b/modules/compiled/select_mod_knn_grad_kernel.cc
index e6b4378c..99c24898 100644
--- a/modules/compiled/select_mod_knn_grad_kernel.cc
+++ b/modules/compiled/select_mod_knn_grad_kernel.cc
@@ -47,7 +47,6 @@ class SelectModKnnGradOp : public OpKernel {
 public:
     explicit SelectModKnnGradOp(OpKernelConstruction *context) : OpKernel(context) {
 
-
     }
 
     void Compute(OpKernelContext *context) override {
@@ -64,13 +63,12 @@ class SelectModKnnGradOp : public OpKernel {
         int n_coords = t_coord.dim_size(1);
         int n_neigh = t_distances.dim_size(1);
 
-        auto coorddimsok = t_coord_mod.dims() == 3
-                && t_coord_mod.dim_size(0) == n_vert
-                && t_coord_mod.dim_size(1) == n_coords
-                && t_coord_mod.dim_size(2) == n_coords
-                ? tensorflow::Status(): tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
-                        "Coordinate modifier tensor needs to have 3 dimensions (V x C x C)");
-        OP_REQUIRES_OK(context,coorddimsok);
+        //auto coorddimsok = t_coord_mod.dims() == 3
+        //        && t_coord_mod.dim_size(0) == n_vert
+        //        && t_coord_mod.dim_size(1) == n_coords
+        //        && t_coord_mod.dim_size(2) == n_coords 
+        //        ? OkStatus() : errors::FailedPrecondition("Coordinate modifier tensor needs to have 3 dimensions (V x C x C)");
+        //OP_REQUIRES_OK(context,coorddimsok);
 
 
         TensorShape outputShape;
diff --git a/modules/compiled/select_mod_knn_grad_kernel.cu.cc b/modules/compiled/select_mod_knn_grad_kernel.cu.cc
index d9b44419..a447f57b 100644
--- a/modules/compiled/select_mod_knn_grad_kernel.cu.cc
+++ b/modules/compiled/select_mod_knn_grad_kernel.cu.cc
@@ -247,7 +247,7 @@ struct SelectModKnnGradOpFunctor<GPUDevice, dummy> {
                 n_neigh,
                 n_coords);
 
-
+        cudaDeviceSynchronize();
     }
 
 };
diff --git a/modules/compiled/select_mod_knn_kernel.cc b/modules/compiled/select_mod_knn_kernel.cc
index 447ccf07..20a578c0 100644
--- a/modules/compiled/select_mod_knn_kernel.cc
+++ b/modules/compiled/select_mod_knn_kernel.cc
@@ -282,13 +282,12 @@ class SelectModKnnOp : public OpKernel {
         int n_coords = d_coord_tensor.dim_size(1);
         int n_rs = d_rs_tensor.dim_size(0);
 
-        auto coorddimsok = d_coord_mod_tensor.dims() == 3
-                && d_coord_mod_tensor.dim_size(0) == n_vert
-                && d_coord_mod_tensor.dim_size(1) == n_coords
-                && d_coord_mod_tensor.dim_size(2) == n_coords
-                ? tensorflow::Status(): tensorflow::Status(tensorflow::error::INVALID_ARGUMENT,
-               "Coordinate modifier tensor needs to have 3 dimensions (V x C x C)");
-        OP_REQUIRES_OK(context,coorddimsok);
+        //auto coorddimsok = d_coord_mod_tensor.dims() == 3
+        //        && d_coord_mod_tensor.dim_size(0) == n_vert
+        //        && d_coord_mod_tensor.dim_size(1) == n_coords
+        //        && d_coord_mod_tensor.dim_size(2) == n_coords 
+        //        ? OkStatus() : errors::FailedPrecondition("Coordinate modifier tensor needs to have 3 dimensions (V x C x C)");
+        //OP_REQUIRES_OK(context,coorddimsok);
 
         TensorShape outputShape;
         outputShape.AddDim(n_vert);
diff --git a/modules/compiled/select_threshold_kernel.cu.cc b/modules/compiled/select_threshold_kernel.cu.cc
index 077aca01..3643501e 100644
--- a/modules/compiled/select_threshold_kernel.cu.cc
+++ b/modules/compiled/select_threshold_kernel.cu.cc
@@ -197,7 +197,8 @@ struct CopyOutputSelectThresholdOpFunctor<GPUDevice, dummy> {
         grid_and_block gb(n_scatter_idxs, 512);
 
         copy_all_kernel<<<gb.grid(),gb.block(), 0, d.stream()>>>(d_scatter_idxs,d_tmp_scatter_idxs,n_scatter_idxs);
-
+        
+        cudaDeviceSynchronize();
 
     }
 };
diff --git a/modules/compiled/tests/test_binned_select_knn.py b/modules/compiled/tests/test_binned_select_knn.py
index e8cbc7ff..46e49a85 100644
--- a/modules/compiled/tests/test_binned_select_knn.py
+++ b/modules/compiled/tests/test_binned_select_knn.py
@@ -38,11 +38,17 @@ def createData(nvert,ncoords):
         nbins = None
         
         idx, dist = BinnedSelectKnn(test_neighbours, coords, row_splits, n_bins=nbins)
+        
+
+
         start = time.time()
         for _ in range(5):
             idx, dist = BinnedSelectKnn(test_neighbours, coords, row_splits, n_bins=nbins)
         end =  time.time()
         print('took',(end-start)/5.,'s','for',nvert,'points on',nbins,'and nn',test_neighbours )
+
+        #exit()
+        continue
         
         idx, dist = SlicingKnn(test_neighbours, coords, row_splits, features_to_bin_on=[0,1], bin_width=(0.02,0.02))
         start = time.time()
@@ -51,6 +57,7 @@ def createData(nvert,ncoords):
         end =  time.time()
         print('took',(end-start)/5.,'s','for',nvert,'and nn',test_neighbours,'points on SlicingKnn')
     
+    #exit()
     continue
     idx, dist = SelectKnn(test_neighbours, coords, row_splits)
     start = time.time()
diff --git a/modules/datastructures/TrainData_NanoML.py b/modules/datastructures/TrainData_NanoML.py
index 229d2c29..ad68a7fc 100644
--- a/modules/datastructures/TrainData_NanoML.py
+++ b/modules/datastructures/TrainData_NanoML.py
@@ -1,4 +1,5 @@
-from DeepJetCore.TrainData import TrainData, fileTimeOut
+from DeepJetCore import TrainData
+from djcdata.TrainData import fileTimeOut
 from DeepJetCore import SimpleArray 
 import uproot3 as uproot
 import awkward as ak1
diff --git a/modules/datastructures/TrainData_PrepoolingNanoML.py b/modules/datastructures/TrainData_PrepoolingNanoML.py
index 82173be2..b6a2a031 100644
--- a/modules/datastructures/TrainData_PrepoolingNanoML.py
+++ b/modules/datastructures/TrainData_PrepoolingNanoML.py
@@ -1,4 +1,4 @@
-from DeepJetCore.TrainData import TrainData
+from DeepJetCore import TrainData
 from DeepJetCore import SimpleArray
 import pickle
 import numpy as np
@@ -7,7 +7,7 @@
 import gzip
 
 from datastructures.TrainData_NanoML import TrainData_NanoML
-from DeepJetCore.dataPipeline import TrainDataGenerator
+from djcdata.dataPipeline import TrainDataGenerator
 from DebugLayers import switch_off_debug_plots
 
 
diff --git a/modules/datastructures/TrainData_PreselectionNanoML.py b/modules/datastructures/TrainData_PreselectionNanoML.py
index bd508374..5369ac87 100644
--- a/modules/datastructures/TrainData_PreselectionNanoML.py
+++ b/modules/datastructures/TrainData_PreselectionNanoML.py
@@ -1,4 +1,4 @@
-from DeepJetCore.TrainData import TrainData
+from DeepJetCore import TrainData
 from DeepJetCore import SimpleArray
 import pickle
 import numpy as np
@@ -7,7 +7,7 @@
 import gzip
 
 from datastructures.TrainData_NanoML import TrainData_NanoML
-from DeepJetCore.dataPipeline import TrainDataGenerator
+from djcdata.dataPipeline import TrainDataGenerator
 from DebugLayers import switch_off_debug_plots
 
 
diff --git a/modules/datastructures/TrainData_TrackML.py b/modules/datastructures/TrainData_TrackML.py
index 0eb52ccd..4792de28 100644
--- a/modules/datastructures/TrainData_TrackML.py
+++ b/modules/datastructures/TrainData_TrackML.py
@@ -1,4 +1,4 @@
-from DeepJetCore.TrainData import TrainData, fileTimeOut
+from DeepJetCore import TrainData
 from DeepJetCore import SimpleArray 
 import awkward0 as ak
 import pickle
diff --git a/modules/datastructures/TrainData_crilin.py b/modules/datastructures/TrainData_crilin.py
index d212099b..75e1e7b1 100644
--- a/modules/datastructures/TrainData_crilin.py
+++ b/modules/datastructures/TrainData_crilin.py
@@ -2,7 +2,7 @@
 
 
 
-from DeepJetCore.TrainData import TrainData, fileTimeOut
+from DeepJetCore import TrainData
 from DeepJetCore import SimpleArray
 import numpy as np
 import uproot3 as uproot
@@ -34,7 +34,7 @@ def branchToFlatArray(self, b, return_row_splits=False, dtype='float32'):
 
     def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="converted_photons"):
         
-        fileTimeOut(filename, 10)#wait 10 seconds for file in case there are hiccups
+        #fileTimeOut(filename, 10)#wait 10 seconds for file in case there are hiccups
         tree = uproot.open(filename)[treename]
         
         '''
diff --git a/modules/datastructures/TrainData_fcc.py b/modules/datastructures/TrainData_fcc.py
index 9cf0fdb0..9235ab15 100644
--- a/modules/datastructures/TrainData_fcc.py
+++ b/modules/datastructures/TrainData_fcc.py
@@ -1,8 +1,9 @@
 
 
 
-from DeepJetCore.TrainData import TrainData, fileTimeOut
-from DeepJetCore import SimpleArray
+from djcdata import TrainData
+from djcdata.TrainData import fileTimeOut
+from djcdata import SimpleArray
 import numpy as np
 import uproot3 as uproot
 import awkward as ak1
diff --git a/modules/datastructures/TrainData_ild.py b/modules/datastructures/TrainData_ild.py
index 0480dca0..ff6981b9 100644
--- a/modules/datastructures/TrainData_ild.py
+++ b/modules/datastructures/TrainData_ild.py
@@ -2,7 +2,7 @@
 
 
 
-from DeepJetCore.TrainData import TrainData, fileTimeOut
+from DeepJetCore import TrainData
 from DeepJetCore import SimpleArray
 import numpy as np
 import uproot3 as uproot
@@ -86,7 +86,7 @@ def branchToFlatArray(self, b, returnRowSplits=False, selectmask=None, is3d=None
     def fileIsValid(self, filename):
         import ROOT
         try:
-            fileTimeOut(filename, 2)
+            #fileTimeOut(filename, 2)
             tree = uproot.open(filename)["SLCIOConverted"]
             f=ROOT.TFile.Open(filename)
             t=f.Get("SLCIOConverted")
@@ -101,7 +101,7 @@ def fileIsValid(self, filename):
     
     def convertFromSourceFile(self, filename, weighterobjects, istraining, treename="SLCIOConverted"):
         
-        fileTimeOut(filename, 10)#10 seconds for eos to recover 
+        #fileTimeOut(filename, 10)#10 seconds for eos to recover 
         
         tree = uproot.open(filename)[treename]
         nevents = tree.numentries
diff --git a/modules/object_condensation.py b/modules/object_condensation.py
index 8c02e421..016f4b51 100644
--- a/modules/object_condensation.py
+++ b/modules/object_condensation.py
@@ -72,6 +72,8 @@ def __init__(self,
                  spect_supp=None, #None means same as noise
                  global_weight=False
                  ):
+
+        self.rep_range = -1.
         
         self.q_min = q_min
         self.s_b = s_b
@@ -161,6 +163,7 @@ def set_input(self,
     def att_func(self,dsq_k_m):
         return tf.math.log(tf.math.exp(1.)*dsq_k_m/2. + 1.)
     
+    #@tf.function
     def V_att_k(self):
         '''
         '''
@@ -197,33 +200,37 @@ def calc_dsq_rep(self):
         dsq = tf.reduce_sum(dsq**2, axis=-1, keepdims=True)  #K x V x 1
         return dsq
         
+    #@tf.function
     def V_rep_k(self):
         
         
         K = tf.reduce_sum(tf.ones_like(self.q_k))
         N_notk = tf.reduce_sum(self.Mnot, axis=1)
-        #future remark: if this gets too large, one could use a kNN here
+        #future remark: if this gets too large, one could use a kNN/radiusNN here
         
         dsq = self.calc_dsq_rep()
         
         # nogradbeta = tf.stop_gradient(self.beta_k_m)
         #weight. tf.reduce_sum( tf.exp(-dsq) * d_v_e, , axis=1) / tf.reduce_sum( tf.exp(-dsq) )
         sigma = self.weighted_d_k_m(dsq) #create gradients for all, but prefer k vertex
-        
         dsq = tf.math.divide_no_nan(dsq, sigma + 1e-4) #K x V x 1
 
         V_rep = self.rep_func(dsq) * self.Mnot * tf.expand_dims(self.q_v,axis=0)  #K x V x 1
+        if self.rep_range > 0:
+            N_notk = tf.reduce_sum(tf.where(dsq < self.rep_range**2 , 1., tf.zeros_like(dsq)) * self.Mnot, axis=1)
 
         V_rep = self.q_k * tf.reduce_sum(V_rep, axis=1) #K x 1
         if self.global_weight:
             N_full = tf.reduce_sum(tf.ones_like(self.beta_v))
             V_rep = K * tf.math.divide_no_nan(V_rep, N_full+1e-3)  #K x 1
-        else:
+        elif True: #TEST DEBUG REMOVE AGAIN
             V_rep = tf.math.divide_no_nan(V_rep, N_notk+1e-3)  #K x 1
+        else:
+            V_rep/=100.
 
         return V_rep
 
-
+    #@tf.function
     def Pll_k(self):
         
         tanhsqbeta = self.beta_v**2 #softer here
@@ -241,6 +248,7 @@ def Pll_k(self):
                                              pw_k_sum  )#K x P
         return pll_k
     
+    #@tf.function
     def Beta_pen_k(self):
         #use continuous max approximation through LSE
         eps = 1e-3
@@ -249,7 +257,8 @@ def Beta_pen_k(self):
         beta_pen += 1. - tf.clip_by_value(tf.reduce_sum(self.beta_k_m, axis=1), 0., 1)
         beta_pen = tf.debugging.check_numerics(beta_pen, "OC: beta pen")
         return beta_pen
-        
+    
+    #@tf.function
     def Noise_pen(self):
         
         nsupp_v = self.beta_v * self.isn_v
@@ -272,7 +281,7 @@ def pll_weight_k(self, ow_k, vatt_k, vrep_k):
         return ow_k
     
     
-        
+    #@tf.function   
     def add_to_terms(self,
                      V_att, 
                      V_rep,
@@ -306,10 +315,13 @@ def add_to_terms(self,
         
         return V_att, V_rep, Noise_pen, B_pen, pll, high_B_pen
 
-
+    #@tf.function(reduce_retracing=True)
     def calc_metrics(self, energies):
-        return self._calc_containment(energies), self._calc_contamination(energies)
+        d,rel_metrics_radius = self._calc_containment(energies)
+        d.update(self._calc_contamination(energies, rel_metrics_radius))
+        return d
     # metrics functions that can be called at the end, first calc containment, then contamination
+    #@tf.function
     def _calc_containment(self, energies):
         '''
         energies as  V x 1
@@ -321,34 +333,44 @@ def _calc_containment(self, energies):
         d_x_k = tf.reduce_sum(d_x_k, axis=2) + 10000. * tf.eye(d_x_k.shape[0],d_x_k.shape[1])  # K x K , add large identity
         d_m_x_k = tf.reduce_min(d_x_k, axis=1, keepdims=True)# K x 1
         d_m_x_k = tf.sqrt(d_m_x_k)/self.d_k 
-        self.rel_metrics_radius = tf.reduce_mean(d_m_x_k) / 2. # ()
+        rel_metrics_radius = tf.reduce_mean(d_m_x_k) / 2. # ()
 
         ##now metrics
         dxk = tf.reduce_sum( (tf.expand_dims(x_k_alpha, axis=1) - self.x_k_m)**2 , axis= -1) #K x V'
         
-        in_radius = self.rel_metrics_radius > tf.sqrt(dxk)/self.d_k #K x V'
+        in_radius = rel_metrics_radius > tf.sqrt(dxk)/self.d_k #K x V'
         in_radius = in_radius[...,tf.newaxis]
 
         energies_k_m = SelectWithDefault(self.Msel, energies, 0.) #K x V' x 1
-        self.energies_k  = tf.reduce_sum(energies_k_m, axis=1) #K x 1
+        energies_k  = tf.reduce_sum(energies_k_m, axis=1) #K x 1
         
         in_radius_energy = tf.reduce_sum(tf.where( in_radius, energies_k_m, 0. ), axis=1) # K x 1
-        in_radius_energy /= self.energies_k
-        return tf.reduce_mean(in_radius_energy)
+        in_radius_energy /= energies_k
+
+        beta_contain = tf.reduce_sum(tf.where( in_radius, self.beta_k_m, 0. ), axis=1) # K x 1
+        beta_contain /= tf.reduce_sum(self.beta_k_m , axis=1) 
+
+        return {'containment': tf.reduce_mean(in_radius_energy),
+                'beta_containment': tf.reduce_mean(beta_contain)}, rel_metrics_radius
 
-    def _calc_contamination(self, energies):
+    #@tf.function
+    def _calc_contamination(self, energies, rel_metrics_radius):
 
         x_k_alpha = tf.gather_nd(self.x_k_m,self.alpha_k, batch_dims=1) 
         dsq = tf.expand_dims(x_k_alpha, axis=1) - tf.expand_dims(self.x_v, axis=0) #K x V x C
         dsq = tf.reduce_sum(dsq**2, axis=-1)  #K x V 
-        in_radius = self.rel_metrics_radius > tf.sqrt(dsq) / self.d_k# K x V
+        in_radius = rel_metrics_radius > tf.sqrt(dsq) / self.d_k# K x V
         
         energies_k_v = tf.expand_dims(energies, axis=0) # K x V x 1
         energies_ir_all_k_v = tf.where(in_radius[...,tf.newaxis], energies_k_v , 0.)
         energies_ir_not_k_v = tf.where(in_radius[...,tf.newaxis], self.Mnot * energies_k_v , 0.)
         
         rel_cont = tf.reduce_sum(energies_ir_not_k_v, axis=1)/tf.reduce_sum(energies_ir_all_k_v, axis=1)
-        return tf.reduce_mean(rel_cont)
+
+        beta_cont = tf.reduce_sum(tf.where(in_radius[...,tf.newaxis], self.Mnot * self.beta_v[tf.newaxis,...], 0.), axis=1)
+        beta_cont /= tf.reduce_sum(tf.where(in_radius[...,tf.newaxis], self.beta_v[tf.newaxis,...], 0.), axis=1)
+
+        return {'contamination': tf.reduce_mean(rel_cont), 'beta_contamination': tf.reduce_mean(beta_cont)}
     
 
 class Hinge_OC_per_sample_damped(Basic_OC_per_sample):
@@ -378,6 +400,22 @@ class Hinge_OC_per_sample(Hinge_OC_per_sample_damped):
     def __init__(self, **kwargs):
         super(Hinge_OC_per_sample, self).__init__(**kwargs)
         self.condensation_damping = 0.0 # Don't stop any gradients
+        #self.rep_range = 2.
+
+
+        
+class Dead_Zone_Hinge_OC_per_sample(Hinge_OC_per_sample):
+    '''
+    This is the classic repulsive hinge loss plus a dead zone
+    '''
+    def __init__(self, **kwargs):
+        super(Dead_Zone_Hinge_OC_per_sample, self).__init__(**kwargs)
+        self.condensation_damping = 0.0 # Don't stop any gradients
+        self.rep_range = 2.
+
+    def att_func(self,dsq_k_m):
+        return 1. - tf.math.exp(-10. * dsq_k_m)
+    
 
         
 class Hinge_OC_per_sample_learnable_qmin(Hinge_OC_per_sample):
@@ -620,7 +658,7 @@ def __init__(self,
                  ):
         self.loss_impl=loss_impl(**kwargs)
 
-
+    
     def __call__(self, beta,
                          x,
                          d,
@@ -638,8 +676,7 @@ def __call__(self, beta,
         if rs.shape[0] is None or rs.shape[0] < 2:
             return tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen
         batch_size = rs.shape[0] - 1
-
-        contai,contam = tf.constant(0,dtype='float32'),tf.constant(0,dtype='float32')
+        mdict = {}
     
         for b in tf.range(batch_size):
             
@@ -656,16 +693,14 @@ def __call__(self, beta,
             tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen = self.loss_impl.add_to_terms(
                 tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen
                 )
-            if energies is not None: #just last batch is fine
-                ca,cm = self.loss_impl.calc_metrics(energies[rs[b]:rs[b + 1]])
-                contai += ca / float(batch_size)
-                contam += cm / float(batch_size)
+
+            #just last row split is good enough for metric
+            if energies is not None and b == batch_size-1: 
+                mdict = self.loss_impl.calc_metrics(energies[rs[b]:rs[b + 1]])
 
         bs = tf.cast(batch_size, dtype='float32') + 1e-3
         out = [a/bs for a in [tot_V_att, tot_V_rep, tot_Noise_pen, tot_B_pen, tot_pll,tot_too_much_B_pen]]
-        if energies is not None:
-            return out + [contai, contam]
-        return out, None, None
+        return out, mdict
 
 
 #################################################
diff --git a/modules/rs_offset_adder_op.py b/modules/rs_offset_adder_op.py
new file mode 100644
index 00000000..2e6abe10
--- /dev/null
+++ b/modules/rs_offset_adder_op.py
@@ -0,0 +1,52 @@
+
+import tensorflow as tf
+from tensorflow.python.framework import ops
+
+_index_replacer = tf.load_op_library('rs_offset_adder.so')
+
+def RSOffsetAdder(idx, row_splits):
+    '''
+    Adds offsets to indices in idx, and returns the result.
+    The offsets equal the row split offsets. Indices < 0 are ignored.
+    '''
+    # checks if idx is dim None x 1, and rs is None, both are TF tensors
+    if ((idx.shape.ndims == 2 and idx.shape[1] == 1) or idx.shape.ndims < 2) and idx.shape.ndims == 1:
+        return _index_replacer.RSOffsetAdder(idx=idx, row_splits=row_splits)
+    else:
+        raise ValueError('idx must be a 1D tensor or a 2D tensor with shape None x 1')
+
+@ops.RegisterGradient("RSOffsetAdder")
+def _RSOffsetAdderGrad(op, gradin):
+    return None,None
+  
+
+
+'''
+A small unit test in the same file should become standard practice for all custom ops.
+'''
+def test(print_debug = False):
+    # a quick test in eager mode
+    import numpy as np
+    import tensorflow as tf
+    idx = tf.constant([0,-1,1,-1,4,5,-1,2,1,100], dtype=tf.int32)
+    rs = tf.constant([0,3,6,10], dtype=tf.int32)
+    idx = RSOffsetAdder(idx, rs)
+
+    # expected output: [  0  -1   1  -1   7   8  -1   8   7 106]
+    if print_debug:
+        print(idx.numpy())
+
+    multiplier = 10000
+    #more rigorous test implemeting same in numpy with a loop over rowsplits and 10*multiplier random indices
+    idx = np.random.randint(-1, 100, 10*multiplier, dtype=np.int32)
+    rs = multiplier * np.array([0,3,6,10], dtype=np.int32)
+    kernel_idx = RSOffsetAdder(tf.constant(idx), tf.constant(rs))
+    np_idx = idx.copy()
+    for i in range(1, len(rs)+1):
+        if i == len(rs):
+            np_idx[rs[i-1]:]  = np.where( np_idx[rs[i-1]:] >=0 , np_idx[rs[i-1]:] + rs[i-1], np_idx[rs[i-1]:])
+        else:
+            np_idx[rs[i-1]:rs[i]]  = np.where( np_idx[rs[i-1]:rs[i]] >=0 , np_idx[rs[i-1]:rs[i]] + rs[i-1], np_idx[rs[i-1]:rs[i]])
+    return np.all(np_idx == kernel_idx.numpy())
+
+#print(test())
diff --git a/modules/select_knn_op.py b/modules/select_knn_op.py
index 39371a3e..7a5a517f 100644
--- a/modules/select_knn_op.py
+++ b/modules/select_knn_op.py
@@ -11,7 +11,7 @@
 _sknn_op = tf.load_op_library('select_knn.so')
 
 def SelectKnn(K : int, coords,  row_splits, masking_values=None, threshold=0.5, tf_compatible=False, max_radius=-1.,
-              mask_mode='none', mask_logic='xor'):
+              mask_mode='none', mask_logic='xor',n_bins=None):
     '''
     returns indices and distances**2 , gradient for distances is implemented!
     
diff --git a/modules/training_base_hgcal.py b/modules/training_base_hgcal.py
index 215e7132..06660a45 100644
--- a/modules/training_base_hgcal.py
+++ b/modules/training_base_hgcal.py
@@ -1,87 +1,396 @@
-from DeepJetCore.training.training_base import training_base
+#from DeepJetCore.training.training_base import training_base
+
+
+import concurrent.futures
+import numpy as np
+
+## to call it from cammand lines
+import sys
+import os
 from argparse import ArgumentParser
+import shutil
+from DeepJetCore import DataCollection
+import tensorflow.keras as keras
 import tensorflow as tf
+import copy
+from DeepJetCore.training.gpuTools import DJCSetGPUs
+from DeepJetCore.training.training_base import training_base as training_base_djc
+from DeepJetCore.modeltools import load_model
+import time
+from DebugLayers import switch_off_debug_plots
+from DeepJetCore.DJCLayers import LayerWithMetrics
+from tqdm import tqdm
 
-class HGCalTraining(training_base):
-    def __init__(self, *args, 
-                 parser = None,
-                 **kwargs):
-        '''
-        Adds file logging
-        '''
-        #use the DJC training base option to pass a parser
-        if parser is None:
-            parser = ArgumentParser('Run the training')
-        parser.add_argument("--interactive",   help="prints output to screen", default=False, action="store_true")
+
+#for multi-gpu we need to overwrite a few things here
+
+###
+#
+# this will become a cleaned-up version of DJC training_base at some point
+#
+###
+class training_base(object):
+    
+    def __init__(
+				self, splittrainandtest=0.85,
+				useweights=False, testrun=False,
+                testrun_fraction=0.1,
+				resumeSilently=False, 
+				renewtokens=False, #compat
+				collection_class=DataCollection,
+				parser=None,
+                recreate_silently=False
+				):
+        
+        scriptname=sys.argv[0]
+        
+        if parser is None: parser = ArgumentParser('Run the training')
+        parser.add_argument('inputDataCollection')
+        parser.add_argument('outputDir')
+        #parser.add_argument('--modelMethod', help='Method to be used to instantiate model in derived training class', metavar='OPT', default=None)
+        parser.add_argument("--gpu",  help="select specific GPU", metavar="OPT", default="")
+        #parser.add_argument("--gpufraction",  help="select memory fraction for GPU",   type=float, metavar="OPT", default=-1)
+        #parser.add_argument("--submitbatch",  help="submits the job to condor" , default=False, action="store_true")
+        #parser.add_argument("--walltime",  help="sets the wall time for the batch job, format: 1d5h or 2d or 3h etc" , default='1d')
+        #parser.add_argument("--isbatchrun",   help="is batch run", default=False, action="store_true")
+        parser.add_argument("--valdata",   help="set validation dataset (optional)", default="")
+        parser.add_argument("--takeweights",   help="Applies weights from the model given as relative or absolute path. Matches by names and skips layers that don't match.", default="")
+        
+        
+        args = parser.parse_args()
+        self.args = args
+        self.argstring = sys.argv
+        #sanity check
+        
+        
+        import matplotlib
+        #if no X11 use below
+        matplotlib.use('Agg')
+        DJCSetGPUs(args.gpu)
         
-        #no reason for a lot of validation samples usually
-        super().__init__(*args, resumeSilently=True,parser=parser,splittrainandtest=0.95,**kwargs)
         
-        if not self.args.interactive:
-            print('>>> redirecting the following stdout and stderr to logs in',self.outputDir)
-            import sys
-            sys.stdout = open(self.outputDir+'/stdout.txt', 'w')
-            sys.stderr = open(self.outputDir+'/stderr.txt', 'w')
+        self.ngpus=1
+        
+        if len(args.gpu):
+            self.ngpus=len([i for i in args.gpu.split(',')])
+            print('running on '+str(self.ngpus)+ ' gpus')
             
+        self.keras_inputs=[]
+        self.keras_inputsshapes=[]
+        self.keras_model=None
+        self.mgpu_keras_models = []
+        self.keras_weight_model_path=args.takeweights
+        self.train_data=None
+        self.val_data=None
+        self.startlearningrate=None
+        self.optimizer=None
+        self.trainedepoches=0
+        self.compiled=False
+        self.checkpointcounter=0
+        self.callbacks=None
+        self.custom_optimizer=False
+        self.copied_script=""
+        self.gradients=[]
+        
+        self.inputData = os.path.abspath(args.inputDataCollection) \
+												 if ',' not in args.inputDataCollection else \
+														[os.path.abspath(i) for i in args.inputDataCollection.split(',')]
+        self.outputDir=args.outputDir
+        # create output dir
+        
+        isNewTraining=True
+        if os.path.isdir(self.outputDir):
+            if not (resumeSilently or recreate_silently):
+                var = input('output dir exists. To recover a training, please type "yes"\n')
+                if not var == 'yes':
+                    raise Exception('output directory must not exists yet')
+            isNewTraining=False
+            if recreate_silently:
+                isNewTraining=True     
+        else:
+            os.mkdir(self.outputDir)
+        self.outputDir = os.path.abspath(self.outputDir)
+        self.outputDir+='/'
+        
+        if recreate_silently:
+            os.system('rm -rf '+ self.outputDir +'*')
+        
+        #copy configuration to output dir
+        try:
+            shutil.copyfile(scriptname,self.outputDir+os.path.basename(scriptname))
+        except shutil.SameFileError:
+            pass
+        except BaseException as e:
+            raise e
+            
+        self.copied_script = self.outputDir+os.path.basename(scriptname)
+        
+        self.train_data = collection_class()
+        self.train_data.readFromFile(self.inputData)
+        self.train_data.useweights=useweights
+        
+        if len(args.valdata):
+            print('using validation data from ',args.valdata)
+            self.val_data = DataCollection(args.valdata)
+        
+        else:
+            if testrun:
+                if len(self.train_data)>1:
+                    self.train_data.split(testrun_fraction)
+            
+                self.train_data.dataclass_instance=None #can't be pickled
+                self.val_data=copy.deepcopy(self.train_data)
+                
+            else:    
+                self.val_data=self.train_data.split(splittrainandtest)
         
-        from config_saver import copyModules
-        copyModules(self.outputDir)#save the modules with indexing for overwrites
 
-    @tf.function
-    def compute_per_replica_loss(replica_data):
-        with tf.GradientTape() as tape:
-            logits = self.keras_model(replica_data)
-            primary_loss_value = loss_fn(replica_data, logits)
-            total_loss_value = primary_loss_value + tf.add_n(self.keras_model.losses)
-        grads = tape.gradient(total_loss_value, self.keras_model.trainable_variables)
-        optimizer.apply_gradients(zip(grads, self.keras_model.trainable_variables))
-        return total_loss_value
-
-    def to_ragged_tensor(self, data_list):
-        for e in data_list:
-            rt = tf.RaggedTensor.from_row_splits(values=e[0][0], row_splits=e[0][1].flatten())
-            yield rt
 
-    def compileModel(self, **kwargs):
-        super().compileModel(is_eager=True,
-                    loss=None,
-                    **kwargs)
+        shapes = self.train_data.getNumpyFeatureShapes()
+        inputdtypes = self.train_data.getNumpyFeatureDTypes()
+        inputnames= self.train_data.getNumpyFeatureArrayNames()
+        for i in range(len(inputnames)): #in case they are not named
+            if inputnames[i]=="" or inputnames[i]=="_rowsplits":
+                inputnames[i]="input_"+str(i)+inputnames[i]
+
+
+        print("shapes", shapes)
+        print("inputdtypes", inputdtypes)
+        print("inputnames", inputnames)
+        
+        self.keras_inputs=[]
+        self.keras_inputsshapes=[]
+
+        for s,dt,n in zip(shapes,inputdtypes,inputnames):
+            self.keras_inputs.append(keras.layers.Input(shape=s, dtype=dt, name=n))
+            self.keras_inputsshapes.append(s)
+            
+        #bookkeeping
+        self.train_data.writeToFile(self.outputDir+'trainsamples.djcdc',abspath=True)
+        self.val_data.writeToFile(self.outputDir+'valsamples.djcdc',abspath=True)
+            
+        if not isNewTraining:
+            kfile = self.outputDir+'/KERAS_check_model_last.h5'
+            if not os.path.isfile(kfile):
+                kfile = self.outputDir+'/KERAS_check_model_last' #savedmodel format
+                if not os.path.isdir(kfile):
+                    kfile=''
+            if len(kfile):
+                print('loading model',kfile)
+                self.loadModel(kfile)
+                self.trainedepoches=0
+                if os.path.isfile(self.outputDir+'losses.log'):
+                    for line in open(self.outputDir+'losses.log'):
+                        valloss = line.split(' ')[1][:-1]
+                        if not valloss == "None":
+                            self.trainedepoches+=1
+                else:
+                    print('incomplete epochs, starting from the beginning but with pretrained model')
+            else:
+                print('no model found in existing output dir, starting training from scratch')
+        
+    def modelSet(self):
+        return (not self.keras_model==None) and not len(self.keras_weight_model_path)
+        
+    def syncModelWeights(self):
+        if len(self.mgpu_keras_models) < 2:
+            return
+        weights = self.mgpu_keras_models[0].get_weights()
+        for model in self.mgpu_keras_models[1:]:
+            model.set_weights(weights)
+        
+    def setModel(self,model,**modelargs):
+        if len(self.keras_inputs)<1:
+            raise Exception('setup data first') 
+        
+        with tf.device('/GPU:0'):
+            self.keras_model=model(self.keras_inputs,**modelargs)
+        
+        if len(self.keras_weight_model_path):
+            from DeepJetCore.modeltools import apply_weights_where_possible, load_model
+            self.keras_model = apply_weights_where_possible(self.keras_model, 
+                                         load_model(self.keras_weight_model_path))
+        if not self.keras_model:
+            raise Exception('Setting model not successful') 
+
+        self.distributeModelToGPUs()
+
+    def distributeModelToGPUs(self):
+        self.mgpu_keras_models = [self.keras_model] #zero model
+        if self.ngpus > 1:
+            print("distributing model to",self.ngpus,"GPUs")
+            for i in range(self.ngpus-1):
+                with tf.device(f'/GPU:{i+1}'):
+                    self.mgpu_keras_models.append(tf.keras.models.clone_model(self.keras_model))
+            #sync initial or loaded weights
+            self.syncModelWeights()    
+
+        #run debug layers etc just for one model
+        for i, m in enumerate(self.mgpu_keras_models):
+            if i:
+                switch_off_debug_plots(m)
+                # TBI: this does not work yet, needs wandb update
+                LayerWithMetrics.switch_off_metrics_layers(m)
+
+        #run record_metrics only for one model
+
+    def saveCheckPoint(self,addstring=''):
+        self.checkpointcounter=self.checkpointcounter+1 
+        self.saveModel("KERAS_model_checkpoint_"+str(self.checkpointcounter)+"_"+addstring)    
+           
+    def _loadModel(self,filename):
+        keras_model=load_model(filename)
+        optimizer=keras_model.optimizer
+        return keras_model, optimizer
+                
+    def loadModel(self,filename):
+        self.keras_model, self.optimizer = self._loadModel(filename)
+        self.distributeModelToGPUs()
+        #distribute to gpus
+        self.compiled=True
+        
+    def setCustomOptimizer(self,optimizer):
+        self.optimizer = optimizer
+        self.custom_optimizer=True
+        
+    def compileModel(self,
+                     learningrate,
+                     clipnorm=None,
+                     print_models=False,
+                     metrics=None,
+                     is_eager=False,
+                     **compileargs):
+
+        if not self.keras_model:
+            raise Exception('set model first') 
+
+        print('Model being compiled for '+str(self.ngpus)+' gpus')
+            
+        self.startlearningrate=learningrate
+        
+        if not self.custom_optimizer:
+            from tensorflow.keras.optimizers import Adam
+            if clipnorm:
+                self.optimizer = Adam(lr=self.startlearningrate,clipnorm=clipnorm)
+            else:
+                self.optimizer = Adam(lr=self.startlearningrate)
+            
+        def run_compile(model, device):
+            with tf.device(device):
+                model.compile(optimizer=self.optimizer,metrics=metrics,**compileargs)
+                if is_eager:
+                    #call on one batch to fully build it
+                    print('Model being called once for device '+str(device))
+                    model(self.train_data.getExampleFeatureBatch())
+                    
+        for i, m in enumerate(self.mgpu_keras_models):
+            run_compile(m, f'/GPU:{i}')
+        
+        if print_models:
+            print(self.mgpu_keras_models[0].summary())
+        self.compiled=True
+
+        
+    def saveModel(self,outfile):
+        self.keras_model.save(self.outputDir+outfile)
+        
+
+    # add some of the multi-gpu initialisation here?
+    def _initTraining(self,
+                      nepochs,
+                     batchsize,
+                     use_sum_of_squares=False):
+        
+        
+        self.train_data.setBatchSize(batchsize)
+        self.val_data.setBatchSize(batchsize)
+        self.train_data.batch_uses_sum_of_squares=use_sum_of_squares
+        self.val_data.batch_uses_sum_of_squares=use_sum_of_squares
+        
+        self.train_data.setBatchSize(batchsize)
+        self.val_data.setBatchSize(batchsize)
+
+        ## create multi-gpu models
+
+    #now this is hgcal specific because of missing truth, think later how to do that better
+    def compute_gradients(self, model, data, i):
+        with tf.device(f'/GPU:{i}'):
+            with tf.GradientTape() as tape:
+                predictions = model(data, training=True)
+                loss = tf.add_n(model.losses)
+            return tape.gradient(loss, model.trainable_variables)
+        
+    def average_gradients(self):
+        all_gradients = self.gradients
+        # Average the gradients across GPUs
+        if len(all_gradients) < 2:
+            return all_gradients[0]
+        avg_grads = []
+        for grad_list_tuple in zip(*all_gradients):
+            grads = [g for g in grad_list_tuple if g is not None]
+            avg_grads.append(tf.reduce_mean(grads, axis=0))
+        return avg_grads
+
+
+    def trainstep_parallel(self, split_data, collect_gradients=1):
+
+        if self.ngpus == 1: #simple
+            self.gradients += [self.compute_gradients(self.mgpu_keras_models[0], split_data[0], 0)]
+            
+        else:
+            batch_gradients = []
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.ngpus) as executor:
+                futures = [executor.submit(self.compute_gradients, self.mgpu_keras_models[i], split_data[i], i) for i in range(self.ngpus)]
+                for future in concurrent.futures.as_completed(futures):
+                    gradients = future.result()
+                    batch_gradients.append(gradients)
     
+            self.gradients += batch_gradients
+
+        # Average gradients across GPUs and collection steps
+        if collect_gradients * self.ngpus <= len(self.gradients):
+            avg_grads = self.average_gradients()
+            self.optimizer.apply_gradients(zip(avg_grads, self.mgpu_keras_models[0].trainable_variables))
+            self.syncModelWeights() # weights synced
+            self.gradients = []
+
+
+        
     def trainModel(self,
                    nepochs,
                    batchsize,
-                   run_eagerly=True,
-                   verbose=1,
+                   run_eagerly=False,
                    batchsize_use_sum_of_squares = False,
-                   fake_truth=True,#extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs
-                   backup_after_batches=500,
-                   checkperiod=1,
+                   fake_truth=False,#extend the truth list with dummies. Useful when adding more prediction outputs than truth inputs
                    stop_patience=-1, 
                    lr_factor=0.5,
                    lr_patience=-1, 
                    lr_epsilon=0.003, 
                    lr_cooldown=6, 
                    lr_minimum=0.000001,
+                   checkperiod=10,
+                   backup_after_batches=-1,
                    additional_plots=None,
                    additional_callbacks=None,
                    load_in_mem = False,
                    max_files = -1,
                    plot_batch_loss = False,
+                   add_progbar = False,
+                   verbose = 0,
+                   collect_gradients = 1, #average N batches before update 
                    **trainargs):
         
-        self.keras_model.run_eagerly=run_eagerly
+        for m in self.mgpu_keras_models:
+            m.run_eagerly=run_eagerly
         # write only after the output classes have been added
         self._initTraining(nepochs,batchsize, batchsize_use_sum_of_squares)
         
-        try: #won't work for purely eager models
-            self.keras_model.save(self.outputDir+'KERAS_untrained_model')
+        try: #won't work for purely eager models unless called first - now fixed in model compile
+            self.saveModel('KERAS_untrained_model.h5')
         except:
             pass
         print('setting up callbacks')
         from DeepJetCore.training.DeepJet_callbacks import DeepJet_callbacks
-        minTokenLifetime = 5
-        if not self.renewtokens:
-            minTokenLifetime = -1
         
         self.callbacks=DeepJet_callbacks(self.keras_model,
                                     stop_patience=stop_patience, 
@@ -97,79 +406,175 @@ def trainModel(self,
                                     additional_plots=additional_plots,
                                     batch_loss = plot_batch_loss,
                                     print_summary_after_first_batch=run_eagerly,
-                                    minTokenLifetime = minTokenLifetime)
+                                    minTokenLifetime = -1)
         
         if additional_callbacks is not None:
             if not isinstance(additional_callbacks, list):
                 additional_callbacks=[additional_callbacks]
             self.callbacks.callbacks.extend(additional_callbacks)
             
+        #create callbacks wrapper
         
-        print('starting training')
-        if load_in_mem:
-            print('make features')
-            X_train = self.train_data.getAllFeatures(nfiles=max_files)
-            X_test = self.val_data.getAllFeatures(nfiles=max_files)
-            print('make truth')
-            Y_train = self.train_data.getAllLabels(nfiles=max_files)
-            Y_test = self.val_data.getAllLabels(nfiles=max_files)
-            self.keras_model.fit(X_train, Y_train, batch_size=batchsize, epochs=nepochs,
-                                 callbacks=self.callbacks.callbacks,
-                                 validation_data=(X_test, Y_test),
-                                 max_queue_size=1,
-                                 use_multiprocessing=False,
-                                 workers=0,    
-                                 **trainargs)
-        else:
-        
-            #prepare generator 
-        
-            print("setting up generator... can take a while")
-            use_fake_truth=None
-            if fake_truth:
-                if isinstance(self.keras_model.output,dict):
-                    use_fake_truth = [k for k in self.keras_model.output.keys()]
-                elif isinstance(self.keras_model.output,list):
-                    use_fake_truth = len(self.keras_model.output)
-                    
-            traingen = self.train_data.invokeGenerator(fake_truth = use_fake_truth)
-            valgen = self.val_data.invokeGenerator(fake_truth = use_fake_truth)
+        callbacks = tf.keras.callbacks.CallbackList(
+                    self.callbacks.callbacks,
+                    add_history=True,
+                    model=self.keras_model, #only run them on the main model!
+                )
+        if self.trainedepoches == 0:
+            callbacks.on_train_begin()
 
+        #prepare generator 
 
-            while(self.trainedepoches < nepochs):
-           
-                #this can change from epoch to epoch
-                #calculate steps for this epoch
-                #feed info below
-                traingen.prepareNextEpoch()
-                valgen.prepareNextEpoch()
-                nbatches_train = traingen.getNBatches() #might have changed due to shuffeling
-                nbatches_val = valgen.getNBatches()
+        print("setting up generator... can take a while")
+        use_fake_truth=None
+        if fake_truth:
+            if isinstance(self.keras_model.output,dict):
+                use_fake_truth = [k for k in self.keras_model.output.keys()]
+            elif isinstance(self.keras_model.output,list):
+                use_fake_truth = len(self.keras_model.output)
+                
+        traingen = self.train_data.invokeGenerator(fake_truth = use_fake_truth)
+        valgen = self.val_data.invokeGenerator(fake_truth = use_fake_truth)
+
+        while(self.trainedepoches < nepochs):
+
+            self.gradients = [] #reset in case of accumulated gradients
             
-                print('>>>> epoch', self.trainedepoches,"/",nepochs)
-                print('training batches: ',nbatches_train)
-                print('validation batches: ',nbatches_val)
+            callbacks.on_epoch_begin(self.trainedepoches)
+            #this can change from epoch to epoch
+            #calculate steps for this epoch
+            #feed info below
+            traingen.prepareNextEpoch()
+            valgen.prepareNextEpoch()
+            nbatches_train = traingen.getNBatches() #might have changed due to shuffeling
+            nbatches_val = valgen.getNBatches()
+        
+            print('>>>> epoch', self.trainedepoches,"/",nepochs)
+            print('training batches: ',nbatches_train)
+            print('validation batches: ',nbatches_val)
 
-                data = self.to_ragged_tensor(traingen.feedNumpyData())
-                #data = traingen.feedNumpyData()
+            nbatches_in = 0
+            single_counter = 0
+
+            if add_progbar:
+                pbar = tqdm(total=nbatches_train + nbatches_val)
+
+            while nbatches_in < nbatches_train:
+
+                thisbatch = []
+                while len(thisbatch) < self.ngpus and nbatches_in < nbatches_train:
+                    #only 'feature' part matters for HGCAL
+                    thisbatch.append(next(traingen.feedNumpyData())[0])
+                    nbatches_in += 1
+
+                if len(thisbatch) != self.ngpus: #last batch might not be enough
+                    break
+
+                callbacks.on_train_batch_begin(single_counter)
+
+                self.trainstep_parallel(thisbatch, collect_gradients)
                 
-                self.keras_model.fit(data, 
-                                     steps_per_epoch=nbatches_train,
-                                     epochs=self.trainedepoches + 1,
-                                     initial_epoch=self.trainedepoches,
-                                     callbacks=self.callbacks.callbacks,
-                                     validation_data=valgen.feedNumpyData(),
-                                     validation_steps=nbatches_val,
-                                     max_queue_size=1,
-                                     use_multiprocessing=False,
-                                     workers=0,
-                                     **trainargs
-                )
-                self.trainedepoches += 1
-                traingen.shuffleFileList()
-                #
+                logs = { m.name: m.result() for m in self.keras_model.metrics } #only for main model
+
+                callbacks.on_train_batch_end(single_counter, logs)
+
+                single_counter += 1
+                if add_progbar:
+                    pbar.update(len(thisbatch))
+            try:
+                callbacks.on_epoch_end(self.trainedepoches, logs) #use same logs here
+            except Exception as e:
+                print(e)
+                print('will continue training anyway')
+
+            if add_progbar:
+                pbar.close()
+            self.trainedepoches += 1
+            traingen.shuffleFileList()
+            #
+    
+        self.saveModel("KERAS_model.h5")
+
+        #return self.keras_model, callbacks.history
+    
+    
+    
+       
+    def change_learning_rate(self, new_lr):
+        import tensorflow.keras.backend as K
+        K.set_value(self.keras_model.optimizer.lr, new_lr)
         
-            self.saveModel("KERAS_model.h5")
+      
 
-        return self.keras_model, self.callbacks.history
+class HGCalTraining(training_base):
+    def __init__(self, *args, 
+                 **kwargs):
+        '''
+        Adds file logging
+        '''
+        #no reason for a lot of validation samples usually
+        super().__init__(*args, resumeSilently=True,splittrainandtest=0.95,**kwargs)
+        
+        from config_saver import copyModules
+        copyModules(self.outputDir)#save the modules with indexing for overwrites
+
+    def compileModel(self, **kwargs):
+        super().compileModel(is_eager=True,
+                       loss=None,
+                       **kwargs)
     
+    def trainModel(self,
+                   nepochs,
+                   batchsize,
+                   backup_after_batches=500,
+                   checkperiod=1, 
+                   **kwargs):
+        '''
+        Just implements some defaults
+        '''
+        return super().trainModel(nepochs=nepochs,
+                           batchsize=batchsize,
+                           run_eagerly=True,
+                           batchsize_use_sum_of_squares=False,
+                           fake_truth=True,
+                           backup_after_batches=backup_after_batches,
+                           checkperiod=checkperiod,
+                            **kwargs)
+
+
+
+class HGCalTraining_compat(training_base_djc):
+    def __init__(self, *args, 
+                 **kwargs):
+        '''
+        Adds file logging
+        '''
+        #no reason for a lot of validation samples usually
+        super().__init__(*args, resumeSilently=True,splittrainandtest=0.95,**kwargs)
+        
+        from config_saver import copyModules
+        copyModules(self.outputDir)#save the modules with indexing for overwrites
+
+    def compileModel(self, **kwargs):
+        super().compileModel(is_eager=True,
+                       loss=None,
+                       **kwargs)
+    
+    def trainModel(self,
+                   nepochs,
+                   batchsize,
+                   backup_after_batches=500,
+                   checkperiod=1, 
+                   **kwargs):
+        '''
+        Just implements some defaults
+        '''
+        return super().trainModel(nepochs=nepochs,
+                           batchsize=batchsize,
+                           run_eagerly=True,
+                           verbose=2,
+                           batchsize_use_sum_of_squares=False,
+                           fake_truth=True,
+                           backup_after_batches=backup_after_batches,
+                           checkperiod=checkperiod,
+                            **kwargs)
\ No newline at end of file
diff --git a/scripts/fibatch_submit.py b/scripts/fibatch_submit.py
index 8c447bfb..432aa1fe 100755
--- a/scripts/fibatch_submit.py
+++ b/scripts/fibatch_submit.py
@@ -24,7 +24,7 @@
     sys.exit()
 
 # can be used by others on FI
-DJCLOC='/mnt/ceph/users/jkieseler/containers/deepjetcore3_latest.sif'
+DJCLOC='/mnt/ceph/users/jkieseler/containers/deepjetcore4_latest.sif'
 UEXT = str(uuid.uuid4())
 
 WORKDIR=None
@@ -34,11 +34,13 @@
 TRIGGERED_NAME=False
 TRIGGERED_CONSTRAINT=False
 TRIGGERED_TIME=False
+TRIGGERED_GPU=False
 NAME="batchscript"
 # constraint="a100"
 CONSTRAINT="a100-80gb"
-CONSTRAINT="a100"
+#CONSTRAINT="a100"
 TIME="7-0"
+GPUS="4"
 
 for clo in sys.argv:
 
@@ -50,6 +52,14 @@
         TRIGGERED_NAME=False
         continue
 
+    if clo == '---g':
+        TRIGGERED_GPU = True
+        continue
+    if TRIGGERED_GPU:
+        GPUS = clo
+        TRIGGERED_GPU=False
+        continue
+
     if clo == '---d':
         TRIGGERED=True
         continue
@@ -99,7 +109,7 @@
 
 bscript_temp=f'''#!/bin/bash
 
-#SBATCH  -p gpu --gres=gpu:1  --mincpus 4 -t 7-0 --constraint={CONSTRAINT}
+#SBATCH  -p gpu --gres=gpu:{GPUS}  --mincpus 4 -t 7-0 --constraint={CONSTRAINT}
 
 nvidia-smi
 singularity  run  -B /mnt --nv {DJCLOC} /bin/bash runscript_{UEXT}.sh