diff --git a/tutorials/tmva/TMVA_CNN_Classification.py b/tutorials/tmva/TMVA_CNN_Classification.py new file mode 100644 index 0000000000000..7cea3ad2494d6 --- /dev/null +++ b/tutorials/tmva/TMVA_CNN_Classification.py @@ -0,0 +1,370 @@ +# TMVA Classification Example Using a Convolutional Neural Network + +# This is an example of using a CNN in TMVA. We do classification using a toy image data set that is generated when running the example macro. + +# Helper function to create input images data we create a signal and background 2D histograms from 2d gaussians with a location (means in X and Y) different for each event The difference between signal and background is in the gaussian width. The width for the background gaussian is slightly larger than the signal width by few % values + +# First you need to run TMVA_CNN_Classification.C to generate images_data_16x16.root. + +import ROOT +from ROOT import TMVA +import os +from array import array +import tensorflow +from tensorflow.keras.models import Sequential +from tensorflow.keras.optimizers import Adam + +from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Reshape, BatchNormalization + +# Setting up TMVA + +ROOT.TMVA.Tools.Instance() +# TMVA requires initialization the PyMVA to utilize PyTorch. PyMVA is the interface for third-party MVA tools based on Python. It is created to make powerful external libraries easily accessible with a direct integration into the TMVA workflow. All PyMVA methods provide the same plug-and-play mechanisms as the TMVA methods. Because the base method of PyMVA is inherited from the TMVA base method, all options of internal TMVA methods apply for PyMVA methods as well. +# For PYMVA methods +TMVA.PyMethodBase.PyInitialize() + +# Create an Output File and Declare Factory +# +# Create the Factory class. Later you can choose the methods whose performance you'd like to investigate. +# +# The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass +# +# - The first argument is the base of the name of all the output weightfiles in the directory weight/ that will be created with the method parameters +# +# - The second argument is the output file for the training results +# +# - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string +# + +outputFile = ROOT.TFile.Open("CNN_ClassificationOutput.root", "RECREATE") + +factory = ROOT.TMVA.Factory("TMVA_CNN_Classification", outputFile, + "!V:ROC:!Silent:Color:!DrawProgressBar:AnalysisType=Classification" ) + + +# Define the Options and number of threads + + +opt=[1,1,1,1,1] +useTMVACNN = opt[0] if (len(opt) > 0) else False +useKerasCNN = opt[0] if (len(opt) > 1) else False +useTMVADNN = opt[0] if (len(opt) > 2) else False +useTMVABDT = opt[0] if (len(opt) > 3) else False +usePyTorchCNN = opt[0] if (len(opt) > 4) else False + +writeOutputFile = True + +num_threads = 0 # use default threads + +# do enable MT running +if (num_threads >= 0): + ROOT.EnableImplicitMT(num_threads) + if (num_threads > 0): + ROOT.gSystem.Setenv("OMP_NUM_THREADS", ROOT.TString.Format("%d",num_threads)) + +else: + ROOT.gSystem.Setenv("OMP_NUM_THREADS", "1") + +print("Running with nthreads = " + str(ROOT.GetThreadPoolSize()) ) + + +if __debug__: + ROOT.gSystem.Setenv("KERAS_BACKEND", "tensorflow") + # for using Keras + # TMVA.PyMethodBase.PyInitialize() +else: + useKerasCNN = False + +factory = ROOT.TMVA.Factory ( + "TMVA_CNN_Classification", outputFile, + "!V:ROC:!Silent:Color:AnalysisType=Classification:Transformations=None:!Correlations") + + +# Declare DataLoader(s) +# +# The next step is to declare the DataLoader class that deals with input variables +# +# Define the input variables that shall be used for the MVA training +# note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )] +# +# In this case the input data consists of an image of 16x16 pixels. Each single pixel is a branch in a ROOT TTree + +loader = ROOT.TMVA.DataLoader("dataset") + +# Setup Dataset(s) +# Define input data file and setup the signal and background trees + +imgSize = 16 * 16 +inputFileName = "images_data_16x16.root" +inputFile = ROOT.TFile.Open(inputFileName) +if (inputFile == None): + Error("TMVA_CNN_Classification", "Error opening input file %s - exit", inputFileName.Data()) +signalTree = inputFile.Get("sig_tree") +backgroundTree = inputFile.Get("bkg_tree") + +signalTree.Print() + + + +nEventsSig = signalTree.GetEntries() +nEventsBkg = backgroundTree.GetEntries() +# global event weights per tree (see below for setting event-wise weights) +signalWeight = 1.0 +backgroundWeight = 1.0 + +# You can add an arbitrary number of signal or background trees +loader.AddSignalTree(signalTree, signalWeight) +loader.AddBackgroundTree(backgroundTree, backgroundWeight) + +## add event variables (image) +## use new method (from ROOT 6.20 to add a variable array for all image data) + +loader.AddVariablesArray("vars",imgSize,'F') + +# Set individual event weights (the variables must exist in the original TTree) +# for signal : factory.SetSignalWeightExpression ("weight1*weight2") +# for background: factory.SetBackgroundWeightExpression("weight1*weight2") +# loader.SetBackgroundWeightExpression( "weight" ) + + +# Apply additional cuts on the signal and background samples (can be different) +mycuts = ROOT.TCut("") # for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1"; +mycutb = ROOT.TCut("") # for example: TCut mycutb = "abs(var1)<0.5"; + + +# Tell the factory how to use the training and testing events + +# If no numbers of events are given, half of the events in the tree are used +# for training, and the other half for testing: +# loader.PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" ); +# It is possible also to specify the number of training and testing events, +# note we disable the computation of the correlation matrix of the input variables + +nTrainSig = 0.8 * nEventsSig +nTrainBkg = 0.8 * nEventsBkg + +# build the string options for DataLoader::PrepareTrainingAndTestTree +prepareOptions = "nTrain_Signal="+str(nTrainSig)+":nTrain_Background="+str(nTrainBkg)+":SplitMode=Random:SplitSeed=100:NormMode=NumEvents:!V:!CalcCorrelations" + +loader.PrepareTrainingAndTestTree(mycuts, mycutb, prepareOptions) + + +# Booking Methods +# +# Here we book the TMVA methods. We book a Boosted Decision Tree method (BDT) + +# Boosted Decision Trees +if (useTMVABDT): + factory.BookMethod(loader, ROOT.TMVA.Types.kBDT, "BDT","!V:NTrees=400:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:"+"UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20") + + +# ### Booking Deep Neural Network +# +# Here we define the option string for building the Deep Neural network model. +# +# #### 1. Define DNN layout +# +# The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed. +# +# We define first the DNN layout: +# +# - **input layout** : this defines the input data format for the DNN as ``input depth | height | width``. +# In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features) +# - **batch layout** : this defines how are the input batch. It is related to input layout but not the same. +# If the first layer is dense it should be ``1 | batch size ! number of variables`` (features) +# +# *(note the use of the character `|` as separator of input parameters for DNN layout)* +# +# note that in case of only dense layer the input layout could be omitted but it is required when defining more +# complex architectures +# +# - **layer layout** string defining the layer architecture. The syntax is +# - layer type (e.g. DENSE, CONV, RNN) +# - layer parameters (e.g. number of units) +# - activation function (e.g TANH, RELU,...) +# +# *the different layers are separated by the ``","`` * +# +# #### 2. Define Training Strategy +# +# We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator. +# One can then concatenate different training strategy with different parameters. The training strategy are separated by +# the ``"|"`` separator. +# +# - Optimizer +# - Learning rate +# - Momentum (valid for SGD and RMSPROP) +# - Regularization and Weight Decay +# - Dropout +# - Max number of epochs +# - Convergence steps. if the test error will not decrease after that value the training will stop +# - Batch size (This value must be the same specified in the input layout) +# - Test Repetitions (the interval when the test error will be computed) +# +# +# #### 3. Define general DNN options +# +# We define the general DNN options concatenating in the final string the previously defined layout and training strategy. +# Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods. +# In addition to input layout, batch layout and training strategy we add now: +# +# - Type of Loss function (e.g. CROSSENTROPY) +# - Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL ) +# - Variable Transformation +# - Type of Architecture (e.g. CPU, GPU, Standard) +# +# We can then book the DL method using the built option string +# + +# Define the DNN layout +if (useTMVADNN): + layoutString = "Layout=DENSE|100|RELU,BNORM,DENSE|100|RELU,BNORM,DENSE|100|RELU,BNORM,DENSE|100|RELU,DENSE|1|LINEAR" + + # Training strategies + # one can catenate several training strings with different parameters (e.g. learning rates or regularizations + # parameters) The training string must be concatenates with the `|` delimiter + trainingString1 = "LearningRate=1e-3,Momentum=0.9,Repetitions=1,"+ "ConvergenceSteps=5,BatchSize=100,TestRepetitions=1,"+"MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"+"Optimizer=ADAM,DropConfig=0.0+0.0+0.0+0." + + + trainingStrategyString = "TrainingStrategy=" + trainingStrategyString += trainingString1 # + "|" + trainingString2 + .... + + # Build now the full DNN Option string + + dnnOptions = "!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:"+"WeightInitialization=XAVIER" + dnnOptions+= ":" + dnnOptions+= layoutString + dnnOptions+= ":" + dnnOptions+= trainingStrategyString + + dnnMethodName = "TMVA_DNN_CPU" + + dnnOptions += ":Architecture=CPU" + + +factory.BookMethod(loader, ROOT.TMVA.Types.kDL, dnnMethodName, dnnOptions) + + +# +# ### Book Convolutional Neural Network in TMVA +# +# For building a CNN one needs to define +# +# - Input Layout : number of channels (in this case = 1) | image height | image width +# - Batch Layout : batch size | number of channels | image size = (height*width) +# +# Then one add Convolutional layers and MaxPool layers. +# +# - For Convolutional layer the option string has to be: +# - CONV | number of units | filter height | filter width | stride height | stride width | padding height | paddig +# width | activation function +# +# - note in this case we are using a filer 3x3 and padding=1 and stride=1 so we get the output dimension of the +# conv layer equal to the input +# +# - note we use after the first convolutional layer a batch normalization layer. This seems to help significantly the +# convergence +# +# - For the MaxPool layer: +# - MAXPOOL | pool height | pool width | stride height | stride width +# +# The RESHAPE layer is needed to flatten the output before the Dense layer +# +# +# Note that to run the CNN is required to have CPU or GPU support +# + +inputLayoutString ="InputLayout=1|16|16" + +# Batch Layout +layoutString = "Layout=CONV|10|3|3|1|1|1|1|RELU,BNORM,CONV|10|3|3|1|1|1|1|RELU,MAXPOOL|2|2|1|1,"+"RESHAPE|FLAT,DENSE|100|RELU,DENSE|1|LINEAR" + +# Training strategies. +trainingString1 = "LearningRate=1e-3,Momentum=0.9,Repetitions=1,"+"ConvergenceSteps=5,BatchSize=100,TestRepetitions=1,"+"MaxEpochs=20,WeightDecay=1e-4,Regularization=None,"+"Optimizer=ADAM,DropConfig=0.0+0.0+0.0+0.0" + +trainingStrategyString = "TrainingStrategy=" +trainingStrategyString += trainingString1 # + "|" + trainingString2 + "|" + trainingString3; for concatenating more training strings + +# Build full CNN Options. + + +cnnOptions = "!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:" +"WeightInitialization=XAVIER::Architecture=CPU" + +cnnOptions += ":" + inputLayoutString +cnnOptions += ":" + layoutString +cnnOptions += ":" + trainingStrategyString + ## New DL (CNN) +cnnMethodName = "TMVA_CNN_CPU" +# use GPU if available + + +cnnOptions += ":Architecture=CPU" +cnnMethodName = "TMVA_CNN_CPU" + + +factory.BookMethod(loader, ROOT.TMVA.Types.kDL, cnnMethodName, cnnOptions) +# Book Convolutional Neural Network in Keras using a generated model +ROOT.Info("TMVA_CNN_Classification", "Building convolutional keras model") +# create python script which can be executed +# create 2 conv2d layer + maxpool + dense + +model = Sequential() +model.add(Reshape((16, 16, 1), input_shape = (256, ))) +model.add(Conv2D(10, kernel_size=(3,3), kernel_initializer='TruncatedNormal', activation='relu', padding='same' ) ) +model.add(Conv2D(10, kernel_size=(3,3), kernel_initializer='glorot_normal', activation ='relu', padding = 'same') ) +model.add(BatchNormalization()) +model.add(Conv2D(10, kernel_size = (3,3), kernel_initializer = 'glorot_normal',activation ='relu', padding = 'same') ) +model.add(BatchNormalization()) +model.add(MaxPooling2D(pool_size = (2, 2), strides = (1,1))) +model.add(Flatten()) +model.add(Dense(256, activation = 'relu')) +model.add(Dense(2, activation = 'sigmoid')) +model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate = 0.001), metrics = ['accuracy']) +model.save('model_cnn.h5') +model.summary() + + + +if (ROOT.gSystem.AccessPathName("model_cnn.h5")): + Warning("TMVA_CNN_Classification", "Error creating Keras model file - skip using Keras") +else: + # book PyKeras method only if Keras model could be created + ROOT.Info("TMVA_CNN_Classification", "Booking tf.Keras CNN model") + + +factory.BookMethod(loader, ROOT.TMVA.Types.kPyKeras, "PyKeras","H:!V:VarTransform=None:FilenameModel=model_cnn.h5:"+"FilenameTrainedModel=trained_model_cnn.h5:NumEpochs=20:BatchSize=128") + +# Training All Methods + +# Here we train all the previously booked methods. + +# Train Methods + +factory.TrainAllMethods() + + +# Test all methods +# Now we test all methods using the test data set + +factory.TestAllMethods() + +# Evaluate all methods +# +# Here we evaluate all methods and compare their performances, computing efficiencies, ROC curves etc.. using both training and tetsing data sets. Several histograms are produced which can be examined with the TMVAGui or directly using the output file + +factory.EvaluateAllMethods() + +# Plot ROC Curve +# Here we plot the ROC curve and display the same. + +## Plot ROC Curve + +c1 = factory.GetROCCurve(loader) +c1.Draw() + + +# Close the Output File +# Close outputfile to save all output information (evaluation result of methods) and it can be used by TMVAGUI to display additional plots + +# close outputfile to save output file +outputFile.Close() diff --git a/tutorials/tmva/TMVA_Higgs_Classification.py b/tutorials/tmva/TMVA_Higgs_Classification.py new file mode 100644 index 0000000000000..b0b617f404d9d --- /dev/null +++ b/tutorials/tmva/TMVA_Higgs_Classification.py @@ -0,0 +1,292 @@ +# TMVA Higgs Classification in Python +# In this example we will still do Higgs classification but we will use together with the native TMVA methods also methods from Keras and scikit-learn. +# Classification example of TMVA based on public Higgs UCI dataset +# The UCI data set is a public HIGGS data set , see http://archive.ics.uci.edu/ml/datasets/HIGGS +# used in this paper: Baldi, P., P. Sadowski, and D. Whiteson. “Searching for Exotic Particles in High-energy Physics with Deep Learning.” Nature Communications 5 (July 2, 2014). + +# First you need to run TMVA_Higgs_Classification.C to generate Higgs_data.root +# ### Import the necessary modules +# +# We start with importing the necessary modules required for the tutorial. Here we imported ROOT and TMVA(Toolkit for Multivariate Data Analysis). If you want to know more about TMVA, you can refer the documentation. + +import ROOT +from ROOT import TMVA + +# options to control used methods + +useLikelihood = True #likelihood based discriminant +useLikelihoodKDE = False #likelihood based discriminant +useFischer = True #Fischer discriminant +useMLP = False #Multi Layer Perceptron (old TMVA NN implementation) +useBDT = True #BOosted Decision Tree +useDL = True #TMVA Deep learning ( CPU or GPU) +useKeras = True # Keras Deep learning + +# Setting up TMVA + +ROOT.TMVA.Tools.Instance() + + +# +# Create an Output File and Declare Factory +# +# Create the Factory class. Later you can choose the methods whose performance you'd like to investigate. +# +# The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass +# +# - The first argument is the base of the name of all the output weightfiles in the directory weight/ that will be created with the method parameters +# +# - The second argument is the output file for the training results +# +# - The third argument is a string option defining some general configuration for the TMVA session. For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string +# + +outputFile = ROOT.TFile.Open("Higgs_ClassificationOutput.root", "RECREATE") +factory = ROOT.TMVA.Factory("TMVA_Higgs_Classification", outputFile, + "!V:ROC:!Silent:Color:!DrawProgressBar:AnalysisType=Classification" ) + + +# ### Define the input datafile +# +# Take the input of the .root file in a variable if file exists. If the file doesn't exist, then download it through Cern box. + +inputFileName = "Higgs_data.root" + +inputFile = ROOT.TFile.Open( inputFileName ) +inputFileLink = "http://root.cern.ch/files/" + inputFileName + + +if (ROOT.gSystem.AccessPathName(inputFileName)!=None): + # file exists + inputFile = ROOT.TFile.Open( inputFileName ) +if(inputFile == None): + # download file from Cernbox location + ROOT.Info("TMVA_Higgs_Classification","Download Higgs_data.root file") + ROOT.TFile.SetCacheFileDir(".") + inputFile = ROOT.TFile.Open(inputFileLink, "CACHEREAD") + if (inputFile == NULL): + Error("TMVA_Higgs_Classification","Input file cannot be downloaded - exit") + + +# Setting up the Signal and Background Trees +# +# Here we are setting up the Training and testing Trees. + +# --- Register the training and test trees + +signalTree = inputFile.Get("sig_tree") +backgroundTree = inputFile.Get("bkg_tree") + +signalTree.Print() + + +# Declare DataLoader(s) +# +# The next step is to declare the DataLoader class that deals with input variables +# +# Define the input variables that shall be used for the MVA training +# note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )] + + +loader = TMVA.DataLoader("dataset") + +loader.AddVariable("m_jj") +loader.AddVariable("m_jjj") +loader.AddVariable("m_lv") +loader.AddVariable("m_jlv") +loader.AddVariable("m_bb") +loader.AddVariable("m_wbb") +loader.AddVariable("m_wwbb") + +### We set now the input data trees in the TMVA DataLoader class + +# global event weights per tree (see below for setting event-wise weights) +signalWeight = 1.0 +backgroundWeight = 1.0 + +# You can add an arbitrary number of signal or background trees +loader.AddSignalTree ( signalTree, signalWeight ) +loader.AddBackgroundTree( backgroundTree, backgroundWeight ) + + +# Set individual event weights (the variables must exist in the original TTree) +# +# for signal : factory.SetSignalWeightExpression ("weight1*weight2") +# +# for background: factory.SetBackgroundWeightExpression("weight1*weight2") +# +# loader.SetBackgroundWeightExpression( "weight" ) +# + +# Apply additional cuts on the signal and background samples (can be different) +mycuts = ROOT.TCut("") # for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1" +mycutb = ROOT.TCut("") # for example: TCut mycutb = "abs(var1)<0.5" + + +# Tell the factory how to use the training and testing events + +# If no numbers of events are given, half of the events in the tree are used +# for training, and the other half for testing: +# loader.PrepareTrainingAndTestTree( mycut, "SplitMode=random:!V" ) +# To also specify the number of testing events, use: + +loader.PrepareTrainingAndTestTree( mycuts, mycutb, "nTrain_Signal=7000:nTrain_Background=7000:SplitMode=Random:NormMode=NumEvents:!V" ) + + +# Booking Methods +# +# Here we book the TMVA methods. We book first a Likelihood based on KDE (Kernel Density Estimation), a Fischer discriminant, a BDT +# and a shallow neural network + +# Likelihood ("naive Bayes estimator") +if (useLikelihood): + factory.BookMethod(loader, ROOT.TMVA.Types.kLikelihood, "Likelihood", + "H:!V:TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10:NSmooth=1:NAvEvtPerBin=50" ) + +# Use a kernel density estimator to approximate the PDFs +if (useLikelihoodKDE): + factory.BookMethod(loader, ROOT.TMVA.Types.kLikelihood, "LikelihoodKDE", + "!H:!V:!TransformOutput:PDFInterpol=KDE:KDEtype=Gauss:KDEiter=Adaptive:KDEFineFactor=0.3:KDEborder=None:NAvEvtPerBin=50" ) + +# Fisher discriminant (same as LD) +if (useFischer): + factory.BookMethod(loader, ROOT.TMVA.Types.kFisher, "Fisher", "H:!V:Fisher:VarTransform=None:CreateMVAPdfs:PDFInterpolMVAPdf=Spline2:NbinsMVAPdf=50:NsmoothMVAPdf=10" ) + + +# Boosted Decision Trees +if (useBDT): + factory.BookMethod(loader,ROOT.TMVA.Types.kBDT, "BDT", + "!V:NTrees=200:MinNodeSize=2.5%:MaxDepth=2:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20" ) + + +# Multi-Layer Perceptron (Neural Network) +if (useMLP): + factory.BookMethod(loader, ROOT.TMVA.Types.kMLP, "MLP", + "!H:!V:NeuronType=tanh:VarTransform=N:NCycles=100:HiddenLayers=N+5:TestRate=5:!UseRegulator" ) + + +# ### Booking Deep Neural Network +# +# Here we define the option string for building the Deep Neural network model. +# +# #### 1. Define DNN layout +# +# The DNN configuration is defined using a string. Note that whitespaces between characters are not allowed. +# +# We define first the DNN layout: +# +# - **input layout** : this defines the input data format for the DNN as ``input depth | height | width``. +# In case of a dense layer as first layer the input layout should be ``1 | 1 | number of input variables`` (features) +# - **batch layout** : this defines how are the input batch. It is related to input layout but not the same. +# If the first layer is dense it should be ``1 | batch size ! number of variables`` (features) +# +# *(note the use of the character `|` as separator of input parameters for DNN layout)* +# +# note that in case of only dense layer the input layout could be omitted but it is required when defining more +# complex architectures +# +# - **layer layout** string defining the layer architecture. The syntax is +# - layer type (e.g. DENSE, CONV, RNN) +# - layer parameters (e.g. number of units) +# - activation function (e.g TANH, RELU,...) +# +# *the different layers are separated by the ``","`` * +# +# #### 2. Define Training Strategy +# +# We define here the training strategy parameters for the DNN. The parameters are separated by the ``","`` separator. +# One can then concatenate different training strategy with different parameters. The training strategy are separated by +# the ``"|"`` separator. +# +# - Optimizer +# - Learning rate +# - Momentum (valid for SGD and RMSPROP) +# - Regularization and Weight Decay +# - Dropout +# - Max number of epochs +# - Convergence steps. if the test error will not decrease after that value the training will stop +# - Batch size (This value must be the same specified in the input layout) +# - Test Repetitions (the interval when the test error will be computed) +# +# +# #### 3. Define general DNN options +# +# We define the general DNN options concatenating in the final string the previously defined layout and training strategy. +# Note we use the ``":"`` separator to separate the different higher level options, as in the other TMVA methods. +# In addition to input layout, batch layout and training strategy we add now: +# +# - Type of Loss function (e.g. CROSSENTROPY) +# - Weight Initizalization (e.g XAVIER, XAVIERUNIFORM, NORMAL ) +# - Variable Transformation +# - Type of Architecture (e.g. CPU, GPU, Standard) +# +# We can then book the DL method using the built option string + +## Here we book the new DNN of TMVA if we have support in ROOT. We will use GPU version if ROOT is enabled with GPU + +# Define DNN layout +inputLayoutString = "InputLayout=1|1|7" +batchLayoutString= "BatchLayout=1|32|7" +layoutString= "Layout=DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|64|TANH,DENSE|1|LINEAR" +# Define Training strategies +# one can catenate several training strategies +training1 = "Optimizer=ADAM,LearningRate=1e-3,Momentum=0.,Regularization=None,WeightDecay=1e-4," +training1 += "DropConfig=0.+0.+0.+0.,MaxEpochs=30,ConvergenceSteps=10,BatchSize=32,TestRepetitions=1" +# training2 = ROOT.TString("LearningRate=1e-3,Momentum=0.9" +# "ConvergenceSteps=10,BatchSize=128,TestRepetitions=1," +# "MaxEpochs=20,WeightDecay=1e-4,Regularization=None," +# "Optimizer=SGD,DropConfig=0.0+0.0+0.0+0."); + +trainingStrategyString = "TrainingStrategy=" +trainingStrategyString += training1 # + "|" + training2 + +# General Options. + +dnnOptions = "!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=G:"+"WeightInitialization=XAVIER" + +dnnOptions += ":" + inputLayoutString +dnnOptions += ":" + batchLayoutString +dnnOptions += ":" + layoutString +dnnOptions += ":" + trainingStrategyString + +dnnMethodName = "DNN_CPU" + + +# Booking a Method + + +factory.BookMethod(loader, ROOT.TMVA.Types.kDL, dnnMethodName, dnnOptions) + + +# Training All Methods +# +# Here we train all the previously booked methods. + +factory.TrainAllMethods() + + +# ### Test all methods +# +# Now we test all methods using the test data set + +factory.TestAllMethods() + +# Evaluate all methods +# +# Here we evaluate all methods and compare their performances, computing efficiencies, ROC curves etc.. using both training and tetsing data sets. Several histograms are produced which can be examined with the TMVAGui or directly using the output file +# +factory.EvaluateAllMethods() + +# Plot ROC Curve +# Here we plot the ROC curve and display the same. + +## after we get the ROC curve and we display + +c1 = factory.GetROCCurve(loader) +c1.Draw() + + +# Close the Output File +# Close outputfile to save all output information (evaluation result of methods) and it can be used by TMVAGUI to display additional plots + +outputFile.Close() diff --git a/tutorials/tmva/TMVA_RNN_Classification.py b/tutorials/tmva/TMVA_RNN_Classification.py new file mode 100644 index 0000000000000..d7f11ee555e98 --- /dev/null +++ b/tutorials/tmva/TMVA_RNN_Classification.py @@ -0,0 +1,319 @@ + +# TMVA Classification Example Using a Recurrent Neural Network +# +# This is an example of using a RNN in TMVA. We do classification using a toy time dependent data set +# that is generated when running this example macro. +# +# This is an example of using a RNN in TMVA. We do the classification using a toy data set containing a time series of data sample ntimes and with dimension ndim. + +# First you need to run TMVA_RNN_Classification.C to generate time_data_t10_d30.root. + +# Import the necessary modules + +# We start with importing the necessary modules required for the tutorial. Here we imported ROOT and TMVA(Toolkit for Multivariate Data Analysis). If you want to know more about TMVA, you can refer the documentation. + +import ROOT +from ROOT import TMVA + +import tensorflow +from tensorflow.keras.models import Sequential +from tensorflow.keras.optimizers import Adam +from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Reshape, BatchNormalization + +ninput = 30 + +ntime = 10 + +batchSize = 100 +maxepochs = 20 + +use_type = 1 + +nTotEvts = 10000 # total events to be generated for signal or background + +useKeras = True + + +useTMVA_RNN = True +useTMVA_DNN = True +useTMVA_BDT = False + +rnn_types = ["RNN", "LSTM", "GRU"] +use_rnn_type = [1, 1, 1] +if (use_type >=0 & use_type < 3): + use_rnn_type = [0,0,0] + use_rnn_type[use_type] = 1 + +archString = "CPU" +writeOutputFile = True + +rnn_type = "RNN" + + +# ### Setting up TMVA +# +# TMVA requires initialization the PyMVA to utilize PyTorch. PyMVA is the interface for third-party MVA tools based on Python. It is created to make powerful external libraries easily accessible with a direct integration into the TMVA workflow. All PyMVA methods provide the same plug-and-play mechanisms as the TMVA methods. Because the base method of PyMVA is inherited from the TMVA base method, all options of internal TMVA methods apply for PyMVA methods as well. + +ROOT.TMVA.Tools.Instance() +ROOT.TMVA.PyMethodBase.PyInitialize() + +# Define the input files and the number of threads + +num_threads = 0 # use by default all threads +# do enable MT running +if (num_threads >= 0): + ROOT.EnableImplicitMT(num_threads) + if (num_threads > 0): + ROOT.gSystem.Setenv("OMP_NUM_THREADS", num_threads) + else: + ROOT.gSystem.Setenv("OMP_NUM_THREADS", "1") + + +print("Running with nthreads = " + str(ROOT.GetThreadPoolSize()) + "\n" ) + +inputFileName = "time_data_t10_d30.root" + +fileExist = ROOT.gSystem.AccessPathName(inputFileName) + +inputFile = ROOT.TFile.Open(inputFileName) +if (inputFile==None): + Error("TMVA_RNN_Classification", "Error opening input file %s - exit", inputFileName.Data()) + + +# ### Create an Output File and Declare Factory +# +# Create the Factory class. Later you can choose the methods whose performance you'd like to investigate. +# +# The factory is the major TMVA object you have to interact with. Here is the list of parameters you need to pass +# +# - The first argument is the base of the name of all the output weightfiles in the directory weight/ that will be created with the method parameters +# +# - The second argument is the output file for the training results +# +# - The third argument is a string option defining some general configuration for the TMVA session. +# +# For example all TMVA output can be suppressed by removing the "!" (not) in front of the "Silent" argument in the option string +# + +print("--- RNNClassification : Using input file: " + inputFile.GetName()+"\n") + +# Create a ROOT output file where TMVA will store ntuples, histograms, etc. +outfileName = "data_RNN_"+ archString +".root" + +if (writeOutputFile): + outputFile = ROOT.TFile.Open(outfileName, "RECREATE") + +# Creating the factory object +factory = ROOT.TMVA.Factory("TMVAClassification", outputFile,"!V:!Silent:Color:DrawProgressBar:Transformations=None:!Correlations:"+"AnalysisType=Classification:ModelPersistence") + + +# ### Declare DataLoader(s) +# +# The next step is to declare the DataLoader class that deals with input variables +# +# Define the input variables that shall be used for the MVA training +# note that you may also use variable expressions, which can be parsed by TTree::Draw( "expression" )] + + +dataloader =TMVA.DataLoader("dataset") + +signalTree = inputFile.Get("sgn") +background = inputFile.Get("bkg") + +signalTree.Print() +nvar = ninput * ntime + +# add variables - use new AddVariablesArray function +for i in range(ntime): + varName = "vars_time"+str(i) + dataloader.AddVariablesArray(varName,ninput,'F') + +dataloader.AddSignalTree(signalTree, 1.0) +dataloader.AddBackgroundTree(background, 1.0) + +# check given input +datainfo = dataloader.GetDataSetInfo() +vars = datainfo.GetListOfVariables() +print("number of variables is " + str(vars.size())+ "\n") +for v in vars: + print(str(v)+"\n") + +nTrainSig = 0.8 * nTotEvts +nTrainBkg = 0.8 * nTotEvts + +#build the string options for DataLoader::PrepareTrainingAndTestTree +prepareOptions = "nTrain_Signal="+str(nTrainSig)+":nTrain_Background="+str(nTrainBkg)+":SplitMode=Random:SplitSeed=100:NormMode=NumEvents:!V:!CalcCorrelations" + + +# ### Tell the factory how to use the training and testing events + +# Apply additional cuts on the signal and background samples (can be different) +mycuts = ROOT.TCut("") ## for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1"; +mycutb = ROOT.TCut("") ## for example: TCut mycutb = "abs(var1)<0.5"; + +dataloader.PrepareTrainingAndTestTree(mycuts, mycutb, prepareOptions) + +print("prepared DATA LOADER " ) + + +# ### Book TMVA recurrent models +# +# Book the different types of recurrent models in TMVA (SimpleRNN, LSTM or GRU) + +if (useTMVA_RNN): + for i in range(3): + if (use_rnn_type[i]==None): + continue + rnn_type = str(rnn_types[i]) + +# define the inputlayout string for RNN +# the input data should be organize as following: +# input layout for RNN: time x ndim + + inputLayoutString = "InputLayout="+str(ntime)+"|"+str(ninput) + + # Define RNN layer layout + # it should be LayerType (RNN or LSTM or GRU) | number of units | number of inputs | time steps | remember output (typically no=0 | return full sequence + rnnLayout = str(rnn_type) + "|10|"+ str(ninput) + "|" + str(ntime) + "|0|1" + + # add after RNN a reshape layer (needed top flatten the output) and a dense layer with 64 units and a last one + # Note the last layer is linear because when using Crossentropy a Sigmoid is applied already + layoutString ="Layout=" + rnnLayout + ",RESHAPE|FLAT,DENSE|64|TANH,LINEAR" + + #Defining Training strategies. Different training strings can be concatenate. Use however only one + trainingString1 = "LearningRate=1e-3,Momentum=0.0,Repetitions=1,"+"ConvergenceSteps=5,BatchSize="+str(batchSize)+",TestRepetitions=1,"+"WeightDecay=1e-2,Regularization=None,MaxEpochs="+str(maxepochs + )+","+"Optimizer=ADAM,DropConfig=0.0+0.+0.+0." + + trainingStrategyString="TrainingStrategy=" + trainingStrategyString += trainingString1; # + "|" + trainingString2 + + # Define the full RNN Noption string adding the final options for all network + rnnOptions = "!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:"+"WeightInitialization=XAVIERUNIFORM:ValidationSize=0.2:RandomSeed=1234" + rnnOptions += ":" + inputLayoutString + rnnOptions += ":" + layoutString + rnnOptions += ":" + trainingStrategyString + rnnOptions += ":" + "Architecture=" + str(archString) + + rnnName = "TMVA_" + rnn_type + factory.BookMethod(dataloader, TMVA.Types.kDL, rnnName, rnnOptions) + + +# Book TMVA fully connected dense layer models + +if (useTMVA_DNN): +# Method DL with Dense Layer + inputLayoutString = "InputLayout=1|1|" + str(ntime * ninput) + + layoutString = "Layout=DENSE|64|TANH,DENSE|TANH|64,DENSE|TANH|64,LINEAR" +# Training strategies. + trainingString1 = "LearningRate=1e-3,Momentum=0.0,Repetitions=1,"+"ConvergenceSteps=10,BatchSize=256,TestRepetitions=1,"+"WeightDecay=1e-4,Regularization=None,MaxEpochs=20"+"DropConfig=0.0+0.+0.+0.,Optimizer=ADAM" + trainingStrategyString = "TrainingStrategy=" + trainingStrategyString += trainingString1 # + "|" + trainingString2 + + # General Options. + dnnOptions = "!H:V:ErrorStrategy=CROSSENTROPY:VarTransform=None:"+"WeightInitialization=XAVIER:RandomSeed=0" + + dnnOptions += ":" + inputLayoutString + dnnOptions += ":" + layoutString + dnnOptions += ":" + trainingStrategyString + dnnOptions += ":" + "Architecture=" + str(archString) + + + dnnName = "TMVA_DNN" + factory.BookMethod(dataloader, TMVA.Types.kDL, dnnName, dnnOptions) + + + +# Book Keras recurrent models +# +# Book the different types of recurrent models in Keras (SimpleRNN, LSTM or GRU) + + + + +if (useKeras): + for i in range(3): + if (use_rnn_type[i]): + modelName = "model_" + str(rnn_types[i]) + ".h5" + trainedModelName = "trained_model_"+ str(rnn_types[i]) + ".h5" + + ROOT.Info("TMVA_RNN_Classification", "Building recurrent keras model using a"+str(rnn_types[i])+" layer") + # create python script which can be executed + # create 2 conv2d layer + maxpool + dense + + + + model = Sequential() + model.add(Reshape((10, 30), input_shape = (10*30, ))) + # add recurrent neural network depending on type / Use option to return the full output + if (rnn_types[i] == "LSTM"): + model.add(LSTM(units=10, return_sequences=True) ) + elif (rnn_types[i] == "GRU"): + model.add(GRU(units=10, return_sequences=True) ) + else: + model.add(SimpleRNN(units=10, return_sequences=True) ) + + model.add(BatchNormalization()) + model.add(Flatten())# needed if returning the full time output sequen + model.add(Dense(64, activation = 'tanh')) + model.add(Dense(2, activation = 'sigmoid')) + model.compile(loss = 'binary_crossentropy', optimizer = Adam(learning_rate = 0.001), metrics = ['accuracy']) + + model.save(modelName) + model.summary() + +# m.SaveSource("make_rnn_model.py"); +# execute + ROOT.gSystem.Exec("python make_rnn_model.py") + + if (ROOT.gSystem.AccessPathName(modelName)): + Warning("TMVA_RNN_Classification", "Error creating Keras recurrent model file - Skip using Keras") + useKeras = False + else: + # book PyKeras method only if Keras model could be created + ROOT.Info("TMVA_RNN_Classification", "Booking Keras" + str(rnn_types[i]) + "model") + factory.BookMethod(dataloader, TMVA.Types.kPyKeras,"PyKeras_"+ str(rnn_types[i]),"!H:!V:VarTransform=None:FilenameModel="+str(modelName)+":tf.keras:"+"FilenameTrainedModel="+str(trainedModelName)+":GpuOptions=allow_growth=True:"+"NumEpochs="+str(maxepochs)+":BatchSize="+str(batchSize)) + + + +# Training All Methods +# +# Here we train all the previously booked methods. + +# Train all methods +factory.TrainAllMethods() + + +# Test all methods +# +# Now we test all methods using the test data set + +print("nthreads = "+ str(ROOT.GetThreadPoolSize()) + "\n") + +# Evaluate all MVAs using the set of test events +factory.TestAllMethods() + + +# Evaluate all methods +# +# Here we evaluate all methods and compare their performances, computing efficiencies, ROC curves etc.. using both training and tetsing data sets. Several histograms are produced which can be examined with the TMVAGui or directly using the output file + +# Evaluate and compare performance of all configured MVAs +factory.EvaluateAllMethods() +# check method + + +# Plot ROC Curve +# Here we plot the ROC curve and display the same. + +# plot ROC curve +c1 = factory.GetROCCurve(dataloader) +c1.Draw() + + +# Close the Output File +# Close outputfile to save all output information (evaluation result of methods) and it can be used by TMVAGUI to display additional plots + +if (outputFile): + outputFile.Close()