Initial commit of P3B2 to Release_01 branch.

jmohdyusof · jmohdyusof · commit 5d35aca5e5e9 · 2018-08-08T16:31:41.000-06:00
diff --git a/Pilot3/P3B2/README.md b/Pilot3/P3B2/README.md
@@ -0,0 +1,61 @@
+## P3B2: RNN-LSTM: A Generative Model for Clinical Path Reports
+**Overview**:Given a sample corpus of biomedical text such as clinical reports, build a deep learning network that can automatically generate synthetic text documents with valid clinical context.
+
+**Relationship to core problem**:Labeled data is quite challenging to come by, specifically for patient data, since manual annotations are time consuming; hence, a core capability we intend to build is a “gold-standard” annotated data that is generated by deep learning networks to tune our deep text comprehension applications.
+
+**Expected Outcomes**:A generative RNN based on LSTMs that can effectively generate synthetic biomedical text of desired clinical context.
+
+### Benchmark Specs
+
+#### Description of the Data
+* Data source: Annotated pathology reports
+* Input dimensions: 250,000-500,000 [characters], or 5,000-20,000 [bag of words], or 200-500 [bag of concepts]
+* Output dimensions: Same as input
+* Sample size: O(1,000)
+* Notes on data balance and other issues: standard NLP pre-processing is required, including (but not limited to) stemming words, keywords, cleaning text, stop words, etc. Data balance is an issue since the number of positive examples vs. control is skewed
+
+#### Expected Outcomes
+* A generative model for pathology reports
+* Output range: N/A, since the outputs are actual text documents with known case descriptions/ concepts
+
+#### Evaluation Metrics
+* Accuracy or loss function: Standard information theoretic metrics such as log-likelihood score, minimum description length score, AIC/BIC to measure how similar actual documents are compared to generated ones
+* Expected performance of a naïve method: Latent Dirichlet allocation (LDA) models
+
+#### Description of the Network
+* Proposed network architecture: LSTM with at least 1 layer with 256 character windows
+* Number of layers: At least two hidden layers with one input and one output sequence
+A graphical representation of the samme is shown here.
+![CB-RNN Architecture](https://raw.githubusercontent.com/ECP-CANDLE/Benchmarks/master/Pilot3/P3B2/images/RNN1.png)
+
+#### Annotated Keras Code
+Data loader, preprocessing, basic training and cross validation, prediction and evaluation on test data  
+
+### Running the baseline implementation
+The data file provided here is a compressed pickle file (.tgz extension). Before running the code, use:
+```
+cd P3B2
+tar -xzf data.pkl.tgz
+```
+to unpack the archive. Note that the training data is provided as a single pickle file. The code is documented to provide enough information about how to reproduce the files.
+
+After uncompressing the data file, you can run:
+```
+python keras_p3b2_baseline.py
+```
+
+The original data from the pathology reports cannot be made available online. Hence, we have pre-processed the reports so that example training/testing sets can be generated. Contact yoonh@ornl.gov for more information for generating additional training and testing data. A generic data loader that generates training and testing sets will be provided in the near future.
+
+### Example output
+#### Checkpointing and model saving
+At each iteration of the training process, a model is output as a h5 file and also as a json file. An example model (in JSON format) is shown below.
+```
+{"class_name": "Sequential", "keras_version": "1.1.0", "config": [{"class_name": "LSTM", "config": {"inner_activation": "hard_sigmoid", "trainable": true, "inner_init": "orthogonal", "output_dim": 256, "unroll": false, "consume_less": "cpu", "init": "glorot_uniform", "dropout_U": 0.0, "input_dtype": "float32", "batch_input_shape": [null, 20, 99], "input_length": null, "dropout_W": 0.0, "activation": "tanh", "stateful": false, "b_regularizer": null, "U_regularizer": null, "name": "lstm_1", "go_backwards": false, "input_dim": 99, "return_sequences": false, "W_regularizer": null, "forget_bias_init": "one"}}, {"class_name": "Dense", "config": {"W_constraint": null, "b_constraint": null, "name": "dense_1", "activity_regularizer": null, "trainable": true, "init": "glorot_uniform", "bias": true, "input_dim": null, "b_regularizer": null, "W_regularizer": null, "activation": "linear", "output_dim": 99}}, {"class_name": "Activation", "config": {"activation": "softmax", "trainable": true, "name": "activation_1"}}]}
+```
+
+#### Sample text generated
+The model generates text files that are stored as ```example_<epoch>_<text-number>.txt``` within a separate folder. An example output may look like this:
+```
+----- Generating with seed: "Diagnosis"
+                    DiagnosisWZing Pathology Laboratory is certified under this report. **NAME[M. SSS dessDing Adientation of the tissue is submitted in the same container labeled with the patient's name and designated 'subcarinal lymph node is submitted in toto in cassette A1. B. Received in formalin labeled "right lower outer quadrant; A11-A10 - slice 16 with a cell block and submitted in cassette A1. B. Received fresh for
+```
diff --git a/Pilot3/P3B2/p3b2.py b/Pilot3/P3B2/p3b2.py
@@ -0,0 +1,53 @@
+from __future__ import print_function
+
+import os
+import sys
+import argparse
+
+file_path = os.path.dirname(os.path.realpath(__file__))
+lib_path2 = os.path.abspath(os.path.join(file_path, '..', '..', 'common'))
+sys.path.append(lib_path2)
+
+import candle_keras as candle
+
+additional_definitions = [ 
+{'name':'rnn_size', 
+    'action':'store',
+    'type':int,
+    'help':'size of LSTM internal state'},
+{'name':'n_layers', 
+    'action':'store',
+    'help':'number of layers in the LSTM'}, 
+{'name':'do_sample', 
+    'type':candle.str2bool,
+    'help':'generate synthesized text'},
+{'name':'temperature', 
+    'action':'store',
+    'type': float,
+    'help':'variability of text synthesis'},
+{'name':'primetext', 
+    'action':'store',
+    'help': 'seed string of text synthesis' },
+{'name':'length', 
+    'action':'store',
+    'type': int,
+    'help': 'length of synthesized text'},
+]
+
+required = ['train_data', 'rnn_size', 'epochs', 'n_layers', \
+        'learning_rate', 'drop', 'recurrent_dropout', \
+        'temperature','primetext', 'length']
+
+class BenchmarkP3B2(candle.Benchmark):
+
+    def set_locals(self):
+        """Functionality to set variables specific for the benchmark
+        - required: set of required parameters for the benchmark.
+        - additional_definitions: list of dictionaries describing the additional parameters for the
+        benchmark.
+        """
+
+        if required is not None:
+            self.required = set(required)
+        if additional_definitions is not None:
+            self.additional_definitions = additional_definitions
diff --git a/Pilot3/P3B2/p3b2_baseline_keras2.py b/Pilot3/P3B2/p3b2_baseline_keras2.py
@@ -0,0 +1,240 @@
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Activation, Dropout
+from keras.layers import LSTM
+from keras.optimizers import RMSprop
+import numpy as np
+import os
+
+import datetime
+import pickle
+
+import argparse
+import sys
+
+import p3b2 as bmk
+import candle_keras as candle
+
+def initialize_parameters():
+
+    # Build benchmark object
+    p3b2Bmk = bmk.BenchmarkP3B2(bmk.file_path, 'p3b2_default_model.txt', 'keras',
+    prog='p3b2_baseline', desc='Multi-task (DNN) for data extraction from clinical reports - Pilot 3 Benchmark 1')
+    
+    # Initialize parameters
+    gParameters = candle.initialize_parameters(p3b2Bmk)
+    #bmk.logger.info('Params: {}'.format(gParameters))
+
+    return gParameters
+
+class LossHistory( keras.callbacks.Callback ):
+    def on_train_begin( self, logs= {} ):
+        self.losses = []
+
+    def on_batch_end( self, batch, logs= {} ):
+        self.losses.append( logs.get( 'loss' ) )
+
+
+
+def sample( preds, temperature= 1.0 ):
+    # helper function to sample an index from a probability array
+    preds = np.asarray( preds ).astype( 'float64' )
+    preds = np.log( preds ) / temperature
+    exp_preds = np.exp( preds )
+    preds = exp_preds / np.sum( exp_preds )
+    probas = np.random.multinomial( 1, preds, 1 )
+    return np.argmax( probas )
+
+
+
+def run(gParameters, data_path):
+
+    kerasDefaults = candle.keras_default_config()
+
+    rnn_size = gParameters['rnn_size']
+    n_layers = gParameters['n_layers']
+    learning_rate = gParameters['learning_rate']
+    dropout = gParameters['drop']
+    recurrent_dropout = gParameters['recurrent_dropout']
+    n_epochs = gParameters['epochs']
+    data_train = data_path+'/data.pkl'
+    verbose = gParameters['verbose']
+    savedir = gParameters['output_dir']
+    do_sample = gParameters['do_sample']
+    temperature = gParameters['temperature']
+    primetext = gParameters['primetext']
+    length = gParameters['length']
+
+
+    # load data from pickle
+    f = open( data_train, 'rb' )
+
+    if ( sys.version_info > ( 3, 0 ) ):
+        classes = pickle.load( f, encoding= 'latin1' )
+        chars = pickle.load( f, encoding= 'latin1' )
+        char_indices = pickle.load( f, encoding= 'latin1' )
+        indices_char = pickle.load( f, encoding= 'latin1' )
+
+        maxlen = pickle.load( f, encoding= 'latin1' )
+        step = pickle.load( f, encoding= 'latin1' )
+
+        X_ind = pickle.load( f, encoding= 'latin1' )
+        y_ind = pickle.load( f, encoding= 'latin1' )
+    else:
+        classes = pickle.load( f )
+        chars = pickle.load( f )
+        char_indices = pickle.load( f )
+        indices_char = pickle.load( f )
+
+        maxlen = pickle.load( f )
+        step = pickle.load( f )
+
+        X_ind = pickle.load( f )
+        y_ind = pickle.load( f )
+
+    f.close()
+
+    [ s1, s2 ] = X_ind.shape
+    print( X_ind.shape )
+    print( y_ind.shape )
+    print( maxlen )
+    print( len( chars ) )
+
+    X = np.zeros( ( s1, s2, len( chars ) ), dtype=np.bool )
+    y = np.zeros( ( s1, len( chars ) ), dtype=np.bool )
+
+    for i in range( s1 ):
+        for t in range( s2 ):
+            X[ i, t, X_ind[ i, t ] ] = 1
+        y[ i, y_ind[ i ] ] = 1
+
+    # build the model: a single LSTM
+    if verbose:
+        print( 'Build model...' )
+
+    model = Sequential()
+
+    # for rnn_size in rnn_sizes:
+    for k in range( n_layers ):
+        if k < n_layers - 1:
+            ret_seq = True
+        else:
+            ret_seq = False
+
+        if k == 0:
+            model.add( LSTM( rnn_size, input_shape= ( maxlen, len( chars ) ), return_sequences= ret_seq,
+                                 dropout= dropout, recurrent_dropout= recurrent_dropout ) )
+        else:
+            model.add( LSTM( rnn_size, dropout= dropout, recurrent_dropout= recurrent_dropout, return_sequences= ret_seq ) )
+
+    model.add( Dense( len( chars ) ) )
+    model.add( Activation( gParameters['activation'] ) )
+
+    optimizer = candle.build_optimizer(gParameters['optimizer'],
+                                            gParameters['learning_rate'],
+                                            kerasDefaults)
+
+    model.compile( loss= gParameters['loss'], optimizer= optimizer )
+
+    if verbose:
+        model.summary()
+
+
+    for iteration in range( 1, n_epochs + 1 ):
+        if verbose:
+            print()
+            print('-' * 50)
+            print('Iteration', iteration)
+
+        history = LossHistory()
+        model.fit( X, y, batch_size= 100, epochs= 1, callbacks= [ history ] )
+
+        loss = history.losses[ -1 ]
+        if verbose:
+            print( loss )
+
+
+        dirname = savedir
+        if len( dirname ) > 0 and not dirname.endswith( '/' ):
+            dirname = dirname + '/'
+
+        if not os.path.exists( dirname ):
+            os.makedirs( dirname )
+
+        # serialize model to JSON
+        model_json = model.to_json()
+        with open( dirname + "/model_" + str( iteration ) + "_" + "{:f}".format( loss ) + ".json", "w" ) as json_file:
+            json_file.write( model_json )
+
+        # serialize weights to HDF5
+        model.save_weights( dirname + "/model_" + str( iteration ) + "_" + "{:f}".format( loss ) + ".h5" )
+
+        if verbose:
+            print( "Checkpoint saved." )
+
+        if do_sample:
+            outtext = open( dirname + "/example_" + str( iteration ) + "_" + "{:f}".format( loss ) + ".txt", "w" , encoding= 'utf-8' )
+
+            diversity = temperature
+
+            outtext.write('----- diversity:' + str( diversity ) + "\n" )
+
+            generated = ''
+            seedstr = primetext
+
+            outtext.write('----- Generating with seed: "' + seedstr + '"' + "\n" )
+
+            sentence = " " * maxlen
+
+            # class_index = 0
+            generated += sentence
+            outtext.write( generated )
+
+            for c in seedstr:
+                sentence = sentence[1:] + c
+                x = np.zeros( ( 1, maxlen, len( chars ) ) )
+                for t, char in enumerate(sentence):
+                    x[ 0, t, char_indices[ char ] ] = 1.
+
+                preds = model.predict( x, verbose= verbose )[ 0 ]
+                next_index = sample( preds, diversity )
+                next_char = indices_char[ next_index ]
+
+                generated += c
+
+                outtext.write( c )
+
+
+            for i in range( length ):
+                x = np.zeros( ( 1, maxlen, len( chars ) ) )
+                for t, char in enumerate( sentence ):
+                    x[ 0, t, char_indices[ char ] ] = 1.
+
+                preds = model.predict( x, verbose= verbose )[ 0 ]
+                next_index = sample( preds, diversity )
+                next_char = indices_char[ next_index ]
+
+                generated += next_char
+                sentence = sentence[ 1 : ] + next_char
+
+            if (sys.version_info > (3, 0)):
+                outtext.write( generated + '\n' )
+            else:
+                outtext.write( generated.decode('utf-8').encode('utf-8') + '\n' )
+
+            outtext.close()
+
+
+if __name__  == "__main__":
+
+    gParameters = initialize_parameters()
+
+    origin = gParameters['data_url']
+    train_data = gParameters['train_data']
+    data_loc = candle.fetch_file(origin+train_data, untar=True, md5_hash=None, subdir='Pilot3')
+
+    print( 'Data downloaded and stored at: ' + data_loc )
+    data_path = os.path.dirname(data_loc)
+    print( data_path )
+
+    run(gParameters, data_path)
diff --git a/Pilot3/P3B2/p3b2_default_model.txt b/Pilot3/P3B2/p3b2_default_model.txt
@@ -0,0 +1,18 @@
+[Global_Params]
+data_url = 'http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P3B2/'
+train_data = 'P3B2_data.tgz'
+model_name = 'p3b2'
+rnn_size = 256
+epochs = 10
+n_layers = 1
+learning_rate = 0.01
+drop = 0.0
+recurrent_dropout = 0.0
+loss = 'categorical_crossentropy'
+activation = 'softmax'
+optimizer = 'rmsprop'
+temperature = 1.0
+primetext = 'Diagnosis'
+length = 1000
+do_sample = True
+verbose = True