From 481f5768cfc8c603fb0b3cedfaf6514f0ae228af Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Sat, 29 Aug 2020 01:26:55 +0100 Subject: [PATCH 1/2] Add a keras_preprocessing package dependency. github.com/ChrisCummins/ProGraML/issues/124 --- requirements.txt | 1 + third_party/py/keras_preprocessing/BUILD | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 third_party/py/keras_preprocessing/BUILD diff --git a/requirements.txt b/requirements.txt index 3baff297e..122ac582c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ decorator==4.3.0 gast==0.2.2. # Dependency of tensorflow. GPUtil==1.4.0 Keras==2.3.1 +keras_preprocessing >= 1.1.1, < 1.2 kiwisolver==1.0.1 # Needed by matplotlib. joblib>=0.16.0 # Needed by scikit-learn matplotlib==2.2.0rc1 diff --git a/third_party/py/keras_preprocessing/BUILD b/third_party/py/keras_preprocessing/BUILD new file mode 100644 index 000000000..d9fe238bd --- /dev/null +++ b/third_party/py/keras_preprocessing/BUILD @@ -0,0 +1,16 @@ +# A wrapper around pip package to pull in undeclared dependencies. + +load("@programl_requirements//:requirements.bzl", "requirement") + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # MIT + +py_library( + name = "keras_preprocessing", + srcs = ["//third_party/py:empty.py"], + deps = [ + requirement("keras_preprocessing"), + "//third_party/py/numpy", + ], +) From 29771429a5063a78c29348d2c0c8bf24e1ce4ab0 Mon Sep 17 00:00:00 2001 From: Chris Cummins Date: Sat, 29 Aug 2020 01:26:55 +0100 Subject: [PATCH 2/2] WIP: Rewrite LSTM in PyTorch. github.com/ChrisCummins/ProGraML/issues/124 --- programl/models/lstm/BUILD | 2 +- programl/models/lstm/lstm.py | 282 ++++++++++--------- programl/models/lstm/lstm_batch.py | 4 +- programl/task/dataflow/BUILD | 13 + programl/task/dataflow/lstm_batch_builder.py | 33 ++- programl/task/dataflow/train_lstm.py | 5 +- 6 files changed, 184 insertions(+), 155 deletions(-) diff --git a/programl/models/lstm/BUILD b/programl/models/lstm/BUILD index 7c9c47677..b1b643a16 100644 --- a/programl/models/lstm/BUILD +++ b/programl/models/lstm/BUILD @@ -27,7 +27,7 @@ py_library( "//programl/proto:epoch_py", "//third_party/py/labm8", "//third_party/py/numpy", - "//third_party/py/tensorflow", + "//third_party/py/torch", ], ) diff --git a/programl/models/lstm/lstm.py b/programl/models/lstm/lstm.py index 95688e15d..8b894cad9 100644 --- a/programl/models/lstm/lstm.py +++ b/programl/models/lstm/lstm.py @@ -19,12 +19,15 @@ from typing import Any, Dict, List import numpy as np -import tensorflow as tf +import torch from labm8.py import app from labm8.py.progress import NullContext, ProgressContext +from torch import nn, optim from programl.models.batch_data import BatchData from programl.models.batch_results import BatchResults +from programl.models.ggnn.loss import Loss +from programl.models.ggnn.node_embeddings import NodeEmbeddings from programl.models.lstm.lstm_batch import LstmBatchData from programl.models.model import Model from programl.proto import epoch_pb2 @@ -57,19 +60,24 @@ "The value used for the positive class in the 1-hot selector embedding " "vectors. Has no effect when selector embeddings are not used.", ) -app.DEFINE_boolean( - "cudnn_lstm", - True, - "If set, use CuDNNLSTM implementation when a GPU is available. Else use " - "default Keras implementation. Note that the two implementations are " - "incompatible - a model saved using one LSTM type cannot be restored using " - "the other LSTM type.", -) app.DEFINE_float("learning_rate", 0.001, "The mode learning rate.") app.DEFINE_boolean( "trainable_embeddings", True, "Whether the embeddings are trainable." ) +# Embeddings options. +app.DEFINE_string( + "text_embedding_type", + "random", + "The type of node embeddings to use. One of " + "{constant_zero, constant_random, random}.", +) +app.DEFINE_integer( + "text_embedding_dimensionality", + 32, + "The dimensionality of node text embeddings.", +) + class Lstm(Model): """An LSTM model for node-level classification.""" @@ -78,106 +86,56 @@ def __init__( self, vocabulary: Dict[str, int], node_y_dimensionality: int, + graph_y_dimensionality: int, + graph_x_dimensionality: int, + use_selector_embeddings: bool, test_only: bool = False, name: str = "lstm", ): """Constructor.""" - super(Lstm, self).__init__( - test_only=test_only, vocabulary=vocabulary, name=name - ) + super().__init__(test_only=test_only, vocabulary=vocabulary, name=name) self.vocabulary = vocabulary self.node_y_dimensionality = node_y_dimensionality + self.graph_y_dimensionality = graph_y_dimensionality + self.graph_x_dimensionality = graph_x_dimensionality + self.node_selector_dimensionality = 2 if use_selector_embeddings else 0 # Flag values. self.batch_size = FLAGS.batch_size self.padded_sequence_length = FLAGS.padded_sequence_length - # Reset any previous Tensorflow session. This is required when running - # consecutive LSTM models in the same process. - tf.compat.v1.keras.backend.clear_session() - - @staticmethod - def MakeLstmLayer(*args, **kwargs): - """Construct an LSTM layer. - - If a GPU is available and --cudnn_lstm, this will use NVIDIA's fast - CuDNNLSTM implementation. Else it will use Keras' builtin LSTM, which is - much slower but works on CPU. - """ - if FLAGS.cudnn_lstm and tf.compat.v1.test.is_gpu_available(): - return tf.compat.v1.keras.layers.CuDNNLSTM(*args, **kwargs) - else: - return tf.compat.v1.keras.layers.LSTM(*args, **kwargs, implementation=1) - - def CreateKerasModel(self) -> tf.compat.v1.keras.Model: - """Construct the tensorflow computation graph.""" - vocab_ids = tf.compat.v1.keras.layers.Input( - batch_shape=( - self.batch_size, - self.padded_sequence_length, + self.model = LstmModel( + node_embeddings=NodeEmbeddings( + node_embeddings_type=FLAGS.text_embedding_type, + use_selector_embeddings=self.node_selector_dimensionality, + selector_embedding_value=FLAGS.selector_embedding_value, + embedding_shape=( + # Add one to the vocabulary size to account for the out-of-vocab token. + len(vocabulary) + 1, + FLAGS.text_embedding_dimensionality, + ), ), - dtype="int32", - name="sequence_in", - ) - embeddings = tf.compat.v1.keras.layers.Embedding( - input_dim=len(self.vocabulary) + 2, - input_length=self.padded_sequence_length, - output_dim=FLAGS.hidden_size, - name="embedding", - trainable=FLAGS.trainable_embeddings, - )(vocab_ids) - - selector_vectors = tf.compat.v1.keras.layers.Input( - batch_shape=(self.batch_size, self.padded_sequence_length, 2), - dtype="float32", - name="selector_vectors", - ) - - lang_model_input = tf.compat.v1.keras.layers.Concatenate( - axis=2, name="embeddings_and_selector_vectorss" - )( - [embeddings, selector_vectors], - ) - - # Recurrent layers. - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, return_sequences=True, name="lstm_1" - )(lang_model_input) - lang_model = self.MakeLstmLayer( - FLAGS.hidden_size, - return_sequences=True, - return_state=False, - name="lstm_2", - )(lang_model) - - # Dense layers. - for i in range(1, FLAGS.hidden_dense_layer_count + 1): - lang_model = tf.compat.v1.keras.layers.Dense( - FLAGS.hidden_size, - activation="relu", - name=f"dense_{i}", - )(lang_model) - node_out = tf.compat.v1.keras.layers.Dense( - self.node_y_dimensionality, - activation="sigmoid", - name="node_out", - )(lang_model) - - model = tf.compat.v1.keras.Model( - inputs=[vocab_ids, selector_vectors], - outputs=[node_out], - ) - model.compile( - optimizer=tf.compat.v1.keras.optimizers.Adam( - learning_rate=FLAGS.learning_rate + loss=Loss( + num_classes=self.node_y_dimensionality, + has_aux_input=self.has_aux_input, + intermediate_loss_weight=None, # NOTE(cec): Intentionally broken. + class_prevalence_weighting=False, ), - metrics=["accuracy"], - loss=["categorical_crossentropy"], - loss_weights=[1.0], + padded_sequence_length=self.padded_sequence_length, + learning_rate=FLAGS.learning_rate, + test_only=test_only, + hidden_size=FLAGS.hidden_size, + hidden_dense_layer_count=FLAGS.hidden_dense_layer_count, ) - return model + @property + def num_classes(self) -> int: + return self.node_y_dimensionality or self.graph_y_dimensionality + + @property + def has_aux_input(self) -> bool: + return self.graph_x_dimensionality > 0 def CreateModelData(self, test_only: bool) -> None: """Initialize an LSTM model. This is called during Initialize().""" @@ -209,24 +167,43 @@ def RunBatch( self.batch_size, self.padded_sequence_length, ), model_data.encoded_sequences.shape - assert model_data.selector_vectors.shape == ( + assert model_data.selector_ids.shape == ( self.batch_size, self.padded_sequence_length, - 2, - ), model_data.selector_vectors.shape + ), model_data.selector_ids.shape x = [model_data.encoded_sequences, model_data.selector_vectors] y = [model_data.node_labels] if epoch_type == epoch_pb2.TRAIN: - loss, *_ = self.model.train_on_batch(x, y) + if not self.model.training: + self.model.train() + targets, logits = self.model( + model_data.encoded_sequences, + model_data.selector_ids, + model_data.node_labels, + ) else: - loss = None + if self.model.training: + self.model.eval() + self.model.opt.zero_grad() + # Inference only, don't trace the computation graph. + with torch.no_grad(): + targets, logits = self.model( + model_data.encoded_sequences, + model_data.selector_ids, + model_data.node_labels, + ) + + loss = self.model.loss((logits, None), targets) - padded_predictions = self.model.predict_on_batch(x) + if epoch_type == epoch_pb2.TRAIN: + loss.backward() + self.model.opt.step() + self.model.opt.zero_grad() # Reshape the outputs. - predictions = self.ReshapePaddedModelOutput(batch_data, padded_predictions) + predictions = self.ReshapePaddedModelOutput(batch_data, outputs) # Flatten the targets and predictions lists so that we can compare them. # Shape (batch_node_count, node_y_dimensionality). @@ -234,9 +211,10 @@ def RunBatch( predictions = np.concatenate(predictions) return BatchResults.Create( - targets=targets, - predictions=predictions, - loss=loss, + targets=model_data.node_labels, + predictions=logits.detach().cpu().numpy(), + learning_rate=self.model.learning_rate, + loss=loss.item(), ) def ReshapePaddedModelOutput( @@ -282,36 +260,74 @@ def ReshapePaddedModelOutput( def GetModelData(self) -> Any: """Get the model state.""" - # According to https://keras.io/getting-started/faq/, it is not recommended - # to pickle a Keras model. So as a workaround, I use Keras's saving - # mechanism to store the weights, and pickle that. - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - self.model.save(path) - with open(path, "rb") as f: - model_data = f.read() - return model_data + return { + "model_state_dict": self.model.state_dict(), + "optimizer_state_dict": self.model.opt.state_dict(), + "scheduler_state_dict": self.model.scheduler.state_dict(), + } def LoadModelData(self, data_to_load: Any) -> None: """Restore the model state.""" - # Load the weights from a file generated by ModelDataToSave(). - with tempfile.TemporaryDirectory(prefix="lstm_pickle_") as d: - path = pathlib.Path(d) / "weights.h5" - with open(path, "wb") as f: - f.write(data_to_load) - - # The default TF graph is finalized in Initialize(), so we must - # first reset the session and create a new graph. - tf.compat.v1.reset_default_graph() - SetAllowedGrowthOnKerasSession() - - self.model = tf.compat.v1.keras.models.load_model(path) - - -def SetAllowedGrowthOnKerasSession(): - """Allow growth on GPU for Keras.""" - config = tf.compat.v1.ConfigProto() - config.gpu_options.allow_growth = True - session = tf.compat.v1.Session(config=config) - tf.compat.v1.keras.backend.set_session(session) - return session + self.model.load_state_dict(data_to_load["model_state_dict"]) + # only restore opt if needed. opt should be None o/w. + if not self.test_only: + self.model.opt.load_state_dict(data_to_load["optimizer_state_dict"]) + self.model.scheduler.load_state_dict(data_to_load["scheduler_state_dict"]) + + +class LstmModel(nn.Module): + def __init__( + self, + node_embeddings: NodeEmbeddings, + loss: Loss, + padded_sequence_length: int, + test_only: bool, + learning_rate: float, + hidden_size: int, + hidden_dense_layer_count: int, # TODO(cec): Implement. + ): + super().__init__() + self.node_embeddings = node_embeddings + self.loss = loss + self.padded_sequence_length = padded_sequence_length + self.learning_rate = learning_rate + self.hidden_size = hidden_size + self.learning_rate = learning_rate + + self.lstm = nn.LSTM( + self.node_embeddings.embedding_dimensionality + 2, + self.hidden_size, + ) + self.hidden2label = nn.Linear(self.hidden_size, 2) + + if test_only: + self.opt = None + self.eval() + else: + self.opt = optim.AdamW(self.parameters(), lr=self.learning_rate) + + def forward( + self, + encoded_sequences, + selector_ids, + node_labels, + ): + print("SHAPES", encoded_sequences.shape, selector_ids.shape, node_labels.shape) + + encoded_sequences = torch.tensor(encoded_sequences, dtype=torch.long) + selector_ids = torch.tensor(selector_ids, dtype=torch.long) + node_labels = torch.tensor(node_labels, dtype=torch.long) + + # Embed and concatenate sequences and selector vectors. + embeddings = self.node_embeddings(encoded_sequences, selector_ids) + + lstm_out, _ = self.lstm( + embeddings.view(self.padded_sequence_length, len(encoded_sequences), -1) + ) + print(lstm_out.shape) + + label_space = self.hidden2label(lstm_out.view(self.padded_sequence_length, -1)) + logits = F.log_softmax(label_space, dim=2) + + targets = node_labels + return logits, targets diff --git a/programl/models/lstm/lstm_batch.py b/programl/models/lstm/lstm_batch.py index bcac7f619..f9bf3a417 100644 --- a/programl/models/lstm/lstm_batch.py +++ b/programl/models/lstm/lstm_batch.py @@ -31,8 +31,8 @@ class LstmBatchData(NamedTuple): # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 encoded_sequences: np.array - # Shape (batch_size, padded_sequence_length, 2), dtype np.int32 - selector_vectors: np.array + # Shape (batch_size, padded_sequence_length, 1), dtype np.int32 + selector_ids: np.array # Shape (batch_size, padded_sequence_length, node_y_dimensionality), # dtype np.float32 node_labels: np.array diff --git a/programl/task/dataflow/BUILD b/programl/task/dataflow/BUILD index 8ad5bbb00..749c50485 100644 --- a/programl/task/dataflow/BUILD +++ b/programl/task/dataflow/BUILD @@ -117,6 +117,7 @@ py_library( "//programl/models:base_graph_loader", "//programl/models:batch_data", "//programl/models/lstm:lstm_batch", + "//third_party/py/keras_preprocessing", "//third_party/py/labm8", "//third_party/py/numpy", ], @@ -178,6 +179,18 @@ py_binary( ], ) +py_test( + name = "train_lstm_test", + srcs = ["train_lstm_test.py"], + data = [ + "//programl/test/data:reachability_dataflow_dataset", + ], + deps = [ + ":train_lstm", + "//third_party/py/labm8", + ], +) + py_library( name = "vocabulary", srcs = ["vocabulary.py"], diff --git a/programl/task/dataflow/lstm_batch_builder.py b/programl/task/dataflow/lstm_batch_builder.py index 26a807af1..34c495bd1 100644 --- a/programl/task/dataflow/lstm_batch_builder.py +++ b/programl/task/dataflow/lstm_batch_builder.py @@ -17,7 +17,7 @@ from typing import Dict, Optional import numpy as np -import tensorflow as tf +from keras_preprocessing.sequence import pad_sequences from labm8.py import app from programl.graph.format.py import graph_serializer @@ -50,12 +50,12 @@ def __init__( # Mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] # Padding values. self._vocab_id_pad = len(self.vocabulary) + 1 - self._selector_vector_pad = np.zeros((0, 2), dtype=np.int32) + self._selector_id_pad = 0 self._node_label_pad = np.zeros((0, self.node_y_dimensionality), dtype=np.int32) # Call super-constructor last since it starts the worker thread. @@ -74,14 +74,16 @@ def _Build(self) -> BatchData: self.vocab_ids += [ np.array([self._vocab_id_pad], dtype=np.int32) ] * pad_count - self.selector_vectors += [self._selector_vector_pad] * pad_count + self.selector_ids += [ + np.array([self._selector_id_pad], dtype=np.int32) + ] * pad_count self.targets += [self._node_label_pad] * pad_count batch = BatchData( graph_count=len(self.graph_node_sizes), model_data=LstmBatchData( graph_node_sizes=np.array(self.graph_node_sizes, dtype=np.int32), - encoded_sequences=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( + encoded_sequences=pad_sequences( self.vocab_ids, maxlen=self.padded_sequence_length, dtype="int32", @@ -89,15 +91,15 @@ def _Build(self) -> BatchData: truncating="post", value=self._vocab_id_pad, ), - selector_vectors=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( - self.selector_vectors, + selector_ids=pad_sequences( + self.selector_ids, maxlen=self.padded_sequence_length, - dtype="float32", + dtype="int32", padding="pre", truncating="post", - value=np.zeros(2, dtype=np.float32), + value=self._selector_id_pad, ), - node_labels=tf.compat.v1.keras.preprocessing.sequence.pad_sequences( + node_labels=pad_sequences( self.targets, maxlen=self.padded_sequence_length, dtype="float32", @@ -113,7 +115,7 @@ def _Build(self) -> BatchData: # Reset mutable state. self.graph_node_sizes = [] self.vocab_ids = [] - self.selector_vectors = [] + self.selector_ids = [] self.targets = [] return batch @@ -139,7 +141,7 @@ def OnItem(self, item) -> Optional[BatchData]: ) for n in node_list ] - selector_values = np.array( + selector_ids = np.array( [ features.node_features.feature_list["data_flow_root_node"] .feature[n] @@ -148,10 +150,7 @@ def OnItem(self, item) -> Optional[BatchData]: ], dtype=np.int32, ) - selector_vectors = np.zeros((selector_values.size, 2), dtype=np.float32) - selector_vectors[ - np.arange(selector_values.size), selector_values - ] = FLAGS.selector_embedding_value + # TODO: FLAGS.selector_embedding_value targets = np.array( [ features.node_features.feature_list["data_flow_value"] @@ -171,7 +170,7 @@ def OnItem(self, item) -> Optional[BatchData]: self.graph_node_sizes.append(len(node_list)) self.vocab_ids.append(vocab_ids) - self.selector_vectors.append(selector_vectors) + self.selector_ids.append(selector_ids) self.targets.append(targets_1hot) if len(self.graph_node_sizes) >= self.batch_size: diff --git a/programl/task/dataflow/train_lstm.py b/programl/task/dataflow/train_lstm.py index 6f3ee3d5b..55c891dea 100644 --- a/programl/task/dataflow/train_lstm.py +++ b/programl/task/dataflow/train_lstm.py @@ -114,6 +114,9 @@ def TrainDataflowLSTM( vocabulary=vocab, test_only=False, node_y_dimensionality=2, + graph_y_dimensionality=0, + graph_x_dimensionality=0, + use_selector_embeddings=True, ) if restore_from: @@ -132,8 +135,6 @@ def TrainDataflowLSTM( model.Initialize() start_epoch_step, start_graph_cumsum = 1, 0 - model.model.summary() - # Create training batches and split into epochs. epochs = EpochBatchIterator( MakeBatchBuilder(