Skip to content
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 118 additions & 64 deletions examples/nlp/masked_language_modeling.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
"""
Title: End-to-end Masked Language Modeling with BERT
Author: [Ankur Singh](https://twitter.com/ankur310794)
Date created: 2020/09/18
Last modified: 2024/03/15
Description: Implement a Masked Language Model (MLM) with BERT and fine-tune it on the IMDB Reviews dataset.
Accelerator: GPU
Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
# End-to-end Masked Language Modeling with BERT

**Author:** [Ankur Singh](https://twitter.com/ankur310794)<br>
**Date created:** 2020/09/18<br>
**Last modified:** 2024/05/05<br>
**Description:** Implement a Masked Language Model (MLM) with BERT and fine-tune it on
the IMDB Reviews dataset.<br>
**Accelerator:** GPU
**Converted to Keras 3 by:** [Sitam Meur](https://github.com/sitamgithub-MSIT)<br>
**Converted to Keras 3 Backend-Agnostic by:** [Mrutyunjay
Biswal](https://twitter.com/LearnStochastic)<br>
"""

"""
Expand All @@ -32,35 +36,43 @@
train it with the masked language modeling task,
and then fine-tune this model on a sentiment classification task.

We will use the Keras `TextVectorization` and `MultiHeadAttention` layers
We will use the Keras `TextVectorization` and `MultiHeadAttention` layers, and
`PositionEmbedding` from `keras-nlp`
to create a BERT Transformer-Encoder network architecture.

Note: This example should be run with `tf-nightly`.
Note: This is backend-agnostic, i.e. update the keras backend to "tensorflow", "torch",
or "jax" as shown in the code, and it should work with no other code change.
"""

"""
## Setup

Install `tf-nightly` via `pip install tf-nightly`.
"""

# install keras 3.x and keras-nlp
# !pip install --upgrade keras keras-nlp

import os

# set backend ["tensorflow", "jax", "torch"]
os.environ["KERAS_BACKEND"] = "tensorflow"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it works with all backends, you can just remove this line.

import keras_nlp
import keras
import tensorflow as tf
from keras import layers
from keras.layers import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob

import re
import glob
import numpy as np
import pandas as pd
from pathlib import Path
from pprint import pprint
from dataclasses import dataclass

import keras
from keras import ops
from keras import layers

import keras_nlp
import tensorflow as tf

"""
## Set-up Configuration
## Configuration
"""


Expand All @@ -74,19 +86,43 @@ class Config:
NUM_HEAD = 8 # used in bert model
FF_DIM = 128 # used in bert model
NUM_LAYERS = 1
NUM_EPOCHS = 1
STEPS_PER_EPOCH = 2


config = Config()

"""
## Load the data
## Download the Data: IMDB Movie Review Sentiment Classification
Download the IMDB data and load into a Pandas DataFrame.
"""

fpath = keras.utils.get_file(
origin="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
)
dirpath = Path(fpath).parent.absolute()
_ = os.system(f"tar -xf {fpath} -C {dirpath}")

We will first download the IMDB data and load into a Pandas dataframe.
"""
The `aclImdb` folder contains a `train` and `test` subfolder:
"""

_ = os.system(f"ls {dirpath}/aclImdb")
_ = os.system(f"ls {dirpath}/aclImdb/train")
_ = os.system(f"ls {dirpath}/aclImdb/test")

"""
We are only interested in the `pos` and `neg` subfolders, so let's delete the rest:
"""

_ = os.system(f"rm -r {dirpath}/aclImdb/train/unsup")
_ = os.system(f"rm -r {dirpath}/aclImdb/train/*.feat")
_ = os.system(f"rm -r {dirpath}/aclImdb/train/*.txt")
_ = os.system(f"rm -r {dirpath}/aclImdb/test/*.feat")
_ = os.system(f"rm -r {dirpath}/aclImdb/test/*.txt")

"""shell
curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
tar -xf aclImdb_v1.tar.gz
"""
Let's read the dataset from the text files to a DataFrame.
"""


Expand All @@ -100,9 +136,10 @@ def get_text_list_from_files(files):


def get_data_from_text_files(folder_name):
pos_files = glob.glob("aclImdb/" + folder_name + "/pos/*.txt")

pos_files = glob.glob(f"{dirpath}/aclImdb/" + folder_name + "/pos/*.txt")
pos_texts = get_text_list_from_files(pos_files)
neg_files = glob.glob("aclImdb/" + folder_name + "/neg/*.txt")
neg_files = glob.glob(f"{dirpath}/aclImdb/" + folder_name + "/neg/*.txt")
neg_texts = get_text_list_from_files(neg_files)
df = pd.DataFrame(
{
Expand All @@ -117,15 +154,17 @@ def get_data_from_text_files(folder_name):
train_df = get_data_from_text_files("train")
test_df = get_data_from_text_files("test")

all_data = train_df.append(test_df)
all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)
assert len(all_data) != 0, f"{all_data} is empty"

"""
## Dataset preparation

We will use the `TextVectorization` layer to vectorize the text into integer token ids.
It transforms a batch of strings into either
a sequence of token indices (one sample = 1D array of integer token indices, in order)
or a dense representation (one sample = 1D array of float values encoding an unordered set of tokens).
or a dense representation (one sample = 1D array of float values encoding an unordered
set of tokens).

Below, we define 3 preprocessing functions.

Expand Down Expand Up @@ -156,7 +195,7 @@ def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
Returns:
layers.Layer: Return TextVectorization Keras Layer
"""
vectorize_layer = TextVectorization(
vectorize_layer = layers.TextVectorization(
max_tokens=vocab_size,
output_mode="int",
standardize=custom_standardization,
Expand All @@ -179,14 +218,15 @@ def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
)

# Get mask token id for masked language model
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]
mask_token_id = ops.convert_to_numpy(vectorize_layer(["[mask]"])[0][0])


def encode(texts):
encoded_texts = vectorize_layer(texts)
return encoded_texts.numpy()
return ops.convert_to_numpy(encoded_texts)


# todo: make this backend agnostic
def get_masked_input_and_labels(encoded_texts):
# 15% BERT masking
inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
Expand All @@ -213,7 +253,7 @@ def get_masked_input_and_labels(encoded_texts):
)

# Prepare sample_weights to pass to .fit() method
sample_weights = np.ones(labels.shape)
sample_weights = np.ones(labels.shape, dtype="float32")
sample_weights[labels == -1] = 0

# y_labels would be same as encoded_texts i.e input tokens
Expand Down Expand Up @@ -264,18 +304,18 @@ def get_masked_input_and_labels(encoded_texts):
"""


def bert_module(query, key, value, i):
def bert_module(query, key, value, layer_num):
# Multi headed self-attention
attention_output = layers.MultiHeadAttention(
num_heads=config.NUM_HEAD,
key_dim=config.EMBED_DIM // config.NUM_HEAD,
name="encoder_{}_multiheadattention".format(i),
name=f"encoder_{layer_num}_multiheadattention",
)(query, key, value)
attention_output = layers.Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
attention_output = layers.Dropout(0.1, name=f"encoder_{layer_num}_att_dropout")(
attention_output
)
attention_output = layers.LayerNormalization(
epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
epsilon=1e-6, name=f"encoder_{layer_num}_att_layernormalization"
)(query + attention_output)

# Feed-forward layer
Expand All @@ -284,14 +324,14 @@ def bert_module(query, key, value, i):
layers.Dense(config.FF_DIM, activation="relu"),
layers.Dense(config.EMBED_DIM),
],
name="encoder_{}_ffn".format(i),
name=f"encoder_{layer_num}_ffn",
)
ffn_output = ffn(attention_output)
ffn_output = layers.Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
ffn_output = layers.Dropout(0.1, name=f"encoder_{layer_num}_ffn_dropout")(
ffn_output
)
sequence_output = layers.LayerNormalization(
epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
epsilon=1e-6, name=f"encoder_{layer_num}_ffn_layernormalization"
)(attention_output + ffn_output)
return sequence_output

Expand Down Expand Up @@ -336,37 +376,48 @@ def metrics(self):


def create_masked_language_bert_model():
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, shall I subclass the MaskedLanguageModel instead, and get rid of the tf custom training step?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe you should be able to remove the train_step() and instead implement the compute_loss() method. If you write the compute_loss() method using only keras.ops functions, it will work with all backends.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that no additional logic is required here, and using SparseCategoricalCrossEntropy as Loss function; does it require an override for compute_loss()?

inputs = layers.Input((config.MAX_LEN,), dtype="int64")
inputs = layers.Input((config.MAX_LEN,), dtype="int32")

word_embeddings = layers.Embedding(
config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
)(inputs)

position_embeddings = keras_nlp.layers.PositionEmbedding(
sequence_length=config.MAX_LEN
)(word_embeddings)

embeddings = word_embeddings + position_embeddings

encoder_output = embeddings
for i in range(config.NUM_LAYERS):
encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)
for layer in range(config.NUM_LAYERS):
encoder_output = bert_module(
encoder_output, encoder_output, encoder_output, layer
)

mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
encoder_output
)
mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")
mlm_model = keras.Model(inputs, mlm_output, name="masked_bert_model")

optimizer = keras.optimizers.Adam(learning_rate=config.LR)
mlm_model.compile(optimizer=optimizer)
mlm_model.compile(optimizer=optimizer, loss=loss_fn)
return mlm_model


bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

"""
## Define a Callback to Generate Masked Token
"""

id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}


class MaskedTextGenerator(keras.callbacks.Callback):
def __init__(self, sample_tokens, top_k=5):
self.sample_tokens = sample_tokens
self.sample_tokens = ops.convert_to_numpy(sample_tokens)
self.k = top_k

def decode(self, tokens):
Expand All @@ -378,20 +429,20 @@ def convert_ids_to_tokens(self, id):
def on_epoch_end(self, epoch, logs=None):
prediction = self.model.predict(self.sample_tokens)

masked_index = np.where(self.sample_tokens == mask_token_id)
masked_index = masked_index[1]
masked_index = ops.where(self.sample_tokens == mask_token_id)
masked_index = ops.convert_to_numpy(masked_index[1])[0]
mask_prediction = prediction[0][masked_index]

top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
values = mask_prediction[0][top_indices]
top_indices = mask_prediction.argsort()[-self.k :][::-1]
values = mask_prediction[top_indices]

for i in range(len(top_indices)):
p = top_indices[i]
v = values[i]
tokens = np.copy(sample_tokens[0])
tokens[masked_index[0]] = p
tokens = np.copy(self.sample_tokens[0])
tokens[masked_index] = p
result = {
"input_text": self.decode(sample_tokens[0].numpy()),
"input_text": self.decode(self.sample_tokens[0]),
"prediction": self.decode(tokens),
"probability": v,
"predicted mask token": self.convert_ids_to_tokens(p),
Expand All @@ -400,25 +451,28 @@ def on_epoch_end(self, epoch, logs=None):


sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()
generator_callback = MaskedTextGenerator(sample_tokens)

"""
## Train and Save
"""

bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
bert_masked_model.fit(
mlm_ds,
epochs=config.NUM_EPOCHS,
steps_per_epoch=config.STEPS_PER_EPOCH,
callbacks=[generator_callback],
)
bert_masked_model.save("bert_mlm_imdb.keras")

"""
## Fine-tune a sentiment classification model

We will fine-tune our self-supervised model on a downstream task of sentiment classification.
To do this, let's create a classifier by adding a pooling layer and a `Dense` layer on top of the
We will fine-tune our self-supervised model on a downstream task of sentiment
classification.
To do this, let's create a classifier by adding a pooling layer and a `Dense` layer on
top of the
pretrained BERT features.

"""

# Load pretrained bert model
Expand Down Expand Up @@ -465,7 +519,7 @@ def create_classifier_bert_model():
)
classifer_model.fit(
train_classifier_ds,
epochs=5,
epochs=config.NUM_EPOCHS,
validation_data=test_classifier_ds,
)

Expand All @@ -481,7 +535,7 @@ def create_classifier_bert_model():


def get_end_to_end(model):
inputs_string = keras.Input(shape=(1,), dtype="string")
inputs_string = layers.Input(shape=(1,), dtype="string")
indices = vectorize_layer(inputs_string)
outputs = model(indices)
end_to_end_model = keras.Model(inputs_string, outputs, name="end_to_end_model")
Expand Down