CR_NPL_RNN/rnn_nlp.py at master · Berylbell/CR_NPL_RNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
"""
Created on Tue Jun  2 19:31:49 2020

@author: Berylroll

Using the Tensorflow tutorials and twint to download matthew mercer's tweets and
train a 3 layer RNN on them to produce Matt Style Tweets
"""

import tensorflow as tf
import numpy as np
import os
#tf.enable_eager_execution()
#Given a chunk of text, spilt it into the input and target text
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

#Given a piece of text from the transcript, build a 3 layer RNN to predict it
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape= [batch_size, None]),
                                 tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful = True,
                                 recurrent_initializer = 'glorot_uniform'),
                                 tf.keras.layers.Dense(vocab_size)])
    return model

#Define the loss
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

#Generate the Text given a starting string, and return it
def generate_text(model, start_string, text, num_generate=100, temperature =1, returntype ='none'):

    vocab = sorted(set(text))
    #converst string to num
    char2idx = {u:i for i, u in enumerate(vocab)}
    idx2char = np.array(vocab)
    #text_2_int = np.array([char2idx[c] for c in text])

    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval,0)
    text_generated = []


    #batch
    model.reset_states()

    for i in range(num_generate):
        predictions = model(input_eval)

        #remove the batch dim
        predictions = tf.squeeze(predictions, 0)

        #Use categorical dis to predict char by model
        predictions = predictions/temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        #pass predicted character to model along w state

        input_eval = tf.expand_dims([predicted_id], 0 )
        text_generated.append(idx2char[predicted_id])

    if(returntype == 'str'):
        return(start_string + ''.join(text_generated))
    elif(returntype == 'list'):
        return(start_string + text_generated)
    else:
        return(''.join(text_generated))

#Run the RNN on a given text, for a given number of epochs and return the model
def run_rnn_nlp (text, suffix='suffix', EPOCHS =3):
    #Make vocab
    vocab = sorted(set(text))
    #set up mapping
    char2idx = {u:i for i, u in enumerate(vocab)}
    #char2spe = np.array(vocab)
    text_2_int = np.array([char2idx[c] for c in text])

    #define size of speaker and dataset slices
    seq_length = 100#int(len(text)/20)
    examples_per_epoch = len(text)
    speaker_dataset = tf.data.Dataset.from_tensor_slices(text_2_int)
    sequences = speaker_dataset.batch(seq_length+1, drop_remainder=True)
    dataset = sequences.map(split_input_target)

    # Define Batch Size/ Buffer size
    BATCH_SIZE = 4
    BUFFER_SIZE = 10000
    dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    dataset = dataset.repeat()

    #Building the Model
    vocab_size_s = len(vocab)
    embedding_dim = 256
    rnn_units = 1024
    model = build_model(vocab_size = len(vocab), embedding_dim = embedding_dim, rnn_units = rnn_units, batch_size = BATCH_SIZE)

    #Compile Model
    model.compile(optimizer='adam', loss=loss)

    #Save the Checkpoints
    checkpoint_dir = './training_checkpoints_'+suffix
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath= checkpoint_prefix, save_weights_only=True)


    steps = int(len(text)/(100*BATCH_SIZE)-1)
    #Run the code with Epochs
    history = model.fit(dataset,epochs=EPOCHS, callbacks= [checkpoint_callback], steps_per_epoch = steps)
    tf.train.latest_checkpoint(checkpoint_dir)

    #Build the Model
    model = build_model(vocab_size_s, embedding_dim, rnn_units, batch_size=1)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
    model.build(tf.TensorShape([1,None]))

#Given a file, load the model and produce text
def prod_text_from_file(text, suffix='suffix', start_string='', length=100):

    vocab = sorted(set(text))

    vocab_size = len(vocab)
    embedding_dim = 256
    rnn_units = 1024

    checkpoint_dir = './training_checkpoints_'+suffix

    model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
    model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
    model.build(tf.TensorShape([1,None]))

    #print(model.summary())
    gen= generate_text(model,start_string, text, length, 1, 'str')
    return gen