-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrnn_nlp.py
More file actions
138 lines (106 loc) · 4.87 KB
/
rnn_nlp.py
File metadata and controls
138 lines (106 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 2 19:31:49 2020
@author: Berylroll
Using the Tensorflow tutorials and twint to download matthew mercer's tweets and
train a 3 layer RNN on them to produce Matt Style Tweets
"""
import tensorflow as tf
import numpy as np
import os
#tf.enable_eager_execution()
#Given a chunk of text, spilt it into the input and target text
def split_input_target(chunk):
input_text = chunk[:-1]
target_text = chunk[1:]
return input_text, target_text
#Given a piece of text from the transcript, build a 3 layer RNN to predict it
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape= [batch_size, None]),
tf.keras.layers.GRU(rnn_units, return_sequences = True, stateful = True,
recurrent_initializer = 'glorot_uniform'),
tf.keras.layers.Dense(vocab_size)])
return model
#Define the loss
def loss(labels, logits):
return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
#Generate the Text given a starting string, and return it
def generate_text(model, start_string, text, num_generate=100, temperature =1, returntype ='none'):
vocab = sorted(set(text))
#converst string to num
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
#text_2_int = np.array([char2idx[c] for c in text])
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval,0)
text_generated = []
#batch
model.reset_states()
for i in range(num_generate):
predictions = model(input_eval)
#remove the batch dim
predictions = tf.squeeze(predictions, 0)
#Use categorical dis to predict char by model
predictions = predictions/temperature
predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
#pass predicted character to model along w state
input_eval = tf.expand_dims([predicted_id], 0 )
text_generated.append(idx2char[predicted_id])
if(returntype == 'str'):
return(start_string + ''.join(text_generated))
elif(returntype == 'list'):
return(start_string + text_generated)
else:
return(''.join(text_generated))
#Run the RNN on a given text, for a given number of epochs and return the model
def run_rnn_nlp (text, suffix='suffix', EPOCHS =3):
#Make vocab
vocab = sorted(set(text))
#set up mapping
char2idx = {u:i for i, u in enumerate(vocab)}
#char2spe = np.array(vocab)
text_2_int = np.array([char2idx[c] for c in text])
#define size of speaker and dataset slices
seq_length = 100#int(len(text)/20)
examples_per_epoch = len(text)
speaker_dataset = tf.data.Dataset.from_tensor_slices(text_2_int)
sequences = speaker_dataset.batch(seq_length+1, drop_remainder=True)
dataset = sequences.map(split_input_target)
# Define Batch Size/ Buffer size
BATCH_SIZE = 4
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.repeat()
#Building the Model
vocab_size_s = len(vocab)
embedding_dim = 256
rnn_units = 1024
model = build_model(vocab_size = len(vocab), embedding_dim = embedding_dim, rnn_units = rnn_units, batch_size = BATCH_SIZE)
#Compile Model
model.compile(optimizer='adam', loss=loss)
#Save the Checkpoints
checkpoint_dir = './training_checkpoints_'+suffix
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
filepath= checkpoint_prefix, save_weights_only=True)
steps = int(len(text)/(100*BATCH_SIZE)-1)
#Run the code with Epochs
history = model.fit(dataset,epochs=EPOCHS, callbacks= [checkpoint_callback], steps_per_epoch = steps)
tf.train.latest_checkpoint(checkpoint_dir)
#Build the Model
model = build_model(vocab_size_s, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
model.build(tf.TensorShape([1,None]))
#Given a file, load the model and produce text
def prod_text_from_file(text, suffix='suffix', start_string='', length=100):
vocab = sorted(set(text))
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
checkpoint_dir = './training_checkpoints_'+suffix
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None]))
#print(model.summary())
gen= generate_text(model,start_string, text, length, 1, 'str')
return gen