-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchar-rnn.py
More file actions
140 lines (112 loc) · 7.36 KB
/
char-rnn.py
File metadata and controls
140 lines (112 loc) · 7.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
"""
Character-level RNN built with reference to @karpathy's gist: https://gist.github.com/karpathy/d4dee566867f8291f086
"""
np.random.seed(1)
# data I/O
data = open('datasets/shakespearean.txt').read()
chars = list(set(data)) # List of all unique characters in the data
data_size, vocab_size = len(data), len(chars)
print (f' {data_size} chars in data; {vocab_size} unique chars in vocab')
index_to_char = { i:ch for i, ch in enumerate(chars)} # Dict to map character to its index, and vice versa
char_to_index = { ch:i for i, ch in enumerate(chars)}
# Initialising hyperparams
hidden_units = 120
seq_length = 25
alpha = 0.03
# Initialising params
Wxh = np.random.randn(hidden_units, vocab_size) * 0.01 # parameters input to hidden; 0.01 is required, otherwise can cause overflow in activation func
Whh = np.random.randn(hidden_units, hidden_units) * 0.01 # hidden to hidden (L-R)
Why = np.random.randn(vocab_size, hidden_units) * 0.01 # hidden to output
bh = np.zeros((hidden_units, 1))
by = np.zeros((vocab_size, 1))
def computeModel(inputs, targets, h_prev):
"""
Arguments
inputs - a list of integer inputs corresponding to the sequence of input characters
targets - a list of integer inputs corresponding to the sequence of target (correct) characters
h_prev - the initial hidden state, or previous
Returns
The loss, the gradients, and the most recent hidden state (rightmost)
"""
x_state, h_state, y_state, prob_state = {}, {}, {}, {} # keeps track of the input, hidden, output, probability states at each timestep
loss = 0
h_state[-1] = np.copy(h_prev) # copies the last hidden state from the previous sequence of inputs, as the new initial hidden state
# Forward pass
for t, char_index in enumerate(inputs):
# initialise a one-hot vector for the given character
x_state[t] = np.zeros((vocab_size, 1))
x_state[t][char_index] = 1
h_state[t] = np.tanh(np.dot(Wxh, x_state[t]) + np.dot(Whh, h_state[t - 1]) + bh) # computing hidden state (post-activation)
y_state[t] = np.dot(Why, h_state[t]) + by
prob_state[t] = np.exp(y_state[t]) / np.sum(np.exp(y_state[t])) # softmax classification
loss += -np.log(prob_state[t][targets[t],0]) # compute cost
# Backward pass
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dh_next = np.zeros_like(h_state[0]) # instantiate the "next" hidden state - required in backprop (rightmost hidden state)
for t in reversed(range(len(inputs))):
dy = np.copy(prob_state[t])
dy[targets[t]] -= 1 # Backprop into y. See derivation
dWhy += np.dot(dy, h_state[t].T)
dby += dy
dh = np.dot(Why.T, dy) + dh_next # add the rightmost hidden state since this also contributes to the gradient
dhraw = (1 - h_state[t] * h_state[t]) * dh # hraw = (Whh * h<t-1> + bh) + (Wxh * X<t>), i.e. right before tanh activation
dbh += dhraw
dWxh += np.dot(dhraw, x_state[t].T)
dWhh += np.dot(dhraw, h_state[t-1].T)
dh_next = np.dot(Whh.T, dhraw) # leftmost hidden state, which will be the rightmost hidden state for the next time-step (reversed)
for d in [dWxh, dWhh, dWhy, dbh, dby]: # grad clipping to prevent exploding gradients
np.clip(d, -5, 5, d)
return loss, dWxh, dWhh, dWhy, dbh, dby, h_state[len(inputs) - 1] # the last var returns the most recent hidden state; this will be used as the initial h_state for the next sequence of letters
def sample(h, seed_index, n):
"""
Arguments
h - most recent hidden state
seed_index - the index of the seed character
n - the number of characters to sample
Returns
A list of integers representing selected chracter indices
"""
character_indices = [] # Stores all the characters selected by the model
x = np.zeros((vocab_size, 1)) # instantiate the one-hot vector of the first seed character
x[seed_index] = 1
for t in range(n):
# single forward pass to sample the next character, given the previous
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
y = np.dot(Why, h) + by
prob = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=prob.ravel())
x = np.zeros((vocab_size, 1)) # instantiate the next character's one-hot vector
x[ix] = 1
character_indices.append(ix) # saves the character selected in time step t, to the list
return character_indices
pointer, n = 0, 0 # tracks the current index of the dataset (used to determine subets of the sequences) and the number of iterations
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) # instantiate the memory for grads; to be used in the Adagrad optimization
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0/vocab_size) * seq_length # loss at iteration 0
while n <= 50000:
if (pointer + seq_length >= len(data)) or (n == 0):
h_prev = np.zeros((hidden_units, 1)) # clear the RNN memory
pointer = 0 # go back to the start of the data
# obtaining inputs
inputs = [char_to_index[ch] for ch in data[pointer: pointer + seq_length]]
targets = [char_to_index[ch] for ch in data[pointer + 1: pointer + seq_length + 1]]
if n % 1000 == 0:
# sample the model
sampled_indices = sample(h_prev, inputs[0], 200)
text = ''.join([index_to_char[ix] for ix in sampled_indices])
print ('Sampled text: ', text, '\r')
loss, dWxh, dWhh, dWhy, dbh, dby, h_prev = computeModel(inputs, targets, h_prev)
smooth_loss = smooth_loss * 0.999 + loss * 0.001 # weighted average on the loss to reduce volatility
if n % 100 == 0:
print (f'At iteration {n}, loss of {smooth_loss}. \r')
# Adagrad update
for param, grad, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
mem += grad * grad
param += - alpha * grad / np.sqrt(mem + 1e-8)
if n % 50000 == 0 and n != 0:
# Simple check on grads. This will show vanishing gradients on many parameters, esp on dWxh
print ('Grads: ', dWxh, dWhh, dWhy)
n += 1
pointer += seq_length