-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
122 lines (97 loc) · 3.82 KB
/
train.py
File metadata and controls
122 lines (97 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
sys.path.append("./model")
import torch
from create_test_dataset import CreateTestDataset
from model.transformer import Transformer
from torch import nn
from loss.masked_NLL import masked_NLLLoss
from metric.masked_accuracy import masked_accuracy
# =============================================================================
# parameters zone
# =============================================================================
max_len = 30
ntoken = 15
Nx= 1
d_model = 120
num_heads = 1
d_ff = 240
N_train = 64*700
N_test = 64*300
batch_size = 64
# =============================================================================
# check available computing device
# =============================================================================
device = (
"cuda"
if torch.cuda.is_available()
else "mps"
if torch.backends.mps.is_available()
else "cpu"
)
print(f"Using {device} device")
# =============================================================================
# define model and load it to computing device
# =============================================================================
model = Transformer(ntoken, Nx, d_model, num_heads, max_len = max_len, d_ff=d_ff).to(device)
# =============================================================================
# define dataset and get train, test dataloader
# =============================================================================
c = CreateTestDataset(N_train, N_test, 10, max_num = ntoken - 1, batch_size = batch_size)
train_dataloader, test_dataloader = c.generate_torch_dataloader()
# =============================================================================
# training loop
# =============================================================================
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Shift target left to be another model input
X, y = X.to(device), y.to(device)
y_input = y[:, :-1]
y_output = y[:, 1:]
# Compute prediction and loss
pred = model(X, y_input)
pred = pred.permute(0, 2, 1)
loss = loss_fn(pred, y_output)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), (batch + 1) * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
y_input = y[:, :-1]
y_output = y[:, 1:]
pred = model(X, y_input)
pred = pred.permute(0, 2, 1)
test_loss += loss_fn(pred, y_output).item()
correct += masked_accuracy(pred, y_output)
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
loss_fn = masked_NLLLoss
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
epochs = 10
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimizer)
test(test_dataloader, model, loss_fn)
print("Done!")
# =============================================================================
# sample prediction
# =============================================================================
with torch.no_grad():
x = torch.Tensor([[1, 14, 5, 9, 7, 14, 3, 7, 3, 2]]).to(torch.int64).to(device)
y = torch.Tensor([[1, 14]]).to(torch.int64).to(device)
pred = model(x, y)
next_token = pred.argmax(2)[:, -1]
print(next_token)