-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
146 lines (115 loc) · 4.71 KB
/
train.py
File metadata and controls
146 lines (115 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import math
import os
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from transformers import AdamW, get_scheduler
from data_loader import get_data_loaders
from model_config import create_model
# Configuration
NUM_EPOCHS = 3
BATCH_SIZE = 32
MAX_LEN = 512
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 5e-5
SAVE_PATH = "./code_generator_model"
LOG_DIR = "./tensorboard_logs"
# Check for GPU/Device
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
def calculate_perplexity(loss):
"""
Calculate perplexity from loss.
"""
return math.exp(loss) if loss < 300 else float("inf")
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler, gradient_accumulation_steps, writer):
"""
Train the model for one epoch.
"""
model.train()
total_loss = 0
step_loss = 0
step_count = 0
for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch + 1} Training")):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
labels = batch["input_ids"].to(DEVICE)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss / gradient_accumulation_steps
total_loss += loss.item()
step_loss += loss.item()
# Backward pass
loss.backward()
# Gradient accumulation
if (step + 1) % gradient_accumulation_steps == 0 or (step + 1) == len(train_loader):
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
optimizer.zero_grad()
step_count += 1
# Log step loss to TensorBoard
writer.add_scalar("Loss/Train Step", step_loss / step_count, epoch * len(train_loader) + step)
step_loss = 0
# Adjust learning rate
scheduler.step()
# Calculate epoch metrics
avg_loss = total_loss / len(train_loader)
perplexity = calculate_perplexity(avg_loss)
# Log epoch metrics
writer.add_scalar("Loss/Train Epoch", avg_loss, epoch + 1)
writer.add_scalar("Perplexity/Train Epoch", perplexity, epoch + 1)
return avg_loss, perplexity
def validate_one_epoch(epoch, model, val_loader, writer):
"""
Validate the model for one epoch.
"""
model.eval()
total_val_loss = 0
with torch.no_grad():
for step, batch in enumerate(tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation")):
input_ids = batch["input_ids"].to(DEVICE)
attention_mask = batch["attention_mask"].to(DEVICE)
labels = batch["labels"].to(DEVICE)
# Forward pass
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
total_val_loss += outputs.loss.item()
# Calculate validation metrics
avg_val_loss = total_val_loss / len(val_loader)
perplexity = calculate_perplexity(avg_val_loss)
# Log validation metrics
writer.add_scalar("Loss/Validation", avg_val_loss, epoch + 1)
writer.add_scalar("Perplexity/Validation", perplexity, epoch + 1)
return avg_val_loss, perplexity
def save_model(epoch, model, tokenizer, save_path):
"""
Save the model and tokenizer after each epoch.
"""
epoch_path = os.path.join(save_path, f"epoch_{epoch + 1}")
os.makedirs(epoch_path, exist_ok=True)
model.save_pretrained(epoch_path)
tokenizer.save_pretrained(epoch_path)
def main():
# Create model, tokenizer, optimizer, and scheduler
model, tokenizer = create_model() # Your custom model creation logic
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
train_loader, val_loader, _ = get_data_loaders(tokenizer, MAX_LEN)
num_training_steps = len(train_loader) * NUM_EPOCHS
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
# Initialize TensorBoard writer
writer = SummaryWriter(log_dir=LOG_DIR)
for epoch in range(NUM_EPOCHS):
print(f"\n===== Epoch {epoch + 1}/{NUM_EPOCHS} =====")
# Training
train_loss, train_perplexity = train_one_epoch(
epoch, model, train_loader, optimizer, scheduler, GRADIENT_ACCUMULATION_STEPS, writer
)
print(f"Training Loss: {train_loss:.4f}, Training Perplexity: {train_perplexity:.4f}")
# Validation
val_loss, val_perplexity = validate_one_epoch(epoch, model, val_loader, writer)
print(f"Validation Loss: {val_loss:.4f}, Validation Perplexity: {val_perplexity:.4f}")
# Save model
save_model(epoch, model, tokenizer, SAVE_PATH)
writer.close()
print("Training Complete!")
if __name__ == "__main__":
main()