Skip to content

Commit 8ef7b8f

Browse files
committed
Merge pull request #18 from lucasb-eyer/kaggle-otto
Kaggle otto minor revamp.
2 parents 33f7fa4 + d56a997 commit 8ef7b8f

File tree

3 files changed

+22
-20
lines changed

3 files changed

+22
-20
lines changed

examples/Kaggle-Otto/run.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22
import pandas as pd
33
import beacon8 as bb8
44
import beacon8.optimizers as optim
5+
from os.path import dirname, join as pjoin
56
from sklearn.preprocessing import LabelEncoder
67
from sklearn.cross_validation import train_test_split
7-
from train import *
8-
from test import *
8+
from train import train
9+
from test import validate
910

1011

1112
def load_train_data():
12-
train_data = pd.read_csv('./data/train.csv')
13+
train_data = pd.read_csv(pjoin(dirname(__file__), 'data', 'train.csv'))
1314
labels = train_data.target.values
14-
labels_encoder = LabelEncoder()
15-
labels = labels_encoder.fit_transform(labels)
15+
labels = LabelEncoder().fit_transform(labels)
1616
train_data = train_data.drop('id', axis=1)
1717
train_data = train_data.drop('target', axis=1)
1818
return train_data.as_matrix(), labels
@@ -48,20 +48,20 @@ def nnet():
4848

4949
train_data_x, train_data_y = load_train_data()
5050

51-
train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(train_data_x, train_data_y, train_size=0.85)
51+
train_data_x, valid_data_x, train_data_y, valid_data_y = train_test_split(train_data_x, train_data_y, train_size=0.85)
5252
model = nnet()
5353

5454
criterion = bb8.ClassNLLCriterion()
5555

5656
optimiser = optim.Momentum(lr=0.01, momentum=0.9)
5757

58-
for epoch in range(1000):
58+
for epoch in range(1, 1001):
5959
model.training()
60-
if epoch > 100 and epoch % 100 == 0:
60+
if epoch % 100 == 0:
6161
optimiser.hyperparams['lr'] /= 10
62-
train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100)
63-
train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100, 'stat')
62+
train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100, 'train')
63+
train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100, 'stats')
6464

6565
model.evaluate()
66-
validate(test_data_x, test_data_y, model, epoch, 100)
66+
validate(valid_data_x, valid_data_y, model, epoch, 100)
6767

examples/Kaggle-Otto/test.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
import numpy as np
22
import theano as _th
3-
from sklearn.metrics import log_loss
4-
from kaggle_utils import *
53

4+
from kaggle_utils import multiclass_log_loss
65
from examples.utils import make_progressbar
76

87
def validate(dataset_x, dataset_y, model, epoch, batch_size):
@@ -11,7 +10,7 @@ def validate(dataset_x, dataset_y, model, epoch, batch_size):
1110

1211
mini_batch_input = np.empty(shape=(batch_size, 93), dtype=_th.config.floatX)
1312
mini_batch_targets = np.empty(shape=(batch_size, ), dtype=_th.config.floatX)
14-
accuracy = 0.
13+
logloss = 0.
1514

1615
for j in range((dataset_x.shape[0] + batch_size - 1) // batch_size):
1716
progress.update(j * batch_size)
@@ -26,7 +25,7 @@ def validate(dataset_x, dataset_y, model, epoch, batch_size):
2625
mini_batch_prediction.resize((dataset_x.shape[0] - j * batch_size, 9))
2726
mini_batch_targets.resize((dataset_x.shape[0] - j * batch_size, ))
2827

29-
accuracy = accuracy + multiclass_log_loss(mini_batch_targets, mini_batch_prediction, normalize=False)
28+
logloss += multiclass_log_loss(mini_batch_targets, mini_batch_prediction, normalize=False)
3029

3130
progress.finish()
32-
print("Epoch #" + str(epoch) + ", Logloss: " + str(float(accuracy) / dataset_x.shape[0]))
31+
print("Epoch #{}, Logloss: {:.5f}".format(epoch, logloss/dataset_x.shape[0]))

examples/Kaggle-Otto/train.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33

44
from examples.utils import make_progressbar
55

6-
def train(dataset_x, dataset_y, model, optimiser, criterion, epoch, batch_size, mode=None):
7-
progress = make_progressbar('Training epoch #{}'.format(epoch), len(dataset_x))
6+
7+
def train(dataset_x, dataset_y, model, optimiser, criterion, epoch, batch_size, mode='train'):
8+
progress = make_progressbar('Training ({}) epoch #{}'.format(mode, epoch), len(dataset_x))
89
progress.start()
910

1011
shuffle = np.random.permutation(len(dataset_x))
@@ -17,12 +18,14 @@ def train(dataset_x, dataset_y, model, optimiser, criterion, epoch, batch_size,
1718
mini_batch_input[k] = dataset_x[shuffle[j * batch_size + k]]
1819
mini_batch_targets[k] = dataset_y[shuffle[j * batch_size + k]]
1920

20-
if mode is None:
21+
if mode == 'train':
2122
model.zero_grad_parameters()
2223
model.accumulate_gradients(mini_batch_input, mini_batch_targets, criterion)
2324
optimiser.update_parameters(model)
24-
else:
25+
elif mode == 'stats':
2526
model.accumulate_statistics(mini_batch_input)
27+
else:
28+
assert False, "Mode should be either 'train' or 'stats'"
2629

2730
progress.update((j+1) * batch_size)
2831

0 commit comments

Comments
 (0)