Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit 7aa41ae

Browse files
saatviksapavlo
authored andcommitted
Forecasting Code Cleanup and Model Addition (#1422)
* Model Hierarchy/Cleanup/More Fwd Declarations * Model Additions: Linear/KernelReg + Additional cleanup * formatting * Thorough ModelUtil testing * Baseforecast Model changes * Add Fit/Predict methods * Addressed Review comments * Adding Ensembler * Ensemble model completed with testing * Formatting and minor fix * Attempt to re-enable TF * CMake bug fix * Disable TF for now * Addressed Review + Enable TF to check it passes on other platforms * Redisable TF * Early stop default setup
1 parent 2406b76 commit 7aa41ae

28 files changed

+1872
-397
lines changed

src/brain/util/eigen_util.cpp

Lines changed: 98 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,12 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
#include "brain/util/eigen_util.h"
14+
#include <random>
1415

15-
namespace peloton{
16-
namespace brain{
16+
namespace peloton {
17+
namespace brain {
1718

18-
/**
19-
* Fn to convert matrix_t type to matrix_r
20-
* @param mat: matrix_t matrix
21-
* @return the converted matrix_r matrix
22-
*/
23-
matrix_eig EigenUtil::MatrixTToEigenMat(matrix_t &mat) {
19+
matrix_eig EigenUtil::ToEigenMat(const matrix_t &mat) {
2420
std::vector<float> mat_flat;
2521

2622
uint rows = 0;
@@ -34,12 +30,7 @@ matrix_eig EigenUtil::MatrixTToEigenMat(matrix_t &mat) {
3430
return Eigen::Map<matrix_eig>(mat_flat.data(), rows, mat_flat.size() / rows);
3531
}
3632

37-
/**
38-
* Fn to convert matrix_r type to matrix_t.
39-
* @param mat: mratrix_r matrix
40-
* @return the converted matrix_t matrix
41-
*/
42-
matrix_t EigenUtil::EigenMatToMatrixT(matrix_eig &mat) {
33+
matrix_t EigenUtil::ToMatrixT(const matrix_eig &mat) {
4334
matrix_t out_mat;
4435
out_mat.resize(mat.rows());
4536
auto data = mat.data();
@@ -52,9 +43,100 @@ matrix_t EigenUtil::EigenMatToMatrixT(matrix_eig &mat) {
5243
return out_mat;
5344
}
5445

55-
std::vector<float> EigenUtil::FlattenMatrix(const matrix_eig &mat) {
46+
vector_eig EigenUtil::ToEigenVec(const vector_t &mat) {
47+
return vector_eig::Map(mat.data(), mat.size());
48+
}
49+
50+
vector_t EigenUtil::ToVectorT(const vector_eig &mat) {
51+
return vector_t(mat.data(), mat.data() + mat.size());
52+
}
53+
54+
matrix_eig EigenUtil::VStack(const std::vector<matrix_eig> &mat_vec) {
55+
PELOTON_ASSERT(!mat_vec.empty());
56+
if (mat_vec.size() == 1) {
57+
return mat_vec[0];
58+
}
59+
long num_cols = mat_vec[0].cols();
60+
size_t num_rows = 0;
61+
for (size_t mat_idx = 0; mat_idx < mat_vec.size(); ++mat_idx) {
62+
PELOTON_ASSERT(mat_vec[mat_idx].cols() == num_cols);
63+
num_rows += mat_vec[mat_idx].rows();
64+
}
65+
matrix_eig vstacked_mat(num_rows, num_cols);
66+
size_t row_offset = 0;
67+
for (size_t mat_idx = 0; mat_idx < mat_vec.size(); ++mat_idx) {
68+
long cur_rows = mat_vec[mat_idx].rows();
69+
vstacked_mat.middleRows(row_offset, cur_rows) = mat_vec[mat_idx];
70+
row_offset += cur_rows;
71+
}
72+
return vstacked_mat;
73+
}
74+
75+
matrix_eig EigenUtil::PairwiseEuclideanDist(const matrix_eig &m1,
76+
const matrix_eig &m2) {
77+
matrix_eig m_dist(m1.rows(), m2.rows());
78+
for (int i = 0; i < m1.rows(); i++) {
79+
for (int j = 0; j < m2.rows(); j++) {
80+
m_dist(i, j) = (m1.row(i) - m2.row(j)).norm();
81+
}
82+
}
83+
return m_dist;
84+
}
85+
86+
vector_t EigenUtil::Flatten(const matrix_eig &mat) {
5687
return std::vector<float>(mat.data(), mat.data() + mat.size());
5788
}
5889

90+
vector_t EigenUtil::Flatten(const std::vector<matrix_eig> &mat_vect) {
91+
std::vector<float> flattened_mat;
92+
for (auto &mat : mat_vect) {
93+
flattened_mat.insert(flattened_mat.end(), mat.data(),
94+
mat.data() + mat.size());
95+
}
96+
return flattened_mat;
5997
}
60-
}
98+
99+
vector_t EigenUtil::Flatten(const matrix_t &mat) {
100+
vector_t flattened_mat;
101+
for (auto &mat_row : mat) {
102+
flattened_mat.insert(flattened_mat.end(), mat_row.begin(), mat_row.end());
103+
}
104+
return flattened_mat;
105+
}
106+
107+
matrix_eig EigenUtil::GaussianNoise(size_t rows, size_t cols, float mean,
108+
float stdev) {
109+
std::default_random_engine generator;
110+
std::normal_distribution<> distribution{mean, stdev};
111+
auto gaussian_sampler = [&](UNUSED_ATTRIBUTE float dummy) {
112+
return distribution(generator);
113+
};
114+
return matrix_eig::NullaryExpr(rows, cols, gaussian_sampler);
115+
}
116+
117+
vector_eig EigenUtil::StandardDeviation(const matrix_eig &mat, uint8_t axis) {
118+
if (axis == 0) {
119+
matrix_eig sqdiff_mat =
120+
(mat.rowwise() - mat.colwise().mean()).array().square();
121+
vector_eig var_mat = sqdiff_mat.colwise().mean();
122+
return var_mat.cwiseSqrt();
123+
} else {
124+
throw "Not Implemented";
125+
}
126+
}
127+
128+
float EigenUtil::StandardDeviation(const matrix_eig &mat) {
129+
matrix_eig sqdiff_mat = (mat.array() - mat.mean()).array().square();
130+
float var = sqdiff_mat.mean();
131+
return std::sqrt(var);
132+
}
133+
134+
matrix_eig EigenUtil::PadTop(const matrix_eig &mat, float pad_value,
135+
int num_rows) {
136+
int num_cols = mat.cols();
137+
matrix_eig pad_mat = matrix_eig::Ones(num_rows, num_cols) * pad_value;
138+
return EigenUtil::VStack({pad_mat, mat});
139+
}
140+
141+
} // namespace brain
142+
} // namespace peloton

src/brain/util/model_util.cpp

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// model_util.cpp
6+
//
7+
// Identification: src/brain/util/model_util.cpp
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "brain/util/model_util.h"
14+
#include "brain/workload/base_tf.h"
15+
16+
namespace peloton {
17+
namespace brain {
18+
float ModelUtil::MeanSqError(const matrix_eig &ytrue, const matrix_eig &ypred) {
19+
PELOTON_ASSERT(ytrue.rows() == ypred.rows() && ytrue.cols() == ypred.cols());
20+
return (ytrue - ypred).array().square().mean();
21+
}
22+
23+
void ModelUtil::GetBatch(const BaseForecastModel &model, const matrix_eig &mat,
24+
size_t batch_offset, size_t bsz,
25+
std::vector<matrix_eig> &data,
26+
std::vector<matrix_eig> &target, bool time_major) {
27+
if (time_major) {
28+
size_t samples_per_batch = mat.rows() / bsz;
29+
size_t seq_len = std::min<size_t>(
30+
model.GetBPTT(), samples_per_batch - model.GetHorizon() - batch_offset);
31+
// bsz vector of <seq_len, feat_len> = (bsz, seq_len, feat_len)
32+
for (size_t seq_idx = 0; seq_idx < bsz; seq_idx++) {
33+
size_t seqblock_start = seq_idx * samples_per_batch;
34+
size_t seq_offset = seqblock_start + batch_offset;
35+
// train mat[row_idx:row_idx + seq_len, :)
36+
matrix_eig data_batch = mat.block(seq_offset, 0, seq_len, mat.cols());
37+
// target mat[row_idx + horizon_: row_idx + seq_len + horizon_ , :]
38+
matrix_eig target_batch =
39+
mat.block(seq_offset + model.GetHorizon(), 0, seq_len, mat.cols());
40+
// Push batches into containers
41+
data.push_back(data_batch);
42+
target.push_back(target_batch);
43+
}
44+
} else {
45+
size_t seq_len = model.GetBPTT();
46+
// bsz vector of <seq_len, feat_len> = (bsz, seq_len, feat_len)
47+
for (size_t seq_idx = 0; seq_idx < bsz; seq_idx++) {
48+
size_t seq_start = seq_idx * seq_len + batch_offset;
49+
matrix_eig data_batch = mat.block(seq_start, 0, seq_len, mat.cols());
50+
matrix_eig target_batch =
51+
mat.block(seq_start + model.GetHorizon(), 0, seq_len, mat.cols());
52+
data.push_back(data_batch);
53+
target.push_back(target_batch);
54+
}
55+
}
56+
}
57+
58+
void ModelUtil::GetBatches(const BaseForecastModel &model,
59+
const matrix_eig &mat, size_t batch_size,
60+
std::vector<std::vector<matrix_eig>> &data,
61+
std::vector<std::vector<matrix_eig>> &target,
62+
bool time_major) {
63+
if (time_major) {
64+
// Obtain relevant metadata
65+
int max_allowed_bsz = mat.rows() / (model.GetHorizon() + model.GetBPTT());
66+
int min_allowed_bsz = 1;
67+
int bsz =
68+
std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
69+
int samples_per_input = mat.rows() / bsz;
70+
int num_feats = mat.cols();
71+
72+
// Trim the data for equal sized inputs per batch
73+
matrix_eig mat_adjusted =
74+
mat.block(0, 0, samples_per_input * bsz, num_feats);
75+
76+
for (int batch_offset = 0;
77+
batch_offset < samples_per_input - model.GetHorizon();
78+
batch_offset += model.GetBPTT()) {
79+
std::vector<matrix_eig> data_batch_eig, target_batch_eig;
80+
ModelUtil::GetBatch(model, mat_adjusted, batch_offset, bsz,
81+
data_batch_eig, target_batch_eig);
82+
data.push_back(data_batch_eig);
83+
target.push_back(target_batch_eig);
84+
}
85+
} else {
86+
int max_rows_in = mat.rows() - model.GetHorizon();
87+
int num_samples = max_rows_in / model.GetBPTT();
88+
// Obtain batch size
89+
int max_allowed_bsz = num_samples;
90+
int min_allowed_bsz = 1;
91+
int bsz =
92+
std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
93+
int samples_per_batch = bsz * model.GetBPTT();
94+
int samples_per_seq = model.GetBPTT();
95+
int batch_offset = 0;
96+
for (batch_offset = 0; batch_offset < max_rows_in - samples_per_batch;
97+
batch_offset += samples_per_batch) {
98+
std::vector<matrix_eig> data_batch_eig, target_batch_eig;
99+
ModelUtil::GetBatch(model, mat, batch_offset, bsz, data_batch_eig,
100+
target_batch_eig, time_major);
101+
data.push_back(data_batch_eig);
102+
target.push_back(target_batch_eig);
103+
}
104+
int rem_bsz = (max_rows_in - batch_offset) / samples_per_seq;
105+
if (rem_bsz > 0) {
106+
std::vector<matrix_eig> data_batch_eig, target_batch_eig;
107+
ModelUtil::GetBatch(model, mat, batch_offset, rem_bsz, data_batch_eig,
108+
target_batch_eig, time_major);
109+
data.push_back(data_batch_eig);
110+
target.push_back(target_batch_eig);
111+
}
112+
}
113+
}
114+
115+
void ModelUtil::GetBatches(const BaseForecastModel &model,
116+
const matrix_eig &mat, size_t batch_size,
117+
std::vector<std::vector<matrix_eig>> &data_batches) {
118+
int num_seq = mat.rows() / model.GetBPTT();
119+
// Obtain batch size
120+
int max_allowed_bsz = num_seq;
121+
int min_allowed_bsz = 1;
122+
int bsz =
123+
std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
124+
int samples_per_batch = bsz * model.GetBPTT();
125+
int samples_per_seq = model.GetBPTT();
126+
int batch_offset = 0;
127+
for (batch_offset = 0; batch_offset < mat.rows() - samples_per_batch;
128+
batch_offset += samples_per_batch) {
129+
std::vector<matrix_eig> data_batch;
130+
for (int seq_idx = 0; seq_idx < bsz; seq_idx++) {
131+
int seq_offset = batch_offset + seq_idx * samples_per_seq;
132+
data_batch.push_back(mat.middleRows(seq_offset, samples_per_seq));
133+
}
134+
data_batches.push_back(data_batch);
135+
}
136+
// Push remaining samples into smaller batch
137+
int rem_bsz = (mat.rows() - batch_offset) / samples_per_seq;
138+
if (rem_bsz > 0) {
139+
std::vector<matrix_eig> data_batch;
140+
for (int seq_idx = 0; seq_idx < rem_bsz; seq_idx++) {
141+
int seq_offset = batch_offset + seq_idx * samples_per_seq;
142+
data_batch.push_back(mat.middleRows(seq_offset, samples_per_seq));
143+
}
144+
data_batches.push_back(data_batch);
145+
}
146+
batch_offset += rem_bsz * samples_per_seq;
147+
int rem_seq_len = mat.rows() - batch_offset;
148+
// Push anything further remaining into a single batch of size < BPTT
149+
data_batches.push_back({mat.bottomRows(rem_seq_len)});
150+
}
151+
152+
void ModelUtil::FeatureLabelSplit(const BaseForecastModel &model,
153+
const matrix_eig &data, matrix_eig &X,
154+
matrix_eig &y) {
155+
size_t offset_train = data.rows() - model.GetHorizon();
156+
X = data.topRows(offset_train);
157+
size_t offset_label = model.GetBPTT() + model.GetHorizon() - 1;
158+
y = data.middleRows(offset_label, data.rows() - offset_label);
159+
}
160+
161+
void ModelUtil::GenerateFeatureMatrix(const BaseForecastModel &model,
162+
const matrix_eig &data,
163+
matrix_eig &processed_features) {
164+
size_t timesteps = data.rows();
165+
std::vector<matrix_eig> feat_matrix;
166+
for (size_t idx = 0; idx <= timesteps - model.GetBPTT(); idx++) {
167+
feat_matrix.push_back(EigenUtil::ToEigenMat(
168+
{EigenUtil::Flatten(data.middleRows(idx, model.GetBPTT()))}));
169+
}
170+
processed_features = EigenUtil::VStack(feat_matrix);
171+
}
172+
173+
bool ModelUtil::EarlyStop(vector_t val_losses, size_t patience, float delta) {
174+
// Check for edge cases
175+
PELOTON_ASSERT(patience > 1);
176+
PELOTON_ASSERT(delta > 0);
177+
if (val_losses.size() < patience) return false;
178+
float cur_loss = val_losses[val_losses.size() - 1];
179+
float pat_loss = val_losses[val_losses.size() - patience];
180+
// Loss should have at least dropped by delta at this point
181+
return (pat_loss - cur_loss) < delta;
182+
}
183+
184+
} // namespace brain
185+
} // namespace peloton

0 commit comments

Comments
 (0)