cmu-db
diff --git a/‎src/brain/util/eigen_util.cpp
Lines changed: 98 additions & 16 deletions b/‎src/brain/util/eigen_util.cpp
Lines changed: 98 additions & 16 deletions
diff --git a/‎src/brain/util/model_util.cpp
Lines changed: 185 additions & 0 deletions b/‎src/brain/util/model_util.cpp
Lines changed: 185 additions & 0 deletions
@@ -11,16 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "brain/util/eigen_util.h"
+#include <random>
 
-namespace peloton{
-namespace brain{
+namespace peloton {
+namespace brain {
 
-/**
- * Fn to convert matrix_t type to matrix_r
- * @param mat: matrix_t matrix
- * @return the converted matrix_r matrix
- */
-matrix_eig EigenUtil::MatrixTToEigenMat(matrix_t &mat) {
+matrix_eig EigenUtil::ToEigenMat(const matrix_t &mat) {
   std::vector<float> mat_flat;
 
   uint rows = 0;
@@ -34,12 +30,7 @@ matrix_eig EigenUtil::MatrixTToEigenMat(matrix_t &mat) {
   return Eigen::Map<matrix_eig>(mat_flat.data(), rows, mat_flat.size() / rows);
 }
 
-/**
- * Fn to convert matrix_r type to matrix_t.
- * @param mat: mratrix_r matrix
- * @return the converted matrix_t matrix
- */
-matrix_t EigenUtil::EigenMatToMatrixT(matrix_eig &mat) {
+matrix_t EigenUtil::ToMatrixT(const matrix_eig &mat) {
   matrix_t out_mat;
   out_mat.resize(mat.rows());
   auto data = mat.data();
@@ -52,9 +43,100 @@ matrix_t EigenUtil::EigenMatToMatrixT(matrix_eig &mat) {
   return out_mat;
 }
 
-std::vector<float> EigenUtil::FlattenMatrix(const matrix_eig &mat) {
+vector_eig EigenUtil::ToEigenVec(const vector_t &mat) {
+  return vector_eig::Map(mat.data(), mat.size());
+}
+
+vector_t EigenUtil::ToVectorT(const vector_eig &mat) {
+  return vector_t(mat.data(), mat.data() + mat.size());
+}
+
+matrix_eig EigenUtil::VStack(const std::vector<matrix_eig> &mat_vec) {
+  PELOTON_ASSERT(!mat_vec.empty());
+  if (mat_vec.size() == 1) {
+    return mat_vec[0];
+  }
+  long num_cols = mat_vec[0].cols();
+  size_t num_rows = 0;
+  for (size_t mat_idx = 0; mat_idx < mat_vec.size(); ++mat_idx) {
+    PELOTON_ASSERT(mat_vec[mat_idx].cols() == num_cols);
+    num_rows += mat_vec[mat_idx].rows();
+  }
+  matrix_eig vstacked_mat(num_rows, num_cols);
+  size_t row_offset = 0;
+  for (size_t mat_idx = 0; mat_idx < mat_vec.size(); ++mat_idx) {
+    long cur_rows = mat_vec[mat_idx].rows();
+    vstacked_mat.middleRows(row_offset, cur_rows) = mat_vec[mat_idx];
+    row_offset += cur_rows;
+  }
+  return vstacked_mat;
+}
+
+matrix_eig EigenUtil::PairwiseEuclideanDist(const matrix_eig &m1,
+                                            const matrix_eig &m2) {
+  matrix_eig m_dist(m1.rows(), m2.rows());
+  for (int i = 0; i < m1.rows(); i++) {
+    for (int j = 0; j < m2.rows(); j++) {
+      m_dist(i, j) = (m1.row(i) - m2.row(j)).norm();
+    }
+  }
+  return m_dist;
+}
+
+vector_t EigenUtil::Flatten(const matrix_eig &mat) {
   return std::vector<float>(mat.data(), mat.data() + mat.size());
 }
 
+vector_t EigenUtil::Flatten(const std::vector<matrix_eig> &mat_vect) {
+  std::vector<float> flattened_mat;
+  for (auto &mat : mat_vect) {
+    flattened_mat.insert(flattened_mat.end(), mat.data(),
+                         mat.data() + mat.size());
+  }
+  return flattened_mat;
 }
-}
+
+vector_t EigenUtil::Flatten(const matrix_t &mat) {
+  vector_t flattened_mat;
+  for (auto &mat_row : mat) {
+    flattened_mat.insert(flattened_mat.end(), mat_row.begin(), mat_row.end());
+  }
+  return flattened_mat;
+}
+
+matrix_eig EigenUtil::GaussianNoise(size_t rows, size_t cols, float mean,
+                                    float stdev) {
+  std::default_random_engine generator;
+  std::normal_distribution<> distribution{mean, stdev};
+  auto gaussian_sampler = [&](UNUSED_ATTRIBUTE float dummy) {
+    return distribution(generator);
+  };
+  return matrix_eig::NullaryExpr(rows, cols, gaussian_sampler);
+}
+
+vector_eig EigenUtil::StandardDeviation(const matrix_eig &mat, uint8_t axis) {
+  if (axis == 0) {
+    matrix_eig sqdiff_mat =
+        (mat.rowwise() - mat.colwise().mean()).array().square();
+    vector_eig var_mat = sqdiff_mat.colwise().mean();
+    return var_mat.cwiseSqrt();
+  } else {
+    throw "Not Implemented";
+  }
+}
+
+float EigenUtil::StandardDeviation(const matrix_eig &mat) {
+  matrix_eig sqdiff_mat = (mat.array() - mat.mean()).array().square();
+  float var = sqdiff_mat.mean();
+  return std::sqrt(var);
+}
+
+matrix_eig EigenUtil::PadTop(const matrix_eig &mat, float pad_value,
+                             int num_rows) {
+  int num_cols = mat.cols();
+  matrix_eig pad_mat = matrix_eig::Ones(num_rows, num_cols) * pad_value;
+  return EigenUtil::VStack({pad_mat, mat});
+}
+
+}  // namespace brain
+}  // namespace peloton
@@ -0,0 +1,185 @@
+//===----------------------------------------------------------------------===//
+//
+//                         Peloton
+//
+// model_util.cpp
+//
+// Identification: src/brain/util/model_util.cpp
+//
+// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
+//
+//===----------------------------------------------------------------------===//
+
+#include "brain/util/model_util.h"
+#include "brain/workload/base_tf.h"
+
+namespace peloton {
+namespace brain {
+float ModelUtil::MeanSqError(const matrix_eig &ytrue, const matrix_eig &ypred) {
+  PELOTON_ASSERT(ytrue.rows() == ypred.rows() && ytrue.cols() == ypred.cols());
+  return (ytrue - ypred).array().square().mean();
+}
+
+void ModelUtil::GetBatch(const BaseForecastModel &model, const matrix_eig &mat,
+                         size_t batch_offset, size_t bsz,
+                         std::vector<matrix_eig> &data,
+                         std::vector<matrix_eig> &target, bool time_major) {
+  if (time_major) {
+    size_t samples_per_batch = mat.rows() / bsz;
+    size_t seq_len = std::min<size_t>(
+        model.GetBPTT(), samples_per_batch - model.GetHorizon() - batch_offset);
+    // bsz vector of <seq_len, feat_len> = (bsz, seq_len, feat_len)
+    for (size_t seq_idx = 0; seq_idx < bsz; seq_idx++) {
+      size_t seqblock_start = seq_idx * samples_per_batch;
+      size_t seq_offset = seqblock_start + batch_offset;
+      // train mat[row_idx:row_idx + seq_len, :)
+      matrix_eig data_batch = mat.block(seq_offset, 0, seq_len, mat.cols());
+      // target mat[row_idx + horizon_: row_idx + seq_len + horizon_ , :]
+      matrix_eig target_batch =
+          mat.block(seq_offset + model.GetHorizon(), 0, seq_len, mat.cols());
+      // Push batches into containers
+      data.push_back(data_batch);
+      target.push_back(target_batch);
+    }
+  } else {
+    size_t seq_len = model.GetBPTT();
+    // bsz vector of <seq_len, feat_len> = (bsz, seq_len, feat_len)
+    for (size_t seq_idx = 0; seq_idx < bsz; seq_idx++) {
+      size_t seq_start = seq_idx * seq_len + batch_offset;
+      matrix_eig data_batch = mat.block(seq_start, 0, seq_len, mat.cols());
+      matrix_eig target_batch =
+          mat.block(seq_start + model.GetHorizon(), 0, seq_len, mat.cols());
+      data.push_back(data_batch);
+      target.push_back(target_batch);
+    }
+  }
+}
+
+void ModelUtil::GetBatches(const BaseForecastModel &model,
+                           const matrix_eig &mat, size_t batch_size,
+                           std::vector<std::vector<matrix_eig>> &data,
+                           std::vector<std::vector<matrix_eig>> &target,
+                           bool time_major) {
+  if (time_major) {
+    // Obtain relevant metadata
+    int max_allowed_bsz = mat.rows() / (model.GetHorizon() + model.GetBPTT());
+    int min_allowed_bsz = 1;
+    int bsz =
+        std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
+    int samples_per_input = mat.rows() / bsz;
+    int num_feats = mat.cols();
+
+    // Trim the data for equal sized inputs per batch
+    matrix_eig mat_adjusted =
+        mat.block(0, 0, samples_per_input * bsz, num_feats);
+
+    for (int batch_offset = 0;
+         batch_offset < samples_per_input - model.GetHorizon();
+         batch_offset += model.GetBPTT()) {
+      std::vector<matrix_eig> data_batch_eig, target_batch_eig;
+      ModelUtil::GetBatch(model, mat_adjusted, batch_offset, bsz,
+                          data_batch_eig, target_batch_eig);
+      data.push_back(data_batch_eig);
+      target.push_back(target_batch_eig);
+    }
+  } else {
+    int max_rows_in = mat.rows() - model.GetHorizon();
+    int num_samples = max_rows_in / model.GetBPTT();
+    // Obtain batch size
+    int max_allowed_bsz = num_samples;
+    int min_allowed_bsz = 1;
+    int bsz =
+        std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
+    int samples_per_batch = bsz * model.GetBPTT();
+    int samples_per_seq = model.GetBPTT();
+    int batch_offset = 0;
+    for (batch_offset = 0; batch_offset < max_rows_in - samples_per_batch;
+         batch_offset += samples_per_batch) {
+      std::vector<matrix_eig> data_batch_eig, target_batch_eig;
+      ModelUtil::GetBatch(model, mat, batch_offset, bsz, data_batch_eig,
+                          target_batch_eig, time_major);
+      data.push_back(data_batch_eig);
+      target.push_back(target_batch_eig);
+    }
+    int rem_bsz = (max_rows_in - batch_offset) / samples_per_seq;
+    if (rem_bsz > 0) {
+      std::vector<matrix_eig> data_batch_eig, target_batch_eig;
+      ModelUtil::GetBatch(model, mat, batch_offset, rem_bsz, data_batch_eig,
+                          target_batch_eig, time_major);
+      data.push_back(data_batch_eig);
+      target.push_back(target_batch_eig);
+    }
+  }
+}
+
+void ModelUtil::GetBatches(const BaseForecastModel &model,
+                           const matrix_eig &mat, size_t batch_size,
+                           std::vector<std::vector<matrix_eig>> &data_batches) {
+  int num_seq = mat.rows() / model.GetBPTT();
+  // Obtain batch size
+  int max_allowed_bsz = num_seq;
+  int min_allowed_bsz = 1;
+  int bsz =
+      std::max(min_allowed_bsz, std::min<int>(batch_size, max_allowed_bsz));
+  int samples_per_batch = bsz * model.GetBPTT();
+  int samples_per_seq = model.GetBPTT();
+  int batch_offset = 0;
+  for (batch_offset = 0; batch_offset < mat.rows() - samples_per_batch;
+       batch_offset += samples_per_batch) {
+    std::vector<matrix_eig> data_batch;
+    for (int seq_idx = 0; seq_idx < bsz; seq_idx++) {
+      int seq_offset = batch_offset + seq_idx * samples_per_seq;
+      data_batch.push_back(mat.middleRows(seq_offset, samples_per_seq));
+    }
+    data_batches.push_back(data_batch);
+  }
+  // Push remaining samples into smaller batch
+  int rem_bsz = (mat.rows() - batch_offset) / samples_per_seq;
+  if (rem_bsz > 0) {
+    std::vector<matrix_eig> data_batch;
+    for (int seq_idx = 0; seq_idx < rem_bsz; seq_idx++) {
+      int seq_offset = batch_offset + seq_idx * samples_per_seq;
+      data_batch.push_back(mat.middleRows(seq_offset, samples_per_seq));
+    }
+    data_batches.push_back(data_batch);
+  }
+  batch_offset += rem_bsz * samples_per_seq;
+  int rem_seq_len = mat.rows() - batch_offset;
+  // Push anything further remaining into a single batch of size < BPTT
+  data_batches.push_back({mat.bottomRows(rem_seq_len)});
+}
+
+void ModelUtil::FeatureLabelSplit(const BaseForecastModel &model,
+                                  const matrix_eig &data, matrix_eig &X,
+                                  matrix_eig &y) {
+  size_t offset_train = data.rows() - model.GetHorizon();
+  X = data.topRows(offset_train);
+  size_t offset_label = model.GetBPTT() + model.GetHorizon() - 1;
+  y = data.middleRows(offset_label, data.rows() - offset_label);
+}
+
+void ModelUtil::GenerateFeatureMatrix(const BaseForecastModel &model,
+                                      const matrix_eig &data,
+                                      matrix_eig &processed_features) {
+  size_t timesteps = data.rows();
+  std::vector<matrix_eig> feat_matrix;
+  for (size_t idx = 0; idx <= timesteps - model.GetBPTT(); idx++) {
+    feat_matrix.push_back(EigenUtil::ToEigenMat(
+        {EigenUtil::Flatten(data.middleRows(idx, model.GetBPTT()))}));
+  }
+  processed_features = EigenUtil::VStack(feat_matrix);
+}
+
+bool ModelUtil::EarlyStop(vector_t val_losses, size_t patience, float delta) {
+  // Check for edge cases
+  PELOTON_ASSERT(patience > 1);
+  PELOTON_ASSERT(delta > 0);
+  if (val_losses.size() < patience) return false;
+  float cur_loss = val_losses[val_losses.size() - 1];
+  float pat_loss = val_losses[val_losses.size() - patience];
+  // Loss should have at least dropped by delta at this point
+  return (pat_loss - cur_loss) < delta;
+}
+
+}  // namespace brain
+}  // namespace peloton