mlpack · KimSangYeon-DGU · Jun 2, 2020 · May 20, 2020 · May 24, 2020 · May 26, 2020
diff --git a/.ci/linux-steps.yaml b/.ci/linux-steps.yaml
@@ -45,7 +45,7 @@ steps:
   displayName: 'Build models'
 
 # Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines

diff --git a/.ci/macos-steps.yaml b/.ci/macos-steps.yaml
@@ -37,7 +37,7 @@ steps:
   displayName: 'Build models'
 
 # Run CTests.
-- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
+- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines

diff --git a/.ci/windows-steps.yaml b/.ci/windows-steps.yaml
@@ -134,7 +134,7 @@ steps:
 # Run tests via ctest.
 - bash: |
     cd build/tests
-    CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
+    CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
   displayName: 'Run tests via ctest'
 
 # Publish test results to Azure Pipelines

diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,8 @@ xcode*
 .idea
 cmake-build-*
 *.csv
+*.tar
+*.zip
+*.tar.gz
 .travis/configs.hpp
 Testing/*
diff --git a/dataloader/dataloader.hpp b/dataloader/dataloader.hpp
@@ -63,14 +63,14 @@ class DataLoader
    * 
    * @param datasetPath Path or name of dataset.
    * @param shuffle whether or not to shuffle the data.
-   * @param ratio Ratio for train-test split.
+   * @param testRatio Ratio of dataset to be used for validation set.
    * @param useScaler Use feature scaler for pre-processing the dataset.
    * @param augmentation Adds augmentation to training data only.
    * @param augmentationProbability Probability of applying augmentation on dataset.
    */
   DataLoader(const std::string& dataset,
              const bool shuffle,
-             const double ratio = 0.75,
+             const double testRatio = 0.25,
              const bool useScaler = true,
              const std::vector<std::string> augmentation =
                  std::vector<std::string>(),
@@ -85,7 +85,7 @@ class DataLoader
    *                      Note: This option augmentation to NULL, set ratio to 1 and
    *                      scaler will be used to only transform the test data.
    * @param shuffle Boolean to determine whether or not to shuffle the data.
-   * @param ratio Ratio for train-test split.
+   * @param testRatio Ratio of dataset to be used for validation set.
    * @param useScaler Fits the scaler on training data and transforms dataset.
    * @param dropHeader Drops the first row from CSV.
    * @param startInputFeatures First Index which will be fed into the model as input.
@@ -106,7 +106,7 @@ class DataLoader
   void LoadCSV(const std::string& datasetPath,
                const bool loadTrainData = true,
                const bool shuffle = true,
-               const double ratio = 0.75,
+               const double testRatio = 0.25,
                const bool useScaler = false,
                const bool dropHeader = false,
                const int startInputFeatures = -1,
@@ -179,11 +179,30 @@ class DataLoader
    */
   void DownloadDataset(const std::string& dataset)
   {
+    if (datasetMap[dataset].zipFile && (!Utils::PathExists(
+        datasetMap[dataset].trainPath) ||
+        !Utils::PathExists(datasetMap[dataset].testPath)))
+    {
+      Utils::DownloadFile(datasetMap[dataset].datasetURL,
+          datasetMap[dataset].datasetPath, dataset + "_training_data.",
+          false, false, datasetMap[dataset].serverName,
+          datasetMap[dataset].zipFile);
+
+      if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
+          datasetMap[dataset].datasetHash))
+      {
+        mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
+            " downloaded." << std::endl;
+      }
+
+      return;
+    }
+
     if (!Utils::PathExists(datasetMap[dataset].trainPath))
     {
-      Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+      Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
           datasetMap[dataset].trainPath, dataset + "_training_data.",
-          false);
+          false, false, datasetMap[dataset].serverName);
 
       if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
           datasetMap[dataset].trainHash))
@@ -192,11 +211,12 @@ class DataLoader
             dataset << " downloaded." << std::endl;
       }
     }
+
     if (!Utils::PathExists(datasetMap[dataset].testPath))
     {
-      Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
+      Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
           datasetMap[dataset].testPath, dataset + "_testing_data.",
-          false);
+          false, false, datasetMap[dataset].serverName);
 
       if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
           datasetMap[dataset].testHash))

diff --git a/dataloader/dataloader_impl.hpp b/dataloader/dataloader_impl.hpp
@@ -36,7 +36,7 @@ template<
     DatasetX, DatasetY, ScalerType
 >::DataLoader(const std::string& dataset,
               const bool shuffle,
-              const double ratio,
+              const double testRatio,
               const bool useScaler,
               const std::vector<std::string> augmentation,
               const double augmentationProbability)
@@ -49,14 +49,14 @@ template<
 
     if (datasetMap[dataset].loadCSV)
     {
-      LoadCSV(datasetMap[dataset].trainPath, true, shuffle, ratio, useScaler,
-              datasetMap[dataset].dropHeader,
+      LoadCSV(datasetMap[dataset].trainPath, true, shuffle, testRatio,
+              useScaler, datasetMap[dataset].dropHeader,
               datasetMap[dataset].startTrainingInputFeatures,
               datasetMap[dataset].endTrainingInputFeatures,
               datasetMap[dataset].endTrainingPredictionFeatures,
               datasetMap[dataset].endTrainingPredictionFeatures);
 
-      LoadCSV(datasetMap[dataset].testPath, false, false, useScaler,
+      LoadCSV(datasetMap[dataset].testPath, false, false, testRatio, useScaler,
               datasetMap[dataset].dropHeader,
               datasetMap[dataset].startTestingInputFeatures,
               datasetMap[dataset].endTestingInputFeatures);
@@ -85,7 +85,7 @@ template<
 >::LoadCSV(const std::string& datasetPath,
            const bool loadTrainData,
            const bool shuffle,
-           const double ratio,
+           const double testRatio,
            const bool useScaler,
            const bool dropHeader,
            const int startInputFeatures,
@@ -104,14 +104,7 @@ template<
   if (loadTrainData)
   {
     arma::mat trainDataset, validDataset;
-    data::Split(dataset, trainDataset, validDataset, ratio, shuffle);
-
-    if (useScaler)
-    {
-      scaler.Fit(trainDataset);
-      scaler.Transform(trainDataset, trainDataset);
-      scaler.Transform(validDataset, validDataset);
-    }
+    data::Split(dataset, trainDataset, validDataset, testRatio, shuffle);
 
     trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
         trainDataset.n_rows), WrapIndex(endInputFeatures,
@@ -125,10 +118,16 @@ template<
         validDataset.n_rows), WrapIndex(endInputFeatures,
         validDataset.n_rows));
 
-    validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
+    validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
         validDataset.n_rows), WrapIndex(endPredictionFeatures,
         validDataset.n_rows));
 
+    if (useScaler)
+    {
+      scaler.Fit(trainFeatures);
+      scaler.Transform(trainFeatures, trainFeatures);
+      scaler.Transform(validFeatures, validFeatures);
+    }
     // TODO : Add support for augmentation here.
     mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
   }
@@ -139,8 +138,9 @@ template<
       scaler.Transform(dataset, dataset);
     }
 
-    testFeatures = dataset.submat(WrapIndex(startInputFeatures, dataset.n_rows),
-        0, WrapIndex(endInputFeatures, dataset.n_rows), dataset.n_cols - 1);
+    testFeatures = dataset.rows(WrapIndex(startInputFeatures, dataset.n_rows),
+        WrapIndex(endInputFeatures, dataset.n_rows));
+
     mlpack::Log::Info << "Testing Dataset Loaded." << std::endl;
   }
 }

diff --git a/dataloader/datasets.hpp b/dataloader/datasets.hpp
@@ -26,15 +26,47 @@ template<
 >
 struct DatasetDetails
 {
+  //! Locally stored name of dataset used for identification
+  //! during dataloader call.
   std::string datasetName;
-  std::string trainDownloadUrl;
-  std::string testDownloadUrl;
+
+  //! Locally stored URL for downloading training data.
+  std::string trainDownloadURL;
+
+  //! Locally stored URL for downloading testing data.
+  std::string testDownloadURL;
+
+  //! CRC-32 checksum for training data file.
   std::string trainHash;
+
+  //! CRC-32 checksum for testing data file.
   std::string testHash;
+
+  //! Locally stored boolean to determine if dataset is of CSV or similar
+  //! format.
   bool loadCSV;
+
+  //! Locally stored path to file / directory for training data.
   std::string trainPath;
+
+  //! Locally stored path to file / directory for testing data.
   std::string testPath;
 
+  //! Locally held boolean to determine whether dataset will be in zip format.
+  bool zipFile;
+
+  //! Locally stored URL for downloading dataset.
+  std::string datasetURL;
+
+  //! Locally stored CRC-32 checksum for the dataset.
+  std::string datasetHash;
+
+  //! Locally stored path for saving the archived / zip dataset.
+  std::string datasetPath;
+
+  //! Locally stored server name for download file.
+  std::string serverName;
+
   // Pre-Process functor.
   std::function<void(DatasetX&, DatasetY&,
       DatasetX&, DatasetY&, DatasetX&)> PreProcess;
@@ -61,13 +93,18 @@ struct DatasetDetails
   // Default constructor.
   DatasetDetails() :
       datasetName(""),
-      trainDownloadUrl(""),
-      testDownloadUrl(""),
+      trainDownloadURL(""),
+      testDownloadURL(""),
       trainHash(""),
       testHash(""),
       loadCSV(false),
       trainPath(""),
       testPath(""),
+      zipFile(false),
+      datasetURL(""),
+      datasetPath(""),
+      datasetHash(""),
+      serverName("www.mlpack.org"),
       startTrainingInputFeatures(0),
       endTrainingInputFeatures(0),
       startTrainingPredictionFeatures(0),
@@ -77,23 +114,85 @@ struct DatasetDetails
       dropHeader(false)
   {/* Nothing to do here. */}
 
-  // Constructor for initializing object.
+  /**
+   * Constructor for initializing object for seperate
+   * train and test download URL.
+   *
+   * @param datasetName Name of dataset used for identification during
+   *                    dataloader call.
+   * @param trainDownloadURL URL for downloading training data.
+   * @param testDownloadURL  URL for downloading testing data.
+   * @param trainHash CRC-32 checksum for training data.
+   * @param testHash CRC-32 checksum for testing data.
+   * @param loadCSV Determines if the format of dataset is similar to CSV.
+   * @param trainPath Path for training dataset.
+   * @param testPath Path for testing dataset.
+   */
   DatasetDetails(const std::string& datasetName,
-                 const std::string& trainDownloadUrl,
-                 const std::string& testDownloadUrl,
+                 const std::string& trainDownloadURL,
+                 const std::string& testDownloadURL,
                  const std::string& trainHash,
                  const std::string& testHash,
                  const bool loadCSV,
                  const std::string& trainPath,
                  const std::string& testPath) :
                  datasetName(datasetName),
-                 trainDownloadUrl(trainDownloadUrl),
-                 testDownloadUrl(testDownloadUrl),
+                 trainDownloadURL(trainDownloadURL),
+                 testDownloadURL(testDownloadURL),
                  trainHash(trainHash),
                  testHash(testHash),
                  loadCSV(loadCSV),
                  trainPath(trainPath),
                  testPath(testPath),
+                 zipFile(false),
+                 datasetURL(""),
+                 datasetHash(""),
+                 serverName("www.mlpack.org"),
+                 startTrainingInputFeatures(0),
+                 endTrainingInputFeatures(0),
+                 startTrainingPredictionFeatures(0),
+                 endTrainingPredictionFeatures(0),
+                 startTestingInputFeatures(0),
+                 endTestingInputFeatures(0),
+                 dropHeader(false)
+  {
+    // Nothing to do here.
+  }
+
+  /**
+   * Constructor for initializing paths for zip files.
+   *
+   * @param datasetName Name of dataset used for identification during
+   *                    dataloader call.
+   * @param zipFile Boolean to determine if dataset is stored in zip format.
+   * @param datasetURL  URL for downloading dataset.
+   * @param datasetPath Path where the dataset will be downloaded.
+   * @param datasetHash CRC-32 checksum for dataset.
+   * @param loadCSV Determines if the format of dataset is similar to CSV.
+   * @param trainPath Path for training dataset.
+   * @param testPath Path for testing dataset.
+   */
+  DatasetDetails(const std::string& datasetName,
+                 const bool zipFile,
+                 const std::string& datasetURL,
+                 const std::string& datasetPath,
+                 const std::string& datasetHash,
+                 const bool loadCSV,
+                 const std::string& trainPath,
+                 const std::string& testPath) :
+                 datasetName(datasetName),
+                 zipFile(zipFile),
+                 datasetURL(datasetURL),
+                 datasetHash(datasetHash),
+                 datasetPath(datasetPath),
+                 loadCSV(loadCSV),
+                 trainPath(trainPath),
+                 testPath(testPath),
+                 trainDownloadURL(""),
+                 testDownloadURL(""),
+                 trainHash(""),
+                 testHash(""),
+                 serverName("www.mlpack.org"),
                  startTrainingInputFeatures(0),
                  endTrainingInputFeatures(0),
                  startTrainingPredictionFeatures(0),
@@ -119,17 +218,18 @@ template<
 class Datasets
 {
  public:
+  //! Get details of MNIST Dataset.
   const static DatasetDetails<DatasetX, DatasetY> MNIST()
   {
     DatasetDetails<DatasetX, DatasetY> mnistDetails(
         "mnist",
-        "/datasets/mnist_train.csv",
-        "/datasets/mnist_test.csv",
-        "772495e3",
-        "8bcdb7e1",
         true,
-        "./../data/mnist_train.csv",
-        "./../data/mnist_test.csv");
+        "/datasets/mnist.tar.gz",
+        "./../data/mnist.tar.gz",
+        "33470ca3",
+        true,
+        "./../data/mnist-dataset/mnist_train.csv",
+        "./../data/mnist-dataset/mnist_test.csv");
 
     // Set the Pre-Processor Function.
     mnistDetails.PreProcess = PreProcessor<DatasetX, DatasetY>::MNIST;