Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/linux-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/macos-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/windows-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ steps:
# Run tests via ctest.
- bash: |
cd build/tests
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@ xcode*
.idea
cmake-build-*
*.csv
*.tar
*.zip
*.tar.gz
.travis/configs.hpp
Testing/*
36 changes: 28 additions & 8 deletions dataloader/dataloader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ class DataLoader
*
* @param datasetPath Path or name of dataset.
* @param shuffle whether or not to shuffle the data.
* @param ratio Ratio for train-test split.
* @param testRatio Ratio of dataset to be used for validation set.
* @param useScaler Use feature scaler for pre-processing the dataset.
* @param augmentation Adds augmentation to training data only.
* @param augmentationProbability Probability of applying augmentation on dataset.
*/
DataLoader(const std::string& dataset,
const bool shuffle,
const double ratio = 0.75,
const double testRatio = 0.25,
const bool useScaler = true,
const std::vector<std::string> augmentation =
std::vector<std::string>(),
Expand All @@ -85,7 +85,7 @@ class DataLoader
* Note: This option augmentation to NULL, set ratio to 1 and
* scaler will be used to only transform the test data.
* @param shuffle Boolean to determine whether or not to shuffle the data.
* @param ratio Ratio for train-test split.
* @param testRatio Ratio of dataset to be used for validation set.
* @param useScaler Fits the scaler on training data and transforms dataset.
* @param dropHeader Drops the first row from CSV.
* @param startInputFeatures First Index which will be fed into the model as input.
Expand All @@ -106,7 +106,7 @@ class DataLoader
void LoadCSV(const std::string& datasetPath,
const bool loadTrainData = true,
const bool shuffle = true,
const double ratio = 0.75,
const double testRatio = 0.25,
const bool useScaler = false,
const bool dropHeader = false,
const int startInputFeatures = -1,
Expand Down Expand Up @@ -179,11 +179,30 @@ class DataLoader
*/
void DownloadDataset(const std::string& dataset)
{
if (datasetMap[dataset].zipFile && (!Utils::PathExists(
datasetMap[dataset].trainPath) ||
!Utils::PathExists(datasetMap[dataset].testPath)))
{
Utils::DownloadFile(datasetMap[dataset].datasetURL,
datasetMap[dataset].datasetPath, dataset + "_training_data.",
false, false, datasetMap[dataset].serverName,
datasetMap[dataset].zipFile);

if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
datasetMap[dataset].datasetHash))
{
mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
" downloaded." << std::endl;
}

return;
}

if (!Utils::PathExists(datasetMap[dataset].trainPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].trainPath, dataset + "_training_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
datasetMap[dataset].trainHash))
Expand All @@ -192,11 +211,12 @@ class DataLoader
dataset << " downloaded." << std::endl;
}
}

if (!Utils::PathExists(datasetMap[dataset].testPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].testPath, dataset + "_testing_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
datasetMap[dataset].testHash))
Expand Down
32 changes: 16 additions & 16 deletions dataloader/dataloader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ template<
DatasetX, DatasetY, ScalerType
>::DataLoader(const std::string& dataset,
const bool shuffle,
const double ratio,
const double testRatio,
const bool useScaler,
const std::vector<std::string> augmentation,
const double augmentationProbability)
Expand All @@ -49,14 +49,14 @@ template<

if (datasetMap[dataset].loadCSV)
{
LoadCSV(datasetMap[dataset].trainPath, true, shuffle, ratio, useScaler,
datasetMap[dataset].dropHeader,
LoadCSV(datasetMap[dataset].trainPath, true, shuffle, testRatio,
useScaler, datasetMap[dataset].dropHeader,
datasetMap[dataset].startTrainingInputFeatures,
datasetMap[dataset].endTrainingInputFeatures,
datasetMap[dataset].endTrainingPredictionFeatures,
datasetMap[dataset].endTrainingPredictionFeatures);

LoadCSV(datasetMap[dataset].testPath, false, false, useScaler,
LoadCSV(datasetMap[dataset].testPath, false, false, testRatio, useScaler,
datasetMap[dataset].dropHeader,
datasetMap[dataset].startTestingInputFeatures,
datasetMap[dataset].endTestingInputFeatures);
Expand Down Expand Up @@ -85,7 +85,7 @@ template<
>::LoadCSV(const std::string& datasetPath,
const bool loadTrainData,
const bool shuffle,
const double ratio,
const double testRatio,
const bool useScaler,
const bool dropHeader,
const int startInputFeatures,
Expand All @@ -104,14 +104,7 @@ template<
if (loadTrainData)
{
arma::mat trainDataset, validDataset;
data::Split(dataset, trainDataset, validDataset, ratio, shuffle);

if (useScaler)
{
scaler.Fit(trainDataset);
scaler.Transform(trainDataset, trainDataset);
scaler.Transform(validDataset, validDataset);
}
data::Split(dataset, trainDataset, validDataset, testRatio, shuffle);

trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
trainDataset.n_rows), WrapIndex(endInputFeatures,
Expand All @@ -125,10 +118,16 @@ template<
validDataset.n_rows), WrapIndex(endInputFeatures,
validDataset.n_rows));

validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
validDataset.n_rows), WrapIndex(endPredictionFeatures,
validDataset.n_rows));

if (useScaler)
{
scaler.Fit(trainFeatures);
scaler.Transform(trainFeatures, trainFeatures);
scaler.Transform(validFeatures, validFeatures);
}
// TODO : Add support for augmentation here.
mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
}
Expand All @@ -139,8 +138,9 @@ template<
scaler.Transform(dataset, dataset);
}

testFeatures = dataset.submat(WrapIndex(startInputFeatures, dataset.n_rows),
0, WrapIndex(endInputFeatures, dataset.n_rows), dataset.n_cols - 1);
testFeatures = dataset.rows(WrapIndex(startInputFeatures, dataset.n_rows),
WrapIndex(endInputFeatures, dataset.n_rows));

mlpack::Log::Info << "Testing Dataset Loaded." << std::endl;
}
}
Expand Down
130 changes: 115 additions & 15 deletions dataloader/datasets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,47 @@ template<
>
struct DatasetDetails
{
//! Locally stored name of dataset used for identification
//! during dataloader call.
std::string datasetName;
std::string trainDownloadUrl;
std::string testDownloadUrl;

//! Locally stored URL for downloading training data.
std::string trainDownloadURL;

//! Locally stored URL for downloading testing data.
std::string testDownloadURL;

//! CRC-32 checksum for training data file.
std::string trainHash;

//! CRC-32 checksum for testing data file.
std::string testHash;

//! Locally stored boolean to determine if dataset is of CSV or similar
//! format.
bool loadCSV;

//! Locally stored path to file / directory for training data.
std::string trainPath;

//! Locally stored path to file / directory for testing data.
std::string testPath;

//! Locally held boolean to determine whether dataset will be in zip format.
bool zipFile;

//! Locally stored URL for downloading dataset.
std::string datasetURL;

//! Locally stored CRC-32 checksum for the dataset.
std::string datasetHash;

//! Locally stored path for saving the archived / zip dataset.
std::string datasetPath;

//! Locally stored server name for download file.
std::string serverName;

// Pre-Process functor.
std::function<void(DatasetX&, DatasetY&,
DatasetX&, DatasetY&, DatasetX&)> PreProcess;
Expand All @@ -61,13 +93,18 @@ struct DatasetDetails
// Default constructor.
DatasetDetails() :
datasetName(""),
trainDownloadUrl(""),
testDownloadUrl(""),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
loadCSV(false),
trainPath(""),
testPath(""),
zipFile(false),
datasetURL(""),
datasetPath(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -77,23 +114,85 @@ struct DatasetDetails
dropHeader(false)
{/* Nothing to do here. */}

// Constructor for initializing object.
/**
* Constructor for initializing object for seperate
* train and test download URL.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param trainDownloadURL URL for downloading training data.
* @param testDownloadURL URL for downloading testing data.
* @param trainHash CRC-32 checksum for training data.
* @param testHash CRC-32 checksum for testing data.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const std::string& trainDownloadUrl,
const std::string& testDownloadUrl,
const std::string& trainDownloadURL,
const std::string& testDownloadURL,
const std::string& trainHash,
const std::string& testHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
trainDownloadUrl(trainDownloadUrl),
testDownloadUrl(testDownloadUrl),
trainDownloadURL(trainDownloadURL),
testDownloadURL(testDownloadURL),
trainHash(trainHash),
testHash(testHash),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
zipFile(false),
datasetURL(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
endTrainingPredictionFeatures(0),
startTestingInputFeatures(0),
endTestingInputFeatures(0),
dropHeader(false)
{
// Nothing to do here.
}

/**
* Constructor for initializing paths for zip files.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param zipFile Boolean to determine if dataset is stored in zip format.
* @param datasetURL URL for downloading dataset.
* @param datasetPath Path where the dataset will be downloaded.
* @param datasetHash CRC-32 checksum for dataset.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const bool zipFile,
const std::string& datasetURL,
const std::string& datasetPath,
const std::string& datasetHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
zipFile(zipFile),
datasetURL(datasetURL),
datasetHash(datasetHash),
datasetPath(datasetPath),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -119,17 +218,18 @@ template<
class Datasets
{
public:
//! Get details of MNIST Dataset.
const static DatasetDetails<DatasetX, DatasetY> MNIST()
{
DatasetDetails<DatasetX, DatasetY> mnistDetails(
"mnist",
"/datasets/mnist_train.csv",
"/datasets/mnist_test.csv",
"772495e3",
"8bcdb7e1",
true,
"./../data/mnist_train.csv",
"./../data/mnist_test.csv");
"/datasets/mnist.tar.gz",
"./../data/mnist.tar.gz",
"33470ca3",
true,
"./../data/mnist-dataset/mnist_train.csv",
"./../data/mnist-dataset/mnist_test.csv");

// Set the Pre-Processor Function.
mnistDetails.PreProcess = PreProcessor<DatasetX, DatasetY>::MNIST;
Expand Down
Loading