Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/linux-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/macos-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ steps:
displayName: 'Build models'

# Run CTests.
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -R UtilsTest
- script: cd build/tests/ && sudo CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
2 changes: 1 addition & 1 deletion .ci/windows-steps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ steps:
# Run tests via ctest.
- bash: |
cd build/tests
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release -R UtilsTest
CTEST_OUTPUT_ON_FAILURE=1 ctest -T Test -C Release .
displayName: 'Run tests via ctest'

# Publish test results to Azure Pipelines
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,8 @@ xcode*
.idea
cmake-build-*
*.csv
*.tar
*.zip
*.tar.gz
.travis/configs.hpp
Testing/*
28 changes: 24 additions & 4 deletions dataloader/dataloader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,11 +179,30 @@ class DataLoader
*/
void DownloadDataset(const std::string& dataset)
{
if (datasetMap[dataset].zipFile && (!Utils::PathExists(
datasetMap[dataset].trainPath) ||
!Utils::PathExists(datasetMap[dataset].testPath)))
{
Utils::DownloadFile(datasetMap[dataset].datasetURL,
datasetMap[dataset].datasetPath, dataset + "_training_data.",
false, false, datasetMap[dataset].serverName,
datasetMap[dataset].zipFile);

if (!Utils::CompareCRC32(datasetMap[dataset].datasetPath,
datasetMap[dataset].datasetHash))
{
mlpack::Log::Fatal << "Corrupted Data for " << dataset <<
" downloaded." << std::endl;
}

return;
}

if (!Utils::PathExists(datasetMap[dataset].trainPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].trainPath, dataset + "_training_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].trainPath,
datasetMap[dataset].trainHash))
Expand All @@ -192,11 +211,12 @@ class DataLoader
dataset << " downloaded." << std::endl;
}
}

if (!Utils::PathExists(datasetMap[dataset].testPath))
{
Utils::DownloadFile(datasetMap[dataset].trainDownloadUrl,
Utils::DownloadFile(datasetMap[dataset].trainDownloadURL,
datasetMap[dataset].testPath, dataset + "_testing_data.",
false);
false, false, datasetMap[dataset].serverName);

if (!Utils::CompareCRC32(datasetMap[dataset].testPath,
datasetMap[dataset].testHash))
Expand Down
22 changes: 11 additions & 11 deletions dataloader/dataloader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ template<
datasetMap[dataset].endTrainingPredictionFeatures,
datasetMap[dataset].endTrainingPredictionFeatures);

LoadCSV(datasetMap[dataset].testPath, false, false, useScaler,
LoadCSV(datasetMap[dataset].testPath, false, false, ratio, useScaler,
datasetMap[dataset].dropHeader,
datasetMap[dataset].startTestingInputFeatures,
datasetMap[dataset].endTestingInputFeatures);
Expand Down Expand Up @@ -106,13 +106,6 @@ template<
arma::mat trainDataset, validDataset;
data::Split(dataset, trainDataset, validDataset, ratio, shuffle);

if (useScaler)
{
scaler.Fit(trainDataset);
scaler.Transform(trainDataset, trainDataset);
scaler.Transform(validDataset, validDataset);
}

trainFeatures = trainDataset.rows(WrapIndex(startInputFeatures,
trainDataset.n_rows), WrapIndex(endInputFeatures,
trainDataset.n_rows));
Expand All @@ -125,10 +118,16 @@ template<
validDataset.n_rows), WrapIndex(endInputFeatures,
validDataset.n_rows));

validLabels = trainDataset.rows(WrapIndex(startPredictionFeatures,
validLabels = validDataset.rows(WrapIndex(startPredictionFeatures,
validDataset.n_rows), WrapIndex(endPredictionFeatures,
validDataset.n_rows));

if (useScaler)
{
scaler.Fit(trainFeatures);
scaler.Transform(trainFeatures, trainFeatures);
scaler.Transform(validFeatures, validFeatures);
}
// TODO : Add support for augmentation here.
mlpack::Log::Info << "Training Dataset Loaded." << std::endl;
}
Expand All @@ -139,8 +138,9 @@ template<
scaler.Transform(dataset, dataset);
}

testFeatures = dataset.submat(WrapIndex(startInputFeatures, dataset.n_rows),
0, WrapIndex(endInputFeatures, dataset.n_rows), dataset.n_cols - 1);
testFeatures = dataset.rows(WrapIndex(startInputFeatures, dataset.n_rows),
WrapIndex(endInputFeatures, dataset.n_rows));

mlpack::Log::Info << "Testing Dataset Loaded." << std::endl;
}
}
Expand Down
130 changes: 115 additions & 15 deletions dataloader/datasets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,47 @@ template<
>
struct DatasetDetails
{
//! Locally stored name of dataset used for identification
//! during dataloader call.
std::string datasetName;
std::string trainDownloadUrl;
std::string testDownloadUrl;

//! Locally stored URL for downloading training data.
std::string trainDownloadURL;

//! Locally stored URL for downloading testing data.
std::string testDownloadURL;

//! CRC-32 checksum for training data file.
std::string trainHash;

//! CRC-32 checksum for testing data file.
std::string testHash;

//! Locally stored boolean to determine if dataset is of CSV or similar
//! format.
bool loadCSV;

//! Locally stored path to file / directory for training data.
std::string trainPath;

//! Locally stored path to file / directory for testing data.
std::string testPath;

//! Locally held boolean to determine whether dataset will be in zip format.
bool zipFile;

//! Locally stored URL for downloading dataset.
std::string datasetURL;

//! Locally stored CRC-32 checksum for the dataset.
std::string datasetHash;

//! Locally stored path for saving the archived / zip dataset.
std::string datasetPath;

//! Locally stored server name for download file.
std::string serverName;

// Pre-Process functor.
std::function<void(DatasetX&, DatasetY&,
DatasetX&, DatasetY&, DatasetX&)> PreProcess;
Expand All @@ -61,13 +93,18 @@ struct DatasetDetails
// Default constructor.
DatasetDetails() :
datasetName(""),
trainDownloadUrl(""),
testDownloadUrl(""),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
loadCSV(false),
trainPath(""),
testPath(""),
zipFile(false),
datasetURL(""),
datasetPath(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -77,23 +114,85 @@ struct DatasetDetails
dropHeader(false)
{/* Nothing to do here. */}

// Constructor for initializing object.
/**
* Constructor for initializing object for seperate
* train and test download URL.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param trainDownloadURL URL for downloading training data.
* @param testDownloadURL URL for downloading testing data.
* @param trainHash CRC-32 checksum for training data.
* @param testHash CRC-32 checksum for testing data.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const std::string& trainDownloadUrl,
const std::string& testDownloadUrl,
const std::string& trainDownloadURL,
const std::string& testDownloadURL,
const std::string& trainHash,
const std::string& testHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
trainDownloadUrl(trainDownloadUrl),
testDownloadUrl(testDownloadUrl),
trainDownloadURL(trainDownloadURL),
testDownloadURL(testDownloadURL),
trainHash(trainHash),
testHash(testHash),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
zipFile(false),
datasetURL(""),
datasetHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
endTrainingPredictionFeatures(0),
startTestingInputFeatures(0),
endTestingInputFeatures(0),
dropHeader(false)
{
// Nothing to do here.
}

/**
* Constructor for initializing paths for zip files.
*
* @param datasetName Name of dataset used for identification during
* dataloader call.
* @param zipFile Boolean to determine if dataset is stored in zip format.
* @param datasetURL URL for downloading dataset.
* @param datasetPath Path where the dataset will be downloaded.
* @param datasetHash CRC-32 checksum for dataset.
* @param loadCSV Determines if the format of dataset is similar to CSV.
* @param trainPath Path for training dataset.
* @param testPath Path for testing dataset.
*/
DatasetDetails(const std::string& datasetName,
const bool zipFile,
const std::string& datasetURL,
const std::string& datasetPath,
const std::string& datasetHash,
const bool loadCSV,
const std::string& trainPath,
const std::string& testPath) :
datasetName(datasetName),
zipFile(zipFile),
datasetURL(datasetURL),
datasetHash(datasetHash),
datasetPath(datasetPath),
loadCSV(loadCSV),
trainPath(trainPath),
testPath(testPath),
trainDownloadURL(""),
testDownloadURL(""),
trainHash(""),
testHash(""),
serverName("www.mlpack.org"),
startTrainingInputFeatures(0),
endTrainingInputFeatures(0),
startTrainingPredictionFeatures(0),
Expand All @@ -119,17 +218,18 @@ template<
class Datasets
{
public:
//! Get details of MNIST Dataset.
const static DatasetDetails<DatasetX, DatasetY> MNIST()
{
DatasetDetails<DatasetX, DatasetY> mnistDetails(
"mnist",
"/datasets/mnist_train.csv",
"/datasets/mnist_test.csv",
"772495e3",
"8bcdb7e1",
true,
"./../data/mnist_train.csv",
"./../data/mnist_test.csv");
"/datasets/mnist.tar.gz",
"./../data/mnist.tar.gz",
"33470ca3",
true,
"./../data/mnist-dataset/mnist_train.csv",
"./../data/mnist-dataset/mnist_test.csv");

// Set the Pre-Processor Function.
mnistDetails.PreProcess = PreProcessor<DatasetX, DatasetY>::MNIST;
Expand Down
27 changes: 23 additions & 4 deletions tests/dataloader_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
// Check for training dataset using tuples.
BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_cols, 75);
BOOST_REQUIRE_EQUAL(std::get<0>(irisDataloader.TrainSet()).n_rows, 4);

Utils::RemoveFile("./../data/iris.csv");
}

/**
Expand All @@ -58,11 +60,28 @@ BOOST_AUTO_TEST_CASE(CSVDataLoaderTest)
BOOST_AUTO_TEST_CASE(MNISTDataLoaderTest)
{
DataLoader<> dataloader("mnist", true, 0.80);

// Check for correct dimensions.
BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 784);
BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 33600);
BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_rows, 784);
BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_rows, 784);
BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_rows, 784);


Copy link
Member

@KimSangYeon-DGU KimSangYeon-DGU May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use 1 newline and add a comment about what we're checking in this block?

And, perhaps, I missed something... can you let me know why the data were distributed like below?
Train : 8,400
Valid: 33,600
Test: 28,000

Copy link
Member Author

@kartikdutt18 kartikdutt18 May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same split as mlpack::data::split. Here 0.8 means 80% testing and 20 % training data. mlpack takes in test ratio. I though it would be best not to invert. Let me know if I should do that or keep it same as mlpack. Here is the link for data file.

Copy link
Member

@KimSangYeon-DGU KimSangYeon-DGU May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, then, it would be better to change the parameter name ratio to testRatio for clarification in DataLoader class :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I'll make the change.

Copy link
Member

@KimSangYeon-DGU KimSangYeon-DGU May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, kindly update the parameter description as well in here

Copy link
Member

@KimSangYeon-DGU KimSangYeon-DGU May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok. I thought it's from the same file :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I use validRatio instead I think that makes even more sense.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. that makes sense to me as well.

Copy link
Member Author

@kartikdutt18 kartikdutt18 May 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, making the changes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. :)

BOOST_REQUIRE_EQUAL(dataloader.TrainFeatures().n_cols, 8400);
BOOST_REQUIRE_EQUAL(dataloader.ValidFeatures().n_cols, 33600);
BOOST_REQUIRE_EQUAL(dataloader.TestFeatures().n_cols, 28000);

BOOST_REQUIRE_EQUAL(std::get<0>(dataloader.TrainSet()).n_cols, 8400);
BOOST_REQUIRE_EQUAL(std::get<1>(dataloader.TrainSet()).n_rows, 1);
BOOST_REQUIRE_EQUAL(std::get<0>(dataloader.ValidSet()).n_cols, 33600);
BOOST_REQUIRE_EQUAL(std::get<1>(dataloader.TrainSet()).n_rows, 1);

// Clean up.
Utils::RemoveFile("./../data/mnist-dataset/mnist_all.csv");
Utils::RemoveFile("./../data/mnist-dataset/mnist_all_centroids.csv");
Utils::RemoveFile("./../data/mnist-dataset/mnist_train.csv");
Utils::RemoveFile("./../data/mnist-dataset/mnist_test.csv");
Utils::RemoveFile("./../data/mnist.tar.gz");
}

BOOST_AUTO_TEST_SUITE_END();
17 changes: 17 additions & 0 deletions tests/utils_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,21 @@ BOOST_AUTO_TEST_CASE(RemoveFileTest)
BOOST_REQUIRE_EQUAL(Utils::PathExists("./../data/file.txt"), 0);
}

BOOST_AUTO_TEST_CASE(ExtractFilesTest)
{
std::vector<boost::filesystem::path> vec;

Utils::DownloadFile("/datasets/USCensus1990.tar.gz",
"./../data/USCensus1990.tar.gz", "", false, true,
"www.mlpack.org", true, "./../data/");

BOOST_REQUIRE(Utils::PathExists("./../data/USCensus1990.csv"));
BOOST_REQUIRE(Utils::PathExists("./../data/USCensus1990_centroids.csv"));

// Clean up.
Utils::RemoveFile("./../data/USCensus1990.csv");
Utils::RemoveFile("./../data/USCensus1990_centroids.csv");
Utils::RemoveFile("./../data/USCensus1990.tar.gz");
}

BOOST_AUTO_TEST_SUITE_END();
Loading