Remove async toy datasets (#227)

gyrdym · web-flow · commit 1ced4c02a598 · 2022-05-05T00:32:56.000+03:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 # Changelog
 
+## 16.11.4
+- `getPimaIndiansDiabetesDataFrame`, `getIrisDataFrame` used
+
 ## 16.11.3
 - Toy datasets from `ml_dataframe` package used
 
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ We have 2 options here:
 
 - Download the dataset from [Pima Indians Diabetes Database](https://www.kaggle.com/uciml/pima-indians-diabetes-database).
 
-- Or we may simply use [loadPimaIndiansDiabetesDataset](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/loadPimaIndiansDiabetesDataset.html) function
+- Or we may simply use [getPimaIndiansDiabetesDataFrame](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/getPimaIndiansDiabetesDataFrame.html) function
 from [ml_dataframe](https://pub.dev/packages/ml_dataframe) package. The function returns a ready to use [DataFrame](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/DataFrame-class.html) instance
 filled with `Pima Indians Diabetes Database` data.
 
@@ -342,7 +342,7 @@ import 'package:ml_preprocessing/ml_preprocessing.dart';
 
 void main() async {
   // Another option - to use a toy dataset:
-  // final samples = await loadPimaIndiansDiabetesDataset();
+  // final samples = getPimaIndiansDiabetesDataFrame();
   final samples = await fromCsv('datasets/pima_indians_diabetes_database.csv', headerExists: true);
   final targetColumnName = 'Outcome';
   final splits = splitData(samples, [0.7]);
@@ -387,7 +387,7 @@ import 'package:ml_preprocessing/ml_preprocessing.dart';
 void main() async {
   final rawCsvContent = await rootBundle.loadString('assets/datasets/pima_indians_diabetes_database.csv');
   // Another option - to use a toy dataset:
-  // final samples = await loadPimaIndiansDiabetesDataset();
+  // final samples = getPimaIndiansDiabetesDataFrame();
   final samples = DataFrame.fromRawCsv(rawCsvContent);
   final targetColumnName = 'Outcome';
   final splits = splitData(samples, [0.7]);
@@ -599,7 +599,7 @@ void main() async {
 Let's try to classify data from a well-known [Iris](https://www.kaggle.com/datasets/uciml/iris) dataset using a non-linear algorithm - [decision trees](https://en.wikipedia.org/wiki/Decision_tree)
 
 First, you need to download the data and place it in a proper place in your file system. To do so you should follow the
-instructions which are given in the [Logistic regression](#logistic-regression) section. Or you may use [loadIrisDataset](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/loadIrisDataset.html)
+instructions which are given in the [Logistic regression](#logistic-regression) section. Or you may use [getIrisDataFrame](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/getIrisDataFrame.html)
 function that returns ready to use [DataFrame](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/DataFrame-class.html) instance filled with `Iris`dataset. 
 
 After loading the data, it's needed to preprocess it. We should drop the `Id` column since the column doesn't make sense. 
@@ -612,7 +612,7 @@ import 'package:ml_dataframe/ml_dataframe.dart';
 import 'package:ml_preprocessing/ml_preprocessing.dart';
 
 void main() async {
-    final samples = (await loadIrisDataset())
+    final samples = getIrisDataset()
       .shuffle()
       .dropSeries(seriesNames: ['Id']);
     
@@ -675,14 +675,14 @@ efficient to retrieve data.
 Let's retrieve some data points through a kd-tree built on the [Iris](https://www.kaggle.com/datasets/uciml/iris) dataset.
 
 First, we need to prepare the data. To do so, it's needed to load the dataset. For this purpose, we may use 
-[loadIrisDataset](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/loadIrisDataset.html) function from [ml_dataframe](https://pub.dev/packages/ml_dataframe). The function returns prefilled with the Iris data DataFrame instance:
+[getIrisDataFrame](https://pub.dev/documentation/ml_dataframe/latest/ml_dataframe/getIrisDataFrame.html) function from [ml_dataframe](https://pub.dev/packages/ml_dataframe). The function returns prefilled with the Iris data DataFrame instance:
 
 ```dart
 import 'package:ml_algo/ml_algo.dart';
 import 'package:ml_dataframe/ml_dataframe.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
 }
 ```
 
@@ -693,8 +693,8 @@ drop these columns:
 import 'package:ml_algo/ml_algo.dart';
 import 'package:ml_dataframe/ml_dataframe.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
   final data = originalData.dropSeries(names: ['Id', 'Species']);
 }
 ```
@@ -705,8 +705,8 @@ Next, we can build the tree:
 import 'package:ml_algo/ml_algo.dart';
 import 'package:ml_dataframe/ml_dataframe.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
   final data = originalData.dropSeries(names: ['Id', 'Species']);
   final tree = KDTree(data);
 }
@@ -719,8 +719,8 @@ import 'package:ml_algo/ml_algo.dart';
 import 'package:ml_dataframe/ml_dataframe.dart';
 import 'package:ml_linalg/vector.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
   final data = originalData.dropSeries(names: ['Id', 'Species']);
   final tree = KDTree(data);
   final neighbourCount = 5;
@@ -742,8 +742,8 @@ The nearest point has an index 75 in the original data. Let's check a record at
 ```dart
 import 'package:ml_dataframe/ml_dataframe.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
  
   print(originalData.rows.elementAt(75));
 }
@@ -784,8 +784,8 @@ import 'dart:io';
 import 'package:ml_algo/ml_algo.dart';
 import 'package:ml_dataframe/ml_dataframe.dart';
 
-void main() async {
-  final originalData = await loadIrisDataset();
+void main() {
+  final originalData = getIrisDataFrame();
   final data = originalData.dropSeries(names: ['Id', 'Species']);
   final tree = KDTree(data);
  
diff --git a/e2e/decision_tree_classifier/decision_tree_classifier_test.dart b/e2e/decision_tree_classifier/decision_tree_classifier_test.dart
@@ -5,8 +5,8 @@ import 'package:ml_linalg/vector.dart';
 import 'package:ml_preprocessing/ml_preprocessing.dart';
 import 'package:test/test.dart';
 
-Future<Vector> evaluateClassifier(MetricType metric, DType dtype) async {
-  final samples = (await loadIrisDataset()).shuffle().dropSeries(names: ['Id']);
+Future<Vector> evaluateClassifier(MetricType metric, DType dtype) {
+  final samples = getIrisDataFrame().shuffle().dropSeries(names: ['Id']);
   final pipeline = Pipeline(samples, [
     toIntegerLabels(
       columnNames: ['Species'],
diff --git a/e2e/kd_tree/kd_tree_test.dart b/e2e/kd_tree/kd_tree_test.dart
@@ -6,9 +6,8 @@ import 'package:test/test.dart';
 
 void main() async {
   group('KDTree', () {
-    test('should return correct list of neighbours, dtype=DType.float32',
-        () async {
-      final originalData = await loadIrisDataset();
+    test('should return correct list of neighbours, dtype=DType.float32', () {
+      final originalData = getIrisDataFrame();
       final data = originalData.dropSeries(names: ['Id', 'Species']);
       final tree = KDTree(data);
       final neighbours = tree.query(Vector.fromList([6.5, 3.01, 4.5, 1.5]), 5);
@@ -18,9 +17,8 @@ void main() async {
           '((Index: 75, Distance: 0.17349341930302867), (Index: 51, Distance: 0.21470911402365767), (Index: 65, Distance: 0.26095956499211426), (Index: 86, Distance: 0.29681616124778537), (Index: 56, Distance: 0.4172527193942372))');
     });
 
-    test('should return correct list of neighbours, dtype=DType.float64',
-        () async {
-      final originalData = await loadIrisDataset();
+    test('should return correct list of neighbours, dtype=DType.float64', () {
+      final originalData = getIrisDataFrame();
       final data = originalData.dropSeries(names: ['Id', 'Species']);
       final tree = KDTree(data, dtype: DType.float64);
       final neighbours = tree.query(
diff --git a/e2e/knn_classifier/knn_classifier_test.dart b/e2e/knn_classifier/knn_classifier_test.dart
@@ -5,8 +5,8 @@ import 'package:ml_linalg/vector.dart';
 import 'package:ml_preprocessing/ml_preprocessing.dart';
 import 'package:test/test.dart';
 
-Future<Vector> evaluateKnnClassifier(MetricType metric, DType dtype) async {
-  final samples = (await loadIrisDataset()).shuffle().dropSeries(names: ['Id']);
+Future<Vector> evaluateKnnClassifier(MetricType metric, DType dtype) {
+  final samples = getIrisDataFrame().shuffle().dropSeries(names: ['Id']);
   final targetName = 'Species';
   final pipeline = Pipeline(samples, [
     toIntegerLabels(
diff --git a/e2e/logistic_regressor/logistic_regressor_test.dart b/e2e/logistic_regressor/logistic_regressor_test.dart
@@ -4,8 +4,8 @@ import 'package:ml_linalg/dtype.dart';
 import 'package:ml_linalg/vector.dart';
 import 'package:test/test.dart';
 
-Future<Vector> evaluateLogisticRegressor(MetricType metric, DType dtype) async {
-  final samples = (await loadPimaIndiansDiabetesDataset()).shuffle();
+Future<Vector> evaluateLogisticRegressor(MetricType metric, DType dtype) {
+  final samples = getPimaIndiansDiabetesDataFrame().shuffle();
   final numberOfFolds = 5;
   final targetNames = ['Outcome'];
   final validator = CrossValidator.kFold(
diff --git a/e2e/softmax_regressor/softmax_regressor_test.dart b/e2e/softmax_regressor/softmax_regressor_test.dart
@@ -5,9 +5,8 @@ import 'package:ml_linalg/vector.dart';
 import 'package:ml_preprocessing/ml_preprocessing.dart';
 import 'package:test/test.dart';
 
-Future<Vector> evaluateSoftmaxRegressor(
-    MetricType metricType, DType dtype) async {
-  final samples = (await loadIrisDataset()).shuffle().dropSeries(names: ['Id']);
+Future<Vector> evaluateSoftmaxRegressor(MetricType metricType, DType dtype) {
+  final samples = getIrisDataFrame().shuffle().dropSeries(names: ['Id']);
   final pipeline = Pipeline(samples, [
     toOneHotLabels(
       columnNames: ['Species'],
diff --git a/pubspec.yaml b/pubspec.yaml
@@ -1,6 +1,6 @@
 name: ml_algo
 description: Machine learning algorithms, Machine learning models performance evaluation functionality
-version: 16.11.3
+version: 16.11.4
 homepage: https://github.com/gyrdym/ml_algo
 
 environment:
@@ -10,7 +10,7 @@ dependencies:
   collection: ^1.16.0
   injector: ^2.0.0
   json_annotation: ^4.0.0
-  ml_dataframe: ^1.4.2
+  ml_dataframe: ^1.5.0
   ml_linalg: ^13.7.0
   ml_preprocessing: ^7.0.2
   quiver: ^3.0.0