Benchmarks are done

igor_rukhovich · igor_rukhovich · commit 01a5c602d160 · 2020-10-01T01:26:26.000+03:00
diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json
@@ -4,8 +4,7 @@
         "lib": ["modelbuilders"],
         "data-format": ["pandas"],
         "data-order": ["F"],
-        "dtype": ["float32"],
-        "count-dmatrix": [""]
+        "dtype": ["float32"]
     },
     "cases": [
         {
@@ -22,8 +21,7 @@
                 }
             ],
             "n-estimators": [100],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
+            "objective": ["regression"],
             "max-depth": [8],
             "scale-pos-weight": [2],
             "learning-rate": [0.1],
@@ -56,8 +54,7 @@
             "max-depth": [8],
             "max-leaves": [256],
             "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
+            "objective": ["binary"]
         },
         {
             "algorithm": "lgbm_mb",
@@ -82,8 +79,7 @@
             "max-depth": [8],
             "max-leaves": [256],
             "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
+            "objective": ["binary"]
         },
         {
             "algorithm": "lgbm_mb",
@@ -103,11 +99,10 @@
             "subsample": [1],
             "reg-lambda":  [2],
             "min-child-weight": [1],
-            "min-split-loss": [0.1],
+            "min-split-gain": [0.1],
             "max-depth": [8],
             "n-estimators": [200],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"]
+            "objective": ["multiclass"]
         }
     ]
 }
diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py
@@ -136,31 +136,13 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
             if param_vars[file_arg].name.endswith('.npy'):
                 data = np.load(param_vars[file_arg].name)
             else:
-                data = read_csv(param_vars[file_arg].name, params)
+                data = read_csv(param_vars[file_arg].name)
             full_data[element] = convert_data(
                 data,
                 int_dtype if 'y' in element and int_label else params.dtype,
                 params.data_order, params.data_format
             )
-        # generate and convert data if it's marked and path isn't specified
-        if full_data[element] is None and element in generated_data:
-            full_data[element] = convert_data(
-                np.random.rand(*params.shape),
-                int_dtype if 'y' in element and int_label else params.dtype,
-                params.data_order, params.data_format)
-        # convert existing labels from 1- to 2-dimensional
-        # if it's forced and possible
-        if full_data[element] is not None and 'y' in element and label_2d and hasattr(full_data[element], 'reshape'):
-            full_data[element] = full_data[element].reshape(
-                (full_data[element].shape[0], 1))
-        # add dtype property to data if it's needed and doesn't exist
-        if full_data[element] is not None and add_dtype and not hasattr(full_data[element], 'dtype'):
-            if hasattr(full_data[element], 'values'):
-                full_data[element].dtype = full_data[element].values.dtype
-            elif hasattr(full_data[element], 'dtypes'):
-                full_data[element].dtype = full_data[element].dtypes[0].type
-
-    params.dtype = get_dtype(full_data['X_train'])
+
     # add size to parameters which is need for some cases
     if not hasattr(params, 'size'):
         params.size = size_str(full_data['X_train'].shape)
@@ -363,7 +345,7 @@ def print_output(library, algorithm, stages, columns, params, functions,
         print(json.dumps(output, indent=4))
 
 
-def read_csv(filename, params):
+def read_csv(filename):
     from string import ascii_lowercase, ascii_uppercase
 
     # find out header existance
@@ -377,9 +359,9 @@ def read_csv(filename, params):
     # try to read csv with pandas and fall back to numpy reader if failed
     try:
         import pandas as pd
-        data = pd.read_csv(filename, header=header, dtype=params.dtype).values
+        data = pd.read_csv(filename, header=header, dtype=np.float32).values
     except ImportError:
-        data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype,
+        data = np.genfromtxt(filename, delimiter=',', dtype=np.float32,
                              skip_header=0 if header is None else 1)
 
     if data.ndim == 2:
diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
@@ -29,18 +29,17 @@
                     help='Maximum delta step we allow each leaf output to be')
 parser.add_argument('--max-depth', type=int, default=6,
                     help='Maximum depth of a tree')
-parser.add_argument('--max-leaves', type=int, default=0,
+parser.add_argument('--max-leaves', type=int, default=31,
                     help='Maximum number of nodes to be added')
 parser.add_argument('--min-child-weight', type=float, default=1,
                     help='Minimum sum of instance weight needed in a child')
-parser.add_argument('--min-split-loss', '--gamma', type=float, default=0,
+parser.add_argument('--min-split-gain', '--gamma', type=float, default=0,
                     help='Minimum loss reduction required to make'
                          ' partition on a leaf node')
 parser.add_argument('--n-estimators', type=int, default=100,
                     help='Number of gradient boosted trees')
 parser.add_argument('--objective', type=str, required=True,
-                    choices=('reg:squarederror', 'binary:logistic',
-                             'multi:softmax', 'multi:softprob'),
+                    choices=('regression', 'binary', 'multiclass'),
                     help='Control a balance of positive and negative weights')
 parser.add_argument('--reg-alpha', type=float, default=0,
                     help='L1 regularization term on weights')
@@ -50,31 +49,24 @@
                     help='Controls a balance of positive and negative weights')
 parser.add_argument('--subsample', type=float, default=1,
                     help='Subsample ratio of the training instances')
-parser.add_argument('--tree-method', type=str, required=True,
-                    help='The tree construction algorithm used in XGBoost')
 
 params = parse_args(parser)
 
 X_train, X_test, y_train, y_test = load_data(params)
 
 lgbm_params = {
-    'booster': 'gbtree',
-    'verbosity': 0,
+    'verbosity': -1,
     'learning_rate': params.learning_rate,
-    'min_split_loss': params.min_split_loss,
+    'min_split_gain': params.min_split_gain,
     'max_depth': params.max_depth,
     'min_child_weight': params.min_child_weight,
     'max_delta_step': params.max_delta_step,
     'subsample': params.subsample,
-    'sampling_method': 'uniform',
     'colsample_bytree': params.colsample_bytree,
-    'colsample_bylevel': 1,
     'colsample_bynode': 1,
     'reg_lambda': params.reg_lambda,
     'reg_alpha': params.reg_alpha,
-    'tree_method': params.tree_method,
     'scale_pos_weight': params.scale_pos_weight,
-    'grow_policy': params.grow_policy,
     'max_leaves': params.max_leaves,
     'max_bin': params.max_bin,
     'objective': params.objective,
@@ -113,10 +105,10 @@
 t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params,  lgbm_train, params=params,
                         num_boost_round=params.n_estimators, valid_sets=lgbm_train,
                         verbose_eval=False)
-y_train_pred = model_lgbm.predict(lgbm_train)
+y_train_pred = model_lgbm.predict(X_train)
 train_metric = metric_func(y_train, y_train_pred)
 
-t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, lgbm_test, params=params)
+t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
 
 t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
@@ -138,4 +130,4 @@
                 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'],
              times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
              accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
-             data=[X_train, X_test, X_train, X_test, X_train, X_test])
+             data=[X_train, X_test, X_train, X_test, X_train, X_test])
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
@@ -6,6 +6,7 @@
 import daal4py
 import numpy as np
 from os import environ
+from sys import stderr
 from timeit import default_timer as timer
 from typing import Tuple
 import xgboost as xgb
diff --git a/runner.py b/runner.py
@@ -289,6 +289,7 @@ class GenerationArgs:
                         try:
                             json_result['results'].extend(json.loads(stdout))
                         except json.JSONDecodeError:
+                            print("UNABLE TO PARSE, ", stdout)
                             pass
                     elif args.output_format == 'csv':
                         csv_result += stdout + '\n'