29
29
from __future__ import division
30
30
from __future__ import print_function
31
31
32
- import argparse
33
32
import os
34
- import sys
35
33
34
+ # pylint: disable=g-bad-import-order
35
+ import numpy as np
36
36
from absl import app as absl_app
37
37
from absl import flags
38
- import numpy as np # pylint: disable=wrong-import-order
39
- import tensorflow as tf # pylint: disable=wrong -import-order
38
+ import tensorflow as tf
39
+ # pylint: enable=g-bad -import-order
40
40
41
41
from official .utils .flags import core as flags_core
42
42
from official .utils .flags ._conventions import help_wrap
43
+ from official .utils .logs import logger
43
44
45
+ NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
44
46
45
- NPZ_FILE = 'HIGGS.csv.gz.npz' # numpy compressed file containing 'data' array
46
-
47
-
48
- def define_train_higgs_flags ():
49
- """Add tree related flags as well as training/eval configuration."""
50
- flags_core .define_base (stop_threshold = False , batch_size = False , num_gpu = False )
51
- flags .adopt_module_key_flags (flags_core )
52
-
53
- flags .DEFINE_integer (
54
- name = 'train_start' , default = 0 ,
55
- help = help_wrap ('Start index of train examples within the data.' ))
56
- flags .DEFINE_integer (
57
- name = 'train_count' , default = 1000000 ,
58
- help = help_wrap ('Number of train examples within the data.' ))
59
- flags .DEFINE_integer (
60
- name = 'eval_start' , default = 10000000 ,
61
- help = help_wrap ('Start index of eval examples within the data.' ))
62
- flags .DEFINE_integer (
63
- name = 'eval_count' , default = 1000000 ,
64
- help = help_wrap ('Number of eval examples within the data.' ))
65
-
66
- flags .DEFINE_integer (
67
- 'n_trees' , default = 100 , help = help_wrap ('Number of trees to build.' ))
68
- flags .DEFINE_integer (
69
- 'max_depth' , default = 6 , help = help_wrap ('Maximum depths of each tree.' ))
70
- flags .DEFINE_float (
71
- 'learning_rate' , default = 0.1 ,
72
- help = help_wrap ('Maximum depths of each tree.' ))
73
-
74
- flags_core .set_defaults (data_dir = '/tmp/higgs_data' ,
75
- model_dir = '/tmp/higgs_model' )
76
47
48
+ def read_higgs_data (data_dir , train_start , train_count , eval_start , eval_count ):
49
+ """Reads higgs data from csv and returns train and eval data.
77
50
51
+ Args:
52
+ data_dir: A string, the directory of higgs dataset.
53
+ train_start: An integer, the start index of train examples within the data.
54
+ train_count: An integer, the number of train examples within the data.
55
+ eval_start: An integer, the start index of eval examples within the data.
56
+ eval_count: An integer, the number of eval examples within the data.
78
57
79
- def read_higgs_data (data_dir , train_start , train_count , eval_start , eval_count ):
80
- """Reads higgs data from csv and returns train and eval data."""
58
+ Returns:
59
+ Numpy array of train data and eval data.
60
+ """
81
61
npz_filename = os .path .join (data_dir , NPZ_FILE )
82
62
try :
83
63
# gfile allows numpy to read data from network data sources as well.
84
- with tf .gfile .Open (npz_filename , 'rb' ) as npz_file :
64
+ with tf .gfile .Open (npz_filename , "rb" ) as npz_file :
85
65
with np .load (npz_file ) as npz :
86
- data = npz [' data' ]
66
+ data = npz [" data" ]
87
67
except Exception as e :
88
68
raise RuntimeError (
89
- ' Error loading data; use data_download.py to prepare the data:\n {}: {}'
69
+ " Error loading data; use data_download.py to prepare the data:\n {}: {}"
90
70
.format (type (e ).__name__ , e ))
91
71
return (data [train_start :train_start + train_count ],
92
72
data [eval_start :eval_start + eval_count ])
@@ -105,18 +85,18 @@ def make_inputs_from_np_arrays(features_np, label_np):
105
85
as a single tensor. Don't use batch.
106
86
107
87
Args:
108
- features_np: a numpy ndarray (shape=[batch_size, num_features]) for
88
+ features_np: A numpy ndarray (shape=[batch_size, num_features]) for
109
89
float32 features.
110
- label_np: a numpy ndarray (shape=[batch_size, 1]) for labels.
90
+ label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
111
91
112
92
Returns:
113
- input_fn: a function returning a Dataset of feature dict and label.
114
- feature_column: a list of tf.feature_column.BucketizedColumn.
93
+ input_fn: A function returning a Dataset of feature dict and label.
94
+ feature_column: A list of tf.feature_column.BucketizedColumn.
115
95
"""
116
96
num_features = features_np .shape [1 ]
117
97
features_np_list = np .split (features_np , num_features , axis = 1 )
118
98
# 1-based feature names.
119
- feature_names = [' feature_%02d' % (i + 1 ) for i in range (num_features )]
99
+ feature_names = [" feature_%02d" % (i + 1 ) for i in range (num_features )]
120
100
121
101
# Create source feature_columns and bucketized_columns.
122
102
def get_bucket_boundaries (feature ):
@@ -155,16 +135,16 @@ def make_eval_inputs_from_np_arrays(features_np, label_np):
155
135
num_features = features_np .shape [1 ]
156
136
features_np_list = np .split (features_np , num_features , axis = 1 )
157
137
# 1-based feature names.
158
- feature_names = [' feature_%02d' % (i + 1 ) for i in range (num_features )]
138
+ feature_names = [" feature_%02d" % (i + 1 ) for i in range (num_features )]
159
139
160
140
def input_fn ():
161
141
features = {
162
142
feature_name : tf .constant (features_np_list [i ])
163
143
for i , feature_name in enumerate (feature_names )
164
144
}
165
- return tf .data .Dataset .zip (
166
- ( tf .data .Dataset .from_tensor_slices (features ),
167
- tf .data .Dataset .from_tensor_slices (label_np ),)).batch (1000 )
145
+ return tf .data .Dataset .zip ((
146
+ tf .data .Dataset .from_tensor_slices (features ),
147
+ tf .data .Dataset .from_tensor_slices (label_np ),)).batch (1000 )
168
148
169
149
return input_fn
170
150
@@ -175,22 +155,37 @@ def train_boosted_trees(flags_obj):
175
155
Args:
176
156
flags_obj: An object containing parsed flag values.
177
157
"""
178
-
179
158
# Clean up the model directory if present.
180
159
if tf .gfile .Exists (flags_obj .model_dir ):
181
160
tf .gfile .DeleteRecursively (flags_obj .model_dir )
182
- print ( ' ## data loading..' )
161
+ tf . logging . info ( " ## Data loading..." )
183
162
train_data , eval_data = read_higgs_data (
184
163
flags_obj .data_dir , flags_obj .train_start , flags_obj .train_count ,
185
164
flags_obj .eval_start , flags_obj .eval_count )
186
- print ( ' ## data loaded; train: {}{}, eval: {}{}' .format (
165
+ tf . logging . info ( " ## Data loaded; train: {}{}, eval: {}{}" .format (
187
166
train_data .dtype , train_data .shape , eval_data .dtype , eval_data .shape ))
188
- # data consists of one label column and 28 feature columns following.
167
+
168
+ # Data consists of one label column followed by 28 feature columns.
189
169
train_input_fn , feature_columns = make_inputs_from_np_arrays (
190
170
features_np = train_data [:, 1 :], label_np = train_data [:, 0 :1 ])
191
171
eval_input_fn = make_eval_inputs_from_np_arrays (
192
172
features_np = eval_data [:, 1 :], label_np = eval_data [:, 0 :1 ])
193
- print ('## features prepared. training starts..' )
173
+ tf .logging .info ("## Features prepared. Training starts..." )
174
+
175
+ # Create benchmark logger to log info about the training and metric values
176
+ run_params = {
177
+ "train_start" : flags_obj .train_start ,
178
+ "train_count" : flags_obj .train_count ,
179
+ "eval_start" : flags_obj .eval_start ,
180
+ "eval_count" : flags_obj .eval_count ,
181
+ "n_trees" : flags_obj .n_trees ,
182
+ "max_depth" : flags_obj .max_depth ,
183
+ }
184
+ benchmark_logger = logger .config_benchmark_logger (flags_obj )
185
+ benchmark_logger .log_run_info (
186
+ model_name = "boosted_trees" ,
187
+ dataset_name = "higgs" ,
188
+ run_params = run_params )
194
189
195
190
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
196
191
# training is yet provided as a contrib library.
@@ -203,7 +198,9 @@ def train_boosted_trees(flags_obj):
203
198
learning_rate = flags_obj .learning_rate )
204
199
205
200
# Evaluation.
206
- eval_result = classifier .evaluate (eval_input_fn )
201
+ eval_results = classifier .evaluate (eval_input_fn )
202
+ # Benchmark the evaluation results
203
+ benchmark_logger .log_evaluation_result (eval_results )
207
204
208
205
# Exporting the savedmodel.
209
206
if flags_obj .export_dir is not None :
@@ -216,7 +213,37 @@ def main(_):
216
213
train_boosted_trees (flags .FLAGS )
217
214
218
215
219
- if __name__ == '__main__' :
216
+ def define_train_higgs_flags ():
217
+ """Add tree related flags as well as training/eval configuration."""
218
+ flags_core .define_base (stop_threshold = False , batch_size = False , num_gpu = False )
219
+ flags .adopt_module_key_flags (flags_core )
220
+
221
+ flags .DEFINE_integer (
222
+ name = "train_start" , default = 0 ,
223
+ help = help_wrap ("Start index of train examples within the data." ))
224
+ flags .DEFINE_integer (
225
+ name = "train_count" , default = 1000000 ,
226
+ help = help_wrap ("Number of train examples within the data." ))
227
+ flags .DEFINE_integer (
228
+ name = "eval_start" , default = 10000000 ,
229
+ help = help_wrap ("Start index of eval examples within the data." ))
230
+ flags .DEFINE_integer (
231
+ name = "eval_count" , default = 1000000 ,
232
+ help = help_wrap ("Number of eval examples within the data." ))
233
+
234
+ flags .DEFINE_integer (
235
+ "n_trees" , default = 100 , help = help_wrap ("Number of trees to build." ))
236
+ flags .DEFINE_integer (
237
+ "max_depth" , default = 6 , help = help_wrap ("Maximum depths of each tree." ))
238
+ flags .DEFINE_float (
239
+ "learning_rate" , default = 0.1 ,
240
+ help = help_wrap ("The learning rate." ))
241
+
242
+ flags_core .set_defaults (data_dir = "/tmp/higgs_data" ,
243
+ model_dir = "/tmp/higgs_model" )
244
+
245
+
246
+ if __name__ == "__main__" :
220
247
# Training progress and eval results are shown as logging.INFO; so enables it.
221
248
tf .logging .set_verbosity (tf .logging .INFO )
222
249
define_train_higgs_flags ()
0 commit comments