|
11 | 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 | 12 | # See the License for the specific language governing permissions and
|
13 | 13 | # limitations under the License.
|
14 |
| -"""VGG16 benchmark in Fluid""" |
| 14 | +"""VGG16 benchmark in Fluid |
| 15 | +
|
| 16 | +# Single trainer, single PS on a single machine. |
| 17 | +VGG_SRC="${CODE_DIR}/vgg16_fluid.py" |
| 18 | +export TRAINING_ROLE=PSERVER |
| 19 | +export TRAINERS=1 |
| 20 | +export POD_IP=127.0.0.1 |
| 21 | +export PADDLE_INIT_PORT=6174 |
| 22 | +MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 & |
| 23 | +sleep 10 # wait for PS to start. |
| 24 | +export TRAINING_ROLE=TRAINER |
| 25 | +MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU & |
| 26 | +
|
| 27 | +# To run multiple trainers on a single machine |
| 28 | +# change TRAINERS=2 and launch 2 trainers. |
| 29 | +# CUDA_VISIBLE_DEVICES=4 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=0 & |
| 30 | +# CUDA_VISIBLE_DEVICES=5 MKL_NUM_THREADS=1 python -u ${VGG_SRC} --local 0 --ps_host=127.0.0.1:6174 --trainer_hosts=127.0.0.1:6174 --device=GPU --task_index=1 & |
| 31 | +""" |
| 32 | + |
15 | 33 | from __future__ import print_function
|
16 | 34 |
|
17 | 35 | import sys
|
@@ -200,18 +218,19 @@ def train_loop(exe, trainer_prog):
|
200 | 218 | num_samples += len(data)
|
201 | 219 | train_pass_acc.add(value=acc, weight=b_size)
|
202 | 220 | print(
|
203 |
| - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" |
204 |
| - % (pass_id, iters, loss, acc, |
205 |
| - len(data) / (time.time() - ts)) |
| 221 | + "Task:%d Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, " |
| 222 | + "Speed = %.2f img/s " % (args.task_index, pass_id, iters, |
| 223 | + loss, acc, |
| 224 | + len(data) / (time.time() - ts)) |
206 | 225 | ) # The accuracy is the accumulation of batches, but not the current batch.
|
207 | 226 |
|
208 | 227 | pass_elapsed = time.time() - start_time
|
209 | 228 | pass_train_acc = train_pass_acc.eval()
|
210 | 229 | pass_test_acc = test(exe)
|
211 |
| - print( |
212 |
| - "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" |
213 |
| - % (pass_id, num_samples / pass_elapsed, pass_train_acc, |
214 |
| - pass_test_acc)) |
| 230 | + print("Task:%d Pass = %d, Training performance = %f imgs/s, " |
| 231 | + "Train accuracy = %f, Test accuracy = %f\n" % |
| 232 | + (args.task_index, pass_id, num_samples / pass_elapsed, |
| 233 | + pass_train_acc, pass_test_acc)) |
215 | 234 |
|
216 | 235 | if args.local:
|
217 | 236 | # Parameter initialization
|
|
0 commit comments