Add num_workers flag to es_trainer (#473)

boomanaiden154 · web-flow · commit e4446c13b492 · 2025-03-18T10:25:40.000-07:00
This patch adds a num_workers flag to es_trainer. This better matches
the behavior of the train_locally script, which is important for some
internal scripts for distributed training. Choosing the number of
workers based on the number of perturbations also does not take into
consideration the underlying hardware at all, which should be what
determines the worker count. The current logic already did not take into
account antithetic sampling doubling the number of models to evaluate
per iteration.
diff --git a/compiler_opt/es/es_trainer_lib.py b/compiler_opt/es/es_trainer_lib.py
@@ -51,6 +51,8 @@
     to the data collection requests.")
 _TRAIN_CORPORA = flags.DEFINE_string("train_corpora", "",
                                      "List of paths to training corpora")
+_NUM_WORKERS = flags.DEFINE_integer("num_workers", 100,
+                                    "The number of workers to create.")
 
 
 @gin.constants_from_enum(module="es_trainer_lib")
@@ -216,7 +218,7 @@ def train(additional_compilation_flags=(),
 
   with worker_manager_class(
       worker_class,
-      count=learner_config.total_num_perturbations,
+      count=_NUM_WORKERS.value,
       worker_kwargs=dict(gin_config=gin.operative_config_str())) as pool:
     learner.set_baseline(pool)
     for _ in range(learner_config.total_steps):