working, can now launch from cli

lessw2020 · lessw2020 · commit bf7969713433 · 2024-10-16T18:37:12.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -20,14 +20,14 @@
 from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
 from torchchat.cli.builder import _initialize_tokenizer, TokenizerArgs
 
-from torchchat.distributed.logging_utils import SingletonLogger
-
 # TODO - these are not distributed specific, consider moving to new package
 from torchchat.distributed.checkpoint_utils import (
     get_hf_config_file,
     load_weights_from_hf_format,
     load_weights_from_torchchat_format,
 )
+
+from torchchat.distributed.logging_utils import SingletonLogger
 from torchchat.distributed.utils import (
     bytes_to_readable,
     Color as color,
@@ -153,7 +153,9 @@ def _load_model_weights(
         # This format stands for:
         # single binary file, OR
         # multiple binary files without index files.
-        load_weights_from_torchchat_format(stage_module, distribution, device, model_config)
+        load_weights_from_torchchat_format(
+            stage_module, distribution, device, model_config
+        )
     else:
         raise ValueError(f"Unknown checkpoint format: {chpt_from}")
 
@@ -304,7 +306,7 @@ def _cleanup():
 
 
 def main(args):
-    model_name = args.model_name
+    model_name = "llama3"  # args.model_name
     pp_degree = args.pp
 
     rank, world_size = _init_distributed()
@@ -590,12 +592,14 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
+    """parser.add_argument(
         "model_name",
         type=str,
+        default="llama3",
         help="Name of the model to load",
-        choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(),
+        # choices=NAME_TO_DISTRIBUTION_AND_DTYPE.keys(),
     )
+    """
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel degree")
     parser.add_argument(
         "--ntokens",
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -510,23 +510,29 @@ def _load_model(builder_args: BuilderArgs) -> Model:
     return model.eval()
 
 
-@record
-def run_main(local_rank):
-    # Add the directory containing the train file to sys.path
-    train_file_path = Path(__file__).parent.parent.parent / "dist_run.py"
-    print(f"******* {train_file_path=}")
-    sys.path.insert(0, os.path.dirname(os.path.abspath(train_file_path)))
+import importlib.util
+import subprocess
+
 
-    # Set environment variables for distributed training
-    os.environ["LOCAL_RANK"] = str(local_rank)
-    os.environ["RANK"] = str(
-        local_rank  # + kwargs.get("node_rank", 0) * num_processes_per_node
+def run_script(script_path, *args):
+    # Construct the command to run the script
+    cmd = [sys.executable, script_path] + list(args)
+
+    # Run the script as a subprocess
+    process = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True
     )
-    os.environ["WORLD_SIZE"] = str(4 * 1)  # num_nodes)
 
-    # Execute the train file
-    with open(train_file_path, "rb") as file:
-        exec(compile(file.read(), train_file_path, "exec"))
+    # Stream the output in real-time
+    for line in process.stdout:
+        print(line, end="")
+    for line in process.stderr:
+        print(line, end="", file=sys.stderr)
+
+    # Wait for the process to complete and get the return code
+    return_code = process.wait()
+    if return_code != 0:
+        raise subprocess.CalledProcessError(return_code, cmd)
 
 
 def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
@@ -546,12 +552,20 @@ def _launch_distributed_inference(builder_args: BuilderArgs) -> None:
         monitor_interval=1,
     )
 
-    train_file_path = Path(__file__).parent / "distributed" / "dist_run.py"
+    train_file_path = Path(__file__).parent.parent.parent / "dist_run.py"
+    print(f"train_file_path: {train_file_path}")
+    # import argparse
+
+    # parser2 = argparse.ArgumentParser()
+
+    # args = parser2.parse_args()
+    args = []
+    print(f"args: {args}")
 
     elastic_launch(
         config=lc,
-        entrypoint=run_main,
-    )(train_file_path)
+        entrypoint=run_script,
+    )(train_file_path, *args)
     print(
         f"Done launching distributed inference on **4 ** {builder_args.num_gpus} GPUs."
     )