GoogleCloudPlatform · ManfeiBai · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
@@ -195,6 +195,34 @@ def model_install_cmds(output_file=None) -> str:
       *install_torch_xla2_dependency,
   )
 
+def run_simple_model_test(simple_model_test):
+    # def set_up_torchbench_tpu(
+    model_name: str = "",
+    test_version: VERSION = VERSION.NIGHTLY,
+) -> Tuple[str]:
+  """Common set up for Simple model."""
+
+    version_mapping = get_version_mapping(test_version)
+    return (
+      "pip3 install -U 'setuptools>=70.0.0,<71.0.0'",
+      "sudo apt-get -y update",
+      "sudo apt install -y libopenblas-base",
+      "sudo apt install -y libsndfile-dev",
+      "sudo apt-get install libgl1 -y",
+      "pip3 install --user numpy pandas",
+      (
+          f"pip3 install --user --pre {version_mapping.TORCH.value} {version_mapping.TORCHVISION.value} {version_mapping.TORCHAUDIO.value} --index-url {version_mapping.TORCH_INDEX_CPU_URL.value}"
+      ),
+      (
+          f"pip3 install --user 'torch_xla[tpu] @{version_mapping.TORCH_XLA_TPU_WHEEL.value}' -f https://storage.googleapis.com/libtpu-releases/index.html"
+      ),
+      "pip3 install --user psutil",
+      "cd; git clone https://github.com/pytorch/benchmark.git", # change to hf code path
+      # f"cd benchmark && {model_install_cmds()}",
+      f"cd; git clone {version_mapping.TORCH_REPO_BRANCH.value} https://github.com/pytorch/pytorch.git",
+      f"cd; git clone {version_mapping.TORCH_XLA_REPO_BRANCH.value} https://github.com/pytorch/xla.git",
+      # *install_torch_xla2_dependency,
+  )
 
 def get_torchbench_tpu_config(
     tpu_version: resource.TpuVersion,
@@ -211,13 +239,18 @@ def get_torchbench_tpu_config(
     test_version: VERSION = VERSION.NIGHTLY,
     model_name: str = "",
     extraFlags: str = "",
+    simple_model_test: bool = False,
 ):
   job_gcp_config = gcp_config.GCPConfig(
       project_name=project.value,
       zone=tpu_zone.value,
       dataset_name=metric_config.DatasetOption.BENCHMARK_DATASET,
   )
 
+  if simple_model_test:
+    run_simple_model_test(model_name, test_version)
+  # else: ...
+
   set_up_cmds = set_up_torchbench_tpu(
       model_name, test_version, use_xla2=use_xla2
   )

@@ -24,16 +24,63 @@
 SCHEDULED_TIME = "0 11 * * *" if composer_env.is_prod_env() else None
 
 
+# @task_group(prefix_group_id=False)
+# def llama():
+#   llama_3_train_trillium = task.run_queued_resource_test(
+#       test_config.JSonnetTpuVmTest.from_pytorch(
+#           "pt-nightly-llama3-train-func-v6e-4-1vm",
+#           network=V5_NETWORKS,
+#           subnetwork=V6E_SUBNETWORKS,
+#       ),
+#       US_CENTRAL2_B_TPU_PROD_ENV,
+#   )
+
 with models.DAG(
     dag_id="pytorchxla-torchbench",
     schedule=SCHEDULED_TIME,
     tags=["pytorchxla", "nightly", "torchbench"],
     start_date=datetime.datetime(2024, 1, 1),
     catchup=False,
 ) as dag:
+  # llama()
+
   model = "all" if composer_env.is_prod_env() else "BERT_pytorch"
   torchbench_extra_flags = [f"--filter={model}"]
 
+  # LLaMA3 on V6E:
+  config.get_torchbench_tpu_config(
+      tpu_version=resource.TpuVersion.TRILLIUM,
+      tpu_cores=8,
+      project=resource.Project.CLOUD_ML_BENCHMARKING,
+      tpu_zone=resource.Zone.US_CENTRAL2_B,
+      runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV6,
+      network=resource.BM_NETWORKS,
+      subnetwork=resource.V4_BM_SUBNETWORKS,
+      time_out_in_min=1600,
+      model_name="llama3",
+      reserved=False,
+      preemptible=False,
+      extraFlags=" ".join(torchbench_extra_flags),
+      simple_model_test=True,
+  )
+
+  # SD2 on V6E:
+  config.get_torchbench_tpu_config(
+      tpu_version=resource.TpuVersion.TRILLIUM,
+      tpu_cores=8,
+      project=resource.Project.CLOUD_ML_BENCHMARKING,
+      tpu_zone=resource.Zone.US_CENTRAL2_B,
+      runtime_version=resource.RuntimeVersion.V2_ALPHA_TPUV6,
+      network=resource.BM_NETWORKS,
+      subnetwork=resource.V4_BM_SUBNETWORKS,
+      time_out_in_min=1600,
+      model_name="sd2",
+      reserved=False,
+      preemptible=False,
+      extraFlags=" ".join(torchbench_extra_flags),
+      simple_model_test=True,
+  )
+
   # Running on V4-8:
   config.get_torchbench_tpu_config(
       tpu_version=resource.TpuVersion.V4,