[nightly][ci] Logging to scuba when running in oss (#427)

xuzhao9 · web-flow · commit 129966e2b3ac · 2025-09-18T17:11:06.000-04:00
diff --git a/.ci/upload/scribe.py b/.ci/upload/scribe.py
@@ -5,131 +5,35 @@
 import argparse
 import json
 import os
-import time
+import sys
+from os.path import abspath, exists
 
-from collections import defaultdict
 
-import requests
+def setup_tritonbench_cwd():
+    original_dir = abspath(os.getcwd())
 
-CATEGORY_NAME = "perfpipe_pytorch_user_benchmarks"
-BENCHMARK_SCHEMA = {
-    "int": ["time"],
-    "normal": [
-        "benchmark_date",
-        "unix_user",
-        "submission_group_id",
-        "cuda_version",
-        "device",
-        "conda_env",
-        "pytorch_commit",
-        "triton_commit",
-        "tritonbench_commit",
-        "triton_branch",
-        "pytorch_branch",
-        "tritonbench_branch",
-        "triton_commit_time",
-        "pytorch_commit_time",
-        "tritonbench_commit_time",
-        "github_action",
-        "github_actor",
-        "github_base_ref",
-        "github_ref",
-        "github_ref_protected",
-        "github_repository",
-        "github_run_attempt",
-        "github_run_id",
-        "github_run_number",
-        "github_workflow",
-        "github_workflow_ref",
-        "github_workflow_sha",
-        "job_name",
-        "runner_arch",
-        "runner_name",
-        "runner_type",
-        "runner_os",
-        "metric_id",
-    ],
-    "float": ["metric_value"],
-}
+    for tritonbench_dir in (
+        ".",
+        "../../tritonbench",
+    ):
+        if exists(tritonbench_dir):
+            break
 
+    if exists(tritonbench_dir):
+        tritonbench_dir = abspath(tritonbench_dir)
+        os.chdir(tritonbench_dir)
+        sys.path.append(tritonbench_dir)
+    return original_dir
 
-class ScribeUploader:
-    def __init__(self, category, schema):
-        self.category = category
-        self.schema = schema
-
-    def _format_message(self, field_dict):
-        assert "time" in field_dict, "Missing required Scribe field 'time'"
-        message = defaultdict(dict)
-        for field, value in field_dict.items():
-            field = field.lower()
-            if value is None:
-                continue
-            if field in self.schema["normal"]:
-                message["normal"][field] = str(value)
-            elif field in self.schema["int"]:
-                message["int"][field] = int(value)
-            elif field in self.schema["float"]:
-                try:
-                    message["float"][field] = float(value)
-                except ValueError:
-                    # If value error (e.g., "CUDA OOM"), override the field value to 0.0
-                    message["float"][field] = 0.0
-            else:
-                raise ValueError(
-                    "Field {} is not currently used, "
-                    "be intentional about adding new fields to schema".format(field)
-                )
-        return message
-
-    def _upload(self, messages: list):
-        access_token = os.environ.get("TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN")
-        if not access_token:
-            raise ValueError("Can't find access token from environment variable")
-        url = "https://graph.facebook.com/scribe_logs"
-        r = requests.post(
-            url,
-            data={
-                "access_token": access_token,
-                "logs": json.dumps(
-                    [
-                        {
-                            "category": self.category,
-                            "message": json.dumps(message),
-                            "line_escape": False,
-                        }
-                        for message in messages
-                    ]
-                ),
-            },
-        )
-        print(r.text)
-        r.raise_for_status()
-
-    def post_benchmark_results(self, bm_data):
-        messages = []
-        base_message = {
-            "time": int(time.time()),
-        }
-        base_message.update(bm_data["env"])
-        base_message.update(bm_data["github"])
-        base_message["submission_group_id"] = f"tritonbench.{bm_data['name']}"
-        base_message["unix_user"] = "tritonbench_ci"
-        for metric in bm_data["metrics"]:
-            msg = base_message.copy()
-            msg["metric_id"] = metric
-            msg["metric_value"] = bm_data["metrics"][metric]
-            formatted_msg = self._format_message(msg)
-            messages.append(formatted_msg)
-        self._upload(messages)
 
+setup_tritonbench_cwd()
+from tritonbench.utils.scuba_utils import log_benchmark
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--json", required=True, type=argparse.FileType("r"), help="Userbenchmark json"
     )
     args = parser.parse_args()
-    uploader = ScribeUploader(category=CATEGORY_NAME, schema=BENCHMARK_SCHEMA)
     benchmark_data = json.load(args.json)
-    uploader.post_benchmark_results(benchmark_data)
+    log_benchmark(benchmark_data)
diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py
@@ -1,5 +1,6 @@
 """
 Tritonbench nightly run, dashboard: https://hud.pytorch.org/tritonbench/commit_view
+Run all operators in nightly/autogen.yaml.
 Requires the operator to support the speedup metric.
 """
 
@@ -37,57 +38,6 @@ def setup_tritonbench_cwd():
     return original_dir
 
 
-def reduce(run_timestamp, output_dir, output_files, args):
-    """aggregate all op benchmark csvs into json file"""
-    from tritonbench.utils.gpu_utils import get_nvidia_gpu_states, has_nvidia_smi
-    from tritonbench.utils.path_utils import REPO_PATH
-    from tritonbench.utils.run_utils import get_github_env, get_run_env
-
-    repo_locs = {
-        "tritonbench": REPO_PATH,
-    }
-    if args.ci and "TRITONBENCH_TRITON_REPO_PATH" in os.environ:
-        repo_locs["triton"] = os.environ.get("TRITONBENCH_TRITON_REPO_PATH", None)
-        repo_locs["pytorch"] = os.environ.get("TRITONBENCH_PYTORCH_REPO_PATH", None)
-    aggregated_obj = {
-        "name": "nightly",
-        "env": get_run_env(run_timestamp, repo_locs),
-        "metrics": {},
-    }
-    if has_nvidia_smi():
-        aggregated_obj.update(
-            {
-                "nvidia_gpu_states": get_nvidia_gpu_states(),
-            }
-        )
-
-    # Collecting GitHub environment variables when running in CI environment
-    if args.ci:
-        aggregated_obj["github"] = get_github_env()
-
-    for result_json_file in output_files:
-        logger.info(f"Loading output file: {result_json_file}.")
-        result_json_filename = Path(result_json_file).stem
-        if (
-            not os.path.exists(result_json_file)
-            or os.path.getsize(result_json_file) == 0
-        ):
-            aggregated_obj["metrics"][f"tritonbench_{result_json_filename}-pass"] = 0
-            continue
-        # TODO: check if all inputs pass
-        aggregated_obj["metrics"][f"tritonbench_{result_json_filename}-pass"] = 1
-        with open(
-            result_json_file,
-            "r",
-        ) as fp:
-            result_obj = json.load(fp)
-            aggregated_obj["metrics"].update(result_obj)
-    result_json_path = os.path.join(output_dir, "result.json")
-    with open(result_json_path, "w") as fp:
-        json.dump(aggregated_obj, fp, indent=4)
-    return result_json_path
-
-
 def get_operator_benchmarks() -> Dict[str, Any]:
     def _load_benchmarks(config_path: str) -> Dict[str, Any]:
         out = {}
@@ -111,12 +61,17 @@ def _load_benchmarks(config_path: str) -> Dict[str, Any]:
 
 def run():
     parser = argparse.ArgumentParser()
+    parser.add_argument("--name", default="nightly", help="Benchmark name.")
     parser.add_argument(
         "--ci", action="store_true", help="Running in GitHub Actions CI mode."
     )
+    parser.add_argument(
+        "--log-scuba", action="store_true", help="Upload results to Scuba."
+    )
     args = parser.parse_args()
     setup_tritonbench_cwd()
     from tritonbench.utils.run_utils import run_in_task, setup_output_dir
+    from tritonbench.utils.scuba_utils import decorate_benchmark_data, log_benchmark
 
     run_timestamp, output_dir = setup_output_dir("nightly")
     # Run each operator
@@ -127,10 +82,32 @@ def run():
         output_file = output_dir.joinpath(f"{op_bench}.json")
         op_args.extend(["--output-json", str(output_file.absolute())])
         run_in_task(op=op_name, op_args=op_args, benchmark_name=op_bench)
+        # write pass or fail to result json
+        # todo: check every input shape has passed
+        output_file_name = Path(output_file).stem
+        if not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
+            logger.warning(f"[nightly] Failed to run {output_file_name}.")
+            with open(output_file, "w") as f:
+                json.dump({f"tritonbench_{output_file_name}-pass": 0}, f)
+        else:
+            with open(output_file, "r") as f:
+                obj = json.load(f)
+            obj[f"tritonbench_{output_file_name}-pass"] = 1
+            with open(output_file, "w") as f:
+                json.dump(obj, f, indent=4)
         output_files.append(output_file)
     # Reduce all operator CSV outputs to a single output json
-    result_json_file = reduce(run_timestamp, output_dir, output_files, args)
+    benchmark_data = [json.load(open(f, "r")) for f in output_files]
+    aggregated_obj = decorate_benchmark_data(
+        args.name, run_timestamp, args.ci, benchmark_data
+    )
+    result_json_file = os.path.join(output_dir, "result.json")
+    with open(result_json_file, "w") as fp:
+        json.dump(aggregated_obj, fp, indent=4)
     logger.info(f"[nightly] logging result json file to {result_json_file}.")
+    if args.log_scuba:
+        log_benchmark(aggregated_obj)
+        logger.info(f"[nightly] logging results to scuba.")
 
 
 if __name__ == "__main__":
diff --git a/run.py b/run.py
@@ -7,9 +7,10 @@
 
 import argparse
 import os
-import shlex
 import sys
-from typing import List, Tuple
+import time
+from datetime import datetime
+from typing import List
 
 from tritonbench.operator_loader import get_op_loader_bench_cls_by_name, is_loader_op
 
@@ -35,6 +36,7 @@
 
 
 def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorResult:
+    run_timestamp = datetime.fromtimestamp(time.time()).strftime("%Y%m%d%H%M%S")
     if is_loader_op(args.op):
         Opbench = get_op_loader_bench_cls_by_name(args.op)
     else:
@@ -72,6 +74,13 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
             if "triton_type" in args:
                 kwargs["triton_type"] = args.triton_type
             log_benchmark(**kwargs)
+        # Log benchmark output to scuba even if not in fbcode
+        if args.log_scuba and not is_fbcode():
+            from tritonbench.utils.scuba_utils import log_benchmark
+
+            log_benchmark(
+                benchmark_data=None, run_timestamp=run_timestamp, opbench=opbench
+            )
 
         if args.plot:
             try:
diff --git a/tritonbench/utils/git_utils.py b/tritonbench/utils/git_utils.py
@@ -1,7 +1,6 @@
 import os
 import subprocess
 from datetime import datetime
-from typing import Optional
 
 
 def get_branch(repo: str, commit: str) -> str:
@@ -11,6 +10,8 @@ def get_branch(repo: str, commit: str) -> str:
     If a commit does not belong to any branch, return "unknown"
     If a commit belongs to many branches, return the very first branch.
     """
+    if repo == "unknown":
+        return "unknown"
     assert os.path.exists(repo), f"{repo} path does not exist."
     cmd = ["git", "branch", "-a", "--contains", commit, "--no-color"]
     branch_names = subprocess.check_output(cmd, cwd=repo).decode().strip().splitlines()
@@ -27,6 +28,8 @@ def get_commit_time(repo: str, commit: str) -> str:
     commit: hash of a commit
     If a commit does not exist, return "unknown"
     """
+    if repo == "unknown":
+        return "unknown"
     assert os.path.exists(repo), f"{repo} path does not exist."
     git_date_cmd = ["git", "show", "--no-patch", "--format=%ci", commit]
     git_date = subprocess.check_output(git_date_cmd, cwd=repo).decode().strip()
@@ -40,6 +43,8 @@ def get_commit_time(repo: str, commit: str) -> str:
 def get_current_hash(repo: str) -> str:
     """Get the HEAD hash of a git repo.
     repo: local git repo path"""
+    if repo == "unknown":
+        return "unknown"
     cmd = ["git", "rev-parse", "--verify", "HEAD"]
     output = subprocess.check_output(cmd, cwd=repo).decode().strip()
     return output
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
@@ -288,9 +288,9 @@ def get_parser(args=None):
         help="Configuration B for A/B testing. Specify operator-specific arguments as a string. "
         "Example: '--side-b \"--dynamic\"'",
     )
+    parser.add_argument("--log-scuba", action="store_true", help="Log to scuba.")
 
     if is_fbcode():
-        parser.add_argument("--log-scuba", action="store_true", help="Log to scuba.")
         parser.add_argument(
             "--production-shapes",
             action="store_true",
diff --git a/tritonbench/utils/run_utils.py b/tritonbench/utils/run_utils.py
@@ -57,9 +57,9 @@ def get_run_env(
     run_env["pytorch_commit"] = torch.version.git_version
     # we assume Tritonbench CI will properly set Triton commit hash in env
     run_env["triton_commit"] = os.environ.get(
-        "TRITONBENCH_TRITON_MAIN_COMMIT", "unknown"
+        "TRITONBENCH_TRITON_COMMIT_HASH", get_current_hash(repo_locs["triton"])
     )
-    run_env["tritonbench_commit"] = get_current_hash(REPO_PATH)
+    run_env["tritonbench_commit"] = get_current_hash(repo_locs["tritonbench"])
     for repo in ["triton", "pytorch", "tritonbench"]:
         repo_loc = repo_locs.get(repo, None)
         if not run_env[f"{repo}_commit"] == "unknown" and repo_loc:
diff --git a/tritonbench/utils/scuba_utils.py b/tritonbench/utils/scuba_utils.py