Migrate LLM dashboards to v3 (#6057)

huydhn · web-flow · commit 28ccdef959c8 · 2024-12-13T15:28:17.000-08:00
This migrates the queries from `oss_ci_benchmark_v2` to `oss_ci_benchmark_v3`. This will unblock the following updates: * Add delegation backend support for [ExecuTorch dashboard](https://hud.pytorch.org/benchmark/llms?repoName=pytorch%2Fexecutorch) instead of bundling it together with the model name `edsr coreml_all` (cc @guangy10) * Add LLM AO dashboard, whose data is only available in `oss_ci_benchmark_v3` (cc @jerryzh168) Some minor fixes that go with this: * Change the parameters to `benchmarks` and `models`. This is clearer than `filenames` and `names` * Remove `getJobId` param. Its origin is from the TorchInductor query when the job ID is a string. In `oss_ci_benchmark_v3`, the job is an UInt64, so there is no saving there. ### Testing * https://torchci-git-fork-huydhn-migrate-llm-dashboard-v3-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fpytorch * https://torchci-git-fork-huydhn-migrate-llm-dashboard-v3-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fexecutorch
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_branches/params.json b/torchci/clickhouse_queries/oss_ci_benchmark_branches/params.json
@@ -2,8 +2,8 @@
   "deviceArch": "String",
   "dtypes": "Array(String)",
   "excludedMetrics": "Array(String)",
-  "filenames": "Array(String)",
-  "names": "Array(String)",
+  "benchmarks": "Array(String)",
+  "models": "Array(String)",
   "repo": "String",
   "startTime": "DateTime64(3)",
   "stopTime": "DateTime64(3)"
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_branches/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_branches/query.sql
@@ -1,47 +1,65 @@
 -- This query is used to get the list of branches and commits used by different
 -- OSS CI benchmark experiments. This powers HUD benchmarks dashboards
+WITH benchmarks AS (
+    SELECT
+        o.head_branch AS head_branch,
+        o.head_sha AS head_sha,
+        o.workflow_id AS id,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'device' ],
+            tupleElement(o.runners [ 1 ], 'name')
+        ) AS device,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
+            tupleElement(o.runners [ 1 ], 'type')
+        ) AS arch,
+        toStartOfDay(fromUnixTimestamp(o.timestamp)) AS event_time
+    FROM
+        benchmark.oss_ci_benchmark_v3 o
+    WHERE
+        o.timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+        AND o.timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+        AND o.repo = {repo: String }
+        AND (
+            has({benchmarks: Array(String) }, o.benchmark.name)
+            OR empty({benchmarks: Array(String) })
+        )
+        AND (
+            has({models: Array(String) }, o.model.name)
+            OR empty({models: Array(String) })
+        )
+        AND (
+            has({dtypes: Array(String) }, o.benchmark.dtype)
+            OR empty({dtypes: Array(String) })
+        )
+        AND (
+            NOT has({excludedMetrics: Array(String) }, o.metric.name)
+            OR empty({excludedMetrics: Array(String) })
+        )
+        AND notEmpty(o.metric.name)
+        AND notEmpty(o.benchmark.dtype)
+)
 SELECT
-    DISTINCT w.head_branch AS head_branch,
-    w.head_sha,
-    w.id,
-    toStartOfDay(fromUnixTimestamp64Milli(o.timestamp)) AS event_time,
-    o.filename
+    DISTINCT replaceOne(head_branch, 'refs/heads/', '') AS head_branch,
+    head_sha,
+    id,
+    event_time
 FROM
-    benchmark.oss_ci_benchmark_v2 o
-    LEFT JOIN default .workflow_run w FINAL ON o.workflow_id = w.id
+    benchmarks
 WHERE
-    o.timestamp >= toUnixTimestamp64Milli({startTime: DateTime64(3) })
-    AND o.timestamp < toUnixTimestamp64Milli({stopTime: DateTime64(3) })
-    AND (
-        has({filenames: Array(String) }, o.filename)
-        OR empty({filenames: Array(String) })
-    )
-    AND (
-        has({names: Array(String) }, o.name)
-        OR empty({names: Array(String) })
-    )
     -- NB: DEVICE (ARCH) is the display format used by HUD when grouping together these two fields
-    AND (
+    (
         CONCAT(
-            o.device,
+            device,
             ' (',
-            IF(empty(o.arch), 'NVIDIA A100-SXM4-40GB', o.arch),
+            IF(empty(arch), 'NVIDIA A100-SXM4-40GB', arch),
             ')'
         ) = {deviceArch: String }
         OR {deviceArch: String } = ''
     )
-    AND (
-        has({dtypes: Array(String) }, o.dtype)
-        OR empty({dtypes: Array(String) })
-    )
-    AND (
-        NOT has({excludedMetrics: Array(String) }, o.metric)
-        OR empty({excludedMetrics: Array(String) })
-    )
-    AND notEmpty(o.metric)
-    AND w.html_url LIKE CONCAT('%', {repo: String }, '%')
-    AND notEmpty(o.dtype)
-    AND notEmpty(o.device)
+    AND notEmpty(device)
 ORDER BY
-    w.head_branch,
+    head_branch,
     event_time DESC
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_llms/params.json b/torchci/clickhouse_queries/oss_ci_benchmark_llms/params.json
@@ -4,10 +4,9 @@
   "deviceArch": "String",
   "dtypes": "Array(String)",
   "excludedMetrics": "Array(String)",
-  "filenames": "Array(String)",
-  "getJobId": "Bool",
+  "benchmarks": "Array(String)",
   "granularity": "String",
-  "names": "Array(String)",
+  "models": "Array(String)",
   "repo": "String",
   "startTime": "DateTime64(3)",
   "stopTime": "DateTime64(3)"
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_llms/query.sql
@@ -1,66 +1,93 @@
 --- This query is used to get the LLMs benchmark results from different experiments. It
 --- queries the TPS and memory bandwidth for each model / quantization combos. This powers
 --- the LLMs benchmark dashboard
+WITH benchmarks AS (
+    SELECT
+        replaceOne(o.head_branch, 'refs/heads/', '') AS head_branch,
+        o.workflow_id AS workflow_id,
+        o.job_id AS job_id,
+        o.model.name AS model,
+        o.model.backend AS backend,
+        o.metric.name AS metric,
+        floor(arrayAvg(o.metric.benchmark_values), 2) AS actual,
+        floor(toFloat64(o.metric.target_value), 2) AS target,
+        o.benchmark.dtype AS dtype,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'device' ],
+            tupleElement(o.runners [ 1 ], 'name')
+        ) AS device,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
+            tupleElement(o.runners [ 1 ], 'type')
+        ) AS arch,
+        DATE_TRUNC(
+            {granularity: String },
+            fromUnixTimestamp(o.timestamp)
+        ) AS granularity_bucket
+    FROM
+        benchmark.oss_ci_benchmark_v3 o
+    WHERE
+        o.timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+        AND o.timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+        AND o.repo = {repo: String }
+        AND (
+            has({commits: Array(String) }, o.head_sha)
+            OR empty({commits: Array(String) })
+        )
+        AND (
+            has({benchmarks: Array(String) }, o.benchmark.name)
+            OR empty({benchmarks: Array(String) })
+        )
+        AND (
+            has({models: Array(String) }, o.model.name)
+            OR empty({models: Array(String) })
+        )
+        AND (
+            has({dtypes: Array(String) }, o.benchmark.dtype)
+            OR empty({dtypes: Array(String) })
+        )
+        AND (
+            NOT has({excludedMetrics: Array(String) }, o.metric.name)
+            OR empty({excludedMetrics: Array(String) })
+        )
+        AND notEmpty(o.metric.name)
+        AND notEmpty(o.benchmark.dtype)
+)
 SELECT
-    DISTINCT o.workflow_id AS workflow_id,
-    -- As the JSON response is pretty big, only return the field if it's needed
-    IF({getJobId: Bool}, o.job_id, '') AS job_id,
-    o.name,
-    o.metric,
-    floor(toFloat64(o.actual), 2) AS actual,
-    floor(toFloat64(o.target), 2) AS target,
-    DATE_TRUNC(
-        {granularity: String },
-        fromUnixTimestamp64Milli(o.timestamp)
-    ) AS granularity_bucket,
-    o.dtype,
-    o.device,
-    -- NB: Default to NVIDIA A100-SXM4-40GB for old records without arch column
-    IF(empty(o.arch), 'NVIDIA A100-SXM4-40GB', o.arch) as arch
+    DISTINCT workflow_id,
+    job_id,
+    CONCAT(model, ' ', backend) AS name,
+    metric,
+    actual,
+    target,
+    dtype,
+    device,
+    arch,
+    granularity_bucket
 FROM
-    benchmark.oss_ci_benchmark_v2 o
-    LEFT JOIN default .workflow_run w FINAL ON o.workflow_id = w.id
+    benchmarks
 WHERE
-    o.timestamp >= toUnixTimestamp64Milli({startTime: DateTime64(3) })
-    AND o.timestamp < toUnixTimestamp64Milli({stopTime: DateTime64(3) })
-    AND (
-        has({branches: Array(String) }, w.head_branch)
-        OR empty({branches: Array(String) })
-    )
-    AND (
-        has({commits: Array(String) }, w.head_sha)
-        OR empty({commits: Array(String) })
-    )
-    AND (
-        has({filenames: Array(String) }, o.filename)
-        OR empty({filenames: Array(String) })
+    (
+        has({models: Array(String) }, CONCAT(model, ' ', backend))
+        OR empty({models: Array(String) })
     )
     AND (
-        has({names: Array(String) }, o.name)
-        OR empty({names: Array(String) })
+        has({branches: Array(String) }, head_branch)
+        OR empty({branches: Array(String) })
     )
     -- NB: DEVICE (ARCH) is the display format used by HUD when grouping together these two fields
     AND (
         CONCAT(
-            o.device,
+            device,
             ' (',
-            IF(empty(o.arch), 'NVIDIA A100-SXM4-40GB', o.arch),
+            IF(empty(arch), 'NVIDIA A100-SXM4-40GB', arch),
             ')'
         ) = {deviceArch: String }
         OR {deviceArch: String } = ''
     )
-    AND (
-        has({dtypes: Array(String) }, o.dtype)
-        OR empty({dtypes: Array(String) })
-    )
-    AND (
-        NOT has({excludedMetrics: Array(String) }, o.metric)
-        OR empty({excludedMetrics: Array(String) })
-    )
-    AND notEmpty(o.metric)
-    AND notEmpty(o.dtype)
-    AND notEmpty(o.device)
-    AND w.html_url LIKE CONCAT('%', {repo: String }, '%')
+    AND notEmpty(device)
 ORDER BY
     granularity_bucket DESC,
     workflow_id DESC,
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_names/params.json b/torchci/clickhouse_queries/oss_ci_benchmark_names/params.json
@@ -2,8 +2,8 @@
   "deviceArch": "String",
   "dtypes": "Array(String)",
   "excludedMetrics": "Array(String)",
-  "filenames": "Array(String)",
-  "names": "Array(String)",
+  "benchmarks": "Array(String)",
+  "models": "Array(String)",
   "repo": "String",
   "startTime": "DateTime64(3)",
   "stopTime": "DateTime64(3)"
diff --git a/torchci/clickhouse_queries/oss_ci_benchmark_names/query.sql b/torchci/clickhouse_queries/oss_ci_benchmark_names/query.sql
@@ -1,51 +1,70 @@
 --- This query is used by HUD benchmarks dashboards to get the list of experiment names
+WITH benchmarks AS (
+    SELECT
+        o.benchmark.name AS benchmark,
+        o.model.name AS model,
+        o.model.backend AS backend,
+        o.metric.name AS metric,
+        o.benchmark.dtype AS dtype,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'device' ],
+            tupleElement(o.runners [ 1 ], 'name')
+        ) AS device,
+        IF(
+            empty(o.runners),
+            tupleElement(o.benchmark, 'extra_info') [ 'arch' ],
+            tupleElement(o.runners [ 1 ], 'type')
+        ) AS arch
+    FROM
+        benchmark.oss_ci_benchmark_v3 o
+    WHERE
+        o.timestamp >= toUnixTimestamp({startTime: DateTime64(3) })
+        AND o.timestamp < toUnixTimestamp({stopTime: DateTime64(3) })
+        AND o.repo = {repo: String }
+        AND (
+            has({benchmarks: Array(String) }, o.benchmark.name)
+            OR empty({benchmarks: Array(String) })
+        )
+        AND (
+            has({models: Array(String) }, o.model.name)
+            OR empty({models: Array(String) })
+        )
+        AND (
+            has({dtypes: Array(String) }, o.benchmark.dtype)
+            OR empty({dtypes: Array(String) })
+        )
+        AND (
+            NOT has({excludedMetrics: Array(String) }, o.metric.name)
+            OR empty({excludedMetrics: Array(String) })
+        )
+        AND notEmpty(o.metric.name)
+        AND notEmpty(o.benchmark.dtype)
+)
 SELECT
-    DISTINCT o.filename AS filename,
-    o.name,
-    o.metric,
-    o.dtype,
-    o.device,
-    -- NB: Default to NVIDIA A100-SXM4-40GB for old records without arch column
-    IF(empty(o.arch), 'NVIDIA A100-SXM4-40GB', o.arch) AS arch
+    DISTINCT benchmark,
+    CONCAT(model, ' ', backend) AS name,
+    metric,
+    dtype,
+    device,
+    arch
 FROM
-    benchmark.oss_ci_benchmark_v2 o
-    LEFT JOIN default .workflow_run w FINAL ON o.workflow_id = w.id
+    benchmarks
 WHERE
-    o.timestamp >= toUnixTimestamp64Milli({startTime: DateTime64(3) })
-    AND o.timestamp < toUnixTimestamp64Milli({stopTime: DateTime64(3) })
-    AND (
-        has({filenames: Array(String) }, o.filename)
-        OR empty({filenames: Array(String) })
-    )
-    AND (
-        has({names: Array(String) }, o.name)
-        OR empty({names: Array(String) })
-    )
     -- NB: DEVICE (ARCH) is the display format used by HUD when grouping together these two fields
-    AND (
+    (
         CONCAT(
-            o.device,
+            device,
             ' (',
-            IF(empty(o.arch), 'NVIDIA A100-SXM4-40GB', o.arch),
+            IF(empty(arch), 'NVIDIA A100-SXM4-40GB', arch),
             ')'
         ) = {deviceArch: String }
         OR {deviceArch: String } = ''
     )
-    AND (
-        has({dtypes: Array(String) }, o.dtype)
-        OR empty({dtypes: Array(String) })
-    )
-    AND (
-        NOT has({excludedMetrics: Array(String) }, o.metric)
-        OR empty({excludedMetrics: Array(String) })
-    )
-    AND notEmpty(o.metric)
-    AND w.html_url LIKE CONCAT('%', {repo: String }, '%')
-    AND notEmpty(o.dtype)
-    AND notEmpty(o.device)
+    AND notEmpty(device)
 ORDER BY
-    o.filename,
-    o.name,
-    o.metric,
-    o.dtype,
-    o.device
+    benchmark,
+    name,
+    metric,
+    dtype,
+    device
diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx
@@ -2,8 +2,8 @@ import { BranchAndCommit } from "lib/types";
 
 export const REPOS = ["pytorch/pytorch", "pytorch/executorch"];
 export const REPO_TO_BENCHMARKS: { [k: string]: string[] } = {
-  "pytorch/pytorch": ["gpt_fast_benchmark"],
-  "pytorch/executorch": ["android-perf", "apple-perf"],
+  "pytorch/pytorch": ["PyTorch gpt-fast benchmark"],
+  "pytorch/executorch": ["ExecuTorch"],
 };
 export const EXCLUDED_METRICS: string[] = ["load_status"];
 export const DEFAULT_MODEL_NAME = "All Models";
diff --git a/torchci/lib/benchmark/llmUtils.ts b/torchci/lib/benchmark/llmUtils.ts
diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx