Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"granularity": "String",
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"granularity": "day",
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
66 changes: 66 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
-- vLLM CI reliability metrics
-- Computes CI success rate, failure rate over time for Buildkite builds
-- Daily breakdown of build states (passed, failed, canceled)
-- Overall success rate and job-level reliability

WITH builds AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state,
formatDateTime(
DATE_TRUNC(
{granularity: String },
tupleElement(build, 'started_at')
),
'%Y-%m-%d'
) AS bucket
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
GROUP BY
repository,
pipeline_name,
build_number,
build_started_at,
build_finished_at,
build_state,
bucket
),

daily_stats AS (
SELECT
bucket,
countIf(lowerUTF8(build_state) IN ('passed', 'finished', 'success'))
AS passed_count,
countIf(lowerUTF8(build_state) = 'failed') AS failed_count,
countIf(lowerUTF8(build_state) IN ('canceled', 'cancelled'))
AS canceled_count,
passed_count + failed_count + canceled_count AS total_count,
passed_count + failed_count AS non_canceled_count,
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM builds
GROUP BY bucket
)

SELECT
bucket AS granularity_bucket,
passed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM daily_stats
ORDER BY granularity_bucket ASC
16 changes: 16 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"repo": "vllm-project/vllm",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
32 changes: 32 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
-- vLLM CI run durations (Buildkite builds)
-- Lists per-build durations based on build.started_at and build.finished_at

WITH b AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'finished_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
)

SELECT
pipeline_name,
build_number,
max(build_started_at) AS started_at,
max(build_finished_at) AS finished_at,
any(build_state) AS build_state,
dateDiff('second', started_at, finished_at) AS duration_seconds,
round(duration_seconds / 3600.0, 3) AS duration_hours
FROM b
GROUP BY pipeline_name, build_number
ORDER BY started_at ASC
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)",
"minRuns": "UInt32"
},
"tests": [
{
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000",
"minRuns": 3
}
]
}
56 changes: 56 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- vLLM per-job reliability metrics
-- Computes success rate for each individual job in the CI pipeline
-- Shows which jobs are most/least reliable

WITH jobs AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(job, 'name') AS job_name,
tupleElement(job, 'state') AS job_state,
tupleElement(job, 'soft_failed') AS soft_failed,
tupleElement(job, 'finished_at') AS job_finished_at
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(job, 'finished_at') IS NOT NULL
AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
-- Exclude soft-failed jobs from reliability calculation
AND tupleElement(job, 'soft_failed') = 'false'
),

job_stats AS (
SELECT
job_name,
countIf(lowerUTF8(job_state) IN ('passed', 'finished', 'success'))
AS passed_count,
countIf(lowerUTF8(job_state) = 'failed') AS failed_count,
countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled'))
AS canceled_count,
passed_count + failed_count + canceled_count AS total_count,
passed_count + failed_count AS non_canceled_count,
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM jobs
GROUP BY job_name
HAVING non_canceled_count >= {minRuns: UInt32}
)

SELECT
job_name,
passed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM job_stats
ORDER BY
success_rate ASC,
non_canceled_count DESC
19 changes: 17 additions & 2 deletions torchci/clickhouse_queries/vllm/merges_percentage/query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ manual_merged_prs AS (
manual_merged_prs_with_failures AS (
SELECT
bucket,
count(number) AS manual_merged_with_failures_count
count(DISTINCT number) AS manual_merged_with_failures_count
FROM
merged_prs
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
Expand All @@ -118,6 +118,19 @@ manual_merged_prs_with_failures AS (
GROUP BY
bucket
),
manual_merged_prs_pending AS (
SELECT
bucket,
count(DISTINCT number) AS manual_merged_pending_count
FROM
merged_prs
LEFT JOIN latest_buildkite_jobs ON toString(merged_prs.number) = latest_buildkite_jobs.number
WHERE
tupleElement(auto_merge, 'merge_method') = ''
AND job_state IN ('running', 'pending', 'scheduled')
GROUP BY
bucket
),
auto_merged_prs AS (
SELECT
bucket,
Expand All @@ -137,14 +150,16 @@ results AS (
abandon_count,
auto_merged_count,
manual_merged_count,
manual_merged_with_failures_count
manual_merged_with_failures_count,
manual_merged_pending_count
FROM
total_prs
LEFT JOIN open_prs ON total_prs.bucket = open_prs.bucket
LEFT JOIN abandon_prs ON total_prs.bucket = abandon_prs.bucket
LEFT JOIN auto_merged_prs ON total_prs.bucket = auto_merged_prs.bucket
LEFT JOIN manual_merged_prs ON total_prs.bucket = manual_merged_prs.bucket
LEFT JOIN manual_merged_prs_with_failures ON total_prs.bucket = manual_merged_prs_with_failures.bucket
LEFT JOIN manual_merged_prs_pending ON total_prs.bucket = manual_merged_prs_pending.bucket
)
SELECT
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"params": {
"repo": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"repo": "vllm-project/vllm",
"startTime": "2025-09-22T00:00:00.000",
"stopTime": "2025-09-29T00:00:00.000"
}
]
}
Loading