MaterializeInc
diff --git a/‎ci/mkpipeline.py
Lines changed: 60 additions & 101 deletions b/‎ci/mkpipeline.py
Lines changed: 60 additions & 101 deletions
diff --git a/‎ci/nightly/pipeline.template.yml
Lines changed: 2 additions & 3 deletions b/‎ci/nightly/pipeline.template.yml
Lines changed: 2 additions & 3 deletions
diff --git a/‎ci/test/lint-main/checks/check-pipeline.sh
Lines changed: 43 additions & 0 deletions b/‎ci/test/lint-main/checks/check-pipeline.sh
Lines changed: 43 additions & 0 deletions
diff --git a/‎misc/python/materialize/checks/all_checks/sink.py
Lines changed: 3 additions & 0 deletions b/‎misc/python/materialize/checks/all_checks/sink.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/persist-client/src/internal/state.rs
Lines changed: 1 addition & 0 deletions b/‎src/persist-client/src/internal/state.rs
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/pubsub-disruption/mzcompose.py
Lines changed: 3 additions & 0 deletions b/‎test/pubsub-disruption/mzcompose.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎test/sqllogictest/materialized_views.slt
Lines changed: 1 addition & 1 deletion b/‎test/sqllogictest/materialized_views.slt
Lines changed: 1 addition & 1 deletion
@@ -82,6 +82,7 @@ def main() -> int:
 so it is executed.""",
     )
 
+    parser.add_argument("--dry-run", action="store_true")
     parser.add_argument("--coverage", action="store_true")
     parser.add_argument(
         "--sanitizer",
@@ -193,6 +194,7 @@ def fetch_hashes() -> None:
                 args.bazel_remote_cache,
                 bazel_lto,
             )
+    truncate_skip_length(pipeline)
     handle_sanitizer_skip(pipeline, args.sanitizer)
     increase_agents_timeouts(pipeline, args.sanitizer, args.coverage)
     prioritize_pipeline(pipeline, args.priority)
@@ -221,9 +223,10 @@ def fetch_hashes() -> None:
 
     print("--- Uploading new pipeline:")
     print(yaml.dump(pipeline))
-    spawn.runv(
-        ["buildkite-agent", "pipeline", "upload"], stdin=yaml.dump(pipeline).encode()
-    )
+    cmd = ["buildkite-agent", "pipeline", "upload"]
+    if args.dry_run:
+        cmd.append("--dry-run")
+    spawn.runv(cmd, stdin=yaml.dump(pipeline).encode())
 
     return 0
 
@@ -264,56 +267,48 @@ def prioritize_pipeline(pipeline: Any, priority: int) -> None:
     if build_author == "Dependabot":
         priority -= 40
 
-    def visit(config: Any) -> None:
+    for step in steps(pipeline):
+        if "trigger" in step or "wait" in step or "group" in step:
+            # Trigger and Wait steps do not allow priorities.
+            continue
         # Increase priority for larger Hetzner-based tests so that they get
         # preferential treatment on the agents which also accept smaller jobs.
         agent_priority = 0
-        if "agents" in config:
-            agent = config["agents"].get("queue", None)
+        if "agents" in step:
+            agent = step["agents"].get("queue", None)
             if agent == "hetzner-aarch64-8cpu-16gb":
                 agent_priority = 1
             if agent == "hetzner-aarch64-16cpu-32gb":
                 agent_priority = 2
-        config["priority"] = config.get("priority", 0) + priority + agent_priority
+        step["priority"] = step.get("priority", 0) + priority + agent_priority
 
-    for config in pipeline["steps"]:
-        if "trigger" in config or "wait" in config:
-            # Trigger and Wait steps do not allow priorities.
-            continue
-        if "group" in config:
-            for inner_config in config.get("steps", []):
-                visit(inner_config)
-            continue
-        visit(config)
+
+def truncate_skip_length(pipeline: Any) -> None:
+    for step in steps(pipeline):
+        if len(str(step.get("skip", ""))) > 70:
+            step["skip"] = step["skip"][:70]
 
 
 def handle_sanitizer_skip(pipeline: Any, sanitizer: Sanitizer) -> None:
     if sanitizer != Sanitizer.none:
         pipeline.setdefault("env", {})["CI_SANITIZER"] = sanitizer.value
 
-        def visit(step: dict[str, Any]) -> None:
+        for step in steps(pipeline):
             if step.get("sanitizer") == "skip":
                 step["skip"] = True
 
     else:
 
-        def visit(step: dict[str, Any]) -> None:
+        for step in steps(pipeline):
             if step.get("sanitizer") == "only":
                 step["skip"] = True
 
-    for step in pipeline["steps"]:
-        visit(step)
-        if "group" in step:
-            for inner_step in step.get("steps", []):
-                visit(inner_step)
-
 
 def increase_agents_timeouts(
     pipeline: Any, sanitizer: Sanitizer, coverage: bool
 ) -> None:
     if sanitizer != Sanitizer.none or os.getenv("CI_SYSTEM_PARAMETERS", "") == "random":
-
-        def visit(step: dict[str, Any]) -> None:
+        for step in steps(pipeline):
             # Most sanitizer runs, as well as random permutations of system
             # parameters, are slower and need more memory. The default system
             # parameters in CI are chosen to be efficient for execution, while
@@ -359,13 +354,6 @@ def visit(step: dict[str, Any]) -> None:
                     agent = "hetzner-x86-64-dedi-48cpu-192gb"
                 step["agents"] = {"queue": agent}
 
-        for step in pipeline["steps"]:
-            visit(step)
-            # Groups can't be nested, so handle them explicitly here instead of recursing
-            if "group" in step:
-                for inner_step in step.get("steps", []):
-                    visit(inner_step)
-
     if coverage:
         pipeline["env"]["CI_COVERAGE_ENABLED"] = 1
 
@@ -487,92 +475,81 @@ def switch_jobs_to_aws(pipeline: Any, priority: int) -> None:
 
     print(f"Queues stuck in Hetzner, switching to AWS or another arch: {stuck}")
 
-    def visit(config: Any) -> None:
-        if "agents" not in config:
-            return
+    for step in steps(pipeline):
+        # Trigger and Wait steps don't have agents
+        if "trigger" in step or "wait" in step or "group" in step:
+            continue
 
-        agent = config["agents"].get("queue", None)
+        if "agents" not in step:
+            continue
+
+        agent = step["agents"].get("queue", None)
         if not agent in stuck:
-            return
+            continue
 
         if agent == "hetzner-aarch64-2cpu-4gb":
             if "hetzner-x86-64-2cpu-4gb" not in stuck:
-                config["agents"]["queue"] = "hetzner-x86-64-2cpu-4gb"
-                if config.get("depends_on") == "build-aarch64":
-                    config["depends_on"] = "build-x86_64"
+                step["agents"]["queue"] = "hetzner-x86-64-2cpu-4gb"
+                if step.get("depends_on") == "build-aarch64":
+                    step["depends_on"] = "build-x86_64"
             else:
-                config["agents"]["queue"] = "linux-aarch64"
+                step["agents"]["queue"] = "linux-aarch64"
         elif agent == "hetzner-aarch64-4cpu-8gb":
             if "hetzner-x86-64-4cpu-8gb" not in stuck:
-                config["agents"]["queue"] = "hetzner-x86-64-4cpu-8gb"
-                if config.get("depends_on") == "build-aarch64":
-                    config["depends_on"] = "build-x86_64"
+                step["agents"]["queue"] = "hetzner-x86-64-4cpu-8gb"
+                if step.get("depends_on") == "build-aarch64":
+                    step["depends_on"] = "build-x86_64"
             else:
-                config["agents"]["queue"] = "linux-aarch64"
+                step["agents"]["queue"] = "linux-aarch64"
         elif agent == "hetzner-aarch64-8cpu-16gb":
             if "hetzner-x86-64-8cpu-16gb" not in stuck:
-                config["agents"]["queue"] = "hetzner-x86-64-8cpu-16gb"
-                if config.get("depends_on") == "build-aarch64":
-                    config["depends_on"] = "build-x86_64"
+                step["agents"]["queue"] = "hetzner-x86-64-8cpu-16gb"
+                if step.get("depends_on") == "build-aarch64":
+                    step["depends_on"] = "build-x86_64"
             else:
-                config["agents"]["queue"] = "linux-aarch64-medium"
+                step["agents"]["queue"] = "linux-aarch64-medium"
 
         elif agent == "hetzner-aarch64-16cpu-32gb":
             if "hetzner-x86-64-16cpu-32gb" not in stuck:
-                config["agents"]["queue"] = "hetzner-x86-64-16cpu-32gb"
-                if config.get("depends_on") == "build-aarch64":
-                    config["depends_on"] = "build-x86_64"
+                step["agents"]["queue"] = "hetzner-x86-64-16cpu-32gb"
+                if step.get("depends_on") == "build-aarch64":
+                    step["depends_on"] = "build-x86_64"
             else:
-                config["agents"]["queue"] = "linux-aarch64-medium"
+                step["agents"]["queue"] = "linux-aarch64-medium"
 
         elif agent in ("hetzner-x86-64-4cpu-8gb", "hetzner-x86-64-2cpu-4gb"):
-            config["agents"]["queue"] = "linux-x86_64"
+            step["agents"]["queue"] = "linux-x86_64"
         elif agent in ("hetzner-x86-64-8cpu-16gb", "hetzner-x86-64-16cpu-32gb"):
-            config["agents"]["queue"] = "linux-x86_64-medium"
+            step["agents"]["queue"] = "linux-x86_64-medium"
         elif agent == "hetzner-x86-64-dedi-2cpu-8gb":
-            config["agents"]["queue"] = "linux-x86_64"
+            step["agents"]["queue"] = "linux-x86_64"
         elif agent == "hetzner-x86-64-dedi-4cpu-16gb":
-            config["agents"]["queue"] = "linux-x86_64-medium"
+            step["agents"]["queue"] = "linux-x86_64-medium"
         elif agent in (
             "hetzner-x86-64-dedi-8cpu-32gb",
             "hetzner-x86-64-dedi-16cpu-64gb",
         ):
-            config["agents"]["queue"] = "linux-x86_64-large"
+            step["agents"]["queue"] = "linux-x86_64-large"
         elif agent in (
             "hetzner-x86-64-dedi-32cpu-128gb",
             "hetzner-x86-64-dedi-48cpu-192gb",
         ):
-            config["agents"]["queue"] = "builder-linux-x86_64"
-
-    for config in pipeline["steps"]:
-        if "trigger" in config or "wait" in config:
-            # Trigger and Wait steps don't have agents
-            continue
-        if "group" in config:
-            for inner_config in config.get("steps", []):
-                visit(inner_config)
-            continue
-        visit(config)
+            step["agents"]["queue"] = "builder-linux-x86_64"
 
 
 def permit_rerunning_successful_steps(pipeline: Any) -> None:
-    def visit(step: Any) -> None:
+    for step in steps(pipeline):
+        if "trigger" in step or "wait" in step or "group" in step or "block" in step:
+            continue
         step.setdefault("retry", {}).setdefault("manual", {}).setdefault(
             "permit_on_passed", True
         )
 
-    for config in pipeline["steps"]:
-        if "trigger" in config or "wait" in config or "block" in config:
-            continue
-        if "group" in config:
-            for inner_config in config.get("steps", []):
-                visit(inner_config)
-            continue
-        visit(config)
-
 
 def set_retry_on_agent_lost(pipeline: Any) -> None:
-    def visit(step: Any) -> None:
+    for step in steps(pipeline):
+        if "trigger" in step or "wait" in step or "group" in step or "block" in step:
+            continue
         step.setdefault("retry", {}).setdefault("automatic", []).extend(
             [
                 {
@@ -591,15 +568,6 @@ def visit(step: Any) -> None:
             ]
         )
 
-    for config in pipeline["steps"]:
-        if "trigger" in config or "wait" in config or "block" in config:
-            continue
-        if "group" in config:
-            for inner_config in config.get("steps", []):
-                visit(inner_config)
-            continue
-        visit(config)
-
 
 def set_default_agents_queue(pipeline: Any) -> None:
     for step in steps(pipeline):
@@ -614,19 +582,10 @@ def set_default_agents_queue(pipeline: Any) -> None:
 
 
 def set_parallelism_name(pipeline: Any) -> None:
-    def visit(step: Any) -> None:
+    for step in steps(pipeline):
         if step.get("parallelism", 1) > 1:
             step["label"] += " %N"
 
-    for config in pipeline["steps"]:
-        if "trigger" in config or "wait" in config or "block" in config:
-            continue
-        if "group" in config:
-            for inner_config in config.get("steps", []):
-                visit(inner_config)
-            continue
-        visit(config)
-
 
 def check_depends_on(pipeline: Any, pipeline_name: str) -> None:
     if pipeline_name not in ("test", "nightly", "release-qualification"):
@@ -639,7 +598,7 @@ def check_depends_on(pipeline: Any, pipeline_name: str) -> None:
         # has completed, without waiting for block or wait steps unless those
         # are also explicit dependencies.
         if step.get("id") in ("analyze", "deploy", "coverage-pr-analyze"):
-            return
+            continue
 
         if (
             "depends_on" not in step
 
@@ -425,8 +425,7 @@ steps:
         depends_on: build-aarch64
         timeout_in_minutes: 150
         parallelism: 8
-        # disabled by default
-        skip: true
+        skip: "disabled by default"
         agents:
           queue: hetzner-aarch64-8cpu-16gb
         plugins:
@@ -1073,7 +1072,7 @@ steps:
               # Uses .td-file based parallelism instead
               args: [-m=long, test/cloudtest/test_upgrade.py, --no-test-parallelism]
         sanitizer: skip
-        skip: "TODO(def-): Reenable in one version when labels are fixed in old version"
+        skip: "TODO(def-) Reenable in one version when labels are fixed in old version"
 
   - group: "K8s node recovery cloudtest"
     key: k8s-node-recovery
 
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# Copyright Materialize, Inc. and contributors. All rights reserved.
+#
+# Use of this software is governed by the Business Source License
+# included in the LICENSE file at the root of this repository.
+#
+# As of the Change Date specified in that file, in accordance with
+# the Business Source License, use of this software will be governed
+# by the Apache License, Version 2.0.
+#
+# check-pipeline.sh: Sanity check for pipelines
+
+set -euo pipefail
+
+cd "$(dirname "$0")/../../../.."
+
+. misc/shlib/shlib.bash
+
+: "${CI:=0}"
+
+if ! is_truthy "$CI"; then
+    # Requires buildkite agent-access-token, which won't be available locally
+    exit
+fi
+
+unset CI_TEST_IDS
+unset CI_TEST_SELECTION
+unset CI_SANITIZER
+unset CI_COVERAGE_ENABLED
+unset CI_WAITING_FOR_BUILD
+
+pids=()
+for pipeline in $(find ci -name "pipeline.template.yml" -not -path "ci/test/pipeline.template.yml" -exec dirname {} \; | cut -d/ -f2); do
+    bin/pyactivate -m ci.mkpipeline "$pipeline" --dry-run &
+    pids+=($!)
+done
+
+for pid in "${pids[@]}"; do
+    try wait "$pid"
+done
+
+try_status_report
@@ -709,6 +709,9 @@ def validate(self) -> Testdrive:
 
                 # We check the contents of the sink topics by re-ingesting them.
 
+                # Still needs to sleep some before the topic exists
+                $ sleep-is-probably-flaky-i-have-justified-my-need-with-a-comment duration="5s"
+
                 > CREATE SOURCE sink_view_comments1_src
                   FROM KAFKA CONNECTION kafka_conn (TOPIC 'sink-sink-comments1')
                 > CREATE TABLE sink_view_comments1 FROM SOURCE sink_view_comments1_src (REFERENCE "sink-sink-comments1")
 
@@ -4372,6 +4372,7 @@ pub(crate) mod tests {
     /// This golden will have to be updated each time we change State, but
     /// that's a feature, not a bug.
     #[mz_ore::test]
+    #[cfg_attr(miri, ignore)] // too slow
     fn state_inspect_serde_json() {
         const STATE_SERDE_JSON: &str = include_str!("state_serde.json");
         let mut runner = proptest::test_runner::TestRunner::deterministic();
 
@@ -136,6 +136,8 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
             input=SCHEMA
             + dedent(
                 """
+                $ set-sql-timeout duration=120s
+
                 > UPDATE t1 SET f2 = 3;
                 $ kafka-ingest format=avro key-format=avro topic=pubsub-disruption schema=${schema} key-schema=${keyschema} start-iteration=1 repeat=1000000
                 {"f1": ${kafka-ingest.iteration}} {"f2": 3}
@@ -167,6 +169,7 @@ def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
             input=SCHEMA
             + dedent(
                 """
+                $ set-sql-timeout duration=120s
                 > UPDATE t1 SET f2 = 4;
                 $ kafka-ingest format=avro key-format=avro topic=pubsub-disruption schema=${schema} key-schema=${keyschema} start-iteration=1 repeat=1000000
                 {"f1": ${kafka-ingest.iteration}} {"f2": 4}
 
@@ -1568,7 +1568,7 @@ c_schedule_5  manual  NULL
 c_schedule_hydration_time_estimate  on-refresh  00:16:35
 
 statement ok
-SELECT mz_unsafe.mz_sleep(4);
+SELECT mz_unsafe.mz_sleep(8);
 
 query TTTTBT rowsort
 SELECT DISTINCT