Add check completeness of samples on ci.

Xreki · Xreki · commit b2f86ade5df6 · 2025-09-28T12:06:31.000+08:00
diff --git a/.github/workflows/Codestyle-Check.yml b/.github/workflows/Codestyle-Check.yml
@@ -55,3 +55,17 @@ jobs:
           set +e
           bash -x tools/codestyle/pre_commit.sh;EXCODE=$?
           exit $EXCODE
+
+      - name: Check samples
+      if: steps.check-bypass.outputs.can-skip != 'true'
+        run: |
+          set +e
+          python3.10 tools/check_samples.py;EXCODE=$?
+          exit $EXCODE
+
+      - name: Count samples
+      if: steps.check-bypass.outputs.can-skip != 'true'
+        run: |
+          set +e
+          python3.10 tools/count_sample.py;EXCODE=$?
+          exit $EXCODE
diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json
@@ -4,5 +4,5 @@
     "num_nodes_required": 1,
     "dynamic": false,
     "model_name": "Qwen/Qwen1.5-0.5B",
-    "heuristic_tag": "unknown"
-}
+    "heuristic_tag": "nlp"
+}
diff --git a/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/graph_net.json b/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/graph_net.json
@@ -0,0 +1,7 @@
+{
+    "framework": "torch",
+    "num_devices_required": 1,
+    "num_nodes_required": 1,
+    "source": "huggingface_hub",
+    "heuristic_tag": "nlp"
+}
diff --git a/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/input_meta.py b/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/input_meta.py
diff --git a/tools/check_samples.py b/tools/check_samples.py
@@ -0,0 +1,88 @@
+import os
+
+
+def check_completeness(samples_dir):
+    samples_missing_hash = []
+    samples_missing_json = []
+    samples_missing_meta = []
+    for root, dirs, files in os.walk(samples_dir):
+        if "shape_patches_" not in root and "model.py" in files:
+            model_path = root
+            if not os.path.exists(os.path.join(model_path, "graph_hash.txt")):
+                samples_missing_hash.append(model_path)
+            if not os.path.exists(os.path.join(model_path, "graph_net.json")):
+                samples_missing_json.append(model_path)
+            if not os.path.exists(
+                os.path.join(model_path, "input_meta.py")
+            ) or not os.path.exists(os.path.join(model_path, "weight_meta.py")):
+                samples_missing_meta.append(model_path)
+
+    print(f"Check completeness result for {samples_dir}:")
+    print(f"1. {len(samples_missing_hash)} samples missing graph_hash.txt")
+    for model_path in samples_missing_hash:
+        print(f"  - {model_path}")
+
+    print(f"2. {len(samples_missing_json)} samples missing graph_net.json")
+    for model_path in samples_missing_json:
+        print(f"  - {model_path}")
+
+    print(
+        f"3. {len(samples_missing_meta)} samples missing input_meta.py or weight_meta.py"
+    )
+    for model_path in samples_missing_meta:
+        print(f"  - {model_path}")
+    print()
+    return (
+        len(samples_missing_hash) == 0
+        and len(samples_missing_json) == 0
+        and len(samples_missing_meta) == 0
+    )
+
+
+def check_redandancy(samples_dir):
+    graph_hash2model_paths = {}
+    for root, dirs, files in os.walk(samples_dir):
+        if "graph_hash.txt" in files:
+            model_path = root
+            graph_hash_path = os.path.join(model_path, "graph_hash.txt")
+            graph_hash = open(graph_hash_path).read()
+            if graph_hash not in graph_hash2model_paths.keys():
+                graph_hash2model_paths[graph_hash] = [model_path]
+            else:
+                graph_hash2model_paths[graph_hash].append(model_path)
+
+    has_duplicates = False
+    print(f"Totally {len(graph_hash2model_paths)} unique samples under {samples_dir}.")
+    for graph_hash, model_paths in graph_hash2model_paths.items():
+        graph_hash2model_paths[graph_hash] = sorted(model_paths)
+        if len(model_paths) > 1:
+            has_duplicates = True
+            print(f"Redundant models detected for grap_hash {graph_hash}:")
+            for model_path in model_paths:
+                print(f"    {model_path}")
+
+    return has_duplicates, graph_hash2model_paths
+
+
+def main():
+    filename = os.path.abspath(__file__)
+    root_dir = os.path.dirname(os.path.dirname(filename))
+
+    all_samples_completed = True
+    for samples_dirname in ["samples", "paddle_samples"]:
+        samples_dir = os.path.join(root_dir, samples_dirname)
+        all_samples_completed = all_samples_completed and check_completeness(
+            samples_dir
+        )
+    assert all_samples_completed, "Please fix the incompleted samples!"
+
+    all_samples_has_duplicates = False
+    for samples_dirname in ["samples", "paddle_samples"]:
+        samples_dir = os.path.join(root_dir, samples_dirname)
+        has_duplicates, graph_hash2model_paths = check_redandancy(samples_dir)
+        all_samples_has_duplicates = all_samples_has_duplicates or has_duplicates
+    assert not all_samples_has_duplicates, "Please remove the redundant samples!"
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/ci/check_validate.sh b/tools/ci/check_validate.sh
@@ -41,7 +41,6 @@ function prepare_torch_env() {
     pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 > /dev/null
     [ $? -ne 0 ] && LOG "[FATAL] Install torch2.9.0 failed!" && exit -1
   else
-    python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py
     LOG "[INFO] This pull request doesn't change any torch samples, skip the CI."
   fi
 }
@@ -62,7 +61,6 @@ function prepare_paddle_env() {
     [ $? -ne 0 ] && LOG "[FATAL] Install paddlepaddle-develop failed!" && exit -1
     python -c "import paddle; print('[PaddlePaddle Commit]', paddle.version.commit)"
   else
-    python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py
     LOG "[INFO] This pull request doesn't change any paddle samples, skip the CI."
   fi
 }
diff --git a/tools/count_sample.py b/tools/count_sample.py
@@ -25,7 +25,7 @@
                     with open(os.path.join(root, "graph_net.json"), "r") as f:
                         data = json.load(f)
                         model_name = data.get("model_name", None)
-                    if model_name is not None:
+                    if model_name is not None and model_name != "NO_VALID_MATCH_FOUND":
                         if model_name not in model_names_set:
                             model_names_set.add(model_name)
                             graph_net_count += 1