diff --git a/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json index 1dc8f04d6..040a66cde 100644 --- a/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json +++ b/samples/transformers-auto-model/Qwen1.5-0.5B/graph_net.json @@ -4,5 +4,5 @@ "num_nodes_required": 1, "dynamic": false, "model_name": "Qwen/Qwen1.5-0.5B", - "heuristic_tag": "unknown" -} \ No newline at end of file + "heuristic_tag": "nlp" +} diff --git a/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/graph_net.json b/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/graph_net.json new file mode 100644 index 000000000..bb6c06e2f --- /dev/null +++ b/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/graph_net.json @@ -0,0 +1,7 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "source": "huggingface_hub", + "heuristic_tag": "nlp" +} diff --git a/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/input_meta.py b/samples/transformers-auto-model/joeddav_xlm-roberta-large-xnli/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/tools/check_and_count_samples.py b/tools/check_and_count_samples.py new file mode 100644 index 000000000..f4ab5cc1c --- /dev/null +++ b/tools/check_and_count_samples.py @@ -0,0 +1,130 @@ +import os +import json + + +def check_completeness(samples_dir): + samples_missing_hash = [] + samples_missing_json = [] + samples_missing_meta = [] + for root, dirs, files in os.walk(samples_dir): + if "shape_patches_" not in root and "model.py" in files: + model_path = root + if not os.path.exists(os.path.join(model_path, "graph_hash.txt")): + samples_missing_hash.append(model_path) + if not os.path.exists(os.path.join(model_path, "graph_net.json")): + samples_missing_json.append(model_path) + if not os.path.exists( + os.path.join(model_path, "input_meta.py") + ) or not os.path.exists(os.path.join(model_path, "weight_meta.py")): + samples_missing_meta.append(model_path) + + all_samples_complete = ( + len(samples_missing_hash) == 0 + and len(samples_missing_json) == 0 + and len(samples_missing_meta) == 0 + ) + + if not all_samples_complete: + print(f"Check completeness result for {samples_dir}:") + print(f"1. {len(samples_missing_hash)} samples missing graph_hash.txt") + for model_path in samples_missing_hash: + print(f" - {model_path}") + + print(f"2. {len(samples_missing_json)} samples missing graph_net.json") + for model_path in samples_missing_json: + print(f" - {model_path}") + + print( + f"3. {len(samples_missing_meta)} samples missing input_meta.py or weight_meta.py" + ) + for model_path in samples_missing_meta: + print(f" - {model_path}") + print() + + return all_samples_complete + + +def check_redandancy(samples_dir): + graph_hash2model_paths = {} + for root, dirs, files in os.walk(samples_dir): + if "graph_hash.txt" in files: + model_path = root + graph_hash_path = os.path.join(model_path, "graph_hash.txt") + graph_hash = open(graph_hash_path).read() + if graph_hash not in graph_hash2model_paths.keys(): + graph_hash2model_paths[graph_hash] = [model_path] + else: + graph_hash2model_paths[graph_hash].append(model_path) + + has_duplicates = False + print(f"Totally {len(graph_hash2model_paths)} unique graphs under {samples_dir}.") + for graph_hash, model_paths in graph_hash2model_paths.items(): + graph_hash2model_paths[graph_hash] = sorted(model_paths) + if len(model_paths) > 1: + has_duplicates = True + print(f"Redundant models detected for grap_hash {graph_hash}:") + for model_path in model_paths: + print(f" {model_path}") + return has_duplicates, graph_hash2model_paths + + +def count_samples(samples_dir, framework): + model_sources = os.listdir(samples_dir) + + graph_net_count = 0 + graph_net_dict = {} + model_names_set = set() + for source in model_sources: + source_dir = os.path.join(samples_dir, source) + if os.path.isdir(source_dir): + graph_net_dict[source] = 0 + for root, dirs, files in os.walk(source_dir): + if "graph_net.json" in files: + with open(os.path.join(root, "graph_net.json"), "r") as f: + data = json.load(f) + model_name = data.get("model_name", None) + if model_name is not None and model_name != "NO_VALID_MATCH_FOUND": + if model_name not in model_names_set: + model_names_set.add(model_name) + graph_net_count += 1 + graph_net_dict[source] += 1 + else: + graph_net_count += 1 + graph_net_dict[source] += 1 + + print(f"Number of {framework} samples: {graph_net_count}") + for name, number in graph_net_dict.items(): + print(f"- {name:24}: {number}") + print() + + +def main(): + filename = os.path.abspath(__file__) + root_dir = os.path.dirname(os.path.dirname(filename)) + + framework2dirname = { + "torch": "samples", + "paddle": "paddle_samples", + } + + all_samples_complete = True + for samples_dirname in framework2dirname.values(): + samples_dir = os.path.join(root_dir, samples_dirname) + all_samples_complete = all_samples_complete and check_completeness(samples_dir) + assert all_samples_complete, "Please fix the incompleted samples!" + + all_samples_has_duplicates = False + for samples_dirname in framework2dirname.values(): + samples_dir = os.path.join(root_dir, samples_dirname) + has_duplicates, graph_hash2model_paths = check_redandancy(samples_dir) + all_samples_has_duplicates = all_samples_has_duplicates or has_duplicates + print() + assert not all_samples_has_duplicates, "Please remove the redundant samples!" + + for framework in framework2dirname.keys(): + samples_dir = os.path.join(root_dir, framework2dirname[framework]) + count_samples(samples_dir, framework) + + +if __name__ == "__main__": + main() diff --git a/tools/ci/check_validate.sh b/tools/ci/check_validate.sh index f443cd9ea..64e45119a 100644 --- a/tools/ci/check_validate.sh +++ b/tools/ci/check_validate.sh @@ -41,7 +41,6 @@ function prepare_torch_env() { pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126 > /dev/null [ $? -ne 0 ] && LOG "[FATAL] Install torch2.9.0 failed!" && exit -1 else - python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py LOG "[INFO] This pull request doesn't change any torch samples, skip the CI." fi } @@ -62,7 +61,6 @@ function prepare_paddle_env() { [ $? -ne 0 ] && LOG "[FATAL] Install paddlepaddle-develop failed!" && exit -1 python -c "import paddle; print('[PaddlePaddle Commit]', paddle.version.commit)" else - python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py LOG "[INFO] This pull request doesn't change any paddle samples, skip the CI." fi } @@ -165,7 +163,8 @@ function main() { check_validation_info=$(check_paddle_validation) check_validation_code=$? summary_problems $check_validation_code "$check_validation_info" - python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/count_sample.py + python ${GRAPH_NET_EXTRACT_WORKSPACE}/tools/check_and_count_samples.py >&2 + [ $? -ne 0 ] && LOG "[FATAL] Check completeness or redundancy failed!" && exit -1 LOG "[INFO] check_validation run success and no error!" } diff --git a/tools/count_sample.py b/tools/count_sample.py deleted file mode 100644 index 3addb4d5a..000000000 --- a/tools/count_sample.py +++ /dev/null @@ -1,40 +0,0 @@ -import os -import json - - -filename = os.path.abspath(__file__) -root_dir = os.path.dirname(os.path.dirname(filename)) -framework2dirname = { - "torch": "samples", - "paddle": "paddle_samples", -} - -for framework in ["torch", "paddle"]: - samples_dir = os.path.join(root_dir, framework2dirname[framework]) - model_categories = os.listdir(samples_dir) - - graph_net_count = 0 - graph_net_dict = {} - model_names_set = set() - for category in model_categories: - category_dir = os.path.join(samples_dir, category) - if os.path.isdir(category_dir): - graph_net_dict[category] = 0 - for root, dirs, files in os.walk(category_dir): - if "graph_net.json" in files: - with open(os.path.join(root, "graph_net.json"), "r") as f: - data = json.load(f) - model_name = data.get("model_name", None) - if model_name is not None: - if model_name not in model_names_set: - model_names_set.add(model_name) - graph_net_count += 1 - graph_net_dict[category] += 1 - else: - graph_net_count += 1 - graph_net_dict[category] += 1 - - print(f"Number of {framework} samples: {graph_net_count}") - for name, number in graph_net_dict.items(): - print(f"- {name:24}: {number}") - print()