PaddlePaddle
diff --git a/‎graph_net/test/dev_model_list/get_fusible_subgraph_sample_list.txt‎
Lines changed: 100 additions & 0 deletions b/‎graph_net/test/dev_model_list/get_fusible_subgraph_sample_list.txt‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎graph_net/test/dev_model_list/small_sample_list_for_get_fusible_subgraph.txt‎
Lines changed: 10 additions & 0 deletions b/‎graph_net/test/dev_model_list/small_sample_list_for_get_fusible_subgraph.txt‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎graph_net/test/fully_fusible_subgraph_extractor_test.sh‎
Lines changed: 33 additions & 0 deletions b/‎graph_net/test/fully_fusible_subgraph_extractor_test.sh‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎graph_net/test/naive_check_if_fully_fusionabale.sh‎
Lines changed: 20 additions & 0 deletions b/‎graph_net/test/naive_check_if_fully_fusionabale.sh‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎graph_net/test/naive_decomposer_and_post_extract_process_test.sh‎
Lines changed: 1 addition & 1 deletion b/‎graph_net/test/naive_decomposer_and_post_extract_process_test.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graph_net/test/torch_extractor_test.py‎
Lines changed: 1 addition & 1 deletion b/‎graph_net/test/torch_extractor_test.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graph_net/torch/constraint_util.py‎
Lines changed: 3 additions & 0 deletions b/‎graph_net/torch/constraint_util.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graph_net/torch/count_kernels_util.py‎
Lines changed: 145 additions & 0 deletions b/‎graph_net/torch/count_kernels_util.py‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎graph_net/torch/decompose_util.py‎
Lines changed: 4 additions & 2 deletions b/‎graph_net/torch/decompose_util.py‎
Lines changed: 4 additions & 2 deletions
@@ -0,0 +1,100 @@
+samples/timm/crossvit_small_240.in1k
+samples/timm/poolformerv2_s12.sail_in1k
+samples/timm/regnety_080.pycls_in1k
+samples/timm/dla46x_c.in1k
+samples/timm/mobilenetv1_100.ra4_e3600_r224_in1k
+samples/timm/efficientnetv2_rw_s.ra2_in1k
+samples/timm/vit_base_patch16_rope_ape_224.naver_in1k
+samples/timm/fastvit_t8.apple_dist_in1k
+samples/timm/test_byobnet.r160_in1k
+samples/timm/mambaout_base.in1k
+samples/timm/davit_small
+samples/timm/resnet61q.ra2_in1k
+samples/timm/coat_tiny
+samples/timm/regnetx_004.pycls_in1k
+samples/timm/convnextv2_large.fcmae
+samples/timm/regnety_640.seer
+samples/timm/repvit_m1_1.dist_300e_in1k
+samples/timm/tinynet_d.in1k
+samples/timm/resnetrs270.tf_in1k
+samples/timm/cait_m48_448
+samples/timm/legacy_seresnet50.in1k
+samples/timm/tinynet_a.in1k
+samples/timm/convnext_small.fb_in1k
+samples/timm/vit_huge_patch14_clip_quickgelu_224.dfn5b
+samples/timm/dpn131.mx_in1k
+samples/timm/convnextv2_large.fcmae_ft_in1k
+samples/timm/convnextv2_small
+samples/timm/repvit_m1.dist_in1k
+samples/timm/cs3darknet_s
+samples/timm/resnet50d.a1_in1k
+samples/timm/dm_nfnet_f6
+samples/timm/coatnet_1_rw_224
+samples/timm/lcnet_050.ra2_in1k
+samples/timm/efficientnet_em.ra2_in1k
+samples/timm/dpn48b
+samples/timm/semnasnet_075.rmsp_in1k
+samples/timm/skresnet34.ra_in1k
+samples/timm/crossvit_15_dagger_240.in1k
+samples/timm/mnasnet_100.rmsp_in1k
+samples/timm/mobilenetv3_rw.rmsp_in1k
+samples/timm/xception65p.ra3_in1k
+samples/timm/coatnet_0_rw_224
+samples/timm/eca_nfnet_l3
+samples/timm/deit3_base_patch16_224.fb_in1k
+samples/timm/mambaout_base_short_rw.sw_e500_in1k
+samples/timm/mobilenetv4_conv_small.e1200_r224_in1k
+samples/timm/xception71.tf_in1k
+samples/timm/dla60.in1k
+samples/timm/repghostnet_130.in1k
+samples/timm/mambaout_base_plus_rw.sw_e150_in12k
+samples/timm/poolformerv2_s36.sail_in1k
+samples/timm/deit3_huge_patch14_224.fb_in1k
+samples/timm/vit_base_patch32_clip_224.datacompxl
+samples/timm/poolformer_m48.sail_in1k
+samples/timm/regnety_006.pycls_in1k
+samples/timm/starnet_s4.in1k
+samples/timm/poolformer_m36.sail_in1k
+samples/timm/vit_huge_patch14_gap_224.in1k_ijepa
+samples/timm/efficientnet_b3.ra2_in1k
+samples/timm/mobilenetv3_large_150d.ra4_e3600_r256_in1k
+samples/timm/hgnetv2_b0.ssld_stage1_in22k_in1k
+samples/timm/convnextv2_huge.fcmae
+samples/timm/davit_huge
+samples/timm/regnetx_004_tv.tv2_in1k
+samples/timm/dla34.in1k
+samples/timm/convnext_xlarge.fb_in22k
+samples/timm/resmlp_12_224.fb_dino
+samples/timm/fasternet_t1.in1k
+samples/timm/resnetblur50.bt_in1k
+samples/timm/res2net50d.in1k
+samples/timm/vit_base_patch32_224.augreg_in1k
+samples/timm/mambaout_base_wide_rw.sw_e500_in1k
+samples/timm/vgg19_bn.tv_in1k
+samples/timm/vit_small_patch16_rope_ape_224.naver_in1k
+samples/timm/hardcorenas_b.miil_green_in1k
+samples/timm/vgg16.tv_in1k
+samples/timm/xception41p.ra3_in1k
+samples/timm/efficientnet_lite0.ra_in1k
+samples/timm/regnetv_064.ra3_in1k
+samples/timm/regnety_320.pycls_in1k
+samples/timm/convnext_pico.d1_in1k
+samples/timm/repvit_m1_0.dist_300e_in1k
+samples/timm/resnet50c.gluon_in1k
+samples/timm/mobileone_s4.apple_in1k
+samples/timm/ghostnet_100.in1k
+samples/timm/deit_base_distilled_patch16_384
+samples/timm/dpn68b.mx_in1k
+samples/timm/dla60_res2next
+samples/timm/resnet101d.gluon_in1k
+samples/timm/eva02_large_patch14_clip_224.merged2b
+samples/timm/fasternet_m.in1k
+samples/timm/mobilenetv2_110d.ra_in1k
+samples/timm/regnetx_064.pycls_in1k
+samples/timm/cspresnet50.ra_in1k
+samples/timm/resmlp_24_224.fb_dino
+samples/timm/mobileone_s3.apple_in1k
+samples/timm/mobileone_s2.apple_in1k
+samples/timm/res2net101d
+samples/timm/hardcorenas_f.miil_green_in1k
+samples/timm/hrnet_w18_ssld.paddle_in1k
@@ -0,0 +1,10 @@
+#samples/timm/crossvit_small_240.in1k
+#samples/timm/poolformerv2_s12.sail_in1k
+#samples/timm/regnety_080.pycls_in1k
+#samples/timm/dla46x_c.in1k
+#samples/timm/mobilenetv1_100.ra4_e3600_r224_in1k
+samples/timm/efficientnetv2_rw_s.ra2_in1k
+#samples/timm/vit_base_patch16_rope_ape_224.naver_in1k
+#samples/timm/fastvit_t8.apple_dist_in1k
+#samples/timm/test_byobnet.r160_in1k
+#samples/timm/mambaout_base.in1k
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+
+# input model path
+MODEL_NAME=resnet18
+MODEL_PATH_IN_SAMPLES=/timm/$MODEL_NAME
+# INPUT_MODEL_LIST=$GRAPH_NET_ROOT/test/dev_model_list/get_fusible_subgraph_sample_list.txt
+INPUT_MODEL_LIST=$GRAPH_NET_ROOT/test/dev_model_list/small_sample_list_for_get_fusible_subgraph.txt
+
+OUTPUT_DIR="/tmp/find_fully_fusible_output"
+config_json_str=$(cat <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/torch/fully_fusible_subgraph_extractor.py",
+    "handler_class_name":"FullyFusibleSubgraphExtractor",
+    "handler_config": {
+        "resume": false,
+        "model_path_prefix": "$GRAPH_NET_ROOT/../",
+        "output_dir": "$OUTPUT_DIR",
+        "nn_module_fully_fusible_decorator_path": "$GRAPH_NET_ROOT/torch/count_kernels_util.py",
+        "nn_module_fully_fusible_decorator_class_name": "TorchSubModuleFullyFusibleDecorator",
+        "max_step": 3,
+        "min_step": 2,
+        "max_nodes": 4
+    }
+}
+EOF
+)
+CONFIG=$(echo $config_json_str | base64 -w 0)
+
+# python3 -m graph_net.model_path_handler --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --handler-config=$CONFIG
+python3 -m graph_net.model_path_handler --model-path-list $INPUT_MODEL_LIST --handler-config=$CONFIG
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(
+os.path.dirname(graph_net.__file__))")
+# added
+# input model path
+MODEL_NAME=resnet18d.ra2_in1k
+MODEL_PATH_IN_SAMPLES=/timm/$MODEL_NAME
+checker_config_json_str=$(cat <<EOF
+{
+    "post_extract_process_config": {
+        "post_extract_process_path":"$GRAPH_NET_ROOT/torch/count_kernels_util.py",
+        "post_extract_process_class_name": "GraphFullyFusionable"
+    }
+}
+EOF
+)
+CHECKER_CONFIG=$(echo $checker_config_json_str | base64 -w 0)
+
+python3 -m graph_net.torch.check_if_a_given_model_is_fully_fusionable --model-path $GRAPH_NET_ROOT/../samples/$MODEL_PATH_IN_SAMPLES --checker-config=$CHECKER_CONFIG
@@ -19,7 +19,7 @@ decorator_config_json_str=$(cat <<EOF
             "group_head_and_tail": true,
             "filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",
             "filter_config": {},
-            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process_count_kernels.py",
+            "post_extract_process_path":"$GRAPH_NET_ROOT/torch/count_kernels_util.py",
             "post_extract_process_class_name": "GraphFullyFusible"
         }
     }
 
@@ -76,7 +76,7 @@ def forward(self, x):
             start_node_idx=0,
             end_node_idx=2,
             submodule_hook=submodule_hook,
-            # group_head_and_tail=False,
+            group_head_and_tail=True,
         )
         folded_output = folded(inp)
 
 
@@ -1,6 +1,9 @@
 import sys
 import os
 import graph_net
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 class NaiveDataInputPredicator:
 
@@ -0,0 +1,145 @@
+import traceback
+from graph_net.torch import utils
+import importlib.util
+import torch
+import sys
+from typing import Type
+from torch.profiler import profile, record_function, ProfilerActivity
+
+from graph_net.torch.graph_fusibility_status import (
+    GraphFusibilityStatus,
+    GraphFusibility,
+)
+
+
+class TorchSubModuleFullyFusibleDecorator:
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, module, sub_module_idx):
+        return TorchNNModuleFullyFusiblePredicator(module)
+
+
+class TorchNNModuleFullyFusiblePredicator(torch.nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+
+    def forward(self, *inputs):
+        try:
+            compiled_model = torch.compile(self.module)
+        except Exception:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        ret_tensors, compiled_num_of_kernels = count_kernels(compiled_model, inputs)
+        if compiled_num_of_kernels == 1:
+            raise GraphFusibilityStatus(GraphFusibility.kFullyFusible)
+        else:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        return ret_tensors
+
+
+class ThrowExitStatusIfGraphFullyFusible:
+    def __init__(self, config):
+        self.config = config
+
+    def __call__(self, model_path=None):
+        # def callback = lambda: logger.warning("post-extract-process-call-end")
+        # logger.warning("post-extract-process-call-begin")
+        # atexit.register(callback)
+        torch._dynamo.reset()
+        if model_path is None:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        # model
+        model_class = load_class_from_file(
+            f"{model_path}/model.py", class_name="GraphModule"
+        )
+        assert model_class is not None
+        model = model_class()
+        # print(f"{model_path=}")
+
+        inputs_params = utils.load_converted_from_text(f"{model_path}")
+        params = inputs_params["weight_info"]
+        state_dict = {k: utils.get_dummy_tensor(v) for k, v in params.items()}
+
+        # try to run the model
+        try:
+            model(**state_dict)
+        except Exception:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        # try to compile the model
+        try:
+            compiled_model = torch.compile(model)
+        except Exception:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+        _, compiled_num_of_kernels = count_kernels(compiled_model, state_dict)
+        if compiled_num_of_kernels == 1:
+            raise GraphFusibilityStatus(GraphFusibility.kFullyFusible)
+        else:
+            raise GraphFusibilityStatus(GraphFusibility.kNotFullyFusible)
+
+
+class GraphFullyFusible:
+    def __init__(self, config):
+        self.predicator = ThrowExitStatusIfGraphFullyFusible(config)
+
+    def __call__(self, model_path=None):
+        try:
+            self.predicator(model_path)
+        except GraphFusibilityStatus as status:
+            if status.graph_fusibility == GraphFusibility.kFullyFusible:
+                sys.exit(0)
+            elif status.graph_fusibility == GraphFusibility.kNotFullyFusible:
+                sys.exit(1)
+            else:
+                raise NotImplementedError(f"{status.graph_fusibility=}")
+        except Exception:
+            traceback.print_exc()
+        sys.exit(1)
+
+
+def load_class_from_file(file_path: str, class_name: str) -> Type[torch.nn.Module]:
+    spec = importlib.util.spec_from_file_location("unnamed", file_path)
+    unnamed = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(unnamed)
+    model_class = getattr(unnamed, class_name, None)
+    return model_class
+
+
+def count_kernels(model, sample_inputs) -> int:
+    """
+    Count the number of CUDA kernel launches performed during a model's forward pass.
+
+    Args:
+        model(graph models)
+        sample_inputs(tensors)
+
+    Returns:
+        int: The number of kernels used.
+
+    Behavior:
+        - Runs the model once inside a PyTorch profiler context.
+        - Identifies the event with key = 'cudaLaunchKernel', which corresponds
+        to the number of CUDA kernel launches.
+    """
+    model.eval()
+    # Use PyTorch Profiler
+
+    with profile(
+        activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU],
+        record_shapes=True,
+    ) as prof:
+        with record_function("model_inference"):
+            if isinstance(sample_inputs, dict):
+                ret_tensors = model(**sample_inputs)
+            elif isinstance(sample_inputs, (list, tuple)):
+                ret_tensors = model(*sample_inputs)
+            else:
+                raise NotImplementedError(f"{type(sample_inputs)=}")
+
+    events = prof.key_averages()
+
+    total_count = 0
+    for e in events:
+        if e.key == "cuLaunchKernel" or e.key == "cudaLaunchKernel":
+            total_count += e.count
+    return ret_tensors, total_count
@@ -181,7 +181,7 @@ def fold_range_to_submodule(
     end_node_idx: int,
     submodule_hook=None,
     submodule_name="extracted_submodule",
-    group_head_and_tail=True,
+    group_head_and_tail=False,
 ):
     return convert_to_submodules_graph(
         gm,
@@ -249,7 +249,9 @@ def get_args_node(arg):
             yield arg.stop
             yield arg.step
         else:
-            assert isinstance(arg, (int, bool, float, str, type(None))), f"{type(arg)=}"
+            assert isinstance(
+                arg, (int, bool, float, str, type(...), type(None))
+            ), f"{type(arg)=}"
 
     def get_args_node_and_self_node(node):
         for arg in node.args:
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ decorator_config_json_str=$(cat <<EOF`
`19`	`19`	`"group_head_and_tail": true,`
`20`	`20`	`"filter_path":"$GRAPH_NET_ROOT/torch/naive_subgraph_filter.py",`
`21`	`21`	`"filter_config": {},`
`22`		`- "post_extract_process_path":"$GRAPH_NET_ROOT/torch/post_extract_process_count_kernels.py",`
	`22`	`+ "post_extract_process_path":"$GRAPH_NET_ROOT/torch/count_kernels_util.py",`
`23`	`23`	`"post_extract_process_class_name": "GraphFullyFusible"`
`24`	`24`	`}`
`25`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def forward(self, x):`
`76`	`76`	`start_node_idx=0,`
`77`	`77`	`end_node_idx=2,`
`78`	`78`	`submodule_hook=submodule_hook,`
`79`		`- # group_head_and_tail=False,`
	`79`	`+ group_head_and_tail=True,`
`80`	`80`	`)`
`81`	`81`	`folded_output = folded(inp)`
`82`	`82`