microsoft
diff --git a/‎onnxruntime/python/tools/transformers/models/sam2/README.md‎
Lines changed: 21 additions & 10 deletions b/‎onnxruntime/python/tools/transformers/models/sam2/README.md‎
Lines changed: 21 additions & 10 deletions
diff --git a/‎onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py‎
Lines changed: 14 additions & 1 deletion b/‎onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.py‎
Lines changed: 14 additions & 1 deletion
@@ -96,8 +96,7 @@ We can create a conda environment then run GPU benchmark like the following:
 conda create -n sam2_gpu python=3.11 -y
 conda activate sam2_gpu
 install_dir=$HOME
-profiling=true
-bash benchmark_sam2.sh $install_dir gpu $profiling
+bash benchmark_sam2.sh $install_dir gpu
 ```
 
 or create a new conda environment for CPU benchmark:
@@ -107,16 +106,28 @@ conda activate sam2_cpu
 bash benchmark_sam2.sh $HOME cpu
 ```
 
-The first parameter is a directory to clone git repositories or install CUDA/cuDNN for benchmark.
-The second parameter can be either "gpu" or "cpu", which indicates the device to run benchmark.
-The third parameter is optional. Value "true" will enable profiling after running benchmarking on GPU.
+The usage of the script like the following:
+```
+bash benchmark_sam2.sh <install_dir> <cpu_or_gpu> [profiling] [benchmarking] [nightly] [dynamo]
+```
+
+| Parameter| Default  | Description |
+|----------|----------| ------------|
+| install_dir | $HOME | a directory to clone git repositories or install CUDA/cuDNN for benchmark |
+| cpu_or_gpu | gpu | the device to run benchmark. The value can be either "gpu" or "cpu" |
+| profiling | false | run gpu profiling |
+| benchmarking | true | run benchmark |
+| nightly | false | install onnxruntime nightly or official release package |
+| dynamo | false | export image encoder using dynamo or not. |
 
-The script will automatically install required packages in current conda environment, download checkpoints, export onnx,
-and run demo, benchmark and optionally run profiling.
+The dynamo export is experimental since graph optimization still need extra works for this model.
 
-* The performance test result is in sam2_gpu.csv or sam2_cpu.csv, which can be loaded into Excel.
-* The demo output is sam2_demo_fp16_gpu.png or sam2_demo_fp32_cpu.png.
-* The profiling results are in *.nsys-rep or *.json files in current directory. Use Nvidia NSight System to view the *.nsys-rep file.
+Output files:
+* sam2_cpu_[timestamp].csv or sam2_gpu_[timestamp].csv has benchmark results. Use Excel to load the file to view it.
+* onnxruntime_image_[encoder|decoder].json has ONNX Runtime profiling results. Use `chrome://tracing` in Chrome browser to view it.
+* torch_image_[encoder|decoder].json has PyTorch profiling results. Use `chrome://tracing` in Chrome browser to view it.
+* sam2_fp16_profile_image_[encoder|decoder]_[ort|torch]_gpu.[nsys-rep|sqlite] has NVTX profiling. Use Nvidia NSight System to view it.
+* torch_image_encoder_compiled_code.txt has the compiled kernel code from Pytorch.
 
 ## Limitations
 - The exported image_decoder model does not support batch mode for now.
@@ -46,6 +46,7 @@ def __init__(
         prefer_nhwc: bool = False,
         warm_up: int = 5,
         enable_nvtx_profile: bool = False,
+        enable_ort_profile: bool = False,
         enable_torch_profile: bool = False,
         repeats: int = 1000,
         verbose: bool = False,
@@ -74,6 +75,7 @@ def __init__(
         self.prefer_nhwc = prefer_nhwc
         self.warm_up = warm_up
         self.enable_nvtx_profile = enable_nvtx_profile
+        self.enable_ort_profile = enable_ort_profile
         self.enable_torch_profile = enable_torch_profile
         self.repeats = repeats
         self.verbose = verbose
@@ -317,6 +319,7 @@ def run_test(
         repeats=args.repeats,
         warm_up=args.warm_up,
         enable_nvtx_profile=args.enable_nvtx_profile,
+        enable_ort_profile=args.enable_ort_profile,
         enable_torch_profile=args.enable_torch_profile,
         torch_compile_mode=args.torch_compile_mode,
         verbose=False,
@@ -325,7 +328,7 @@ def run_test(
     if args.engine == "ort":
         sess_options = SessionOptions()
         sess_options.intra_op_num_threads = args.intra_op_num_threads
-        if config.enable_nvtx_profile:
+        if config.enable_ort_profile:
             sess_options.enable_profiling = True
             sess_options.log_severity_level = 4
             sess_options.log_verbosity_level = 0
@@ -349,6 +352,8 @@ def run_test(
             with nvtx.annotate("one_run"):
                 _ = session.infer(input_dict)
             cudart.cudaProfilerStop()
+
+        if config.enable_ort_profile:
             session.ort_session.end_profiling()
 
         if repeats == 0:
@@ -554,6 +559,14 @@ def _parse_arguments():
         help="Enable nvtx profiling. It will add an extra run for profiling before performance test.",
     )
 
+    parser.add_argument(
+        "--enable_ort_profile",
+        required=False,
+        default=False,
+        action="store_true",
+        help="Enable ORT profiling.",
+    )
+
     parser.add_argument(
         "--enable_torch_profile",
         required=False,