Align usage of RECORD_FUNCTION for xetla (#2371)

anmyachev · web-flow · commit 77d819c8c3d1 · 2024-09-29T12:21:15.000+02:00
Closes #2373 Motivation: `NOTE: passing the inputs incurs an additional overhead` (that is, for a fairer comparison, we should not use this parameter) : https://github.com/Stonepia/pytorch/blob/4de58719fbb5c681305622bd0e22997c9ece52b0/aten/src/ATen/record_function.h#L110 ```c++ /** * RecordFunctionCallback represents a pair of callbacks to be used with * RecordFunction, members: * start, end - the callbacks to run when entering and exiting the scope; * optionally, the start callback may return an ObserverContext which will * be passed to the end callback, use appropriate constructor accordingly. * needs_inputs - whether the callbacks need the inputs passed from the * observed function/range; NOTE: passing the inputs incurs an additional * overhead; sampling_probability - if not 1.0, then the callback is * probabilistically sampled to run; NOTE: start and end callbacks always run as * a pair and are sampled together; scopes - types of scopes to execute the * callbacks on (see RecordScope); passing empty set means the callbacks will be * executed for all possible scope types should_run - optional function that * returns whether this callback should run; overwrites the effect of setting * sampling_probability */ ``` CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/11071744714/job/30765931643 Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/benchmarks/xetla_kernel/python_main.cpp b/benchmarks/xetla_kernel/python_main.cpp
@@ -42,7 +42,7 @@ at::Tensor softmax(const at::Tensor &input, const at::Tensor &output,
   CHECK_INPUT(input);
   CHECK_INPUT(output);
 #ifdef USE_IPEX
-  RECORD_FUNCTION("xetla softmax", {input});
+  RECORD_FUNCTION("xetla softmax", {});
 #endif
 
   auto queue = get_current_sycl_queue();
@@ -62,7 +62,7 @@ at::Tensor bf16_gemm(const at::Tensor &a, const at::Tensor &b,
   CHECK_INPUT(c);
   CHECK_INPUT(acc);
 #ifdef USE_IPEX
-  RECORD_FUNCTION("xetla gemm", {a, b, c, acc});
+  RECORD_FUNCTION("xetla gemm", {});
 #endif
 
   auto queue = get_current_sycl_queue();
@@ -82,7 +82,7 @@ at::Tensor bf16_stream_k_gemm(const at::Tensor &a, const at::Tensor &b,
   CHECK_INPUT(c);
   CHECK_INPUT(acc);
 #ifdef USE_IPEX
-  RECORD_FUNCTION("xetla stream_k_gemm", {a, b, c, acc});
+  RECORD_FUNCTION("xetla stream_k_gemm", {});
 #endif
 
   auto queue = get_current_sycl_queue();
@@ -119,8 +119,7 @@ void flash_attn(const at::Tensor &q, const at::Tensor &k, const at::Tensor &v,
   CHECK_INPUT(m);
   CHECK_INPUT(l);
 #ifdef USE_IPEX
-  RECORD_FUNCTION("xetla fa",
-                  {num_batches, num_heads, head_size, num_queries, num_keys});
+  RECORD_FUNCTION("xetla fa", {});
 #endif
 
   auto queue = get_current_sycl_queue();