Update PyTorch pin (#4826)

anmyachev · web-flow · commit 4d11eb7b2bb6 · 2025-08-01T18:45:23.000+02:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/.github/pins/pytorch.txt b/.github/pins/pytorch.txt
@@ -1 +1 @@
-6d071bd65de9bdc354f32adf67e00d6e13475e76
+83e2ea8135c42fa826c3d751a04f60259e97147f
diff --git a/scripts/patch-pytorch.sh b/scripts/patch-pytorch.sh
@@ -40,4 +40,3 @@ apply_patch ./patch/pytorch_fp64.patch
 apply_patch ./patch/pytorch_global_scratch.patch
 apply_patch ./patch/test_compile_subprocess.patch
 apply_patch ./patch/flex_decoding.patch
-apply_patch ./patch/flex_attn_bwd_num_stage_1.patch
diff --git a/scripts/patch/flex_attn_143553.patch b/scripts/patch/flex_attn_143553.patch
@@ -525,7 +525,7 @@ index e78cf68244ee..79cb9d102bdd 100644
  if __name__ == "__main__":
      from torch._inductor.test_case import run_tests
 diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
-index 5af7b284f757..667915c69507 100644
+index b5ec59dc291c..777892a0ce2d 100644
 --- a/test/inductor/test_flex_decoding.py
 +++ b/test/inductor/test_flex_decoding.py
 @@ -27,6 +27,7 @@
@@ -598,7 +598,7 @@ index 5af7b284f757..667915c69507 100644
              requires_grad=True,
          )
          q, k, v, backward_grad = make_q(), make_kv(), make_kv(), make_q()
-@@ -998,12 +1007,12 @@ def mask_mod(b, h, q, kv):
+@@ -999,12 +1008,12 @@ def mask_mod(b, h, q, kv):
  
      @supported_platform
      @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
@@ -615,7 +615,7 @@ index 5af7b284f757..667915c69507 100644
  
          def score_mod(score, b, h, q, kv):
              return score + offset_kv[kv] + offset_q[q]
-@@ -1011,8 +1020,14 @@ def score_mod(score, b, h, q, kv):
+@@ -1012,8 +1021,14 @@ def score_mod(score, b, h, q, kv):
          def mask_mod(b, h, q, kv):
              return kv >= q + offset_tensor
  
@@ -632,7 +632,7 @@ index 5af7b284f757..667915c69507 100644
  
      @supported_platform
      @common_utils.parametrize("dtype", test_dtypes_fast)
-@@ -1677,19 +1692,19 @@ def mask_mod(b, h, q, kv):
+@@ -1679,19 +1694,19 @@ def mask_mod(b, h, q, kv):
      @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
      @common_utils.parametrize("dtype", test_dtypes)
      @common_utils.parametrize("score_mod", [_identity, _causal])
@@ -655,7 +655,7 @@ index 5af7b284f757..667915c69507 100644
              requires_grad=True,
          )
          q, k, v = make_q(), make_kv(), make_kv()
-@@ -1729,19 +1744,19 @@ def eager_sdpa_hop(q, k, v, score_mod):
+@@ -1731,19 +1746,19 @@ def eager_sdpa_hop(q, k, v, score_mod):
  
      @supported_platform
      @unittest.skipIf(SKIP_UT_ON_CPU, "Skip on CPU as not supported")
@@ -678,7 +678,7 @@ index 5af7b284f757..667915c69507 100644
              requires_grad=True,
          )
  
-@@ -1993,7 +2008,9 @@ def causal_mask(b, h, q, kv):
+@@ -1995,7 +2010,9 @@ def causal_mask(b, h, q, kv):
              self._check_equal(golden_outs, ref_outs, paged_out, fudge_factor, "Out")
  
  
@@ -689,11 +689,18 @@ index 5af7b284f757..667915c69507 100644
  
  if __name__ == "__main__":
      from torch._inductor.test_case import run_tests
-diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
-index e471332afe71..ced92fae6229 100644
---- a/torch/_inductor/kernel/flex_attention.py
-+++ b/torch/_inductor/kernel/flex_attention.py
-@@ -1445,7 +1445,9 @@ def flex_attention(
+diff --git a/third_party/xpu.txt b/third_party/xpu.txt
+index f3cfe7166aa7..d13f6ae35d03 100644
+--- a/third_party/xpu.txt
++++ b/third_party/xpu.txt
+@@ -1 +1 @@
+-3a9419c8bb6a98dd3e3cd473c36691fb4abeae40
++3f07dd52aac2e466c3c3efc15f88118f21428272
+diff --git a/torch/_inductor/kernel/flex/flex_attention.py b/torch/_inductor/kernel/flex/flex_attention.py
+index 0553fd06755d..d094a48627fb 100644
+--- a/torch/_inductor/kernel/flex/flex_attention.py
++++ b/torch/_inductor/kernel/flex/flex_attention.py
+@@ -531,7 +531,9 @@ def flex_attention(
  
      dtype = query.get_dtype()
      head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
@@ -704,7 +711,7 @@ index e471332afe71..ced92fae6229 100644
  
      # Mark SPARSE_KV_BLOCK_SIZE & SPARSE_Q_BLOCK_SIZE as static shapes and add guards.
      SPARSE_KV_BLOCK_SIZE = V.graph.sizevars.guard_int(SPARSE_KV_BLOCK_SIZE)
-@@ -2567,7 +2569,9 @@ def flex_attention_backward(*args, **kwargs):
+@@ -1653,7 +1655,9 @@ def flex_attention_backward(*args, **kwargs):
  
      dtype = query.get_dtype()
      head_dim = V.graph.sizevars.guard_int(query.get_size()[-1])
@@ -715,11 +722,11 @@ index e471332afe71..ced92fae6229 100644
  
      # Default config for warp specialization
      num_consumer_groups, num_buffers_warp_spec = 0, 0
-diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
-index 7e0aef981856..628bfc6419be 100644
---- a/torch/_inductor/kernel/flex_decoding.py
-+++ b/torch/_inductor/kernel/flex_decoding.py
-@@ -310,7 +310,10 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
+diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
+index 83c6b59cec96..e89981286ed8 100644
+--- a/torch/_inductor/kernel/flex/flex_decoding.py
++++ b/torch/_inductor/kernel/flex/flex_decoding.py
+@@ -354,7 +354,10 @@ def flex_decoding_grid(batch_size, kv_heads, gqa_group_size, n_keys, d_model, me
  
  
  def get_split_k(B: int, H: int, Mk: int) -> int:
@@ -731,7 +738,7 @@ index 7e0aef981856..628bfc6419be 100644
      bh = max(B * H, 1)  # NOTE: Handle B*h=0 case
      assert isinstance(bh, (int, sympy.Integer)), "B and H must be concrete integers"
      split_k = num_SM // bh * 2  # Each SM should at least get one block.
-@@ -415,7 +418,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
+@@ -458,7 +461,9 @@ def create_flex_decoding_kernel(*args, **kwargs):
      choices: list[Any] = []
      dtype = key.get_dtype()
      head_dim = V.graph.sizevars.guard_int(key.get_size()[-1])
@@ -742,24 +749,31 @@ index 7e0aef981856..628bfc6419be 100644
  
      # TODO: fix autotuning.
  
-@@ -462,7 +467,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
+@@ -505,7 +510,7 @@ def create_flex_decoding_kernel(*args, **kwargs):
                      )
                      * gqa_shared_heads
                  ),
 -                16,
-+                float('-inf') if torch.xpu.is_available() else 16,
++                1 if torch.xpu.is_available() else 16,
              )
          ),
      )
 diff --git a/torch/_inductor/template_heuristics.py b/torch/_inductor/template_heuristics.py
-index 65a6851192a0..3a53f0eed52e 100644
+index eec1d055ddf7..f7a5aefb5cd1 100644
 --- a/torch/_inductor/template_heuristics.py
 +++ b/torch/_inductor/template_heuristics.py
-@@ -1201,3 +1201,87 @@ class XPUConfigHeuristic(BaseConfigHeuristic):
-     """
+@@ -3,6 +3,7 @@
+ import dataclasses
+ import itertools
+ import math
++import os
+ from functools import partial
+ from threading import Lock
+ from typing import Any, Callable, Optional, TYPE_CHECKING
+@@ -1203,6 +1204,97 @@ class XPUConfigHeuristic(BaseConfigHeuristic):
      Placeholder child class for XPU specific overrides.
      """
-+
+ 
 +    def __init__(self) -> None:
 +        super().__init__()
 +
@@ -804,6 +818,9 @@ index 65a6851192a0..3a53f0eed52e 100644
 +
 +    def get_flex_attn_bwd_configs(self, head_dim: int, dtype: Any) -> list[FlexConfig]:
 +        flex_attn_bwd_configs: list[FlexConfig] = []
++        TRITON_LESS_FLEX_ATTN_BWD_CONFIGS = os.getenv(
++            "TRITON_LESS_FLEX_ATTN_BWD_CONFIGS", "0"
++        ).lower() in {"true", "1", "t", "y", "yes", "on"}
 +
 +        if config.max_autotune:
 +            if config.max_autotune_flex_search_space == "EXHAUSTIVE":
@@ -825,6 +842,10 @@ index 65a6851192a0..3a53f0eed52e 100644
 +        if default_config not in flex_attn_bwd_configs:
 +            flex_attn_bwd_configs.append(default_config)
 +
++        if TRITON_LESS_FLEX_ATTN_BWD_CONFIGS:
++            flex_attn_bwd_configs = list(
++                filter(lambda c: c.num_stages == 1, flex_attn_bwd_configs)
++            )
 +        return flex_attn_bwd_configs
 +
 +    def get_flex_decode_configs(
@@ -843,8 +864,12 @@ index 65a6851192a0..3a53f0eed52e 100644
 +            flex_decode_configs.append(default_config)
 +
 +        return flex_decode_configs
++
+ 
+ class MTIAConfigHeuristic(BaseConfigHeuristic):
+     """
 diff --git a/torch/_ops.py b/torch/_ops.py
-index fecfebaeaa53..8fac24a8579c 100644
+index 83a5dc0e57a5..b351aa17dfa7 100644
 --- a/torch/_ops.py
 +++ b/torch/_ops.py
 @@ -267,6 +267,7 @@ def resolve_key(op: OperatorBase, k: DispatchKey):  # type: ignore[valid-type]
diff --git a/scripts/patch/flex_attn_bwd_num_stage_1.patch b/scripts/patch/flex_attn_bwd_num_stage_1.patch
diff --git a/scripts/patch/flex_decoding.patch b/scripts/patch/flex_decoding.patch
@@ -1,7 +1,7 @@
-diff --git a/torch/_inductor/kernel/flex_decoding.py b/torch/_inductor/kernel/flex_decoding.py
-index 628bfc6419b..e86aca59db6 100644
---- a/torch/_inductor/kernel/flex_decoding.py
-+++ b/torch/_inductor/kernel/flex_decoding.py
+diff --git a/torch/_inductor/kernel/flex/flex_decoding.py b/torch/_inductor/kernel/flex/flex_decoding.py
+index 83c6b59cec96..e89981286ed8 100644
+--- a/torch/_inductor/kernel/flex/flex_decoding.py
++++ b/torch/_inductor/kernel/flex/flex_decoding.py
 @@ -459,15 +459,12 @@ def create_flex_decoding_kernel(*args, **kwargs):
              # m
              # if V.graph.sizevars.evaluate_expr(sympy.Lt(query.get_size()[-2], 0))
@@ -14,7 +14,7 @@ index 628bfc6419b..e86aca59db6 100644
 -                    )
 -                    * gqa_shared_heads
 -                ),
--                float('-inf') if torch.xpu.is_available() else 16,
+-                1 if torch.xpu.is_available() else 16,
 +            next_power_of_2(
 +                V.graph.sizevars.size_hint(
 +                    seq_len_q,

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-6d071bd65de9bdc354f32adf67e00d6e13475e76`
	`1`	`+83e2ea8135c42fa826c3d751a04f60259e97147f`