CrazyForks · pull · Jan 27, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/.github/workflows/autogen.yml b/.github/workflows/autogen.yml
@@ -84,7 +84,7 @@ jobs:
     - name: Regenerate autogen files
       run: |
         rm tinygrad/runtime/autogen/metal.py tinygrad/runtime/autogen/iokit.py tinygrad/runtime/autogen/corefoundation.py
-        LIBCLANG_PATH=/opt/homebrew/opt/llvm@20/lib/libclang.dylib python3 -c "from tinygrad.runtime.autogen import metal, iokit, corefoundation"
+        python3 -c "from tinygrad.runtime.autogen import metal, iokit, corefoundation"
     - name: Check for differences
       run: |
         if ! git diff --quiet; then

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -792,8 +792,6 @@ jobs:
         ocelot: 'true'
         llvm: 'true'
     - name: Run unit tests
-      env:
-        LIBCLANG_PATH: '/opt/homebrew/opt/llvm@20/lib/libclang.dylib'
       run: METAL=1 python -m pytest -n=auto test/unit/ --durations=20
     - name: Run ONNX
       run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20

diff --git a/..._submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh b/..._submission_v6.0/tinycorp/benchmarks/llama8b/implementations/tinybox_8xMI350X/dev_beam.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+export PYTHONPATH="."
+export DEV=${DEV:-AMD}
+export IGNORE_OOB=1
+export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000
+
+export DEBUG=${DEBUG:-2}
+export FLASH_ATTENTION=${FLASH_ATTENTION:-1}
+export ALL2ALL=${ALL2ALL:-1}
+
+export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
+export DP=8 BS=8 EVAL_BS=8 GRADIENT_ACC_STEPS=1
+export GBS=$((BS * GRADIENT_ACC_STEPS))
+
+export MODEL="llama3"
+export BASEDIR="/raid/datasets/c4-8b/"
+export SMALL=1
+export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
+export EVAL_TARGET=3.3 EVAL_FREQ=12288
+export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
+export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
+export SAMPLES=$((MAX_STEPS * GBS))
+
+export SEED=5760
+
+export JITBEAM=3
+export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
+
+export FAKEDATA=1 BENCHMARK=10 LLAMA_LAYERS=2
+
+python3 examples/mlperf/model_train.py
diff --git a/test/test_dtype.py b/test/test_dtype.py
@@ -18,7 +18,7 @@
 settings.load_profile("my_profile")
 
 def get_available_cast_dtypes(dtype: DType) -> List[DType]:
-  if not is_dtype_supported(dtype): return []
+  if not is_dtype_supported(dtype) and dtype not in (dtypes.long, dtypes.ulong): return []
   # dont cast internal dtypes
   return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")]
 
@@ -333,8 +333,14 @@ def test_uint16_to_int8_overflow(self):
 class TestInt32DType(TestDType): DTYPE = dtypes.int32
 class TestUint32DType(TestDType): DTYPE = dtypes.uint32
 
-class TestInt64DType(TestDType): DTYPE = dtypes.int64
+class TestInt64DType(TestDType):
+  DTYPE = dtypes.int64
+  @classmethod
+  def setUpClass(cls): cls.DATA = rand_for_dtype(cls.DTYPE, 10)
+
 class TestUint64DType(TestDType):
+  @classmethod
+  def setUpClass(cls): cls.DATA = rand_for_dtype(cls.DTYPE, 10)
   DTYPE = dtypes.uint64
   def test_uint64_load(self):
     assert Tensor(2**64 - 1, dtype=dtypes.uint64).numpy() == 2**64 - 1

diff --git a/test/test_dtype_alu.py b/test/test_dtype_alu.py
@@ -165,7 +165,6 @@ def test_uint16(self, a, b, op): universal_test(a, b, dtypes.uint16, op)
   @given(ht.uint32, ht.uint32, strat.sampled_from(integer_binary_operations))
   def test_uint32(self, a, b, op): universal_test(a, b, dtypes.uint32, op)
 
-  @unittest.skipUnless(is_dtype_supported(dtypes.uint64), f"no uint64 on {Device.DEFAULT}")
   @given(ht.uint64, ht.uint64, strat.sampled_from(integer_binary_operations))
   def test_uint64(self, a, b, op): universal_test(a, b, dtypes.uint64, op)
 
@@ -178,7 +177,6 @@ def test_int16(self, a, b, op): universal_test(a, b, dtypes.int16, op)
   @given(ht.int32, ht.int32, strat.sampled_from(integer_binary_operations))
   def test_int32(self, a, b, op): universal_test(a, b, dtypes.int32, op)
 
-  @unittest.skipUnless(is_dtype_supported(dtypes.int64), f"no int64 on {Device.DEFAULT}")
   @given(ht.int64, ht.int64, strat.sampled_from(integer_binary_operations))
   def test_int64(self, a, b, op): universal_test(a, b, dtypes.int64, op)
 
@@ -193,7 +191,6 @@ def test_uint16_unary(self, a, op): universal_test_unary(a, dtypes.uint16, op)
   @given(ht.uint32, strat.sampled_from(integer_unary_operations))
   def test_uint32_unary(self, a, op): universal_test_unary(a, dtypes.uint32, op)
 
-  @unittest.skipUnless(is_dtype_supported(dtypes.uint64), f"no uint64 on {Device.DEFAULT}")
   @given(ht.uint64, strat.sampled_from(integer_unary_operations))
   def test_uint64_unary(self, a, op): universal_test_unary(a, dtypes.uint64, op)
 
@@ -206,7 +203,6 @@ def test_int16_unary(self, a, op): universal_test_unary(a, dtypes.int16, op)
   @given(ht.int32, strat.sampled_from(integer_unary_operations))
   def test_int32_unary(self, a, op): universal_test_unary(a, dtypes.int32, op)
 
-  @unittest.skipUnless(is_dtype_supported(dtypes.int64), f"no int64 on {Device.DEFAULT}")
   @given(ht.int64, strat.sampled_from(integer_unary_operations))
   def test_int64_unary(self, a, op): universal_test_unary(a, dtypes.int64, op)
 

diff --git a/test/test_edgecases.py b/test/test_edgecases.py
@@ -26,7 +26,7 @@
 import numpy as np
 import torch
 from tinygrad import Tensor, dtypes, nn
-from tinygrad.device import Device, is_dtype_supported
+from tinygrad.device import Device
 from tinygrad.helpers import getenv
 from tinygrad.renderer.nir import NIRRenderer
 
@@ -207,8 +207,7 @@ class TestUOpValidationIssue(unittest.TestCase):
   # these fail with UOp verification error.
   # we want more of these with diverse errors!
 
-  @unittest.skipIf((not is_dtype_supported(dtypes.long)) or MOCKGPU or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer),
-                   "hangs gpuocelot, NIR cannot render")
+  @unittest.skipIf(MOCKGPU or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer), "hangs gpuocelot, NIR cannot render")
   def test_tensor_index_overflow(self):
     val = Tensor([1])
     big = val.expand(2**31 + 3)

diff --git a/test/unit/test_assign.py b/test/unit/test_assign.py
@@ -415,10 +415,8 @@ def test_permuted_assignment_masked_view_not_contiguous(self):
 
   # TODO: is there a way to sneak in a permute such that it returns the wrong answer?
 
-  # NOTE: overlapping shrink assign tests are WIP, behavior depends on backend/thread ordering
-  @unittest.skip("WIP: not a stable test, relies on undefined behavior")
   def test_overlapping_shrink_assignment_forward(self):
-    # Forward shift: read index > write index in overlap - works by thread ordering luck
+    # Forward shift: read index > write index in overlap
     N = 100000
     shift = 1000
     a = Tensor.arange(N).float().contiguous().realize()
@@ -427,11 +425,8 @@ def test_overlapping_shrink_assignment_forward(self):
     with Context(NOOPT=1): a[0:N-shift].assign(a[shift:N]).realize()
     np.testing.assert_allclose(a.numpy(), expected)
 
-  @unittest.skip("WIP: not a stable test, relies on undefined behavior")
-  @unittest.expectedFailure
   def test_overlapping_shrink_assignment_reverse(self):
-    # Reverse shift: write index > read index in overlap - race condition!
-    # This fails because find_permutes excludes SHRINK from hazard detection
+    # Reverse shift: write index > read index in overlap
     N = 100000
     shift = 1000
     a = Tensor.arange(N).float().contiguous().realize()
@@ -440,15 +435,14 @@ def test_overlapping_shrink_assignment_reverse(self):
     with Context(NOOPT=1): a[shift:N].assign(a[0:N-shift]).realize()
     np.testing.assert_allclose(a.numpy(), expected)
 
-  @unittest.skip("WIP: not a stable test, relies on undefined behavior")
-  def test_overlapping_shrink_assignment_reverse_with_contiguous(self):
-    # Adding .contiguous() forces a copy, fixing the race
-    N = 100000
-    shift = 1000
-    a = Tensor.arange(N).float().contiguous().realize()
-    expected = np.arange(N, dtype=np.float32)
-    expected[shift:] = expected[:N-shift].copy()
-    with Context(NOOPT=1): a[shift:N].assign(a[0:N-shift].contiguous()).realize()
+  def test_nonoverlapping_shrink_assignment(self):
+    # TODO: non-overlapping shrinks don't actually need contiguous, could be 1 kernel with smarter range analysis
+    a = Tensor.arange(100).float().contiguous().realize()
+    expected = np.arange(100, dtype=np.float32)
+    expected[0:10] = expected[50:60].copy()
+    kc = GlobalCounters.kernel_count
+    a[0:10].assign(a[50:60]).realize()
+    assert GlobalCounters.kernel_count - kc == 2, "currently conservative, forces contiguous"
     np.testing.assert_allclose(a.numpy(), expected)
 
   @unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")

diff --git a/test/unit/test_indexing.py b/test/unit/test_indexing.py
@@ -339,7 +339,7 @@ def test_index_put_accumulate_duplicate_indices(self):
       numpy_testing_assert_equal_helper(output, input_list)
   '''
 
-  @unittest.skipUnless(is_dtype_supported(dtypes.long), f"long dtype not supported on {Device.DEFAULT}")
+  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU doesn't support long indexing: #13624")
   def test_index_ind_dtype(self):
     x = Tensor.randn(4, 4)
     # ind_long = torch.randint(4, (4,), dtype=torch.long)

diff --git a/test/unit/test_uop_vmin_vmax.py b/test/unit/test_uop_vmin_vmax.py
@@ -49,6 +49,24 @@ def test_vmin_vmax_and_with_variable(self):
     self.assertEqual(uop.vmin, 0)
     self.assertEqual(uop.vmax, 20) # shoud be 0
 
+  def test_vmin_vmax_and_with_negative_variable(self):
+    # when mask doesn't have sign bit set, result is always non-negative
+    x = UOp.variable('x', -100, 100, dtypes.int32)
+    # 511 = 0x1FF, doesn't have sign bit set for int32
+    uop = x & 511
+    self.assertEqual(uop.vmin, 0)
+    self.assertEqual(uop.vmax, 511)
+
+    # 0x7FFFFFFF is max positive int32, doesn't have sign bit
+    uop = x & 0x7FFFFFFF
+    self.assertEqual(uop.vmin, 0)
+    self.assertEqual(uop.vmax, 0x7FFFFFFF)
+
+    # negative mask: x & -1 could be anything since -1 has all bits set
+    uop = x & -1
+    self.assertEqual(uop.vmin, dtypes.min(dtypes.int32))
+    self.assertEqual(uop.vmax, dtypes.max(dtypes.int32))
+
   def test_vmin_vmax_multiplication_with_variable(self):
     # vmin and vmax for multiplication with a variable
     x = UOp.variable('x', -3, 4)

diff --git a/tinygrad/codegen/__init__.py b/tinygrad/codegen/__init__.py
@@ -95,7 +95,7 @@ def full_rewrite_to_sink(sink:UOp, ren:Renderer|None=None, optimize:bool=True) -
 
   # decompositions
   supported_ops = tuple(ren.code_for_op.keys())
-  pm_decomp = symbolic_simple+get_late_rewrite_patterns(supported_ops, TRANSCENDENTAL>=2)
+  pm_decomp = symbolic_simple+get_late_rewrite_patterns(supported_ops, ren.device, TRANSCENDENTAL>=2)
   sink = graph_rewrite(sink, pm_decomp, ctx=ren.device, name="decompositions")
 
   # final rules for the renderer (without sym)

diff --git a/tinygrad/runtime/autogen/__init__.py b/tinygrad/runtime/autogen/__init__.py
@@ -10,6 +10,7 @@
 
 llvm_lib = (r"'C:\\Program Files\\LLVM\\bin\\LLVM-C.dll' if WIN else '/opt/homebrew/opt/llvm@20/lib/libLLVM.dylib' if OSX else " +
             repr(['LLVM'] + [f'LLVM-{i}' for i in reversed(range(14, 21+1))]))
+clang_lib = "'/opt/homebrew/opt/llvm@20/lib/libclang.dylib' if OSX else ['clang-20', 'clang']"
 
 webgpu_lib = "os.path.join(sysconfig.get_paths()['purelib'], 'pydawn', 'lib', 'libwebgpu_dawn.dll') if WIN else 'webgpu_dawn'"
 nv_lib_path = "[f'/{pre}/cuda/targets/{sysconfig.get_config_vars().get(\"MULTIARCH\", \"\").rsplit(\"-\", 1)[0]}/lib' for pre in ['opt', 'usr/local']]"
@@ -135,9 +136,9 @@ def __getattr__(nm):
   tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
   prolog=["import gzip, base64"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
     case "libclang":
-      return load("libclang", "['clang-20', 'clang']",
+      return load("libclang", clang_lib,
                   lambda: [f"{system('llvm-config-20 --includedir')}/clang-c/{s}.h" for s in ["Index", "CXString", "CXSourceLocation", "CXFile"]],
-                  args=lambda: system("llvm-config-20 --cflags").split())
+                  prolog=["from tinygrad.helpers import OSX"], args=lambda: system("llvm-config-20 --cflags").split())
     case "metal":
       return load("metal", "'Metal'", [f"{macossdk}/System/Library/Frameworks/Metal.framework/Headers/MTL{s}.h" for s in
                   ["ComputeCommandEncoder", "ComputePipeline", "CommandQueue", "Device", "IndirectCommandBuffer", "Resource", "CommandEncoder"]],

diff --git a/tinygrad/runtime/autogen/libclang.py b/tinygrad/runtime/autogen/libclang.py
@@ -4,7 +4,8 @@
 from typing import Annotated, Literal, TypeAlias
 from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
 from tinygrad.runtime.support import c
-dll = c.DLL('libclang', ['clang-20', 'clang'])
+from tinygrad.helpers import OSX
+dll = c.DLL('libclang', '/opt/homebrew/opt/llvm@20/lib/libclang.dylib' if OSX else ['clang-20', 'clang'])
 CXIndex: TypeAlias = ctypes.c_void_p
 class struct_CXTargetInfoImpl(ctypes.Structure): pass
 CXTargetInfo: TypeAlias = c.POINTER[struct_CXTargetInfoImpl]

diff --git a/tinygrad/schedule/rangeify.py b/tinygrad/schedule/rangeify.py
@@ -28,8 +28,8 @@
 # 0. do some cleanup rewrites, mostly copied from the old stuff
 
 def fix_assign_hazard(dest:UOp, src:UOp, assign:UOp):
-  # PERMUTE and FLIP reorder indices, causing read/write races when src and dest are the same buffer
-  unsafe = {Ops.PERMUTE, Ops.FLIP}
+  # PERMUTE and FLIP reorder indices, SHRINK can have overlapping regions when dest is also shrunk
+  unsafe = {Ops.PERMUTE, Ops.FLIP} | ({Ops.SHRINK} if dest.op_in_backward_slice_with_self(Ops.SHRINK) else set())
   if not (hazards:=[s for s in src.toposort(gate=lambda s:s.op not in ALWAYS_CONTIGUOUS) if s.op in unsafe]): return
   for h in hazards:
     if any(s is dest.base for s in h.toposort(gate=lambda s:s.op not in ALWAYS_CONTIGUOUS-{Ops.BUFFER})):