Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/autogen.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ jobs:
- name: Regenerate autogen files
run: |
rm tinygrad/runtime/autogen/metal.py tinygrad/runtime/autogen/iokit.py tinygrad/runtime/autogen/corefoundation.py
LIBCLANG_PATH=/opt/homebrew/opt/llvm@20/lib/libclang.dylib python3 -c "from tinygrad.runtime.autogen import metal, iokit, corefoundation"
python3 -c "from tinygrad.runtime.autogen import metal, iokit, corefoundation"
- name: Check for differences
run: |
if ! git diff --quiet; then
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -792,8 +792,6 @@ jobs:
ocelot: 'true'
llvm: 'true'
- name: Run unit tests
env:
LIBCLANG_PATH: '/opt/homebrew/opt/llvm@20/lib/libclang.dylib'
run: METAL=1 python -m pytest -n=auto test/unit/ --durations=20
- name: Run ONNX
run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/usr/bin/env bash

export PYTHONPATH="."
export DEV=${DEV:-AMD}
export IGNORE_OOB=1
export REWRITE_STACK_LIMIT=5000000 HCQDEV_WAIT_TIMEOUT_MS=240000

export DEBUG=${DEBUG:-2}
export FLASH_ATTENTION=${FLASH_ATTENTION:-1}
export ALL2ALL=${ALL2ALL:-1}

export DEFAULT_FLOAT="bfloat16" OPTIM_DTYPE="bfloat16"
export DP=8 BS=8 EVAL_BS=8 GRADIENT_ACC_STEPS=1
export GBS=$((BS * GRADIENT_ACC_STEPS))

export MODEL="llama3"
export BASEDIR="/raid/datasets/c4-8b/"
export SMALL=1
export LLAMA3_SIZE=${LLAMA3_SIZE:-"8B"}
export EVAL_TARGET=3.3 EVAL_FREQ=12288
export LR="4e-4" END_LR="4e-5" WARMUP_SAMPLES=256 MAX_STEPS=1200000
export WARMUP_STEPS=$((WARMUP_SAMPLES / GBS))
export SAMPLES=$((MAX_STEPS * GBS))

export SEED=5760

export JITBEAM=3
export BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5

export FAKEDATA=1 BENCHMARK=10 LLAMA_LAYERS=2

python3 examples/mlperf/model_train.py
10 changes: 8 additions & 2 deletions test/test_dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
settings.load_profile("my_profile")

def get_available_cast_dtypes(dtype: DType) -> List[DType]:
if not is_dtype_supported(dtype): return []
if not is_dtype_supported(dtype) and dtype not in (dtypes.long, dtypes.ulong): return []
# dont cast internal dtypes
return [v for k, v in DTYPES_DICT.items() if v != dtype and is_dtype_supported(v) and not k.startswith("_")]

Expand Down Expand Up @@ -333,8 +333,14 @@ def test_uint16_to_int8_overflow(self):
class TestInt32DType(TestDType): DTYPE = dtypes.int32
class TestUint32DType(TestDType): DTYPE = dtypes.uint32

class TestInt64DType(TestDType): DTYPE = dtypes.int64
class TestInt64DType(TestDType):
DTYPE = dtypes.int64
@classmethod
def setUpClass(cls): cls.DATA = rand_for_dtype(cls.DTYPE, 10)

class TestUint64DType(TestDType):
@classmethod
def setUpClass(cls): cls.DATA = rand_for_dtype(cls.DTYPE, 10)
DTYPE = dtypes.uint64
def test_uint64_load(self):
assert Tensor(2**64 - 1, dtype=dtypes.uint64).numpy() == 2**64 - 1
Expand Down
4 changes: 0 additions & 4 deletions test/test_dtype_alu.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def test_uint16(self, a, b, op): universal_test(a, b, dtypes.uint16, op)
@given(ht.uint32, ht.uint32, strat.sampled_from(integer_binary_operations))
def test_uint32(self, a, b, op): universal_test(a, b, dtypes.uint32, op)

@unittest.skipUnless(is_dtype_supported(dtypes.uint64), f"no uint64 on {Device.DEFAULT}")
@given(ht.uint64, ht.uint64, strat.sampled_from(integer_binary_operations))
def test_uint64(self, a, b, op): universal_test(a, b, dtypes.uint64, op)

Expand All @@ -178,7 +177,6 @@ def test_int16(self, a, b, op): universal_test(a, b, dtypes.int16, op)
@given(ht.int32, ht.int32, strat.sampled_from(integer_binary_operations))
def test_int32(self, a, b, op): universal_test(a, b, dtypes.int32, op)

@unittest.skipUnless(is_dtype_supported(dtypes.int64), f"no int64 on {Device.DEFAULT}")
@given(ht.int64, ht.int64, strat.sampled_from(integer_binary_operations))
def test_int64(self, a, b, op): universal_test(a, b, dtypes.int64, op)

Expand All @@ -193,7 +191,6 @@ def test_uint16_unary(self, a, op): universal_test_unary(a, dtypes.uint16, op)
@given(ht.uint32, strat.sampled_from(integer_unary_operations))
def test_uint32_unary(self, a, op): universal_test_unary(a, dtypes.uint32, op)

@unittest.skipUnless(is_dtype_supported(dtypes.uint64), f"no uint64 on {Device.DEFAULT}")
@given(ht.uint64, strat.sampled_from(integer_unary_operations))
def test_uint64_unary(self, a, op): universal_test_unary(a, dtypes.uint64, op)

Expand All @@ -206,7 +203,6 @@ def test_int16_unary(self, a, op): universal_test_unary(a, dtypes.int16, op)
@given(ht.int32, strat.sampled_from(integer_unary_operations))
def test_int32_unary(self, a, op): universal_test_unary(a, dtypes.int32, op)

@unittest.skipUnless(is_dtype_supported(dtypes.int64), f"no int64 on {Device.DEFAULT}")
@given(ht.int64, strat.sampled_from(integer_unary_operations))
def test_int64_unary(self, a, op): universal_test_unary(a, dtypes.int64, op)

Expand Down
5 changes: 2 additions & 3 deletions test/test_edgecases.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import numpy as np
import torch
from tinygrad import Tensor, dtypes, nn
from tinygrad.device import Device, is_dtype_supported
from tinygrad.device import Device
from tinygrad.helpers import getenv
from tinygrad.renderer.nir import NIRRenderer

Expand Down Expand Up @@ -207,8 +207,7 @@ class TestUOpValidationIssue(unittest.TestCase):
# these fail with UOp verification error.
# we want more of these with diverse errors!

@unittest.skipIf((not is_dtype_supported(dtypes.long)) or MOCKGPU or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer),
"hangs gpuocelot, NIR cannot render")
@unittest.skipIf(MOCKGPU or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer), "hangs gpuocelot, NIR cannot render")
def test_tensor_index_overflow(self):
val = Tensor([1])
big = val.expand(2**31 + 3)
Expand Down
26 changes: 10 additions & 16 deletions test/unit/test_assign.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,8 @@ def test_permuted_assignment_masked_view_not_contiguous(self):

# TODO: is there a way to sneak in a permute such that it returns the wrong answer?

# NOTE: overlapping shrink assign tests are WIP, behavior depends on backend/thread ordering
@unittest.skip("WIP: not a stable test, relies on undefined behavior")
def test_overlapping_shrink_assignment_forward(self):
# Forward shift: read index > write index in overlap - works by thread ordering luck
# Forward shift: read index > write index in overlap
N = 100000
shift = 1000
a = Tensor.arange(N).float().contiguous().realize()
Expand All @@ -427,11 +425,8 @@ def test_overlapping_shrink_assignment_forward(self):
with Context(NOOPT=1): a[0:N-shift].assign(a[shift:N]).realize()
np.testing.assert_allclose(a.numpy(), expected)

@unittest.skip("WIP: not a stable test, relies on undefined behavior")
@unittest.expectedFailure
def test_overlapping_shrink_assignment_reverse(self):
# Reverse shift: write index > read index in overlap - race condition!
# This fails because find_permutes excludes SHRINK from hazard detection
# Reverse shift: write index > read index in overlap
N = 100000
shift = 1000
a = Tensor.arange(N).float().contiguous().realize()
Expand All @@ -440,15 +435,14 @@ def test_overlapping_shrink_assignment_reverse(self):
with Context(NOOPT=1): a[shift:N].assign(a[0:N-shift]).realize()
np.testing.assert_allclose(a.numpy(), expected)

@unittest.skip("WIP: not a stable test, relies on undefined behavior")
def test_overlapping_shrink_assignment_reverse_with_contiguous(self):
# Adding .contiguous() forces a copy, fixing the race
N = 100000
shift = 1000
a = Tensor.arange(N).float().contiguous().realize()
expected = np.arange(N, dtype=np.float32)
expected[shift:] = expected[:N-shift].copy()
with Context(NOOPT=1): a[shift:N].assign(a[0:N-shift].contiguous()).realize()
def test_nonoverlapping_shrink_assignment(self):
# TODO: non-overlapping shrinks don't actually need contiguous, could be 1 kernel with smarter range analysis
a = Tensor.arange(100).float().contiguous().realize()
expected = np.arange(100, dtype=np.float32)
expected[0:10] = expected[50:60].copy()
kc = GlobalCounters.kernel_count
a[0:10].assign(a[50:60]).realize()
assert GlobalCounters.kernel_count - kc == 2, "currently conservative, forces contiguous"
np.testing.assert_allclose(a.numpy(), expected)

@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
Expand Down
2 changes: 1 addition & 1 deletion test/unit/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def test_index_put_accumulate_duplicate_indices(self):
numpy_testing_assert_equal_helper(output, input_list)
'''

@unittest.skipUnless(is_dtype_supported(dtypes.long), f"long dtype not supported on {Device.DEFAULT}")
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "WEBGPU doesn't support long indexing: #13624")
def test_index_ind_dtype(self):
x = Tensor.randn(4, 4)
# ind_long = torch.randint(4, (4,), dtype=torch.long)
Expand Down
18 changes: 18 additions & 0 deletions test/unit/test_uop_vmin_vmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ def test_vmin_vmax_and_with_variable(self):
self.assertEqual(uop.vmin, 0)
self.assertEqual(uop.vmax, 20) # shoud be 0

def test_vmin_vmax_and_with_negative_variable(self):
# when mask doesn't have sign bit set, result is always non-negative
x = UOp.variable('x', -100, 100, dtypes.int32)
# 511 = 0x1FF, doesn't have sign bit set for int32
uop = x & 511
self.assertEqual(uop.vmin, 0)
self.assertEqual(uop.vmax, 511)

# 0x7FFFFFFF is max positive int32, doesn't have sign bit
uop = x & 0x7FFFFFFF
self.assertEqual(uop.vmin, 0)
self.assertEqual(uop.vmax, 0x7FFFFFFF)

# negative mask: x & -1 could be anything since -1 has all bits set
uop = x & -1
self.assertEqual(uop.vmin, dtypes.min(dtypes.int32))
self.assertEqual(uop.vmax, dtypes.max(dtypes.int32))

def test_vmin_vmax_multiplication_with_variable(self):
# vmin and vmax for multiplication with a variable
x = UOp.variable('x', -3, 4)
Expand Down
2 changes: 1 addition & 1 deletion tinygrad/codegen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def full_rewrite_to_sink(sink:UOp, ren:Renderer|None=None, optimize:bool=True) -

# decompositions
supported_ops = tuple(ren.code_for_op.keys())
pm_decomp = symbolic_simple+get_late_rewrite_patterns(supported_ops, TRANSCENDENTAL>=2)
pm_decomp = symbolic_simple+get_late_rewrite_patterns(supported_ops, ren.device, TRANSCENDENTAL>=2)
sink = graph_rewrite(sink, pm_decomp, ctx=ren.device, name="decompositions")

# final rules for the renderer (without sym)
Expand Down
5 changes: 3 additions & 2 deletions tinygrad/runtime/autogen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

llvm_lib = (r"'C:\\Program Files\\LLVM\\bin\\LLVM-C.dll' if WIN else '/opt/homebrew/opt/llvm@20/lib/libLLVM.dylib' if OSX else " +
repr(['LLVM'] + [f'LLVM-{i}' for i in reversed(range(14, 21+1))]))
clang_lib = "'/opt/homebrew/opt/llvm@20/lib/libclang.dylib' if OSX else ['clang-20', 'clang']"

webgpu_lib = "os.path.join(sysconfig.get_paths()['purelib'], 'pydawn', 'lib', 'libwebgpu_dawn.dll') if WIN else 'webgpu_dawn'"
nv_lib_path = "[f'/{pre}/cuda/targets/{sysconfig.get_config_vars().get(\"MULTIARCH\", \"\").rsplit(\"-\", 1)[0]}/lib' for pre in ['opt', 'usr/local']]"
Expand Down Expand Up @@ -135,9 +136,9 @@ def __getattr__(nm):
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
prolog=["import gzip, base64"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
case "libclang":
return load("libclang", "['clang-20', 'clang']",
return load("libclang", clang_lib,
lambda: [f"{system('llvm-config-20 --includedir')}/clang-c/{s}.h" for s in ["Index", "CXString", "CXSourceLocation", "CXFile"]],
args=lambda: system("llvm-config-20 --cflags").split())
prolog=["from tinygrad.helpers import OSX"], args=lambda: system("llvm-config-20 --cflags").split())
case "metal":
return load("metal", "'Metal'", [f"{macossdk}/System/Library/Frameworks/Metal.framework/Headers/MTL{s}.h" for s in
["ComputeCommandEncoder", "ComputePipeline", "CommandQueue", "Device", "IndirectCommandBuffer", "Resource", "CommandEncoder"]],
Expand Down
3 changes: 2 additions & 1 deletion tinygrad/runtime/autogen/libclang.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from typing import Annotated, Literal, TypeAlias
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
from tinygrad.runtime.support import c
dll = c.DLL('libclang', ['clang-20', 'clang'])
from tinygrad.helpers import OSX
dll = c.DLL('libclang', '/opt/homebrew/opt/llvm@20/lib/libclang.dylib' if OSX else ['clang-20', 'clang'])
CXIndex: TypeAlias = ctypes.c_void_p
class struct_CXTargetInfoImpl(ctypes.Structure): pass
CXTargetInfo: TypeAlias = c.POINTER[struct_CXTargetInfoImpl]
Expand Down
4 changes: 2 additions & 2 deletions tinygrad/schedule/rangeify.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
# 0. do some cleanup rewrites, mostly copied from the old stuff

def fix_assign_hazard(dest:UOp, src:UOp, assign:UOp):
# PERMUTE and FLIP reorder indices, causing read/write races when src and dest are the same buffer
unsafe = {Ops.PERMUTE, Ops.FLIP}
# PERMUTE and FLIP reorder indices, SHRINK can have overlapping regions when dest is also shrunk
unsafe = {Ops.PERMUTE, Ops.FLIP} | ({Ops.SHRINK} if dest.op_in_backward_slice_with_self(Ops.SHRINK) else set())
if not (hazards:=[s for s in src.toposort(gate=lambda s:s.op not in ALWAYS_CONTIGUOUS) if s.op in unsafe]): return
for h in hazards:
if any(s is dest.base for s in h.toposort(gate=lambda s:s.op not in ALWAYS_CONTIGUOUS-{Ops.BUFFER})):
Expand Down
Loading
Loading