Allow string literal args (#353)

jansel · web-flow · commit 975959b3a215 · 2025-07-23T07:19:31.000-07:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -233,6 +233,8 @@ def to_fake(self, obj: object, origin: Origin) -> object:
             return lift_closures(fn, origin)
         if isinstance(obj, ConstExpr):
             return obj.value
+        if isinstance(obj, str):
+            return obj
         if isinstance(obj, list):
             return [self.to_fake(e, origin) for e in obj]
         if isinstance(obj, tuple) and hasattr(obj, "_fields"):
diff --git a/test/test_constexpr.expected b/test/test_constexpr.expected
@@ -97,3 +97,94 @@ def fn(x: torch.Tensor, s: hl.constexpr, *, _launcher=_default_launcher):
     _BLOCK_SIZE_1 = 16
     _launcher(_fn_kernel, (triton.cdiv(b, _BLOCK_SIZE_0) * triton.cdiv(16, _BLOCK_SIZE_1),), x, out, out.stride(0), out.stride(1), x.stride(0), b, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
     return out
+
+--- assertExpectedJournal(TestConstExpr.test_string_literal_arg)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _fn_kernel(x, out, x_size_0, x_size_1, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(x_size_0, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < x_size_1
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    v_0 = 1.0
+    v_1 = load + v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_0[:, None] & mask_1[None, :])
+
+def fn(x: torch.Tensor, mode: str, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _launcher(_fn_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0) * triton.cdiv(x.size(1), _BLOCK_SIZE_1),), x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestConstExpr.test_string_literal_arg)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _fn_kernel(x, out, x_size_0, x_size_1, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(x_size_0, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < x_size_1
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    v_0 = 2.0
+    v_1 = load * v_0
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_1, mask_0[:, None] & mask_1[None, :])
+
+def fn(x: torch.Tensor, mode: str, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _launcher(_fn_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0) * triton.cdiv(x.size(1), _BLOCK_SIZE_1),), x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestConstExpr.test_string_literal_arg)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _fn_kernel(x, out, x_size_0, x_size_1, out_stride_0, out_stride_1, x_stride_0, x_stride_1, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr):
+    num_blocks_0 = tl.cdiv(x_size_0, _BLOCK_SIZE_0)
+    pid_0 = tl.program_id(0) % num_blocks_0
+    pid_1 = tl.program_id(0) // num_blocks_0
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    offset_1 = pid_1 * _BLOCK_SIZE_1
+    indices_1 = (offset_1 + tl.arange(0, _BLOCK_SIZE_1)).to(tl.int32)
+    mask_1 = indices_1 < x_size_1
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), load, mask_0[:, None] & mask_1[None, :])
+
+def fn(x: torch.Tensor, mode: str, *, _launcher=_default_launcher):
+    out = torch.empty_like(x)
+    _BLOCK_SIZE_0 = 32
+    _BLOCK_SIZE_1 = 32
+    _launcher(_fn_kernel, (triton.cdiv(x.size(0), _BLOCK_SIZE_0) * triton.cdiv(x.size(1), _BLOCK_SIZE_1),), x, out, x.size(0), x.size(1), out.stride(0), out.stride(1), x.stride(0), x.stride(1), _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)
+    return out
diff --git a/test/test_constexpr.py b/test/test_constexpr.py
@@ -61,6 +61,36 @@ def fn(x: torch.Tensor, s: hl.constexpr) -> torch.Tensor:
         torch.testing.assert_close(result, x.view(-1, 1).expand(512, 16))
         self.assertExpectedJournal(code)
 
+    def test_string_literal_arg(self):
+        @helion.kernel()
+        def fn(x: torch.Tensor, mode: str) -> torch.Tensor:
+            out = torch.empty_like(x)
+            for tile in hl.tile(x.size()):
+                if mode == "add":
+                    out[tile] = x[tile] + 1.0
+                elif mode == "mul":
+                    out[tile] = x[tile] * 2.0
+                else:
+                    out[tile] = x[tile]
+            return out
+
+        x = torch.randn([512, 512], device=DEVICE)
+
+        # Test "add" mode
+        code, result = code_and_output(fn, (x, "add"))
+        torch.testing.assert_close(result, x + 1.0)
+        self.assertExpectedJournal(code)
+
+        # Test "mul" mode
+        code, result = code_and_output(fn, (x, "mul"))
+        torch.testing.assert_close(result, x * 2.0)
+        self.assertExpectedJournal(code)
+
+        # Test default mode
+        code, result = code_and_output(fn, (x, "default"))
+        torch.testing.assert_close(result, x)
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()