Add option for backends to run vectorization passes (#716)

gbaraldi · web-flow · commit 51367980a7b6 · 2025-10-14T11:46:10.000+02:00
Disabled by default, only enabled for AMDGPU, CUDA, and native.
diff --git a/src/gcn.jl b/src/gcn.jl
@@ -124,3 +124,5 @@ function emit_trap!(job::CompilerJob{GCNCompilerTarget}, builder, mod, inst)
     end
     call!(builder, trap_ft, trap)
 end
+
+can_vectorize(job::CompilerJob{GCNCompilerTarget}) = true
diff --git a/src/interface.jl b/src/interface.jl
@@ -222,6 +222,9 @@ end
 # Has the runtime available and does not require special handling
 uses_julia_runtime(@nospecialize(job::CompilerJob)) = false
 
+# Is it legal to run vectorization passes on this target
+can_vectorize(@nospecialize(job::CompilerJob)) = false
+
 # Should emit PTLS lookup that can be relocated
 dump_native(@nospecialize(job::CompilerJob)) = false
 
diff --git a/src/native.jl b/src/native.jl
@@ -35,3 +35,4 @@ end
 
 runtime_slug(job::CompilerJob{NativeCompilerTarget}) = "native_$(job.config.target.cpu)-$(hash(job.config.target.features))$(job.config.target.jlruntime ? "-jlrt" : "")"
 uses_julia_runtime(job::CompilerJob{NativeCompilerTarget}) = job.config.target.jlruntime
+can_vectorize(job::CompilerJob{NativeCompilerTarget}) = true
diff --git a/src/optim.jl b/src/optim.jl
@@ -31,10 +31,7 @@ function buildNewPMPipeline!(mpm, @nospecialize(job::CompilerJob), opt_level)
     add!(mpm, NewPMFunctionPassManager()) do fpm
         buildLoopOptimizerPipeline(fpm, job, opt_level)
         buildScalarOptimizerPipeline(fpm, job, opt_level)
-        if uses_julia_runtime(job) && opt_level >= 2
-            # XXX: we disable vectorization, as this generally isn't useful for GPU targets
-            #      and actually causes issues with some back-end compilers (like Metal).
-            # TODO: Make this not dependent on `uses_julia_runtime` (likely CPU), but it's own control
+        if (can_vectorize(job)) && opt_level >= 2
             buildVectorPipeline(fpm, job, opt_level)
         end
         if isdebug(:optim)
diff --git a/src/ptx.jl b/src/ptx.jl
@@ -78,6 +78,8 @@ have_fma(@nospecialize(target::PTXCompilerTarget), T::Type) = true
 
 dwarf_version(target::PTXCompilerTarget) = Int32(2) # Cuda only supports dwarfv2
 
+can_vectorize(job::CompilerJob{PTXCompilerTarget}) = true
+
 ## job
 
 function Base.show(io::IO, @nospecialize(job::CompilerJob{PTXCompilerTarget}))

Original file line number	Diff line number	Diff line change
`@@ -35,3 +35,4 @@ end`
`35`	`35`
`36`	`36`	`runtime_slug(job::CompilerJob{NativeCompilerTarget}) = "native_$(job.config.target.cpu)-$(hash(job.config.target.features))$(job.config.target.jlruntime ? "-jlrt" : "")"`
`37`	`37`	`uses_julia_runtime(job::CompilerJob{NativeCompilerTarget}) = job.config.target.jlruntime`
	`38`	`+can_vectorize(job::CompilerJob{NativeCompilerTarget}) = true`