Merge pull request #230 from JuliaGPU/tb/spirv

maleadt · web-flow · commit 0cfb52f53f04 · 2021-08-04T11:10:49.000+02:00
More fixes for the SPIRV- backend.
diff --git a/src/optim.jl b/src/optim.jl
@@ -1,46 +1,166 @@
 # LLVM IR optimization
 
-function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
-    tm = llvm_machine(job.target)
+function addTargetPasses!(pm, tm, triple)
+    add_library_info!(pm, triple)
+    add_transform_info!(pm, tm)
+end
+
+# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers.
+function addOptimizationPasses!(pm, opt_level=2)
+    # compare with the using Julia's optimization pipeline directly:
+    #ccall(:jl_add_optimization_passes, Cvoid,
+    #      (LLVM.API.LLVMPassManagerRef, Cint, Cint),
+    #      pm, opt_level, #=lower_intrinsics=# 0)
+    #return
+
+    constant_merge!(pm)
+
+    if opt_level < 2
+        cfgsimplification!(pm)
+        if opt_level == 1
+            scalar_repl_aggregates!(pm)
+            instruction_combining!(pm)
+            early_cse!(pm)
+            # maybe add GVN?
+            # also try GVNHoist and GVNSink
+        end
+        mem_cpy_opt!(pm)
+        always_inliner!(pm) # Respect always_inline
+        lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
+        return
+    end
 
-    function initialize!(pm)
-        add_library_info!(pm, triple(mod))
-        add_transform_info!(pm, tm)
+    propagate_julia_addrsp!(pm)
+    scoped_no_alias_aa!(pm)
+    type_based_alias_analysis!(pm)
+    if opt_level >= 3
+        basic_alias_analysis!(pm)
     end
+    cfgsimplification!(pm)
+    dce!(pm)
+    scalar_repl_aggregates!(pm)
+
+    #mem_cpy_opt!(pm)
+
+    always_inliner!(pm) # Respect always_inline
+
+    # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard
+    # time merging the `alloca` for the unboxed data and the `alloca` created by
+    # the `alloc_opt` pass.
+
+    alloc_opt!(pm)
+    # consider AggressiveInstCombinePass at optlevel > 2
+    instruction_combining!(pm)
+    cfgsimplification!(pm)
+    scalar_repl_aggregates!(pm)
+    instruction_simplify!(pm)
+    jump_threading!(pm)
+    correlated_value_propagation!(pm)
+
+    reassociate!(pm)
+
+    early_cse!(pm)
+
+    # Load forwarding above can expose allocations that aren't actually used
+    # remove those before optimizing loops.
+    alloc_opt!(pm)
+    loop_rotate!(pm)
+    # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1)
+    loop_idiom!(pm)
+
+    # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards
+    lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop
+    licm!(pm)
+    julia_licm!(pm)
+    loop_unswitch!(pm)
+    licm!(pm)
+    julia_licm!(pm)
+    # Subsequent passes not stripping metadata from terminator
+    instruction_simplify!(pm)
+    ind_var_simplify!(pm)
+    loop_deletion!(pm)
+    loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll
+
+    # Run our own SROA on heap objects before LLVM's
+    alloc_opt!(pm)
+    # Re-run SROA after loop-unrolling (useful for small loops that operate,
+    # over the structure of an aggregate)
+    scalar_repl_aggregates!(pm)
+    # might not be necessary:
+    instruction_simplify!(pm)
+
+    gvn!(pm)
+    mem_cpy_opt!(pm)
+    sccp!(pm)
+
+    # Run instcombine after redundancy elimination to exploit opportunities
+    # opened up by them.
+    # This needs to be InstCombine instead of InstSimplify to allow
+    # loops over Union-typed arrays to vectorize.
+    instruction_combining!(pm)
+    jump_threading!(pm)
+    correlated_value_propagation!(pm)
+    dead_store_elimination!(pm)
+
+    # More dead allocation (store) deletion before loop optimization
+    # consider removing this:
+    alloc_opt!(pm)
+    # see if all of the constant folding has exposed more loops
+    # to simplification and deletion
+    # this helps significantly with cleaning up iteration
+    cfgsimplification!(pm)  # See note above, don't hoist instructions before LV
+    loop_deletion!(pm)
+    instruction_combining!(pm)
+    loop_vectorize!(pm)
+    loop_load_elimination!(pm)
+    # Cleanup after LV pass
+    cfgsimplification!(pm)
+    # TODO: aggressive CFG simplificaton options
+
+    aggressive_dce!(pm)
+end
+
+function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
+    triple = llvm_triple(job.target)
+    tm = llvm_machine(job.target)
 
     global current_job
     current_job = job
 
-    # Julia-specific optimizations
-    #
-    # NOTE: we need to use multiple distinct pass managers to force pass ordering;
-    #       intrinsics should never get lowered before Julia has optimized them.
-
     ModulePassManager() do pm
-        initialize!(pm)
-        ccall(:jl_add_optimization_passes, Cvoid,
-                (LLVM.API.LLVMPassManagerRef, Cint, Cint),
-                pm, Base.JLOptions().opt_level, #=lower_intrinsics=# 0)
+        addTargetPasses!(pm, tm, triple)
+        addOptimizationPasses!(pm)
         run!(pm, mod)
     end
 
+    # NOTE: we need to use multiple distinct pass managers to force pass ordering;
+    #       intrinsics should never get lowered before Julia has optimized them.
+    # XXX: why doesn't the barrier noop pass work here?
+
+    # lower intrinsics
     ModulePassManager() do pm
-        initialize!(pm)
+        addTargetPasses!(pm, tm, triple)
 
-        # lower intrinsics
         add!(pm, FunctionPass("LowerGCFrame", lower_gc_frame!))
-        aggressive_dce!(pm) # remove dead uses of ptls
+
+        # remove dead uses of ptls
+        aggressive_dce!(pm)
         add!(pm, ModulePass("LowerPTLS", lower_ptls!))
 
         # the Julia GC lowering pass also has some clean-up that is required
         late_lower_gc_frame!(pm)
 
+        remove_ni!(pm)
         remove_julia_addrspaces!(pm)
 
         # Julia's operand bundles confuse the inliner, so repeat here now they are gone.
         # FIXME: we should fix the inliner so that inlined code gets optimized early-on
         always_inliner!(pm)
 
+        # some of Julia's optimization passes happen _after_ lowering intrinsics
+        combine_mul_add!(pm)
+        div_rem_pairs!(pm)
+
         run!(pm, mod)
     end
 
@@ -56,7 +176,7 @@ function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module)
     #
     # these might not always be safe, as Julia's IR metadata isn't designed for IPO.
     ModulePassManager() do pm
-        initialize!(pm)
+        addTargetPasses!(pm, tm, triple)
 
         dead_arg_elimination!(pm)   # parent doesn't use return value --> ret void
 
diff --git a/src/spirv.jl b/src/spirv.jl
@@ -57,48 +57,73 @@ function finish_module!(job::CompilerJob{SPIRVCompilerTarget}, mod::LLVM.Module)
     # (OpKill is only available in fragment execution mode)
     ModulePassManager() do pm
         add!(pm, ModulePass("RemoveTrap", rm_trap!))
+        add!(pm, ModulePass("RemoveFreeze", rm_freeze!))
         run!(pm, mod)
     end
 end
 
 @unlocked function mcgen(job::CompilerJob{SPIRVCompilerTarget}, mod::LLVM.Module,
                          format=LLVM.API.LLVMAssemblyFile)
-    # write the bitcode to a temporary file (the SPIRV Translator library doesn't have a C API)
-    mktemp() do input, input_io
-        write(input_io, mod)
-        flush(input_io)
+    # The SPIRV Tools don't handle Julia's debug info, rejecting DW_LANG_Julia...
+    strip_debuginfo!(mod)
+
+    # translate to SPIR-V
+    input = tempname(cleanup=false) * ".bc"
+    translated = tempname(cleanup=false) * ".spv"
+    write(input, mod)
+    SPIRV_LLVM_Translator_jll.llvm_spirv() do translator
+        proc = run(ignorestatus(`$translator --spirv-debug-info-version=ocl-100 -o $translated $input`))
+        if !success(proc)
+            error("""Failed to translate LLVM code to SPIR-V.
+                     If you think this is a bug, please file an issue and attach $(input).""")
+        end
+    end
 
-        # compile to SPIR-V
-        mktemp() do output, output_io
-            SPIRV_LLVM_Translator_jll.llvm_spirv() do translator
-                cmd = `$translator`
-                if format == LLVM.API.LLVMAssemblyFile
-                    cmd = `$cmd -spirv-text`
-                end
-                cmd = `$cmd --spirv-debug-info-version=ocl-100 -o $output $input`
-                run(cmd)
+    # validate
+    # XXX: parameterize this on the `validate` driver argument
+    # XXX: our code currently doesn't pass the validator
+    if Base.JLOptions().debug_level >= 2 && false
+        SPIRV_Tools_jll.spirv_val() do validator
+            proc = run(ignorestatus(`$validator $translated`))
+            if !success(proc)
+                error("""Failed to validate generated SPIR-V.
+                         If you think this is a bug, please file an issue and attach $(input) and $(translated).""")
             end
+        end
+    end
 
-            # read back the file
-            if format == LLVM.API.LLVMAssemblyFile
-                read(output_io, String)
-            else
-                read(output_io)
+    # optimize
+    # XXX: parameterize this on the `optimize` driver argument
+    # XXX: the optimizer segfaults on some of our code
+    optimized = tempname(cleanup=false) * ".spv"
+    if false
+        SPIRV_Tools_jll.spirv_opt() do optimizer
+            proc = run(ignorestatus(`$optimizer -O --skip-validation $translated -o $optimized`))
+            if !success(proc)
+                error("""Failed to optimize generated SPIR-V.
+                         If you think this is a bug, please file an issue and attach $(input) and $(translated).""")
             end
         end
     end
+
+    output = if format == LLVM.API.LLVMObjectFile
+        read(translated)
+    else
+        # disassemble
+        SPIRV_Tools_jll.spirv_dis() do disassembler
+            read(`$disassembler $optimized`, String)
+        end
+    end
+
+    rm(input)
+    rm(translated)
+    #rm(optimized)
+
+    return output
 end
 
 # reimplementation that uses `spirv-dis`, giving much more pleasant output
 function code_native(io::IO, job::CompilerJob{SPIRVCompilerTarget}; raw::Bool=false, dump_module::Bool=false)
-    if raw
-        # The SPIRV Tools don't handle Julia's debug info, rejecting DW_LANG_Julia...
-        # so just return what LLVM gives us in that case (which is also more faithful).
-        asm, _ = codegen(:asm, job; strip=false, only_entry=!dump_module, validate=false)
-        print(io, asm)
-        return
-    end
-
     obj, _ = codegen(:obj, job; strip=!raw, only_entry=!dump_module, validate=false)
     mktemp() do input_path, input_io
         write(input_io, obj)
@@ -146,6 +171,27 @@ function rm_trap!(mod::LLVM.Module)
     return changed
 end
 
+# remove freeze and replace uses by the original value
+# (KhronosGroup/SPIRV-LLVM-Translator#1140)
+function rm_freeze!(mod::LLVM.Module)
+    job = current_job::CompilerJob
+    changed = false
+    @timeit_debug to "remove freeze" begin
+
+    for f in functions(mod), bb in blocks(f), inst in instructions(bb)
+        if inst isa LLVM.FreezeInst
+            orig = first(operands(inst))
+            replace_uses!(inst, orig)
+            @compiler_assert isempty(uses(inst)) job
+            unsafe_delete!(bb, inst)
+            changed = true
+        end
+    end
+
+    end
+    return changed
+end
+
 # wrap byval pointers in a single-value struct
 function wrap_byval(@nospecialize(job::CompilerJob), mod::LLVM.Module, entry_f::LLVM.Function)
     ctx = context(mod)