|
1 | 1 | # LLVM IR optimization |
2 | 2 |
|
3 | | -function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=2) |
4 | | - optimize_newpm!(job, mod; opt_level) |
5 | | - # TODO: clean up |
6 | | - return |
7 | | -end |
8 | | - |
9 | | - |
10 | | -## new pm |
11 | | - |
12 | | -function optimize_newpm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level) |
| 3 | +function optimize!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level=1) |
13 | 4 | tm = llvm_machine(job.config.target) |
14 | 5 |
|
15 | 6 | global current_job |
@@ -292,279 +283,6 @@ function buildCleanupPipeline(mpm, @nospecialize(job::CompilerJob), opt_level) |
292 | 283 | end |
293 | 284 |
|
294 | 285 |
|
295 | | -## legacy pm |
296 | | - |
297 | | -function optimize_legacypm!(@nospecialize(job::CompilerJob), mod::LLVM.Module; opt_level) |
298 | | - triple = llvm_triple(job.config.target) |
299 | | - tm = llvm_machine(job.config.target) |
300 | | - |
301 | | - global current_job |
302 | | - current_job = job |
303 | | - |
304 | | - @dispose pm=ModulePassManager() begin |
305 | | - addTargetPasses!(pm, tm, triple) |
306 | | - addOptimizationPasses!(pm, opt_level) |
307 | | - run!(pm, mod) |
308 | | - end |
309 | | - |
310 | | - # NOTE: we need to use multiple distinct pass managers to force pass ordering; |
311 | | - # intrinsics should never get lowered before Julia has optimized them. |
312 | | - # XXX: why doesn't the barrier noop pass work here? |
313 | | - |
314 | | - # lower intrinsics |
315 | | - @dispose pm=ModulePassManager() begin |
316 | | - addTargetPasses!(pm, tm, triple) |
317 | | - |
318 | | - if !uses_julia_runtime(job) |
319 | | - lower_gc_frame!(pm) |
320 | | - end |
321 | | - |
322 | | - if job.config.kernel |
323 | | - # GC lowering is the last pass that may introduce calls to the runtime library, |
324 | | - # and thus additional uses of the kernel state intrinsic. |
325 | | - # TODO: now that all kernel state-related passes are being run here, merge some? |
326 | | - add_kernel_state!(pm) |
327 | | - lower_kernel_state!(pm) |
328 | | - cleanup_kernel_state!(pm) |
329 | | - end |
330 | | - |
331 | | - if !uses_julia_runtime(job) |
332 | | - # remove dead uses of ptls |
333 | | - aggressive_dce!(pm) |
334 | | - lower_ptls!(pm) |
335 | | - end |
336 | | - |
337 | | - if uses_julia_runtime(job) |
338 | | - lower_exc_handlers!(pm) |
339 | | - end |
340 | | - # the Julia GC lowering pass also has some clean-up that is required |
341 | | - late_lower_gc_frame!(pm) |
342 | | - if uses_julia_runtime(job) |
343 | | - final_lower_gc!(pm) |
344 | | - end |
345 | | - |
346 | | - remove_ni!(pm) |
347 | | - remove_julia_addrspaces!(pm) |
348 | | - |
349 | | - if uses_julia_runtime(job) |
350 | | - # We need these two passes and the instcombine below |
351 | | - # after GC lowering to let LLVM do some constant propagation on the tags. |
352 | | - # and remove some unnecessary write barrier checks. |
353 | | - gvn!(pm) |
354 | | - sccp!(pm) |
355 | | - # Remove dead use of ptls |
356 | | - dce!(pm) |
357 | | - LLVM.Interop.lower_ptls!(pm, dump_native(job)) |
358 | | - instruction_combining!(pm) |
359 | | - # Clean up write barrier and ptls lowering |
360 | | - cfgsimplification!(pm) |
361 | | - end |
362 | | - |
363 | | - # Julia's operand bundles confuse the inliner, so repeat here now they are gone. |
364 | | - # FIXME: we should fix the inliner so that inlined code gets optimized early-on |
365 | | - always_inliner!(pm) |
366 | | - |
367 | | - # some of Julia's optimization passes happen _after_ lowering intrinsics |
368 | | - combine_mul_add!(pm) |
369 | | - div_rem_pairs!(pm) |
370 | | - |
371 | | - run!(pm, mod) |
372 | | - end |
373 | | - |
374 | | - # target-specific optimizations |
375 | | - optimize_module!(job, mod) |
376 | | - |
377 | | - # we compile a module containing the entire call graph, |
378 | | - # so perform some interprocedural optimizations. |
379 | | - # |
380 | | - # for some reason, these passes need to be distinct from the regular optimization chain, |
381 | | - # or certain values (such as the constant arrays used to populare llvm.compiler.user ad |
382 | | - # part of the LateLowerGCFrame pass) aren't collected properly. |
383 | | - # |
384 | | - # these might not always be safe, as Julia's IR metadata isn't designed for IPO. |
385 | | - @dispose pm=ModulePassManager() begin |
386 | | - addTargetPasses!(pm, tm, triple) |
387 | | - |
388 | | - # simplify function calls that don't use the returned value |
389 | | - dead_arg_elimination!(pm) |
390 | | - |
391 | | - run!(pm, mod) |
392 | | - end |
393 | | - |
394 | | - return |
395 | | -end |
396 | | - |
397 | | -function addTargetPasses!(pm, tm, triple) |
398 | | - add_library_info!(pm, triple) |
399 | | - add_transform_info!(pm, tm) |
400 | | -end |
401 | | - |
402 | | -# Based on Julia's optimization pipeline, minus the SLP and loop vectorizers. |
403 | | -function addOptimizationPasses!(pm, opt_level) |
404 | | - # compare with the using Julia's optimization pipeline directly: |
405 | | - #ccall(:jl_add_optimization_passes, Cvoid, |
406 | | - # (LLVM.API.LLVMPassManagerRef, Cint, Cint), |
407 | | - # pm, opt_level, #=lower_intrinsics=# 0) |
408 | | - #return |
409 | | - |
410 | | - # NOTE: LLVM 12 disabled the hoisting of common instruction |
411 | | - # before loop vectorization (https://reviews.llvm.org/D84108). |
412 | | - # |
413 | | - # This is re-enabled with calls to cfg_simplify here, |
414 | | - # to merge allocations and sometimes eliminate them, |
415 | | - # since AllocOpt does not handle PhiNodes. |
416 | | - # Enable this instruction hoisting because of this and Union benchmarks. |
417 | | - |
418 | | - constant_merge!(pm) |
419 | | - |
420 | | - if opt_level < 2 |
421 | | - cpu_features!(pm) |
422 | | - if opt_level == 1 |
423 | | - instruction_simplify!(pm) |
424 | | - end |
425 | | - if LLVM.version() >= v"12" |
426 | | - cfgsimplification!(pm; hoist_common_insts=true) |
427 | | - else |
428 | | - cfgsimplification!(pm) |
429 | | - end |
430 | | - if opt_level == 1 |
431 | | - scalar_repl_aggregates!(pm) |
432 | | - instruction_combining!(pm) |
433 | | - early_cse!(pm) |
434 | | - # maybe add GVN? |
435 | | - # also try GVNHoist and GVNSink |
436 | | - end |
437 | | - mem_cpy_opt!(pm) |
438 | | - always_inliner!(pm) # Respect always_inline |
439 | | - lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop |
440 | | - return |
441 | | - end |
442 | | - |
443 | | - propagate_julia_addrsp!(pm) |
444 | | - scoped_no_alias_aa!(pm) |
445 | | - type_based_alias_analysis!(pm) |
446 | | - if opt_level >= 3 |
447 | | - basic_alias_analysis!(pm) |
448 | | - end |
449 | | - if LLVM.version() >= v"12" |
450 | | - cfgsimplification!(pm; hoist_common_insts=true) |
451 | | - else |
452 | | - cfgsimplification!(pm) |
453 | | - end |
454 | | - dce!(pm) |
455 | | - scalar_repl_aggregates!(pm) |
456 | | - |
457 | | - #mem_cpy_opt!(pm) |
458 | | - |
459 | | - always_inliner!(pm) # Respect always_inline |
460 | | - |
461 | | - # Running `memcpyopt` between this and `sroa` seems to give `sroa` a hard |
462 | | - # time merging the `alloca` for the unboxed data and the `alloca` created by |
463 | | - # the `alloc_opt` pass. |
464 | | - |
465 | | - alloc_opt!(pm) |
466 | | - # consider AggressiveInstCombinePass at optlevel > 2 |
467 | | - instruction_combining!(pm) |
468 | | - if LLVM.version() >= v"12" |
469 | | - cfgsimplification!(pm; hoist_common_insts=true) |
470 | | - else |
471 | | - cfgsimplification!(pm) |
472 | | - end |
473 | | - cpu_features!(pm) |
474 | | - scalar_repl_aggregates!(pm) |
475 | | - # SROA can duplicate PHI nodes which can block LowerSIMD |
476 | | - instruction_combining!(pm) |
477 | | - jump_threading!(pm) |
478 | | - correlated_value_propagation!(pm) |
479 | | - |
480 | | - reassociate!(pm) |
481 | | - |
482 | | - early_cse!(pm) |
483 | | - |
484 | | - # Load forwarding above can expose allocations that aren't actually used |
485 | | - # remove those before optimizing loops. |
486 | | - alloc_opt!(pm) |
487 | | - loop_rotate!(pm) |
488 | | - # moving IndVarSimplify here prevented removing the loop in perf_sumcartesian(10:-1:1) |
489 | | - |
490 | | - # LoopRotate strips metadata from terminator, so run LowerSIMD afterwards |
491 | | - lower_simdloop!(pm) # Annotate loop marked with "loopinfo" as LLVM parallel loop |
492 | | - licm!(pm) |
493 | | - julia_licm!(pm) |
494 | | - if LLVM.version() >= v"15" |
495 | | - simple_loop_unswitch_legacy!(pm) |
496 | | - else |
497 | | - # XXX: simple loop unswitch is available on older versions of LLVM too, |
498 | | - # but using this pass instead of the old one breaks Metal.jl. |
499 | | - loop_unswitch!(pm) |
500 | | - end |
501 | | - licm!(pm) |
502 | | - julia_licm!(pm) |
503 | | - inductive_range_check_elimination!(pm) |
504 | | - # Subsequent passes not stripping metadata from terminator |
505 | | - instruction_simplify!(pm) |
506 | | - loop_idiom!(pm) |
507 | | - ind_var_simplify!(pm) |
508 | | - loop_deletion!(pm) |
509 | | - loop_unroll!(pm) # TODO: in Julia createSimpleLoopUnroll |
510 | | - |
511 | | - # Run our own SROA on heap objects before LLVM's |
512 | | - alloc_opt!(pm) |
513 | | - # Re-run SROA after loop-unrolling (useful for small loops that operate, |
514 | | - # over the structure of an aggregate) |
515 | | - scalar_repl_aggregates!(pm) |
516 | | - # might not be necessary: |
517 | | - instruction_simplify!(pm) |
518 | | - |
519 | | - gvn!(pm) |
520 | | - mem_cpy_opt!(pm) |
521 | | - sccp!(pm) |
522 | | - |
523 | | - # These next two passes must come before IRCE to eliminate the bounds check in #43308 |
524 | | - correlated_value_propagation!(pm) |
525 | | - dce!(pm) |
526 | | - |
527 | | - inductive_range_check_elimination!(pm) # Must come between the two GVN passes |
528 | | - |
529 | | - # Run instcombine after redundancy elimination to exploit opportunities |
530 | | - # opened up by them. |
531 | | - # This needs to be InstCombine instead of InstSimplify to allow |
532 | | - # loops over Union-typed arrays to vectorize. |
533 | | - instruction_combining!(pm) |
534 | | - jump_threading!(pm) |
535 | | - if opt_level >= 3 |
536 | | - gvn!(pm) # Must come after JumpThreading and before LoopVectorize |
537 | | - end |
538 | | - dead_store_elimination!(pm) |
539 | | - |
540 | | - # More dead allocation (store) deletion before loop optimization |
541 | | - # consider removing this: |
542 | | - alloc_opt!(pm) |
543 | | - # see if all of the constant folding has exposed more loops |
544 | | - # to simplification and deletion |
545 | | - # this helps significantly with cleaning up iteration |
546 | | - cfgsimplification!(pm) # See note above, don't hoist instructions before LV |
547 | | - loop_deletion!(pm) |
548 | | - instruction_combining!(pm) |
549 | | - loop_vectorize!(pm) |
550 | | - loop_load_elimination!(pm) |
551 | | - # Cleanup after LV pass |
552 | | - instruction_combining!(pm) |
553 | | - if LLVM.version() >= v"12" |
554 | | - cfgsimplification!(pm; # Aggressive CFG simplification |
555 | | - forward_switch_cond_to_phi=true, |
556 | | - convert_switch_to_lookup_table=true, |
557 | | - need_canonical_loop=true, |
558 | | - hoist_common_insts=true, |
559 | | - #sink_common_insts=true # FIXME: Causes assertion in llvm-late-lowering |
560 | | - ) |
561 | | - else |
562 | | - cfgsimplification!(pm) |
563 | | - end |
564 | | - |
565 | | - aggressive_dce!(pm) |
566 | | -end |
567 | | - |
568 | 286 |
|
569 | 287 | ## custom passes |
570 | 288 |
|
|
0 commit comments