Skip to content

Commit 40425c5

Browse files
AndreasPKMarge Bot
authored andcommitted
Aarch64 NCG: Use encoded immediates for literals.
Try to generate instr x2, <imm> instead of mov x1, lit instr x2, x1 When possible. This get's rid if quite a few redundant mov instructions. I believe this causes a metric decrease for LargeRecords as we reduce register pressure. ------------------------- Metric Decrease: LargeRecord -------------------------
1 parent 0bfc890 commit 40425c5

File tree

4 files changed

+174
-26
lines changed

4 files changed

+174
-26
lines changed

compiler/GHC/CmmToAsm/AArch64/CodeGen.hs

Lines changed: 154 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,97 @@ getSomeReg expr = do
372372
Fixed rep reg code ->
373373
return (reg, rep, code)
374374

375+
{- Note [Aarch64 immediates]
376+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
377+
Aarch64 with it's fixed width instruction encoding uses leftover space for
378+
immediates.
379+
If you want the full rundown consult the arch reference document:
380+
"Arm® Architecture Reference Manual" - "C3.4 Data processing - immediate"
381+
382+
The gist of it is that different instructions allow for different immediate encodings.
383+
The ones we care about for better code generation are:
384+
385+
* Simple but potentially repeated bit-patterns for logic instructions.
386+
* 16bit numbers shifted by multiples of 16.
387+
* 12 bit numbers optionally shifted by 12 bits.
388+
389+
It might seem like the ISA allows for 64bit immediates but this isn't the case.
390+
Rather there are some instruction aliases which allow for large unencoded immediates
391+
which will then be transalted to one of the immediate encodings implicitly.
392+
393+
For example mov x1, #0x10000 is allowed but will be assembled to movz x1, #0x1, lsl #16
394+
-}
395+
396+
-- | Move (wide immediate)
397+
-- Allows for 16bit immediate which can be shifted by 0/16/32/48 bits.
398+
-- Used with MOVZ,MOVN, MOVK
399+
-- See Note [Aarch64 immediates]
400+
getMovWideImm :: Integer -> Width -> Maybe Operand
401+
getMovWideImm n w
402+
-- TODO: Handle sign extension/negatives
403+
| n <= 0
404+
= Nothing
405+
-- Fits in 16 bits
406+
| sized_n < 2^(16 :: Int)
407+
= Just $ OpImm (ImmInteger truncated)
408+
409+
-- 0x0000 0000 xxxx 0000
410+
| trailing_zeros >= 16 && sized_n < 2^(32 :: Int)
411+
= Just $ OpImmShift (ImmInteger $ truncated `shiftR` 16) SLSL 16
412+
413+
-- 0x 0000 xxxx 0000 0000
414+
| trailing_zeros >= 32 && sized_n < 2^(48 :: Int)
415+
= Just $ OpImmShift (ImmInteger $ truncated `shiftR` 32) SLSL 32
416+
417+
-- 0x xxxx 0000 0000 0000
418+
| trailing_zeros >= 48
419+
= Just $ OpImmShift (ImmInteger $ truncated `shiftR` 48) SLSL 48
420+
421+
| otherwise
422+
= Nothing
423+
where
424+
truncated = narrowU w n
425+
sized_n = fromIntegral truncated :: Word64
426+
trailing_zeros = countTrailingZeros sized_n
427+
428+
-- | Arithmetic(immediate)
429+
-- Allows for 12bit immediates which can be shifted by 0 or 12 bits.
430+
-- Used with ADD, ADDS, SUB, SUBS, CMP, CMN
431+
-- See Note [Aarch64 immediates]
432+
getArithImm :: Integer -> Width -> Maybe Operand
433+
getArithImm n w
434+
-- TODO: Handle sign extension
435+
| n <= 0
436+
= Nothing
437+
-- Fits in 16 bits
438+
-- Fits in 12 bits
439+
| sized_n < 2^(12::Int)
440+
= Just $ OpImm (ImmInteger truncated)
441+
442+
-- 12 bits shifted by 12 places.
443+
| trailing_zeros >= 12 && sized_n < 2^(24::Int)
444+
= Just $ OpImmShift (ImmInteger $ truncated `shiftR` 12) SLSL 12
445+
446+
| otherwise
447+
= Nothing
448+
where
449+
sized_n = fromIntegral truncated :: Word64
450+
truncated = narrowU w n
451+
trailing_zeros = countTrailingZeros sized_n
452+
453+
-- | Logical (immediate)
454+
-- Allows encoding of some repeated bitpatterns
455+
-- Used with AND, ANDS, EOR, ORR, TST
456+
-- and their aliases which includes at least MOV (bitmask immediate)
457+
-- See Note [Aarch64 immediates]
458+
getBitmaskImm :: Integer -> Width -> Maybe Operand
459+
getBitmaskImm n w
460+
| isAArch64Bitmask truncated = Just $ OpImm (ImmInteger truncated)
461+
| otherwise = Nothing
462+
where
463+
truncated = narrowU w n
464+
465+
375466
-- TODO OPT: we might be able give getRegister
376467
-- a hint, what kind of register we want.
377468
getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
@@ -494,8 +585,14 @@ getRegister' config plat expr
494585
CmmLit lit
495586
-> case lit of
496587

497-
-- TODO handle CmmInt 0 specially, use wzr or xzr.
498-
588+
-- Use wzr xzr for CmmInt 0 if the width matches up, otherwise do a move.
589+
-- TODO: Reenable after https://gitlab.haskell.org/ghc/ghc/-/issues/23632 is fixed.
590+
-- CmmInt 0 W32 -> do
591+
-- let format = intFormat W32
592+
-- return (Fixed format reg_zero (unitOL $ (COMMENT ((text . show $ expr))) ))
593+
-- CmmInt 0 W64 -> do
594+
-- let format = intFormat W64
595+
-- return (Fixed format reg_zero (unitOL $ (COMMENT ((text . show $ expr))) ))
499596
CmmInt i W8 | i >= 0 -> do
500597
return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowU W8 i))))))
501598
CmmInt i W16 | i >= 0 -> do
@@ -510,8 +607,13 @@ getRegister' config plat expr
510607
-- Those need the upper bits set. We'd either have to explicitly sign
511608
-- or figure out something smarter. Lowered to
512609
-- `MOV dst XZR`
610+
CmmInt i w | i >= 0
611+
, Just imm_op <- getMovWideImm i w -> do
612+
return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOVZ (OpReg w dst) imm_op)))
613+
513614
CmmInt i w | isNbitEncodeable 16 i, i >= 0 -> do
514615
return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger i)))))
616+
515617
CmmInt i w | isNbitEncodeable 32 i, i >= 0 -> do
516618
let half0 = fromIntegral (fromIntegral i :: Word16)
517619
half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
@@ -586,7 +688,6 @@ getRegister' config plat expr
586688
(op, imm_code) <- litToImm' lit
587689
let rep = cmmLitType plat lit
588690
format = cmmTypeFormat rep
589-
-- width = typeWidth rep
590691
return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
591692

592693
CmmLabelOff lbl off -> do
@@ -791,17 +892,51 @@ getRegister' config plat expr
791892
-- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
792893

793894
-- A "plain" operation.
794-
bitOp w op = do
895+
bitOpImm w op encode_imm = do
795896
-- compute x<m> <- x
796897
-- compute x<o> <- y
797898
-- <OP> x<n>, x<m>, x<o>
798899
(reg_x, format_x, code_x) <- getSomeReg x
799-
(reg_y, format_y, code_y) <- getSomeReg y
800-
massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOp: incompatible"
900+
(op_y, format_y, code_y) <- case y of
901+
CmmLit (CmmInt n w)
902+
| Just imm_operand_y <- encode_imm n w
903+
-> return (imm_operand_y, intFormat w, nilOL)
904+
_ -> do
905+
(reg_y, format_y, code_y) <- getSomeReg y
906+
return (OpReg w reg_y, format_y, code_y)
907+
massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOpImm: incompatible"
801908
return $ Any (intFormat w) (\dst ->
802909
code_x `appOL`
803910
code_y `appOL`
804-
op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
911+
op (OpReg w dst) (OpReg w reg_x) op_y)
912+
913+
-- A (potentially signed) integer operation.
914+
-- In the case of 8- and 16-bit signed arithmetic we must first
915+
-- sign-extend both arguments to 32-bits.
916+
-- See Note [Signed arithmetic on AArch64].
917+
intOpImm :: Bool -> Width -> (Operand -> Operand -> Operand -> OrdList Instr) -> (Integer -> Width -> Maybe Operand) -> NatM (Register)
918+
intOpImm {- is signed -} True w op _encode_imm = intOp True w op
919+
intOpImm False w op encode_imm = do
920+
-- compute x<m> <- x
921+
-- compute x<o> <- y
922+
-- <OP> x<n>, x<m>, x<o>
923+
(reg_x, format_x, code_x) <- getSomeReg x
924+
(op_y, format_y, code_y) <- case y of
925+
CmmLit (CmmInt n w)
926+
| Just imm_operand_y <- encode_imm n w
927+
-> return (imm_operand_y, intFormat w, nilOL)
928+
_ -> do
929+
(reg_y, format_y, code_y) <- getSomeReg y
930+
return (OpReg w reg_y, format_y, code_y)
931+
massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int"
932+
-- This is the width of the registers on which the operation
933+
-- should be performed.
934+
let w' = opRegWidth w
935+
return $ Any (intFormat w) $ \dst ->
936+
code_x `appOL`
937+
code_y `appOL`
938+
op (OpReg w' dst) (OpReg w' reg_x) (op_y) `appOL`
939+
truncateReg w' w dst -- truncate back to the operand's original width
805940

806941
-- A (potentially signed) integer operation.
807942
-- In the case of 8- and 16-bit signed arithmetic we must first
@@ -847,9 +982,9 @@ getRegister' config plat expr
847982
case op of
848983
-- Integer operations
849984
-- Add/Sub should only be Integer Options.
850-
MO_Add w -> intOp False w (\d x y -> unitOL $ annExpr expr (ADD d x y))
985+
MO_Add w -> intOpImm False w (\d x y -> unitOL $ annExpr expr (ADD d x y)) getArithImm
851986
-- TODO: Handle sub-word case
852-
MO_Sub w -> intOp False w (\d x y -> unitOL $ annExpr expr (SUB d x y))
987+
MO_Sub w -> intOpImm False w (\d x y -> unitOL $ annExpr expr (SUB d x y)) getArithImm
853988

854989
-- Note [CSET]
855990
-- ~~~~~~~~~~~
@@ -891,8 +1026,8 @@ getRegister' config plat expr
8911026

8921027
-- N.B. We needn't sign-extend sub-word size (in)equality comparisons
8931028
-- since we don't care about ordering.
894-
MO_Eq w -> bitOp w (\d x y -> toOL [ CMP x y, CSET d EQ ])
895-
MO_Ne w -> bitOp w (\d x y -> toOL [ CMP x y, CSET d NE ])
1029+
MO_Eq w -> bitOpImm w (\d x y -> toOL [ CMP x y, CSET d EQ ]) getArithImm
1030+
MO_Ne w -> bitOpImm w (\d x y -> toOL [ CMP x y, CSET d NE ]) getArithImm
8961031

8971032
-- Signed multiply/divide
8981033
MO_Mul w -> intOp True w (\d x y -> unitOL $ MUL d x y)
@@ -921,10 +1056,10 @@ getRegister' config plat expr
9211056
MO_S_Lt w -> intOp True w (\d x y -> toOL [ CMP x y, CSET d SLT ])
9221057

9231058
-- Unsigned comparisons
924-
MO_U_Ge w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d UGE ])
925-
MO_U_Le w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d ULE ])
926-
MO_U_Gt w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d UGT ])
927-
MO_U_Lt w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d ULT ])
1059+
MO_U_Ge w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d UGE ]) getArithImm
1060+
MO_U_Le w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d ULE ]) getArithImm
1061+
MO_U_Gt w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d UGT ]) getArithImm
1062+
MO_U_Lt w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d ULT ]) getArithImm
9281063

9291064
-- Floating point arithmetic
9301065
MO_F_Add w -> floatOp w (\d x y -> unitOL $ ADD d x y)
@@ -947,9 +1082,9 @@ getRegister' config plat expr
9471082
MO_F_Lt w -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLT ]) -- x < y <=> y >= x
9481083

9491084
-- Bitwise operations
950-
MO_And w -> bitOp w (\d x y -> unitOL $ AND d x y)
951-
MO_Or w -> bitOp w (\d x y -> unitOL $ ORR d x y)
952-
MO_Xor w -> bitOp w (\d x y -> unitOL $ EOR d x y)
1085+
MO_And w -> bitOpImm w (\d x y -> unitOL $ AND d x y) getBitmaskImm
1086+
MO_Or w -> bitOpImm w (\d x y -> unitOL $ ORR d x y) getBitmaskImm
1087+
MO_Xor w -> bitOpImm w (\d x y -> unitOL $ EOR d x y) getBitmaskImm
9531088
MO_Shl w -> intOp False w (\d x y -> unitOL $ LSL d x y)
9541089
MO_U_Shr w -> intOp False w (\d x y -> unitOL $ LSR d x y)
9551090
MO_S_Shr w -> intOp True w (\d x y -> unitOL $ ASR d x y)
@@ -999,7 +1134,7 @@ getRegister' config plat expr
9991134

10001135
where
10011136
isNbitEncodeable :: Int -> Integer -> Bool
1002-
isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
1137+
isNbitEncodeable n_bits i = let shift = n_bits - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
10031138

10041139
-- N.B. MUL does not set the overflow flag.
10051140
do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register

compiler/GHC/CmmToAsm/AArch64/Instr.hs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ regUsageOfInstr platform instr = case instr of
110110
LSR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
111111
MOV dst src -> usage (regOp src, regOp dst)
112112
MOVK dst src -> usage (regOp src, regOp dst)
113+
MOVZ dst src -> usage (regOp src, regOp dst)
113114
MVN dst src -> usage (regOp src, regOp dst)
114115
ORR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
115116
ROR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
@@ -251,6 +252,7 @@ patchRegsOfInstr instr env = case instr of
251252
LSR o1 o2 o3 -> LSR (patchOp o1) (patchOp o2) (patchOp o3)
252253
MOV o1 o2 -> MOV (patchOp o1) (patchOp o2)
253254
MOVK o1 o2 -> MOVK (patchOp o1) (patchOp o2)
255+
MOVZ o1 o2 -> MOVZ (patchOp o1) (patchOp o2)
254256
MVN o1 o2 -> MVN (patchOp o1) (patchOp o2)
255257
ORR o1 o2 o3 -> ORR (patchOp o1) (patchOp o2) (patchOp o3)
256258
ROR o1 o2 o3 -> ROR (patchOp o1) (patchOp o2) (patchOp o3)
@@ -381,9 +383,8 @@ mkSpillInstr config reg delta slot =
381383
where
382384
a .&~. b = a .&. (complement b)
383385

384-
fmt = case reg of
385-
RegReal (RealRegSingle n) | n < 32 -> II64
386-
_ -> FF64
386+
fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
387+
387388
mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
388389
mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
389390
mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -408,9 +409,7 @@ mkLoadInstr config reg delta slot =
408409
where
409410
a .&~. b = a .&. (complement b)
410411

411-
fmt = case reg of
412-
RegReal (RealRegSingle n) | n < 32 -> II64
413-
_ -> FF64
412+
fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
414413

415414
mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
416415
mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
@@ -619,7 +618,7 @@ data Instr
619618
| MOV Operand Operand -- rd = rn or rd = #i
620619
| MOVK Operand Operand
621620
-- | MOVN Operand Operand
622-
-- | MOVZ Operand Operand
621+
| MOVZ Operand Operand
623622
| MVN Operand Operand -- rd = ~rn
624623
| ORN Operand Operand Operand -- rd = rn | ~op2
625624
| ORR Operand Operand Operand -- rd = rn | op2
@@ -708,6 +707,7 @@ instrCon i =
708707
LSR{} -> "LSR"
709708
MOV{} -> "MOV"
710709
MOVK{} -> "MOVK"
710+
MOVZ{} -> "MOVZ"
711711
MVN{} -> "MVN"
712712
ORN{} -> "ORN"
713713
ORR{} -> "ORR"
@@ -782,6 +782,9 @@ wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
782782
sp = OpReg W64 (RegReal (RealRegSingle 31))
783783
ip0 = OpReg W64 (RegReal (RealRegSingle 16))
784784

785+
reg_zero :: Reg
786+
reg_zero = RegReal (RealRegSingle (-1))
787+
785788
_x :: Int -> Operand
786789
_x i = OpReg W64 (RegReal (RealRegSingle i))
787790
x0, x1, x2, x3, x4, x5, x6, x7 :: Operand

compiler/GHC/CmmToAsm/AArch64/Ppr.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ pprInstr platform instr = case instr of
417417
| isFloatOp o1 || isFloatOp o2 -> op2 (text "\tfmov") o1 o2
418418
| otherwise -> op2 (text "\tmov") o1 o2
419419
MOVK o1 o2 -> op2 (text "\tmovk") o1 o2
420+
MOVZ o1 o2 -> op2 (text "\tmovz") o1 o2
420421
MVN o1 o2 -> op2 (text "\tmvn") o1 o2
421422
ORN o1 o2 o3 -> op3 (text "\torn") o1 o2 o3
422423
ORR o1 o2 o3 -> op3 (text "\torr") o1 o2 o3

compiler/GHC/CmmToAsm/AArch64/Regs.hs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ litToImm (CmmInt i w) = ImmInteger (narrowS w i)
7777
-- narrow to the width: a CmmInt might be out of
7878
-- range, but we assume that ImmInteger only contains
7979
-- in-range values. A signed value should be fine here.
80+
-- AK: We do call this with out of range values, however
81+
-- it just truncates as we would expect.
8082
litToImm (CmmFloat f W32) = ImmFloat f
8183
litToImm (CmmFloat f W64) = ImmDouble f
8284
litToImm (CmmLabel l) = ImmCLbl l
@@ -147,6 +149,13 @@ classOfRealReg (RealRegSingle i)
147149
| i < 32 = RcInteger
148150
| otherwise = RcDouble
149151

152+
fmtOfRealReg :: RealReg -> Format
153+
fmtOfRealReg real_reg =
154+
case classOfRealReg real_reg of
155+
RcInteger -> II64
156+
RcDouble -> FF64
157+
RcFloat -> panic "No float regs on arm"
158+
150159
regDotColor :: RealReg -> SDoc
151160
regDotColor reg
152161
= case classOfRealReg reg of

0 commit comments

Comments
 (0)