Resolving issues #20645 and #26109

recursion-ninja · bgamari · commit 090c800e38ee · 2025-08-18T10:10:15.000-04:00
Correctly sign extending and casting smaller bit width types for LLVM operations: - bitReverse8# - bitReverse16# - bitReverse32# - byteSwap16# - byteSwap32# - pdep8# - pdep16# - pext8# - pext16# (cherry picked from commit 706d33e)
diff --git a/compiler/GHC/Builtin/primops.txt.pp b/compiler/GHC/Builtin/primops.txt.pp
@@ -148,6 +148,7 @@
    vector           = []
    deprecated_msg   = {}      -- A non-empty message indicates deprecation
    div_like         = False   -- Second argument expected to be non zero - used for tests
+   defined_bits     = Nothing -- The number of bits the operation is defined for (if not all bits)
 
 -- Note [When do out-of-line primops go in primops.txt.pp]
 -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1065,19 +1066,24 @@
 
 primop   BSwap16Op   "byteSwap16#"   GenPrimOp   Word# -> Word#
     {Swap bytes in the lower 16 bits of a word. The higher bytes are undefined. }
+    with defined_bits = 16
 primop   BSwap32Op   "byteSwap32#"   GenPrimOp   Word# -> Word#
     {Swap bytes in the lower 32 bits of a word. The higher bytes are undefined. }
+    with defined_bits = 32
 primop   BSwap64Op   "byteSwap64#"   GenPrimOp   Word64# -> Word64#
     {Swap bytes in a 64 bits of a word.}
 primop   BSwapOp     "byteSwap#"     GenPrimOp   Word# -> Word#
     {Swap bytes in a word.}
 
 primop   BRev8Op    "bitReverse8#"   GenPrimOp   Word# -> Word#
     {Reverse the order of the bits in a 8-bit word.}
+    with defined_bits = 8
 primop   BRev16Op   "bitReverse16#"   GenPrimOp   Word# -> Word#
     {Reverse the order of the bits in a 16-bit word.}
+    with defined_bits = 16
 primop   BRev32Op   "bitReverse32#"   GenPrimOp   Word# -> Word#
     {Reverse the order of the bits in a 32-bit word.}
+    with defined_bits = 32
 primop   BRev64Op   "bitReverse64#"   GenPrimOp   Word64# -> Word64#
     {Reverse the order of the bits in a 64-bit word.}
 primop   BRevOp     "bitReverse#"     GenPrimOp   Word# -> Word#
diff --git a/compiler/GHC/CmmToLlvm/CodeGen.hs b/compiler/GHC/CmmToLlvm/CodeGen.hs
@@ -230,23 +230,22 @@ genCall t@(PrimTarget (MO_Prefetch_Data localityInt)) [] args
     statement $ Expr $ Call StdCall fptr (argVars' ++ argSuffix) []
   | otherwise = panic $ "prefetch locality level integer must be between 0 and 3, given: " ++ (show localityInt)
 
--- Handle PopCnt, Clz, Ctz, and BSwap that need to only convert arg
--- and return types
-genCall t@(PrimTarget (MO_PopCnt w)) dsts args =
-    genCallSimpleCast w t dsts args
-
-genCall t@(PrimTarget (MO_Pdep w)) dsts args =
-    genCallSimpleCast2 w t dsts args
-genCall t@(PrimTarget (MO_Pext w)) dsts args =
-    genCallSimpleCast2 w t dsts args
-genCall t@(PrimTarget (MO_Clz w)) dsts args =
-    genCallSimpleCast w t dsts args
-genCall t@(PrimTarget (MO_Ctz w)) dsts args =
-    genCallSimpleCast w t dsts args
-genCall t@(PrimTarget (MO_BSwap w)) dsts args =
-    genCallSimpleCast w t dsts args
-genCall t@(PrimTarget (MO_BRev w)) dsts args =
-    genCallSimpleCast w t dsts args
+-- Handle Clz, Ctz, BRev, BSwap, Pdep, Pext, and PopCnt that need to only
+-- convert arg and return types
+genCall (PrimTarget op@(MO_Clz w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_Ctz w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_BRev w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_BSwap w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_Pdep w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_Pext w)) [dst] args =
+    genCallSimpleCast w op dst args
+genCall (PrimTarget op@(MO_PopCnt w)) [dst] args =
+    genCallSimpleCast w op dst args
 
 genCall (PrimTarget (MO_AtomicRMW width amop)) [dst] [addr, n] = runStmtsDecls $ do
     addrVar <- exprToVarW addr
@@ -640,63 +639,28 @@ genCallExtract _ _ _ _ =
 -- since GHC only really has i32 and i64 types and things like Word8 are backed
 -- by an i32 and just present a logical i8 range. So we must handle conversions
 -- from i32 to i8 explicitly as LLVM is strict about types.
-genCallSimpleCast :: Width -> ForeignTarget -> [CmmFormal] -> [CmmActual]
-              -> LlvmM StmtData
-genCallSimpleCast w t@(PrimTarget op) [dst] args = do
-    let width = widthToLlvmInt w
-        dstTy = cmmToLlvmType $ localRegType dst
-
-    fname                       <- cmmPrimOpFunctions op
-    (fptr, _, top3)             <- getInstrinct fname width [width]
-
-    (dstV, _dst_ty)             <- getCmmReg (CmmLocal dst)
-
-    let (_, arg_hints) = foreignTargetHints t
-    let args_hints = zip args arg_hints
-    (argsV, stmts2, top2)       <- arg_vars args_hints ([], nilOL, [])
-    (argsV', stmts4)            <- castVars Signed $ zip argsV [width]
-    (retV, s1)                  <- doExpr width $ Call StdCall fptr argsV' []
-    (retVs', stmts5)            <- castVars (cmmPrimOpRetValSignage op) [(retV,dstTy)]
-    let retV'                    = singletonPanic "genCallSimpleCast" retVs'
-    let s2                       = Store retV' dstV Nothing []
-
-    let stmts = stmts2 `appOL` stmts4 `snocOL`
-                s1 `appOL` stmts5 `snocOL` s2
-    return (stmts, top2 ++ top3)
-genCallSimpleCast _ _ dsts _ =
-    panic ("genCallSimpleCast: " ++ show (length dsts) ++ " dsts")
-
--- Handle simple function call that only need simple type casting, of the form:
---   truncate arg >>= \a -> call(a) >>= zext
---
--- since GHC only really has i32 and i64 types and things like Word8 are backed
--- by an i32 and just present a logical i8 range. So we must handle conversions
--- from i32 to i8 explicitly as LLVM is strict about types.
-genCallSimpleCast2 :: Width -> ForeignTarget -> [CmmFormal] -> [CmmActual]
-              -> LlvmM StmtData
-genCallSimpleCast2 w t@(PrimTarget op) [dst] args = do
-    let width = widthToLlvmInt w
-        dstTy = cmmToLlvmType $ localRegType dst
-
-    fname                       <- cmmPrimOpFunctions op
-    (fptr, _, top3)             <- getInstrinct fname width (const width <$> args)
-
-    (dstV, _dst_ty)             <- getCmmReg (CmmLocal dst)
-
-    let (_, arg_hints) = foreignTargetHints t
-    let args_hints = zip args arg_hints
-    (argsV, stmts2, top2)       <- arg_vars args_hints ([], nilOL, [])
-    (argsV', stmts4)            <- castVars Signed $ zip argsV (const width <$> argsV)
-    (retV, s1)                  <- doExpr width $ Call StdCall fptr argsV' []
-    (retVs', stmts5)             <- castVars (cmmPrimOpRetValSignage op) [(retV,dstTy)]
-    let retV'                    = singletonPanic "genCallSimpleCast2" retVs'
-    let s2                       = Store retV' dstV Nothing []
+genCallSimpleCast :: Width -> CallishMachOp -> CmmFormal -> [CmmActual]
+                  -> LlvmM StmtData
+genCallSimpleCast specW op dst args = do
+    let width   = widthToLlvmInt specW
+        argsW   = const width <$> args
+        dstType = cmmToLlvmType $ localRegType dst
+        signage = cmmPrimOpRetValSignage op
+
+    fname                 <- cmmPrimOpFunctions op
+    (fptr, _, top3)       <- getInstrinct fname width argsW
+    (dstV, _dst_ty)       <- getCmmReg (CmmLocal dst)
+    let (_, arg_hints)     = foreignTargetHints $ PrimTarget op
+    let args_hints         = zip args arg_hints
+    (argsV, stmts2, top2) <- arg_vars args_hints ([], nilOL, [])
+    (argsV', stmts4)      <- castVars signage $ zip argsV argsW
+    (retV, s1)            <- doExpr width $ Call StdCall fptr argsV' []
+    (retV', stmts5)       <- castVar signage retV dstType
+    let s2                 = Store retV' dstV Nothing []
 
     let stmts = stmts2 `appOL` stmts4 `snocOL`
-                s1 `appOL` stmts5 `snocOL` s2
+                s1 `snocOL` stmts5 `snocOL` s2
     return (stmts, top2 ++ top3)
-genCallSimpleCast2 _ _ dsts _ =
-    panic ("genCallSimpleCast2: " ++ show (length dsts) ++ " dsts")
 
 -- | Create a function pointer from a target.
 getFunPtrW :: (LMString -> LlvmType) -> ForeignTarget
@@ -811,11 +775,47 @@ castVar signage v t | getVarType v == t
             Signed      -> LM_Sext
             Unsigned    -> LM_Zext
 
-
 cmmPrimOpRetValSignage :: CallishMachOp -> Signage
 cmmPrimOpRetValSignage mop = case mop of
-    MO_Pdep _   -> Unsigned
-    MO_Pext _   -> Unsigned
+    -- Some bit-wise operations /must/ always treat the input and output values
+    -- as 'Unsigned' in order to return the expected result values when pre/post-
+    -- operation bit-width truncation and/or extension occur. For example,
+    -- consider the Bit-Reverse operation:
+    --
+    -- If the result of a Bit-Reverse is treated as signed,
+    -- an positive input can result in an negative output, i.e.:
+    --
+    --   identity(0x03) = 0x03 = 00000011
+    --   breverse(0x03) = 0xC0 = 11000000
+    --
+    -- Now if an extension is performed after the operation to
+    -- promote a smaller bit-width value into a larger bit-width
+    -- type, it is expected that the /bit-wise/ operations will
+    -- not be treated /numerically/ as signed.
+    --
+    -- To illustrate the difference, consider how a signed extension
+    -- for the type i16 to i32 differs for out values above:
+    --   ext_zeroed(i32, breverse(0x03)) = 0x00C0 = 0000000011000000
+    --   ext_signed(i32, breverse(0x03)) = 0xFFC0 = 1111111111000000
+    --
+    -- Here we can see that the former output is the expected result
+    -- of a bit-wise operation which needs to be promoted to a larger
+    -- bit-width type. The latter output is not desirable when we must
+    -- constraining a value into a range of i16 within an i32 type.
+    --
+    -- Hence we always treat the "signage" as unsigned for Bit-Reverse!
+    --
+    -- The same reasoning applied to Bit-Reverse above applies to the other
+    -- bit-wise operations; do not sign extend a possibly negated number!
+    MO_BRev   _ -> Unsigned
+    MO_BSwap  _ -> Unsigned
+    MO_Clz    _ -> Unsigned
+    MO_Ctz    _ -> Unsigned
+    MO_Pdep   _ -> Unsigned
+    MO_Pext   _ -> Unsigned
+    MO_PopCnt _ -> Unsigned
+
+    -- All other cases, default to preserving the numeric sign when extending.
     _           -> Signed
 
 -- | Decide what C function to use to implement a CallishMachOp
diff --git a/libraries/ghc-internal/cbits/pdep.c b/libraries/ghc-internal/cbits/pdep.c
@@ -24,20 +24,23 @@ hs_pdep64(StgWord64 src, StgWord64 mask)
   return result;
 }
 
+// When dealing with values of bit-width shorter than uint64_t, ensure to
+// cast the return value to correctly truncate the undefined upper bits.
+// This is *VERY* important when GHC is using the LLVM backend!
 StgWord
 hs_pdep32(StgWord src, StgWord mask)
 {
-  return hs_pdep64(src, mask);
+  return (StgWord) ((StgWord32) hs_pdep64(src, mask));
 }
 
 StgWord
 hs_pdep16(StgWord src, StgWord mask)
 {
-  return hs_pdep64(src, mask);
+  return (StgWord) ((StgWord16) hs_pdep64(src, mask));
 }
 
 StgWord
 hs_pdep8(StgWord src, StgWord mask)
 {
-  return hs_pdep64(src, mask);
+  return (StgWord) ((StgWord8) hs_pdep64(src, mask));
 }
diff --git a/libraries/ghc-internal/cbits/pext.c b/libraries/ghc-internal/cbits/pext.c
@@ -1,13 +1,13 @@
 #include "Rts.h"
 #include "MachDeps.h"
 
-StgWord64
-hs_pext64(StgWord64 src, StgWord64 mask)
+static StgWord64
+hs_pext(const unsigned char bit_width, const StgWord64 src, const StgWord64 mask)
 {
   uint64_t result = 0;
   int offset = 0;
 
-  for (int bit = 0; bit != sizeof(uint64_t) * 8; ++bit) {
+  for (int bit = 0; bit != bit_width; ++bit) {
     const uint64_t src_bit = (src >> bit) & 1;
     const uint64_t mask_bit = (mask >> bit) & 1;
 
@@ -20,20 +20,29 @@ hs_pext64(StgWord64 src, StgWord64 mask)
   return result;
 }
 
+StgWord64
+hs_pext64(const StgWord64 src, const StgWord64 mask)
+{
+  return hs_pext(64, src, mask);
+}
+
+// When dealing with values of bit-width shorter than uint64_t, ensure to
+// cast the return value to correctly truncate the undefined upper bits.
+// This is *VERY* important when GHC is using the LLVM backend!
 StgWord
-hs_pext32(StgWord src, StgWord mask)
+hs_pext32(const StgWord src, const StgWord mask)
 {
-  return hs_pext64(src, mask);
+  return (StgWord) ((StgWord32) hs_pext(32, src, mask));
 }
 
 StgWord
-hs_pext16(StgWord src, StgWord mask)
+hs_pext16(const StgWord src, const StgWord mask)
 {
-  return hs_pext64(src, mask);
+  return (StgWord) ((StgWord16) hs_pext(16, src, mask));
 }
 
 StgWord
-hs_pext8(StgWord src, StgWord mask)
+hs_pext8(const StgWord src, const StgWord mask)
 {
-  return hs_pext64(src, mask);
+  return (StgWord) ((StgWord8) hs_pext(8, src, mask));
 }
diff --git a/testsuite/tests/llvm/should_run/T20645.hs b/testsuite/tests/llvm/should_run/T20645.hs
@@ -0,0 +1,18 @@
+-- Minimal reproducer for https://gitlab.haskell.org/ghc/ghc/-/issues/20645
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE ExtendedLiterals #-}
+import GHC.Exts
+import GHC.Word
+import Numeric (showHex)
+
+opaqueInt8# :: Int8# -> Int8#
+opaqueInt8# x = x
+{-# OPAQUE opaqueInt8# #-}
+
+main :: IO ()
+main = let !x = opaqueInt8# 109#Int8
+           !y = opaqueInt8#   1#Int8
+       in putStrLn $ flip showHex "" (W# ( pext8#
+              (word8ToWord# (int8ToWord8# (0#Int8 `subInt8#` x     )))
+              (word8ToWord# (int8ToWord8# (y      `subInt8#` 4#Int8)))
+          ))
diff --git a/testsuite/tests/llvm/should_run/T20645.stdout b/testsuite/tests/llvm/should_run/T20645.stdout
@@ -0,0 +1 @@
+49
diff --git a/testsuite/tests/llvm/should_run/all.T b/testsuite/tests/llvm/should_run/all.T
@@ -17,3 +17,4 @@ test('T22487', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_a
 test('T22033', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, [''])
 test('T25730', [req_c, unless(arch('x86_64'), skip), normalise_errmsg_fun(ignore_llvm_and_vortex)], compile_and_run, ['T25730C.c'])
   # T25730C.c contains Intel instrinsics, so only run this test on x86
+test('T20645', [normal, normalise_errmsg_fun(ignore_llvm_and_vortex), when(have_llvm(), extra_ways(["optllvm"]))], compile_and_run, [''])
diff --git a/testsuite/tests/numeric/should_run/foundation.hs b/testsuite/tests/numeric/should_run/foundation.hs
@@ -24,6 +24,7 @@ module Main
     ( main
     ) where
 
+import Data.Bits (Bits((.&.), bit))
 import Data.Word
 import Data.Int
 import GHC.Natural
@@ -408,6 +409,33 @@ instance TestPrimop (Word# -> Int# -> Word#) where
   testPrimop s l r = Property s $ \(uWord -> a1) (uInt -> a2) -> (wWord (l a1 a2)) === wWord (r a1 a2)
   -}
 
+-- | A special data-type for representing functions where,
+-- since only some number of the lower bits are defined,
+-- testing for strict equality in the undefined upper bits is not appropriate!
+-- Without using this data-type, false-positive failures will be reported
+-- when the undefined bit regions do not match, even though the equality of bits
+-- in this undefined region has no bearing on correctness.
+data LowerBitsAreDefined =
+    LowerBitsAreDefined
+    { definedLowerWidth :: Word
+    -- ^ The (strictly-non-negative) number of least-significant bits
+    -- for which the attached function is defined.
+    , undefinedBehavior :: (Word# -> Word#)
+    -- ^ Function with undefined behavior for some of its most significant bits.
+    }
+
+instance TestPrimop LowerBitsAreDefined where
+  testPrimop s l r = Property s $ \ (uWord#-> x0) ->
+    let -- Create a mask to unset all bits in the undefined area,
+        -- leaving set bits only in the area of defined behavior.
+        -- Since the upper bits are undefined,
+        -- if the function defines behavior for the lower N bits,
+        -- then /only/ the lower N bits are preserved,
+        -- and the upper WORDSIZE - N bits are discarded.
+        mask = bit (fromEnum (definedLowerWidth r)) - 1
+        valL = wWord# (undefinedBehavior l x0) .&. mask
+        valR = wWord# (undefinedBehavior r x0) .&. mask
+    in  valL === valR
 
 twoNonZero :: (a -> a -> b) -> a -> NonZero a -> b
 twoNonZero f x (NonZero y) = f x y
@@ -655,13 +683,13 @@ testPrimops = Group "primop"
   , testPrimop "ctz32#" Primop.ctz32# Wrapper.ctz32#
   , testPrimop "ctz64#" Primop.ctz64# Wrapper.ctz64#
   , testPrimop "ctz#" Primop.ctz# Wrapper.ctz#
-  , testPrimop "byteSwap16#" Primop.byteSwap16# Wrapper.byteSwap16#
-  , testPrimop "byteSwap32#" Primop.byteSwap32# Wrapper.byteSwap32#
+  , testPrimop "byteSwap16#" (16 `LowerBitsAreDefined` Primop.byteSwap16#) (16 `LowerBitsAreDefined` Wrapper.byteSwap16#)
+  , testPrimop "byteSwap32#" (32 `LowerBitsAreDefined` Primop.byteSwap32#) (32 `LowerBitsAreDefined` Wrapper.byteSwap32#)
   , testPrimop "byteSwap64#" Primop.byteSwap64# Wrapper.byteSwap64#
   , testPrimop "byteSwap#" Primop.byteSwap# Wrapper.byteSwap#
-  , testPrimop "bitReverse8#" Primop.bitReverse8# Wrapper.bitReverse8#
-  , testPrimop "bitReverse16#" Primop.bitReverse16# Wrapper.bitReverse16#
-  , testPrimop "bitReverse32#" Primop.bitReverse32# Wrapper.bitReverse32#
+  , testPrimop "bitReverse8#" (8 `LowerBitsAreDefined` Primop.bitReverse8#) (8 `LowerBitsAreDefined` Wrapper.bitReverse8#)
+  , testPrimop "bitReverse16#" (16 `LowerBitsAreDefined` Primop.bitReverse16#) (16 `LowerBitsAreDefined` Wrapper.bitReverse16#)
+  , testPrimop "bitReverse32#" (32 `LowerBitsAreDefined` Primop.bitReverse32#) (32 `LowerBitsAreDefined` Wrapper.bitReverse32#)
   , testPrimop "bitReverse64#" Primop.bitReverse64# Wrapper.bitReverse64#
   , testPrimop "bitReverse#" Primop.bitReverse# Wrapper.bitReverse#
   , testPrimop "narrow8Int#" Primop.narrow8Int# Wrapper.narrow8Int#
diff --git a/utils/genprimopcode/Lexer.x b/utils/genprimopcode/Lexer.x
@@ -56,6 +56,7 @@ words :-
     <0>         "CanFail"           { mkT TCanFail }
     <0>         "ThrowsException"   { mkT TThrowsException }
     <0>         "ReadWriteEffect"   { mkT TReadWriteEffect }
+    <0>         "defined_bits"      { mkT TDefinedBits }
     <0>         "can_fail_warning"  { mkT TCanFailWarnFlag }
     <0>         "DoNotWarnCanFail"  { mkT TDoNotWarnCanFail }
     <0>         "WarnIfEffectIsCanFail" { mkT TWarnIfEffectIsCanFail }
diff --git a/utils/genprimopcode/Main.hs b/utils/genprimopcode/Main.hs
diff --git a/utils/genprimopcode/Parser.y b/utils/genprimopcode/Parser.y
diff --git a/utils/genprimopcode/ParserM.hs b/utils/genprimopcode/ParserM.hs
diff --git a/utils/genprimopcode/Syntax.hs b/utils/genprimopcode/Syntax.hs

Original file line number	Diff line number	Diff line change
`@@ -24,20 +24,23 @@ hs_pdep64(StgWord64 src, StgWord64 mask)`
`24`	`24`	`return result;`
`25`	`25`	`}`
`26`	`26`
	`27`	`+// When dealing with values of bit-width shorter than uint64_t, ensure to`
	`28`	`+// cast the return value to correctly truncate the undefined upper bits.`
	`29`	`+// This is VERY important when GHC is using the LLVM backend!`
`27`	`30`	`StgWord`
`28`	`31`	`hs_pdep32(StgWord src, StgWord mask)`
`29`	`32`	`{`
`30`		`- return hs_pdep64(src, mask);`
	`33`	`+ return (StgWord) ((StgWord32) hs_pdep64(src, mask));`
`31`	`34`	`}`
`32`	`35`
`33`	`36`	`StgWord`
`34`	`37`	`hs_pdep16(StgWord src, StgWord mask)`
`35`	`38`	`{`
`36`		`- return hs_pdep64(src, mask);`
	`39`	`+ return (StgWord) ((StgWord16) hs_pdep64(src, mask));`
`37`	`40`	`}`
`38`	`41`
`39`	`42`	`StgWord`
`40`	`43`	`hs_pdep8(StgWord src, StgWord mask)`
`41`	`44`	`{`
`42`		`- return hs_pdep64(src, mask);`
	`45`	`+ return (StgWord) ((StgWord8) hs_pdep64(src, mask));`
`43`	`46`	`}`