From fd4bf2314604dbcc5cefa58e6849d2e88da9f522 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Fri, 8 May 2026 16:56:41 +0530 Subject: [PATCH 01/16] Update the traceEventIO example --- examples/traceEventIO.hs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/examples/traceEventIO.hs b/examples/traceEventIO.hs index c31afc8..697c221 100644 --- a/examples/traceEventIO.hs +++ b/examples/traceEventIO.hs @@ -2,13 +2,13 @@ import Control.Monad.IO.Class (MonadIO(..)) import Debug.Trace (traceEventIO) -import Foreign.C.Types -import System.Posix.Signals +import Foreign.C.Types ( CUInt(..) ) +import GHC.Conc (myThreadId, labelThread) +import System.Posix.Signals ( blockSignals, fullSignalSet ) foreign import ccall unsafe "unistd.h sleep" c_sleep :: CUInt -> IO CUInt - {-# INLINE withTracingFlow #-} withTracingFlow :: MonadIO m => String -> m a -> m a withTracingFlow tag action = do @@ -17,6 +17,9 @@ withTracingFlow tag action = do liftIO $ traceEventIO ("END:" ++ tag) pure res +emptyBlock :: IO () +emptyBlock = return () + sleepBlock :: IO () sleepBlock = do -- So that signals do not interrupt the sleep @@ -24,19 +27,19 @@ sleepBlock = do _ <- c_sleep 10 return () -{- -sleepWithEvents :: IO () -sleepWithEvents = do - traceEventIO "before sleep" - sleepBlock - traceEventIO "after sleep" --} - -emptyBlock :: IO () -emptyBlock = return () +{-# INLINE printSumLoop #-} +printSumLoop :: Int -> Int -> Int -> IO () +printSumLoop _ _ 0 = print "All Done!" +printSumLoop chunksOf from times = do + withTracingFlow "SUM" $ print $ sum [from..(from + chunksOf)] + printSumLoop chunksOf (from + chunksOf) (times - 1) main :: IO () main = do + tid <- myThreadId + labelThread tid "main-thread" + withTracingFlow "EMPTY" emptyBlock withTracingFlow "SLEEP" sleepBlock - return () + withTracingFlow "LOOP" $ do + printSumLoop 10000 1 100 From b4129096efd45c118f2d4fe374aa06a3c813f15f Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Fri, 8 May 2026 17:40:51 +0530 Subject: [PATCH 02/16] Modularize the main function --- hperf/Main.hs | 128 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 39 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index ffd41ae..078ac6d 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -405,10 +405,10 @@ printAllCounters maxLines concurrent statsRaw tidMap ctrs w = do Nothing -> "-" -- | Combine stats from all windows with the same name but different thread-id -flattenStats :: +foldWindowThreads :: [((Word32, String, Counter), [(String, Int)])] -> IO [((Word32, String, Counter), [(String, Int)])] -flattenStats statsRaw = do +foldWindowThreads statsRaw = do let renameWindow w = let (_, r) = span (/= ':') w in if null r then w else '0':r @@ -441,9 +441,9 @@ configParser = Config <> help "Path to the GHC eventlog file to analyse" ) <*> switch - ( long "flatten-windows" + ( long "fold-window-threads" <> short 'f' - <> help "Collapse window-level stats across threads" + <> help "Combine stats for windows with the same name across all threads" ) <*> option auto ( long "max-lines" @@ -463,22 +463,18 @@ optsInfo = info (configParser <**> helper) ) ------------------------------------------------------------------------------- --- Entry point +-- Report ------------------------------------------------------------------------------- --- XXX Add two different commands "hperf eventlog" and "hperf metrics", one for --- eventlog analysis and the other for metrics collected by other methods. --- --- XXX Are the events for a particular thread guaranteed to come in order. What --- if a thread logged events to a particular capability buffer and then got --- scheduled on another capability before its eventlog could be flushed from --- the previous capability? -main :: IO () -main = do - Config { configFile = path - , configFlattenWindows = flattenWindows - , configMaxLines = maxLines - } <- execParser optsInfo +loadStats :: + FilePath + -> Bool + -> IO + ( [((Word32, String, Counter), [(String, Int)])] + , [((Word32, String, Counter), [(String, Int)])] + , Map Word32 (Maybe String) + ) +loadStats path mergeThreads = do let stream = File.readChunks (Path.fromString_ path) (kv, rest) <- parseLogHeader $ StreamK.fromStream stream @@ -493,7 +489,7 @@ main = do -- putStrLn $ show tidMap let -- statsRaw :: [(tid, window tag, counter), [(stat name, value)]] - statsRaw = + rawStats = -- TODO: get the sorting field from Config/CLI -- List.sortOn (getStatField "tid") -- TODO: get the threshold from Config/CLI @@ -505,18 +501,44 @@ main = do -- XXX Take a window argument from config/CLI and rename only specific -- windows or all windows depending on that. - statsFlattened <- - (if flattenWindows then flattenStats else return) statsRaw - - let windowCounterList = - List.nub - -- XXX Control this by config - $ filter (\(w,_) -> not (":foreign" `List.isSuffixOf` w)) - $ filter (\(_,c) -> c `notElem` windowLevelCounters) - $ map (\(_, window, counter) -> (window, counter)) - $ map fst statsFlattened - mapM_ checkLabel (Map.toList tidMap) + foldedStats <- + (if mergeThreads then foldWindowThreads else return) rawStats + return (rawStats, foldedStats, tidMap) + where + + toEither (CounterEvent tid tag ctr loc val) = + Left ((tid, tag, ctr), (loc, fromIntegral val)) + toEither (LabelEvent tid label) = Right (tid, label) + +getWindowCounterList :: + [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] +getWindowCounterList foldedStats = + List.nub + -- XXX Control this by config + $ filter (\(w,_) -> not (":foreign" `List.isSuffixOf` w)) + $ filter (\(_,c) -> c `notElem` windowLevelCounters) + $ map (\(_, window, counter) -> (window, counter)) + $ map fst foldedStats + +validateLabels :: Map Word32 (Maybe String) -> IO () +validateLabels tidMap = mapM_ checkLabel (Map.toList tidMap) + + where + + checkLabel (tid, Nothing) = + error $ "Duplicate non-matching label events for thread: " ++ show tid + checkLabel _ = pure () + +printSummarySection :: + Int + -> Bool + -> [((Word32, String, Counter), [(String, Int)])] + -> [((Word32, String, Counter), [(String, Int)])] + -> Map Word32 (Maybe String) + -> [(String, Counter)] + -> IO () +printSummarySection maxLines foldWindowStats statsRaw statsFlattened tidMap windowCounterList = do putStrLn "--------------------------------------------------" putStrLn "Summary Stats" putStrLn "--------------------------------------------------" @@ -531,24 +553,52 @@ main = do let f w = printAllCounters - maxLines flattenWindows (getStats w) (fmap fromJust tidMap) ctrs w + maxLines foldWindowStats (getStats w) (fmap fromJust tidMap) ctrs w in mapM_ f wins +printDetailedSection :: + Int + -> [((Word32, String, Counter), [(String, Int)])] + -> [((Word32, String, Counter), [(String, Int)])] + -> Map Word32 (Maybe String) + -> [(String, Counter)] + -> IO () +printDetailedSection maxLines rawStats foldedStats tidMap windowCounterList = do putStrLn "--------------------------------------------------" putStrLn "Detailed Stats" putStrLn "--------------------------------------------------" putStrLn "" - + -- hack - currently we do not compute avg and stddev in flattened + let getStats w = if w == "default" then rawStats else foldedStats -- For each (window, counter) list all threads let f (w,c) = printWindowCounter maxLines (getStats w) (fmap fromJust tidMap) (w,c) in mapM_ f windowCounterList - where +------------------------------------------------------------------------------- +-- Entry point +------------------------------------------------------------------------------- - toEither (CounterEvent tid tag ctr loc val) = - Left ((tid, tag, ctr), (loc, fromIntegral val)) - toEither (LabelEvent tid label) = Right (tid, label) +-- XXX Add two different commands "hperf eventlog" and "hperf metrics", one for +-- eventlog analysis and the other for metrics collected by other methods. +-- +-- XXX Are the events for a particular thread guaranteed to come in order. What +-- if a thread logged events to a particular capability buffer and then got +-- scheduled on another capability before its eventlog could be flushed from +-- the previous capability? +main :: IO () +main = do + Config { configFile = path + , configFlattenWindows = flattenWindows + , configMaxLines = maxLines + } <- execParser optsInfo - checkLabel (tid,Nothing) = - error $ "Duplicate non-matching label events for thread: " ++ show tid - checkLabel _ = pure () + (statsRaw, statsFlattened, tidMap) <- loadStats path flattenWindows + + let windowCounterList = getWindowCounterList statsFlattened + validateLabels tidMap + + printSummarySection + maxLines + flattenWindows statsRaw statsFlattened tidMap windowCounterList + printDetailedSection + maxLines statsRaw statsFlattened tidMap windowCounterList From 92a96fb7bbbb52e90fd8561238688f389e88a007 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Fri, 8 May 2026 18:16:58 +0530 Subject: [PATCH 03/16] By default print summary stats only Add an option to print detailed stats --- hperf/Main.hs | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 078ac6d..69a91f7 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -432,6 +432,7 @@ data Config = Config { configFile :: FilePath , configFlattenWindows :: Bool , configMaxLines :: Int + , configDetailed :: Bool } configParser :: Parser Config @@ -453,6 +454,11 @@ configParser = Config <> showDefault <> help "Maximum number of thread rows to print per table" ) + <*> switch + ( long "detailed" + <> short 'd' + <> help "Print detailed per-counter stats instead of summary" + ) optsInfo :: ParserInfo Config optsInfo = info (configParser <**> helper) @@ -539,11 +545,6 @@ printSummarySection :: -> [(String, Counter)] -> IO () printSummarySection maxLines foldWindowStats statsRaw statsFlattened tidMap windowCounterList = do - putStrLn "--------------------------------------------------" - putStrLn "Summary Stats" - putStrLn "--------------------------------------------------" - putStrLn "" - -- TODO: filter the counters to be printed based on Config/CLI -- TODO: filter the windows or threads to be printed let ctrs = List.nub $ fmap snd windowCounterList @@ -564,10 +565,8 @@ printDetailedSection :: -> [(String, Counter)] -> IO () printDetailedSection maxLines rawStats foldedStats tidMap windowCounterList = do - putStrLn "--------------------------------------------------" - putStrLn "Detailed Stats" - putStrLn "--------------------------------------------------" - putStrLn "" + -- XXX TODO need to print summary info as well in this. + -- hack - currently we do not compute avg and stddev in flattened let getStats w = if w == "default" then rawStats else foldedStats -- For each (window, counter) list all threads @@ -590,6 +589,7 @@ main = do Config { configFile = path , configFlattenWindows = flattenWindows , configMaxLines = maxLines + , configDetailed = detailed } <- execParser optsInfo (statsRaw, statsFlattened, tidMap) <- loadStats path flattenWindows @@ -597,8 +597,9 @@ main = do let windowCounterList = getWindowCounterList statsFlattened validateLabels tidMap - printSummarySection - maxLines - flattenWindows statsRaw statsFlattened tidMap windowCounterList - printDetailedSection - maxLines statsRaw statsFlattened tidMap windowCounterList + if detailed + then printDetailedSection + maxLines statsRaw statsFlattened tidMap windowCounterList + else printSummarySection + maxLines + flattenWindows statsRaw statsFlattened tidMap windowCounterList From d9d02d2118105bae957324452590ad938f1151df Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 02:40:47 +0530 Subject: [PATCH 04/16] Rename functions names, change help for better clarity --- hperf/Main.hs | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 69a91f7..586060f 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -198,13 +198,13 @@ printTable rows = do getStatField :: String -> (k, [(String, Int)]) -> Maybe Int getStatField x kv = List.lookup x $ snd kv -printWindowCounter :: +showCounterDetailsForWindow :: Int -> [((Word32, String, Counter), [(String, Int)])] -> Map Word32 String -> (String, Counter) -> IO () -printWindowCounter maxLines statsRaw tidMap (w, ctr) = do +showCounterDetailsForWindow maxLines statsRaw tidMap (w, ctr) = do if w == "default" then putStrLn $ "Global thread wise stats for [" ++ show ctr ++ "]" @@ -275,7 +275,7 @@ windowLevelCounters = -- which could be extremely large. Also, we will be able to report online, in -- real time. We will need a Map of windows, which will store a Map of tids -- which will store a list or Map of counters. -printAllCounters :: +showAllCountersForWindow :: Int -> Bool -> [((Word32, String, Counter), [(String, Int)])] @@ -283,7 +283,7 @@ printAllCounters :: -> [Counter] -> String -> IO () -printAllCounters maxLines concurrent statsRaw tidMap ctrs w = do +showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do let windowTotals :: [((Word32, Counter), Int)] windowTotals = fmap toTotal $ filter selectWindow statsRaw @@ -294,7 +294,7 @@ printAllCounters maxLines concurrent statsRaw tidMap ctrs w = do (fmap selectCounter ctrs1) if null ctrs1 - then putStrLn "printAllCounters: no counters to print" + then putStrLn "showAllCountersForWindow: no counters to print" else do -- Each tid must have all the counters present and in the same order. r <- Stream.fold Fold.the $ Stream.fromList tidList @@ -442,9 +442,9 @@ configParser = Config <> help "Path to the GHC eventlog file to analyse" ) <*> switch - ( long "fold-window-threads" + ( long "fold-threads" <> short 'f' - <> help "Combine stats for windows with the same name across all threads" + <> help "Instead of per thread windows, show all threads combined per window" ) <*> option auto ( long "max-lines" @@ -454,10 +454,12 @@ configParser = Config <> showDefault <> help "Maximum number of thread rows to print per table" ) + -- By default prints one table per window consisting of all counters + -- summary for that window. <*> switch ( long "detailed" <> short 'd' - <> help "Print detailed per-counter stats instead of summary" + <> help "Print details for each counter for each window" ) optsInfo :: ParserInfo Config @@ -536,7 +538,7 @@ validateLabels tidMap = mapM_ checkLabel (Map.toList tidMap) error $ "Duplicate non-matching label events for thread: " ++ show tid checkLabel _ = pure () -printSummarySection :: +showAllCountersPerWindow :: Int -> Bool -> [((Word32, String, Counter), [(String, Int)])] @@ -544,7 +546,7 @@ printSummarySection :: -> Map Word32 (Maybe String) -> [(String, Counter)] -> IO () -printSummarySection maxLines foldWindowStats statsRaw statsFlattened tidMap windowCounterList = do +showAllCountersPerWindow maxLines foldWindowStats statsRaw statsFlattened tidMap windowCounterList = do -- TODO: filter the counters to be printed based on Config/CLI -- TODO: filter the windows or threads to be printed let ctrs = List.nub $ fmap snd windowCounterList @@ -553,24 +555,24 @@ printSummarySection maxLines foldWindowStats statsRaw statsFlattened tidMap wind getStats w = if w == "default" then statsRaw else statsFlattened let f w = - printAllCounters + showAllCountersForWindow maxLines foldWindowStats (getStats w) (fmap fromJust tidMap) ctrs w in mapM_ f wins -printDetailedSection :: +showOneCounterPerWindow :: Int -> [((Word32, String, Counter), [(String, Int)])] -> [((Word32, String, Counter), [(String, Int)])] -> Map Word32 (Maybe String) -> [(String, Counter)] -> IO () -printDetailedSection maxLines rawStats foldedStats tidMap windowCounterList = do +showOneCounterPerWindow maxLines rawStats foldedStats tidMap windowCounterList = do -- XXX TODO need to print summary info as well in this. -- hack - currently we do not compute avg and stddev in flattened let getStats w = if w == "default" then rawStats else foldedStats -- For each (window, counter) list all threads - let f (w,c) = printWindowCounter maxLines (getStats w) (fmap fromJust tidMap) (w,c) + let f (w,c) = showCounterDetailsForWindow maxLines (getStats w) (fmap fromJust tidMap) (w,c) in mapM_ f windowCounterList ------------------------------------------------------------------------------- @@ -598,8 +600,8 @@ main = do validateLabels tidMap if detailed - then printDetailedSection + then showOneCounterPerWindow maxLines statsRaw statsFlattened tidMap windowCounterList - else printSummarySection + else showAllCountersPerWindow maxLines flattenWindows statsRaw statsFlattened tidMap windowCounterList From a0a237a6bb0324a5f22662f16e3860bf76ce7663 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 03:18:33 +0530 Subject: [PATCH 05/16] Print windows and counters --- hperf/Main.hs | 45 ++++++++++++++++++++++++++++--------- lib/Perf/Eventlog/Parser.hs | 2 +- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 586060f..82e7c4d 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -433,6 +433,8 @@ data Config = Config , configFlattenWindows :: Bool , configMaxLines :: Int , configDetailed :: Bool + , configListCounters :: Bool + , configListWindows :: Bool } configParser :: Parser Config @@ -461,6 +463,16 @@ configParser = Config <> short 'd' <> help "Print details for each counter for each window" ) + <*> switch + ( long "list-counters" + <> short 'c' + <> help "List all available counters and exit" + ) + <*> switch + ( long "list-windows" + <> short 'w' + <> help "List all windows found in the eventlog file and exit" + ) optsInfo :: ParserInfo Config optsInfo = info (configParser <**> helper) @@ -592,16 +604,27 @@ main = do , configFlattenWindows = flattenWindows , configMaxLines = maxLines , configDetailed = detailed + , configListCounters = listCounters + , configListWindows = listWindows } <- execParser optsInfo - (statsRaw, statsFlattened, tidMap) <- loadStats path flattenWindows - - let windowCounterList = getWindowCounterList statsFlattened - validateLabels tidMap - - if detailed - then showOneCounterPerWindow - maxLines statsRaw statsFlattened tidMap windowCounterList - else showAllCountersPerWindow - maxLines - flattenWindows statsRaw statsFlattened tidMap windowCounterList + when listCounters $ do + putStrLn "Supported counters:" + mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] + + when listWindows $ do + (_, statsFlattened, _) <- loadStats path flattenWindows + let wins = List.nub $ "default" : fmap fst (getWindowCounterList statsFlattened) + putStrLn "Available windows:" + mapM_ (putStrLn . (" " ++)) wins + + when (not listCounters && not listWindows) $ do + (statsRaw, statsFlattened, tidMap) <- loadStats path flattenWindows + let windowCounterList = getWindowCounterList statsFlattened + validateLabels tidMap + if detailed + then showOneCounterPerWindow + maxLines statsRaw statsFlattened tidMap windowCounterList + else showAllCountersPerWindow + maxLines + flattenWindows statsRaw statsFlattened tidMap windowCounterList diff --git a/lib/Perf/Eventlog/Parser.hs b/lib/Perf/Eventlog/Parser.hs index 2488465..60bfbcb 100644 --- a/lib/Perf/Eventlog/Parser.hs +++ b/lib/Perf/Eventlog/Parser.hs @@ -238,7 +238,7 @@ data Counter = | ProcessUserCPUTime | ProcessSystemCPUTime - deriving (Show, Eq, Ord) + deriving (Show, Eq, Ord, Enum, Bounded) -- data Location = Enter | Exit | Resume | Suspend deriving Show data Location = Resume | Suspend | Exit | Purge deriving Show From 6e585bd06aa7ebc70975d5dc22c465385b642f63 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 04:08:48 +0530 Subject: [PATCH 06/16] Factor out the stat loading part --- hperf/Main.hs | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 82e7c4d..bf91e17 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -488,27 +488,43 @@ optsInfo = info (configParser <**> helper) loadStats :: FilePath + -> IO + -- ( Map (tid, window tag, counter) (Maybe [(stat name, value)]) + ( Map (Word32, String, Counter) (Maybe [(String, Int)]) + -- , Map tid (Maybe thread-name)) + , Map Word32 (Maybe String) + ) +loadStats path = do + let chunks = File.readChunks (Path.fromString_ path) + (kv, rest) <- parseLogHeader $ StreamK.fromStream chunks + -- putStrLn $ show kv + eventChunks <- parseDataHeader rest + let tagged = fmap tagEither $ generateEvents kv eventChunks + toLabels = Fold.kvToMap Fold.the + collector = Fold.partition toStats toLabels + Stream.fold collector tagged + + where + + tagEither (CounterEvent tid tag ctr loc val) = + Left ((tid, tag, ctr), (loc, fromIntegral val)) + tagEither (LabelEvent tid label) = Right (tid, label) + +getAllStats :: + FilePath -> Bool -> IO ( [((Word32, String, Counter), [(String, Int)])] , [((Word32, String, Counter), [(String, Int)])] , Map Word32 (Maybe String) ) -loadStats path mergeThreads = do - let stream = File.readChunks (Path.fromString_ path) - - (kv, rest) <- parseLogHeader $ StreamK.fromStream stream - -- putStrLn $ show kv - events <- parseDataHeader rest - (statsMap, tidMap) <- - Stream.fold - (Fold.partition toStats (Fold.kvToMap Fold.the)) - (fmap toEither $ generateEvents kv events) +getAllStats path mergeThreads = do + (statsMap, tidMap) <- loadStats path -- statsMap :: Map (tid, window tag, counter) (Maybe [(stat name, value)]) -- putStrLn $ ppShow r -- putStrLn $ show tidMap let - -- statsRaw :: [(tid, window tag, counter), [(stat name, value)]] + -- rawStats :: [(tid, window tag, counter), [(stat name, value)]] rawStats = -- TODO: get the sorting field from Config/CLI -- List.sortOn (getStatField "tid") @@ -525,12 +541,6 @@ loadStats path mergeThreads = do (if mergeThreads then foldWindowThreads else return) rawStats return (rawStats, foldedStats, tidMap) - where - - toEither (CounterEvent tid tag ctr loc val) = - Left ((tid, tag, ctr), (loc, fromIntegral val)) - toEither (LabelEvent tid label) = Right (tid, label) - getWindowCounterList :: [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] getWindowCounterList foldedStats = @@ -613,13 +623,14 @@ main = do mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] when listWindows $ do - (_, statsFlattened, _) <- loadStats path flattenWindows + (_, statsFlattened, _) <- getAllStats path flattenWindows let wins = List.nub $ "default" : fmap fst (getWindowCounterList statsFlattened) putStrLn "Available windows:" mapM_ (putStrLn . (" " ++)) wins when (not listCounters && not listWindows) $ do - (statsRaw, statsFlattened, tidMap) <- loadStats path flattenWindows + -- XXX flattened stats are needed only in detailed view + (statsRaw, statsFlattened, tidMap) <- getAllStats path flattenWindows let windowCounterList = getWindowCounterList statsFlattened validateLabels tidMap if detailed From 174527f5db2e90f7de7567cedd63270d4c03d5dd Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 05:23:14 +0530 Subject: [PATCH 07/16] Refactor loading of Stats --- hperf/Main.hs | 67 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index bf91e17..28ce850 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -486,6 +486,20 @@ optsInfo = info (configParser <**> helper) -- Report ------------------------------------------------------------------------------- +postProcess :: + -- ((tid, window-tag, counter), [(stat-name, value)]) + -- XXX Instead of Maybe here can we use an empty list? + [ ((Word32, String, Counter), Maybe [(String, Int)]) ] + -> [ ((Word32, String, Counter), [(String, Int)]) ] +postProcess = + -- TODO: get the sorting field from Config/CLI + -- List.sortOn (getStatField "tid") + -- TODO: get the threshold from Config/CLI + -- $ filter (\x -> fromJust (getStatField "total" x) > 0) + fmap (\(k, v) -> (k, filter (\(k1,_) -> k1 /= "latest") v)) + . fmap (\(k, v) -> (k, fromJust v)) + . filter (\(_, v) -> isJust v) + loadStats :: FilePath -> IO @@ -510,36 +524,41 @@ loadStats path = do Left ((tid, tag, ctr), (loc, fromIntegral val)) tagEither (LabelEvent tid label) = Right (tid, label) -getAllStats :: +getStatMapTidMap :: FilePath - -> Bool -> IO ( [((Word32, String, Counter), [(String, Int)])] - , [((Word32, String, Counter), [(String, Int)])] , Map Word32 (Maybe String) ) -getAllStats path mergeThreads = do +getStatMapTidMap path = do (statsMap, tidMap) <- loadStats path -- statsMap :: Map (tid, window tag, counter) (Maybe [(stat name, value)]) -- putStrLn $ ppShow r -- putStrLn $ show tidMap - let - -- rawStats :: [(tid, window tag, counter), [(stat name, value)]] - rawStats = - -- TODO: get the sorting field from Config/CLI - -- List.sortOn (getStatField "tid") - -- TODO: get the threshold from Config/CLI - -- $ filter (\x -> fromJust (getStatField "total" x) > 0) - map (\(k, v) -> (k, filter (\(k1,_) -> k1 /= "latest") v)) - $ map (\(k, v) -> (k, fromJust v)) - $ filter (\(_, v) -> isJust v) - $ Map.toList statsMap + -- rawStats :: [(tid, window tag, counter), [(stat name, value)]] + let statsMap1 = postProcess $ Map.toList statsMap + return (statsMap1, tidMap) +getAllStats :: + Bool + -> FilePath + -> IO + ( [((Word32, String, Counter), [(String, Int)])] + , Map Word32 (Maybe String) + , [((Word32, String, Counter), [(String, Int)])] + , [(String, Counter)]) +getAllStats mergeThreads path = do + (statMap, tidMap) <- getStatMapTidMap path -- XXX Take a window argument from config/CLI and rename only specific -- windows or all windows depending on that. + -- XXX we should rather use the mergeThreads flag whereever we need this. + -- this makes understanding code difficult. foldedStats <- - (if mergeThreads then foldWindowThreads else return) rawStats - return (rawStats, foldedStats, tidMap) + if mergeThreads + then foldWindowThreads statMap + else return statMap + let windowCounterList = getWindowCounterList foldedStats + return (statMap, tidMap, foldedStats, windowCounterList) getWindowCounterList :: [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] @@ -611,7 +630,7 @@ showOneCounterPerWindow maxLines rawStats foldedStats tidMap windowCounterList = main :: IO () main = do Config { configFile = path - , configFlattenWindows = flattenWindows + , configFlattenWindows = mergeThreads , configMaxLines = maxLines , configDetailed = detailed , configListCounters = listCounters @@ -623,19 +642,17 @@ main = do mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] when listWindows $ do - (_, statsFlattened, _) <- getAllStats path flattenWindows - let wins = List.nub $ "default" : fmap fst (getWindowCounterList statsFlattened) + (_, _, _, windowCounterList) <- getAllStats mergeThreads path + let wins = List.nub $ "default" : fmap fst windowCounterList putStrLn "Available windows:" mapM_ (putStrLn . (" " ++)) wins when (not listCounters && not listWindows) $ do - -- XXX flattened stats are needed only in detailed view - (statsRaw, statsFlattened, tidMap) <- getAllStats path flattenWindows - let windowCounterList = getWindowCounterList statsFlattened + (statsMap, tidMap, foldedStats, windowCounterList) <- getAllStats mergeThreads path validateLabels tidMap if detailed then showOneCounterPerWindow - maxLines statsRaw statsFlattened tidMap windowCounterList + maxLines statsMap foldedStats tidMap windowCounterList else showAllCountersPerWindow maxLines - flattenWindows statsRaw statsFlattened tidMap windowCounterList + mergeThreads statsMap foldedStats tidMap windowCounterList From 7a8347afc7c3cf5ebf92883902002f6c780f62ca Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 05:53:57 +0530 Subject: [PATCH 08/16] Organize CLI using subcommands --- hperf/Main.hs | 113 +++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 28ce850..65561bb 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -428,17 +428,37 @@ foldWindowThreads statsRaw = do -- CLI ------------------------------------------------------------------------------- -data Config = Config - { configFile :: FilePath - , configFlattenWindows :: Bool - , configMaxLines :: Int - , configDetailed :: Bool - , configListCounters :: Bool - , configListWindows :: Bool +data ListSubCmd + = ListCounters + | ListWindows FilePath + +data AnalyseConfig = AnalyseConfig + { analyseFile :: FilePath + , analyseFoldThreads :: Bool + , analyseMaxLines :: Int + , analyseDetailed :: Bool } -configParser :: Parser Config -configParser = Config +data Command + = CmdList ListSubCmd + | CmdAnalyse AnalyseConfig + +listSubCmdParser :: Parser ListSubCmd +listSubCmdParser = subparser + ( command "counters" + (info (pure ListCounters) + (progDesc "List all available performance counters")) + <> command "windows" + (info + (ListWindows <$> argument str + ( metavar "EVENTLOG-FILE" + <> help "Path to the GHC eventlog file" + )) + (progDesc "List all windows found in the eventlog file")) + ) + +analyseConfigParser :: Parser AnalyseConfig +analyseConfigParser = AnalyseConfig <$> argument str ( metavar "EVENTLOG-FILE" <> help "Path to the GHC eventlog file to analyse" @@ -463,19 +483,19 @@ configParser = Config <> short 'd' <> help "Print details for each counter for each window" ) - <*> switch - ( long "list-counters" - <> short 'c' - <> help "List all available counters and exit" - ) - <*> switch - ( long "list-windows" - <> short 'w' - <> help "List all windows found in the eventlog file and exit" - ) -optsInfo :: ParserInfo Config -optsInfo = info (configParser <**> helper) +commandParser :: Parser Command +commandParser = subparser + ( command "list" + (info (CmdList <$> listSubCmdParser) + (progDesc "List available counters or windows")) + <> command "analyse" + (info (CmdAnalyse <$> analyseConfigParser) + (progDesc "Analyse a GHC eventlog file")) + ) + +optsInfo :: ParserInfo Command +optsInfo = info (commandParser <**> helper) ( fullDesc <> progDesc ("Analyse CPU cost, heap allocations, and Linux perf event " ++ "counters for Haskell threads and user-defined code windows.") @@ -629,30 +649,27 @@ showOneCounterPerWindow maxLines rawStats foldedStats tidMap windowCounterList = -- the previous capability? main :: IO () main = do - Config { configFile = path - , configFlattenWindows = mergeThreads - , configMaxLines = maxLines - , configDetailed = detailed - , configListCounters = listCounters - , configListWindows = listWindows - } <- execParser optsInfo - - when listCounters $ do - putStrLn "Supported counters:" - mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] - - when listWindows $ do - (_, _, _, windowCounterList) <- getAllStats mergeThreads path - let wins = List.nub $ "default" : fmap fst windowCounterList - putStrLn "Available windows:" - mapM_ (putStrLn . (" " ++)) wins - - when (not listCounters && not listWindows) $ do - (statsMap, tidMap, foldedStats, windowCounterList) <- getAllStats mergeThreads path - validateLabels tidMap - if detailed - then showOneCounterPerWindow - maxLines statsMap foldedStats tidMap windowCounterList - else showAllCountersPerWindow - maxLines - mergeThreads statsMap foldedStats tidMap windowCounterList + cmd <- execParser optsInfo + case cmd of + CmdList ListCounters -> do + putStrLn "Supported counters:" + mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] + CmdList (ListWindows path) -> do + (_, _, _, windowCounterList) <- getAllStats False path + let wins = List.nub $ "default" : fmap fst windowCounterList + putStrLn "Available windows:" + mapM_ (putStrLn . (" " ++)) wins + CmdAnalyse AnalyseConfig + { analyseFile = path + , analyseFoldThreads = mergeThreads + , analyseMaxLines = maxLines + , analyseDetailed = detailed + } -> do + (statsMap, tidMap, foldedStats, windowCounterList) <- getAllStats mergeThreads path + validateLabels tidMap + if detailed + then showOneCounterPerWindow + maxLines statsMap foldedStats tidMap windowCounterList + else showAllCountersPerWindow + maxLines + mergeThreads statsMap foldedStats tidMap windowCounterList From 51f08f884684d41fb4d83527e8b2e9f84971b5b0 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 05:57:44 +0530 Subject: [PATCH 09/16] Add a command to list all threads --- hperf/Main.hs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/hperf/Main.hs b/hperf/Main.hs index 65561bb..f8629ca 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -431,6 +431,7 @@ foldWindowThreads statsRaw = do data ListSubCmd = ListCounters | ListWindows FilePath + | ListThreads FilePath data AnalyseConfig = AnalyseConfig { analyseFile :: FilePath @@ -455,6 +456,13 @@ listSubCmdParser = subparser <> help "Path to the GHC eventlog file" )) (progDesc "List all windows found in the eventlog file")) + <> command "threads" + (info + (ListThreads <$> argument str + ( metavar "EVENTLOG-FILE" + <> help "Path to the GHC eventlog file" + )) + (progDesc "List all threads found in the eventlog file")) ) analyseConfigParser :: Parser AnalyseConfig @@ -659,6 +667,12 @@ main = do let wins = List.nub $ "default" : fmap fst windowCounterList putStrLn "Available windows:" mapM_ (putStrLn . (" " ++)) wins + CmdList (ListThreads path) -> do + (_, tidMap) <- loadStats path + putStrLn "Threads (id, label):" + mapM_ (\(tid, mlabel) -> + putStrLn $ " " ++ show tid ++ ", " ++ maybe "-" id mlabel) + (Map.toList tidMap) CmdAnalyse AnalyseConfig { analyseFile = path , analyseFoldThreads = mergeThreads From 9919d76a8545c23d000659e9dcf84cb340e0edd0 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 06:24:17 +0530 Subject: [PATCH 10/16] Move foreign window filtering out of getWindowCounterList --- hperf/Main.hs | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index f8629ca..b61abea 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -113,9 +113,9 @@ combineWindowStats = Fold.kvToMap combineStats -- Statistics collection for each counter -{-# INLINE stats #-} -stats :: Scanl IO Int64 [(String, Int)] -stats = +{-# INLINE statScanner #-} +statScanner :: Scanl IO Int64 [(String, Int)] +statScanner = Scanl.lmap (fromIntegral :: Int64 -> Int) $ Scanl.distribute [ fmap (\x -> ("latest", fromJust x)) Scanl.latest @@ -129,11 +129,11 @@ stats = {-# INLINE threadStats #-} threadStats :: Scanl IO (Either (Maybe Int64) Int64) [(String, Int)] -threadStats = untilLeft stats +threadStats = untilLeft statScanner {-# INLINE windowStats #-} windowStats :: Scanl IO (Either (Maybe Int64) Int64) [(String, Int)] -windowStats = Scanl.scanlMany (untilLeft Scanl.sum) stats +windowStats = Scanl.scanlMany (untilLeft Scanl.sum) statScanner {-# INLINE toStats #-} toStats :: @@ -567,6 +567,14 @@ getStatMapTidMap path = do let statsMap1 = postProcess $ Map.toList statsMap return (statsMap1, tidMap) +getWindowCounterList :: + [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] +getWindowCounterList stats = + List.nub + $ filter (\(_,c) -> c `notElem` windowLevelCounters) + $ map (\(_, window, counter) -> (window, counter)) + $ map fst stats + getAllStats :: Bool -> FilePath @@ -588,16 +596,6 @@ getAllStats mergeThreads path = do let windowCounterList = getWindowCounterList foldedStats return (statMap, tidMap, foldedStats, windowCounterList) -getWindowCounterList :: - [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] -getWindowCounterList foldedStats = - List.nub - -- XXX Control this by config - $ filter (\(w,_) -> not (":foreign" `List.isSuffixOf` w)) - $ filter (\(_,c) -> c `notElem` windowLevelCounters) - $ map (\(_, window, counter) -> (window, counter)) - $ map fst foldedStats - validateLabels :: Map Word32 (Maybe String) -> IO () validateLabels tidMap = mapM_ checkLabel (Map.toList tidMap) @@ -669,6 +667,7 @@ main = do mapM_ (putStrLn . (" " ++)) wins CmdList (ListThreads path) -> do (_, tidMap) <- loadStats path + validateLabels tidMap putStrLn "Threads (id, label):" mapM_ (\(tid, mlabel) -> putStrLn $ " " ++ show tid ++ ", " ++ maybe "-" id mlabel) @@ -680,10 +679,14 @@ main = do , analyseDetailed = detailed } -> do (statsMap, tidMap, foldedStats, windowCounterList) <- getAllStats mergeThreads path + -- XXX Control this by config + let windowCounterList1 = + filter (\(w,_) -> not (":foreign" `List.isSuffixOf` w)) + windowCounterList validateLabels tidMap if detailed then showOneCounterPerWindow - maxLines statsMap foldedStats tidMap windowCounterList + maxLines statsMap foldedStats tidMap windowCounterList1 else showAllCountersPerWindow maxLines - mergeThreads statsMap foldedStats tidMap windowCounterList + mergeThreads statsMap foldedStats tidMap windowCounterList1 From 4f3190c7beda76147f5441624ad1f18035adc9c2 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 06:25:51 +0530 Subject: [PATCH 11/16] Use folded stats for listing windows So that we do not list windows duplicated by threads. --- hperf/Main.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index b61abea..097a35c 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -661,7 +661,7 @@ main = do putStrLn "Supported counters:" mapM_ (putStrLn . (" " ++) . show) [minBound..maxBound :: Counter] CmdList (ListWindows path) -> do - (_, _, _, windowCounterList) <- getAllStats False path + (_, _, _, windowCounterList) <- getAllStats True path let wins = List.nub $ "default" : fmap fst windowCounterList putStrLn "Available windows:" mapM_ (putStrLn . (" " ++)) wins From 23ebf8734321c8cc07b572b6f9c4ac769e432b61 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 14:41:14 +0530 Subject: [PATCH 12/16] Update perf measurement docs --- docs/ghc-rts-performance-analysis.md | 75 +--------- docs/haskell-perf-measurement.md | 200 +++++++++++++++++++++++++++ 2 files changed, 201 insertions(+), 74 deletions(-) create mode 100644 docs/haskell-perf-measurement.md diff --git a/docs/ghc-rts-performance-analysis.md b/docs/ghc-rts-performance-analysis.md index a4d94a2..8d3d7e3 100644 --- a/docs/ghc-rts-performance-analysis.md +++ b/docs/ghc-rts-performance-analysis.md @@ -26,46 +26,7 @@ threads) spent between two points, but we cannot tell which Haskell thread spent how much time or how much time was actually spent by the instructions between those two points. -## Components of a Haskell Process - -* An OS level process -* Multiple OS level threads in the OS process -* Multiple Haskell green threads that are scheduled on the OS threads. Haskell - threads can run on any of the available OS threads every time it is ready to - run. - -## A prototypical program - -[This example](examples/console-loop-multi-thread.hs) is a -a simple yet comprehensive program to understand different components of -performance analysis and stats. You can play with this to understand how things -work, how the stats add up and what they mean. - -## How many OS threads do we have? - -To see how many OS threads a Haskell process is using on Linux. Run -[this example](examples/console-loop-multi-thread.hs), note its pid -printed in the output. All of its OS threads can be printed by: -``` -ls /proc//task -``` - -Even when compiled without the `-threaded` option we might see two threads -because the RTS still uses a separate thread for GC and for forking async -cleanup threads via GC. - -One of the tasks will have the same pid as the process pid, this is the main OS -thread. You can try changing the number of capabilities using +RTS -N and see -the effect. - -GHC may also use independent OS threads for ffi, for GC, for IO manager, -however it will guarantee that only as many user threads can run at a time as -specified with the -N rts option. - -Usually we see 3 threads plus 2 threads per capability when compiled -with `-threaded` option. - -## GHC RTS stats +## getRTSStats The getRTSStats call gives us the CPU time (essentially get_clocktime or getrusage under the hood to get the CPU time) of the process and @@ -121,40 +82,6 @@ in this category. Note that the GC cpu time should be computed by adding the `gc_cpu_ns` and `nonmoving_gc_cpu_ns` when the non-moving gc is enabled. -## Variability of Measurements - -Performance measurement is tricky and there are many factors to take care of if -you want to get reliable results: - -* Disable CPU frequency scaling, can cause run-to-run or variability in the - same run. -* Do not run other things on the same machine. interrupts, kernel - activity, background daemons can also affect: - * Memory contention can affect the measurement. - * cache effects due to context switching can affect it. -* Discard first runs, first runs are usually outliers because of warm up effects, - instruction cache cold, data cache cold, page faults, branch predictor - not trained. -* Use thread affinity. Thread migration to another CPU: causes cache - invalidation, different core state, timing noise. -* Use larger measurements. In smaller one measurement overhead and - variance may dominate: timing calls (clock_gettime), counters, RTS - stats. -* Different CPUs running at different frequencies can make the results - unpredictable. -* The clocks of different CPUs may not be perfectly in sync. - -To counter the last two factors we should use instruction count or -allocation count rather than time as a more reliable measure. Even -the instruction count might vary because of measurement overhead adds -instruction count, which can vary depending on how many times the thread -is context switched. - -## Haskell specific variability - -* Lazy evaluation, may defer work which might get evaluated later in the - context of some other measurement window. - ## Using getRTSStats with haskell-perf In the `haskell-perf` library we do have a convenient way to wrap a diff --git a/docs/haskell-perf-measurement.md b/docs/haskell-perf-measurement.md new file mode 100644 index 0000000..eb4822d --- /dev/null +++ b/docs/haskell-perf-measurement.md @@ -0,0 +1,200 @@ +## Components of a Haskell Process + +* An OS level process +* Multiple OS level threads within the OS process +* Multiple Haskell green threads that are scheduled on the OS threads. Haskell + threads can run on any of the available OS threads every time it is ready to + run. + +## How many OS threads do we have? + +To see how many OS threads a Haskell process is using on Linux. Run +[this example](test/snippets/console-loop-multi-thread.hs), note its pid +printed in the output. All of its OS threads can be printed by: +``` +ls /proc//task +``` + +## OS threads without -threaded option + +Even when compiled without the `-threaded` option we can see two OS +threads running (ghc 9.10.3 on Linux) because the RTS uses one more +thread for some other async tasks. But only one OS thread is used for +running the user program. + +Not using the -threaded option may be more CPU efficient but it can +lead to higher latencies because of FFI calls blocking the thread. When +compiled with -threaded we can run with a single capability using -N1 +and take adavantage of offloading FFI calls to other threads. + +One of the tasks will have the same pid as the process pid, this is the +main OS thread used for executing Haskell threads. + +## OS threads with -threaded build and -N1 RTS option + +When compiled with -threaded option, GHC may also use multiple independent OS +threads for ffi, for GC, for IO manager. + +With -N1 even though only one capability is used for running the Haskell +code we may see more threads being used by the RTS. Using ghc 9.10.3 on +Linux we see five OS threads running with -N1. + +## OS threads with -threaded build + +You can try changing the number of capabilities using +RTS -N and see +the effect. Usually we can see 3 threads plus 2 threads per capability +when compiled with `-threaded` option. However, GHC guarantees that +only as many user threads can run at a time as specified with the -N rts +option. + +## Process Level View + +If we consider the entire Haskell process, the elapsed wall-clock time +of each OS thread consists of: + +* CPU execution time, +* time spent runnable but waiting on the OS scheduler, +* time spent blocked waiting for I/O or synchronization events. + +When a process uses multiple OS threads, `ProcessCPUTime` may exceed elapsed +wall-clock time because multiple threads can execute simultaneously on +different CPUs. + +As a result, `ProcessCPUTime` is generally not suitable as a delta metric for +measuring the CPU consumption of a specific piece of user code in a +multithreaded program. In such cases, `ThreadCPUTime` is usually more +appropriate, since it measures CPU usage attributable to a single OS thread. +In a single-threaded process, however, `ProcessCPUTime` can safely be used for +this purpose. + +`ProcessCPUTime` is nevertheless useful as an aggregate CPU utilization +metric. Its value can approach elapsed wall-clock time multiplied by the +degree of CPU parallelism available to the process. Lower values indicate +that the process spent more time not executing, for example waiting for +scheduling, synchronization, I/O, or other runtime stalls. + +Useful reporting metric: + +* Total Elapsed time +* Entire process CPU Time + * User time (getrusage) + * System time (getrusage) +* For each OS thread - ThreadCPUTime +* Overall rusage stats for the process +* OS Sched run-queue wait time (using sched_wakeup, sched_switch trace events + via perf, libperf, libtraceevent) + +The total elapsed time for any OS thread can be decomposed as: +``` +elapsed = + on_cpu + + runnable_wait + + off_cpu_wait(reason) +``` + +``` +reason in { + futex, + disk_io, + network_io, + epoll, + sleep, + paging, + pipe_wait, + signal_wait, + ... +} +``` + +## Useful Metrics + +Let's consider the following cases. + +### Single Capability, Single Haskell Thread + +This is the simplest execution model for performance analysis. In this +configuration, a single OS thread executes the measured code. If there +is also only one Haskell thread, then when that thread yields there is +no other Haskell thread that can run on the same capability. Likewise, +no parallel runtime worker executes on that capability. Aside from brief +RTS scheduler bookkeeping, the only substantial additional work that may +execute on that capability is garbage collection or foreign-function +(FFI) code. + +In this model, the OS `ThreadCPUTime` delta can be used to measure the +total CPU time consumed by the execution thread between two points. This +measurement includes both user-code execution and RTS activity such as +garbage collection and scheduler overhead. To estimate the CPU time +spent in user code alone, the CPU time attributable to RTS activity must +be subtracted. Since there is no other Haskell thread, this entire CPU +time can be attributed to the single Haskell thread. + +The CPU time between two points on the OS thread (GHC capability) can be +decomposed as follows: + +* User-code execution time +* CPU time spent in FFI code +* Haskell GC CPU time +* RTS scheduler and bookkeeping CPU time + +### Single Capability, Multiple Haskell Threads + +In this configuration, `ThreadCPUTime` for the OS thread can no longer be +used to directly measure the CPU time consumed by a particular Haskell +thread. A Haskell thread may yield execution between the measurement +points, allowing another Haskell thread to run on the same capability and +consume CPU time on the same OS thread. + +To measure CPU usage attributable to an individual Haskell thread, the +measurement must instead be performed at the RTS level. This can be done +using RTS eventlog tracing or by recording timestamps when the Haskell +thread is scheduled onto and descheduled from a capability. + +The mutator CPU time reported by GHC is the cumulative CPU time spent +outside garbage collection. It therefore includes both Haskell thread +execution time and RTS overhead incurred during mutator execution, such +as scheduler bookkeeping and allocation management. + +If we separately measure the cumulative CPU time attributable to +individual Haskell threads, then the difference between the total +mutator CPU time and the cumulative Haskell thread CPU time provides an +estimate of RTS overhead excluding garbage collection. + +### Multiple capabilities + +Same as above applies in this case as well. + +## Variability of Measurements + +Performance measurement is tricky and there are many factors to take care of if +you want to get reliable results: + +* Disable CPU frequency scaling, can cause run-to-run or variability in the + same run. +* Do not run other things on the same machine. interrupts, kernel + activity, background daemons can also affect: + * Memory contention can affect the measurement. + * cache effects due to context switching can affect it. +* Discard first runs, first runs are usually outliers because of warm up effects, + instruction cache cold, data cache cold, page faults, branch predictor + not trained. +* Use thread affinity. Thread migration to another CPU: causes cache + invalidation, different core state, timing noise. +* Use larger measurements. In smaller one measurement overhead and + variance may dominate: timing calls (clock_gettime), counters, RTS + stats. +* Different CPUs running at different frequencies can make the results + unpredictable. +* The clocks of different CPUs may not be perfectly in sync. + +To counter the last two factors we should use instruction count or +allocation count rather than time as a more reliable measure. Even +the instruction count might vary because of measurement overhead adds +instruction count, which can vary depending on how many times the thread +is context switched. + +## Haskell specific variability + +* Lazy evaluation, may defer work which might get evaluated later in the + context of some other measurement window. + From 32fa3c203006dc94fbf1e82337b1557ea42966da Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 16:10:57 +0530 Subject: [PATCH 13/16] Cleanup, document and write todo comments --- hperf/Main.hs | 91 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 097a35c..7a0480c 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -261,8 +261,12 @@ showCounterDetailsForWindow maxLines statsRaw tidMap (w, ctr) = do in printf "%d" tid : lb : map snd v select ((_, window, counter), _) = window == w && counter == ctr -windowLevelCounters :: [Counter] -windowLevelCounters = +-- XXX we can only use the Process level counters to report the entire +-- program's CPU time including all threads in the saummary data. Reporting +-- this in the Windows times is not useful. We can use the OS thread's +-- ThreadCPUTime only if there is only one Haskell thread running. +processLevelCounters :: [Counter] +processLevelCounters = [ ProcessCPUTime , ProcessUserCPUTime , ProcessSystemCPUTime @@ -275,6 +279,10 @@ windowLevelCounters = -- which could be extremely large. Also, we will be able to report online, in -- real time. We will need a Map of windows, which will store a Map of tids -- which will store a list or Map of counters. + +-- | Print a table for the given window, one row per thread that ever entered +-- the window, listing accumulated value for each +-- counter. showAllCountersForWindow :: Int -> Bool @@ -283,10 +291,10 @@ showAllCountersForWindow :: -> [Counter] -> String -> IO () -showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do +showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do let windowTotals :: [((Word32, Counter), Int)] - windowTotals = fmap toTotal $ filter selectWindow statsRaw + windowTotals = fmap toTotal $ filter selectWindow stats tidList = fmap @@ -312,7 +320,7 @@ showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do (\f -> fmap snd $ filter f windowTotals) (fmap selectCounter ctrs1) - windowCounts = fmap toCounts $ filter selectWindow statsRaw + windowCounts = fmap toCounts $ filter selectWindow stats oneCounterCounts = filter (selectCounter (head ctrs1)) windowCounts counts = fmap snd $ oneCounterCounts @@ -338,25 +346,31 @@ showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do -- we cannot combine the window level counters. putStrLn $ "Window [" ++ w ++ "]" ++ " thread wise stat summary" when (not concurrent) $ do - mapM_ (printWindowLevelCounter windowTotals) - [ProcessCPUTime, ProcessUserCPUTime, ProcessSystemCPUTime] + putStrLn "" + mapM_ (printProcessLevelCounter windowTotals) + [ProcessCPUTime] + mapM_ (\x -> putStr " " >> printProcessLevelCounter windowTotals x) + [ProcessUserCPUTime, ProcessSystemCPUTime] if ":foreign" `List.isSuffixOf` w then return () else do putStrLn "" + mapM_ (printProcessLevelCounter windowTotals) + [ProcessCPUTime] let threadCPUTimeTotal = - getWindowLevelCounter sum windowTotals ThreadCPUTime - putStrLn $ "ThreadCPUTime:" ++ toString threadCPUTimeTotal + getProcessLevelCounter sum windowTotals ThreadCPUTime + putStrLn $ " ThreadCPUTime:" ++ toString threadCPUTimeTotal let gcCPUTime = - getWindowLevelCounter head windowTotals GCCPUTime - putStrLn $ "GcCPUTime:" ++ toString gcCPUTime + getProcessLevelCounter head windowTotals GCCPUTime + putStrLn $ " GcCPUTime:" ++ toString gcCPUTime let processCPUTime = - getWindowLevelCounter head windowTotals ProcessCPUTime + getProcessLevelCounter head windowTotals ProcessCPUTime let rtsCPUTime = processCPUTime - gcCPUTime - threadCPUTimeTotal - putStrLn $ "RtsCPUTime:" ++ toString rtsCPUTime + putStrLn $ " RtsCPUTime(*):" ++ toString rtsCPUTime let cnt = length allRows + putStrLn "" printTable ((colHeaders : take maxLines allRows) ++ [separator, summary]) if cnt > maxLines then putStrLn $ "..." ++ show (cnt - maxLines) ++ " lines omitted ..." @@ -370,15 +384,15 @@ showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do if (":foreign" `List.isSuffixOf` w) then ctrs List.\\ [ThreadAllocated] else ctrs - getWindowLevelCounter f wt c = f $ fmap snd $ filter (selectCounter c) wt - printWindowLevelCounter wt c = do + getProcessLevelCounter f wt c = f $ fmap snd $ filter (selectCounter c) wt + printProcessLevelCounter wt c = do -- Only one thread should have this let val = fmap snd $ filter (selectCounter c) wt case val of [] -> do {- -- a "foreign window does not have GCCPUTime - putStrLn $ "printWindowLevelCounter: counter " + putStrLn $ "printProcessLevelCounter: counter " ++ show c ++ " not found in windowTotals" -} return () @@ -404,25 +418,36 @@ showAllCountersForWindow maxLines concurrent statsRaw tidMap ctrs w = do Just label -> label Nothing -> "-" --- | Combine stats from all windows with the same name but different thread-id +-- A window tag is of the format "tid:name", extract the "tid" from this. +getTidFromWindowTag :: String -> Maybe Word32 +getTidFromWindowTag w = + let (tid, r) = span (/= ':') w + in if null r then Nothing else Just (read tid :: Word32) + +-- | Combine stats from all windows with the same name but different thread-id. +-- This just gives us more samples for the same window, we are saying we don't +-- care in which thread's context the window code ran, just give us the timings +-- in the context of any thread. +-- +-- We just erase the thread id from the window name and change it to 0, so that +-- all the threads now have the same thread-id 0. foldWindowThreads :: [((Word32, String, Counter), [(String, Int)])] -> IO [((Word32, String, Counter), [(String, Int)])] -foldWindowThreads statsRaw = do - let renameWindow w = +foldWindowThreads stats = do + let changeWindowTidZero w = let (_, r) = span (/= ':') w in if null r then w else '0':r - rename ((tid, tag, ctr), v) = ((tid, renameWindow tag, ctr), v) - getTid w = - let (tid, r) = span (/= ':') w - in if null r then Nothing else Just (read tid :: Word32) + + collapseTid ((tid, tag, ctr), v) = + ((tid, changeWindowTidZero tag, ctr), v) + matching ((tid, tag, _), _) = - case getTid tag of + case getTidFromWindowTag tag of Nothing -> False Just x -> x == tid - statsFiltered = fmap rename $ filter matching statsRaw - return statsFiltered + return $ fmap collapseTid $ filter matching stats ------------------------------------------------------------------------------- -- CLI @@ -571,7 +596,7 @@ getWindowCounterList :: [((Word32, String, Counter), [(String, Int)])] -> [(String, Counter)] getWindowCounterList stats = List.nub - $ filter (\(_,c) -> c `notElem` windowLevelCounters) + $ filter (\(_,c) -> c `notElem` processLevelCounters) $ map (\(_, window, counter) -> (window, counter)) $ map fst stats @@ -579,6 +604,7 @@ getAllStats :: Bool -> FilePath -> IO + -- statsMap :: Map (tid, window tag, counter) [(stat name, value)] ( [((Word32, String, Counter), [(String, Int)])] , Map Word32 (Maybe String) , [((Word32, String, Counter), [(String, Int)])] @@ -666,6 +692,7 @@ main = do putStrLn "Available windows:" mapM_ (putStrLn . (" " ++)) wins CmdList (ListThreads path) -> do + -- XXX does not print all threads (_, tidMap) <- loadStats path validateLabels tidMap putStrLn "Threads (id, label):" @@ -673,6 +700,16 @@ main = do putStrLn $ " " ++ show tid ++ ", " ++ maybe "-" id mlabel) (Map.toList tidMap) CmdAnalyse AnalyseConfig + -- XXX the mergeThreads should just print a single table with one + -- line per window in the summary view. + -- XXX We should have the following views: + -- 1. overall summary, "default" window only (default) + -- 2. thread-agnostic, one line per window, all the counter + -- averages as columns. --windows + -- 3. thread-aware, one table per window, one row per thread, all + -- the counters as columns. --threads + -- 4. thread-aware, one table per window per counter, one row per + -- thread, counter attributes as columns. --counters { analyseFile = path , analyseFoldThreads = mergeThreads , analyseMaxLines = maxLines From 123a43b9b63b217d2ea5f400e4ec3e246dc2d269 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sat, 9 May 2026 16:14:36 +0530 Subject: [PATCH 14/16] Remove Console.hs, File.hs --- haskell-perf.cabal | 2 -- lib/Streamly/Metrics/Console.hs | 9 --------- lib/Streamly/Metrics/File.hs | 11 ----------- 3 files changed, 22 deletions(-) delete mode 100644 lib/Streamly/Metrics/Console.hs delete mode 100644 lib/Streamly/Metrics/File.hs diff --git a/haskell-perf.cabal b/haskell-perf.cabal index ad3109b..fc9329a 100644 --- a/haskell-perf.cabal +++ b/haskell-perf.cabal @@ -33,8 +33,6 @@ extra-doc-files: extra-source-files: examples/*.hs test/snippets/*.hs - lib/Streamly/Metrics/File.hs - lib/Streamly/Metrics/Console.hs source-repository head type: git diff --git a/lib/Streamly/Metrics/Console.hs b/lib/Streamly/Metrics/Console.hs deleted file mode 100644 index 6f8a60c..0000000 --- a/lib/Streamly/Metrics/Console.hs +++ /dev/null @@ -1,9 +0,0 @@ -module Streamly.Metrics.Console - ( - ) -where - -import Streamly.Metrics.Type -import Streamly.Internal.Data.Tuple.Strict (Tuple'(..)) -import Streamly.Internal.Data.Array (Array) -import Streamly.Internal.Data.Time.Units (AbsTime) diff --git a/lib/Streamly/Metrics/File.hs b/lib/Streamly/Metrics/File.hs deleted file mode 100644 index c79ba0c..0000000 --- a/lib/Streamly/Metrics/File.hs +++ /dev/null @@ -1,11 +0,0 @@ -module Streamly.Metrics.File - ( - ) -where - -import Streamly.Metrics.Type -import Streamly.Internal.Data.Tuple.Strict (Tuple'(..)) -import Streamly.Internal.Data.Array (Array) -import Streamly.Internal.Data.Time.Units (AbsTime) -import Data.Word (Word64) - From 68936d23df9e434254851fa68866c4935f09ed05 Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sun, 10 May 2026 04:03:07 +0530 Subject: [PATCH 15/16] Remove process level counter reporting --- hperf/Main.hs | 57 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/hperf/Main.hs b/hperf/Main.hs index 7a0480c..6701746 100644 --- a/hperf/Main.hs +++ b/hperf/Main.hs @@ -7,7 +7,7 @@ import Perf.Eventlog.Aggregate ( collectThreadCounter, translateThreadEvents, ) -import Control.Monad (when) +-- import Control.Monad (when) import Data.Either (isLeft) import Data.Int (Int64) import Data.IntMap (IntMap) @@ -261,10 +261,17 @@ showCounterDetailsForWindow maxLines statsRaw tidMap (w, ctr) = do in printf "%d" tid : lb : map snd v select ((_, window, counter), _) = window == w && counter == ctr --- XXX we can only use the Process level counters to report the entire --- program's CPU time including all threads in the saummary data. Reporting --- this in the Windows times is not useful. We can use the OS thread's --- ThreadCPUTime only if there is only one Haskell thread running. +-- ProcessCPUTime is not useful in estimating the CPU time used by a Window. We +-- should use ThreadCPUTime instead and that too can be used only if we have a +-- single Haskell thread. However ProcessCPUTime can be used in the following +-- ways: +-- +-- (1) for reporting the entire program's CPU usage and compare it against the +-- elapsed time to find the overall CPU utilization. +-- +-- (2) In a Window if we can find the elapsed time as well, we can use it to +-- find the overall Haskell process CPU utilization during the Window. +-- processLevelCounters :: [Counter] processLevelCounters = [ ProcessCPUTime @@ -281,17 +288,15 @@ processLevelCounters = -- which will store a list or Map of counters. -- | Print a table for the given window, one row per thread that ever entered --- the window, listing accumulated value for each --- counter. +-- the window, listing accumulated value for each counter. showAllCountersForWindow :: Int - -> Bool -> [((Word32, String, Counter), [(String, Int)])] -> Map Word32 String -> [Counter] -> String -> IO () -showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do +showAllCountersForWindow maxLines stats tidMap ctrs w = do let windowTotals :: [((Word32, Counter), Int)] windowTotals = fmap toTotal $ filter selectWindow stats @@ -342,21 +347,34 @@ showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do if w == "default" then putStrLn $ "Global thread wise stat summary" else do - -- When collapsing windows, if the windows are concurrent then - -- we cannot combine the window level counters. putStrLn $ "Window [" ++ w ++ "]" ++ " thread wise stat summary" + -- XXX ProcessCPUTime can only be useful when elapsed time is + -- also reported to find the cpu utilization during the window. + -- Put it under a CLI option. + {- when (not concurrent) $ do putStrLn "" mapM_ (printProcessLevelCounter windowTotals) [ProcessCPUTime] mapM_ (\x -> putStr " " >> printProcessLevelCounter windowTotals x) [ProcessUserCPUTime, ProcessSystemCPUTime] + -} + + -- XXX In a single-haskell-threaded case ThreadCPUTime can be + -- used to estimate the RTS overhead during the Window (total + -- OSThreadCPUtime - total HaskellThreadCPUTime), provided the + -- OS thread does not change during the entire window. If it is + -- possible we should put that under a separate CLI option, and + -- also ensure that the RTS is single threaded and there are no + -- more than one user threads present in the log. + {- if ":foreign" `List.isSuffixOf` w then return () else do putStrLn "" mapM_ (printProcessLevelCounter windowTotals) [ProcessCPUTime] + -- XXX is this for Haskell thread or OS thread? let threadCPUTimeTotal = getProcessLevelCounter sum windowTotals ThreadCPUTime putStrLn $ " ThreadCPUTime:" ++ toString threadCPUTimeTotal @@ -365,9 +383,12 @@ showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do putStrLn $ " GcCPUTime:" ++ toString gcCPUTime let processCPUTime = getProcessLevelCounter head windowTotals ProcessCPUTime + -- XXX this calculation is wrong even on a single + -- threaded RTS, as there may be two os threads. let rtsCPUTime = processCPUTime - gcCPUTime - threadCPUTimeTotal putStrLn $ " RtsCPUTime(*):" ++ toString rtsCPUTime + -} let cnt = length allRows putStrLn "" @@ -384,6 +405,7 @@ showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do if (":foreign" `List.isSuffixOf` w) then ctrs List.\\ [ThreadAllocated] else ctrs + {- getProcessLevelCounter f wt c = f $ fmap snd $ filter (selectCounter c) wt printProcessLevelCounter wt c = do -- Only one thread should have this @@ -399,6 +421,7 @@ showAllCountersForWindow maxLines concurrent stats tidMap ctrs w = do [x] -> putStrLn $ show c ++ ": " ++ toString x _ -> error $ "Multiple values for counter " ++ show c ++ " in window " ++ w + -} toString = Text.unpack . prettyI (Just ',') colHeaders = @@ -633,13 +656,12 @@ validateLabels tidMap = mapM_ checkLabel (Map.toList tidMap) showAllCountersPerWindow :: Int - -> Bool -> [((Word32, String, Counter), [(String, Int)])] -> [((Word32, String, Counter), [(String, Int)])] -> Map Word32 (Maybe String) -> [(String, Counter)] -> IO () -showAllCountersPerWindow maxLines foldWindowStats statsRaw statsFlattened tidMap windowCounterList = do +showAllCountersPerWindow maxLines statsRaw statsFlattened tidMap windowCounterList = do -- TODO: filter the counters to be printed based on Config/CLI -- TODO: filter the windows or threads to be printed let ctrs = List.nub $ fmap snd windowCounterList @@ -649,7 +671,7 @@ showAllCountersPerWindow maxLines foldWindowStats statsRaw statsFlattened tidMap let f w = showAllCountersForWindow - maxLines foldWindowStats (getStats w) (fmap fromJust tidMap) ctrs w + maxLines (getStats w) (fmap fromJust tidMap) ctrs w in mapM_ f wins showOneCounterPerWindow :: @@ -665,7 +687,9 @@ showOneCounterPerWindow maxLines rawStats foldedStats tidMap windowCounterList = -- hack - currently we do not compute avg and stddev in flattened let getStats w = if w == "default" then rawStats else foldedStats -- For each (window, counter) list all threads - let f (w,c) = showCounterDetailsForWindow maxLines (getStats w) (fmap fromJust tidMap) (w,c) + let f (w,c) = + showCounterDetailsForWindow + maxLines (getStats w) (fmap fromJust tidMap) (w,c) in mapM_ f windowCounterList ------------------------------------------------------------------------------- @@ -725,5 +749,4 @@ main = do then showOneCounterPerWindow maxLines statsMap foldedStats tidMap windowCounterList1 else showAllCountersPerWindow - maxLines - mergeThreads statsMap foldedStats tidMap windowCounterList1 + maxLines statsMap foldedStats tidMap windowCounterList1 From 2e459136adecab3989272105477543284a09abce Mon Sep 17 00:00:00 2001 From: Harendra Kumar Date: Sun, 10 May 2026 04:08:52 +0530 Subject: [PATCH 16/16] Add the new perf doc link in README --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e78b8a9..ade53b0 100644 --- a/README.md +++ b/README.md @@ -49,9 +49,10 @@ With these tools you can find: ## Detailed documents -For more details on each of the performance analysis methods see the following -documents: -* [GHC RTS Stats](docs/ghc-rts-performance-analysis.md) -* [threadCPUTime# RTS primitive](docs/thread-cputime-primop.md) -* [GHC Event logging](docs/eventlog-performance-analysis.md) +For more details on performance measurement and each of the performance +analysis methods see the following documents: +* [Haskell performance measurement](docs/haskell-perf-measurement.md) +* [Using RTS Stats](docs/ghc-rts-performance-analysis.md) +* [Using threadCPUTime RTS primitive](docs/thread-cputime-primop.md) +* [Using Event log based measurement](docs/eventlog-performance-analysis.md) * [GHC patches details](dev/ghc-work.md)