Skip to content

Commit a29092e

Browse files
perf: Group together packages by repo when verifying tarballs (haskell#10121)
verifyFetchedTarball has the effect of deserialising the index tarball (see call to Sec.withIndex). verifyFetchedTarball is called individually for each package in the build plan (see ProjectPlanning.hs). Not once per repo. The hackage tarball is now 880mb so it takes a non significant amount of time to deserialise this (much better after haskell/tar#95). This code path is important as it can add 1s with these 38 calls on to the initial load of a project and scales linearly with the size of your build tree. Reproducer: Simple project with "lens" dependency deserialises the index tarball 38 times. Solution: Refactor verifyFetchedTarball to run once per repo rather than once per package. In future it would be much better to refactor this function so that the items are not immediately grouped and ungrouped but I didn't want to take that on immediately. Fixes haskell#10110 (cherry picked from commit 7d46115) Co-authored-by: Matthew Pickering <[email protected]>
1 parent 2338e1d commit a29092e

File tree

4 files changed

+85
-52
lines changed

4 files changed

+85
-52
lines changed

.hlint.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@
9292
- ignore: {name: "Use unwords"} # 8 hints
9393
- ignore: {name: "Use void"} # 22 hints
9494
- ignore: {name: "Use when"} # 1 hint
95+
- ignore: {name: "Use uncurry"} # 1 hint
9596

9697
- arguments:
9798
- --ignore-glob=Cabal-syntax/src/Distribution/Fields/Lexer.hs

cabal-install/src/Distribution/Client/FetchUtils.hs

Lines changed: 62 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ module Distribution.Client.FetchUtils
2525
-- ** specifically for repo packages
2626
, checkRepoTarballFetched
2727
, fetchRepoTarball
28-
, verifyFetchedTarball
28+
, verifyFetchedTarballs
2929

3030
-- ** fetching packages asynchronously
3131
, asyncFetchPackages
@@ -98,6 +98,7 @@ import System.IO
9898
, openTempFile
9999
)
100100

101+
import Control.Monad (forM)
101102
import Distribution.Client.Errors
102103
import qualified Hackage.Security.Client as Sec
103104
import qualified Hackage.Security.Util.Checked as Sec
@@ -152,40 +153,66 @@ checkRepoTarballFetched repo pkgid = do
152153
then return (Just file)
153154
else return Nothing
154155

155-
verifyFetchedTarball :: Verbosity -> RepoContext -> Repo -> PackageId -> IO Bool
156-
verifyFetchedTarball verbosity repoCtxt repo pkgid =
157-
let file = packageFile repo pkgid
158-
handleError :: IO Bool -> IO Bool
159-
handleError act = do
160-
res <- Safe.try act
161-
case res of
162-
Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False
163-
Right b -> pure b
164-
in handleError $ do
165-
exists <- doesFileExist file
166-
if not exists
167-
then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash.
168-
else case repo of
169-
-- a secure repo has hashes we can compare against to confirm this is the correct file.
170-
RepoSecure{} ->
171-
repoContextWithSecureRepo repoCtxt repo $ \repoSecure ->
172-
Sec.withIndex repoSecure $ \callbacks ->
173-
let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False
174-
in -- the do block in parens is due to dealing with the checked exceptions mechanism.
175-
( do
176-
fileInfo <- Sec.indexLookupFileInfo callbacks pkgid
177-
sz <- Sec.FileLength . fromInteger <$> getFileSize file
178-
if sz /= Sec.fileInfoLength (Sec.trusted fileInfo)
179-
then warnAndFail "file length mismatch"
180-
else do
181-
res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute)
182-
if res
183-
then pure True
184-
else warnAndFail "file hash mismatch"
185-
)
186-
`Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e))
187-
`Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e))
188-
_ -> pure True
156+
verifyFetchedTarballs
157+
:: Verbosity
158+
-> RepoContext
159+
-> Repo
160+
-> [PackageId]
161+
-> IO
162+
( [ Either
163+
(Repo, PackageId) -- Verified
164+
(Repo, PackageId) -- unverified)
165+
]
166+
)
167+
verifyFetchedTarballs verbosity repoCtxt repo pkgids =
168+
-- Establish the context once per repo (see #10110), this codepath is important
169+
-- to be fast as it can happen when no other building happens.
170+
let establishContext k =
171+
case repo of
172+
RepoSecure{} ->
173+
repoContextWithSecureRepo repoCtxt repo $ \repoSecure ->
174+
Sec.withIndex repoSecure $ \callbacks -> k (Just callbacks)
175+
_ -> k Nothing
176+
in do
177+
establishContext $ \mCallbacks ->
178+
forM pkgids $ \pkgid -> do
179+
let file = packageFile repo pkgid
180+
res <- verifyFetchedTarball verbosity file mCallbacks pkgid
181+
return $ if res then Left (repo, pkgid) else Right (repo, pkgid)
182+
183+
verifyFetchedTarball :: Verbosity -> FilePath -> Maybe Sec.IndexCallbacks -> PackageId -> IO Bool
184+
verifyFetchedTarball verbosity file mCallbacks pkgid =
185+
let
186+
handleError :: IO Bool -> IO Bool
187+
handleError act = do
188+
res <- Safe.try act
189+
case res of
190+
Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False
191+
Right b -> pure b
192+
in
193+
handleError $ do
194+
exists <- doesFileExist file
195+
if not exists
196+
then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash.
197+
else case mCallbacks of
198+
-- a secure repo has hashes we can compare against to confirm this is the correct file.
199+
Just callbacks ->
200+
let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False
201+
in -- the do block in parens is due to dealing with the checked exceptions mechanism.
202+
( do
203+
fileInfo <- Sec.indexLookupFileInfo callbacks pkgid
204+
sz <- Sec.FileLength . fromInteger <$> getFileSize file
205+
if sz /= Sec.fileInfoLength (Sec.trusted fileInfo)
206+
then warnAndFail "file length mismatch"
207+
else do
208+
res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute)
209+
if res
210+
then pure True
211+
else warnAndFail "file hash mismatch"
212+
)
213+
`Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e))
214+
`Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e))
215+
_ -> pure True
189216

190217
-- | Fetch a package if we don't have it already.
191218
fetchPackage

cabal-install/src/Distribution/Client/ProjectPlanning.hs

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ import Distribution.Client.SetupWrapper
125125
import Distribution.Client.Store
126126
import Distribution.Client.Targets (userToPackageConstraint)
127127
import Distribution.Client.Types
128-
import Distribution.Client.Utils (incVersion)
128+
import Distribution.Client.Utils (concatMapM, incVersion)
129129

130130
import qualified Distribution.Client.BuildReports.Storage as BuildReports
131131
import qualified Distribution.Client.IndexUtils as IndexUtils
@@ -200,7 +200,7 @@ import qualified Distribution.Solver.Types.ComponentDeps as CD
200200
import qualified Distribution.Compat.Graph as Graph
201201

202202
import Control.Exception (assert)
203-
import Control.Monad (forM, sequence)
203+
import Control.Monad (sequence)
204204
import Control.Monad.IO.Class (liftIO)
205205
import Control.Monad.State as State (State, execState, runState, state)
206206
import Data.Foldable (fold)
@@ -1056,25 +1056,29 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
10561056
-- Tarballs from repositories, either where the repository provides
10571057
-- hashes as part of the repo metadata, or where we will have to
10581058
-- download and hash the tarball.
1059-
repoTarballPkgsWithMetadataUnvalidated :: [(PackageId, Repo)]
1060-
repoTarballPkgsWithoutMetadata :: [(PackageId, Repo)]
1059+
repoTarballPkgsWithMetadataUnvalidated :: [(Repo, [PackageId])]
1060+
repoTarballPkgsWithoutMetadata :: [(Repo, PackageId)]
10611061
( repoTarballPkgsWithMetadataUnvalidated
10621062
, repoTarballPkgsWithoutMetadata
10631063
) =
10641064
partitionEithers
10651065
[ case repo of
1066-
RepoSecure{} -> Left (pkgid, repo)
1067-
_ -> Right (pkgid, repo)
1066+
RepoSecure{} -> Left (repo, [pkgid])
1067+
_ -> Right (repo, pkgid)
10681068
| (pkgid, RepoTarballPackage repo _ _) <- allPkgLocations
10691069
]
10701070

1071+
-- Group up the unvalidated packages by repo so we only read the remote
1072+
-- index once per repo (see #10110). The packages are ungrouped here and then regrouped
1073+
-- below, it would be better in future to refactor this whole code path so that we don't
1074+
-- repeatedly group and ungroup.
1075+
repoTarballPkgsWithMetadataUnvalidatedMap = Map.fromListWith (++) repoTarballPkgsWithMetadataUnvalidated
1076+
10711077
(repoTarballPkgsWithMetadata, repoTarballPkgsToDownloadWithMeta) <- fmap partitionEithers $
10721078
liftIO $
1073-
withRepoCtx $ \repoctx -> forM repoTarballPkgsWithMetadataUnvalidated $
1074-
\x@(pkg, repo) ->
1075-
verifyFetchedTarball verbosity repoctx repo pkg >>= \b -> case b of
1076-
True -> return $ Left x
1077-
False -> return $ Right x
1079+
withRepoCtx $ \repoctx -> flip concatMapM (Map.toList repoTarballPkgsWithMetadataUnvalidatedMap) $
1080+
\(repo, pkgids) ->
1081+
verifyFetchedTarballs verbosity repoctx repo pkgids
10781082

10791083
-- For tarballs from repos that do not have hashes available we now have
10801084
-- to check if the packages were downloaded already.
@@ -1088,9 +1092,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
10881092
[ do
10891093
mtarball <- checkRepoTarballFetched repo pkgid
10901094
case mtarball of
1091-
Nothing -> return (Left (pkgid, repo))
1095+
Nothing -> return (Left (repo, pkgid))
10921096
Just tarball -> return (Right (pkgid, tarball))
1093-
| (pkgid, repo) <- repoTarballPkgsWithoutMetadata
1097+
| (repo, pkgid) <- repoTarballPkgsWithoutMetadata
10941098
]
10951099

10961100
let repoTarballPkgsToDownload = repoTarballPkgsToDownloadWithMeta ++ repoTarballPkgsToDownloadWithNoMeta
@@ -1126,9 +1130,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
11261130
| pkgid <- pkgids
11271131
]
11281132
| (repo, pkgids) <-
1129-
map (\grp@((_, repo) :| _) -> (repo, map fst (NE.toList grp)))
1130-
. NE.groupBy ((==) `on` (remoteRepoName . repoRemote . snd))
1131-
. sortBy (compare `on` (remoteRepoName . repoRemote . snd))
1133+
map (\grp@((repo, _) :| _) -> (repo, map snd (NE.toList grp)))
1134+
. NE.groupBy ((==) `on` (remoteRepoName . repoRemote . fst))
1135+
. sortBy (compare `on` (remoteRepoName . repoRemote . fst))
11321136
$ repoTarballPkgsWithMetadata
11331137
]
11341138

@@ -1140,7 +1144,7 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do
11401144
[ do
11411145
tarball <- fetchRepoTarball verbosity repoctx repo pkgid
11421146
return (pkgid, tarball)
1143-
| (pkgid, repo) <- repoTarballPkgsToDownload
1147+
| (repo, pkgid) <- repoTarballPkgsToDownload
11441148
]
11451149

11461150
return

cabal-install/src/Distribution/Client/Utils.hs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ module Distribution.Client.Utils
3838
, listFilesInside
3939
, safeRead
4040
, hasElem
41+
, concatMapM
4142
, occursOnlyOrBefore
4243
, giveRTSWarning
4344
) where

0 commit comments

Comments
 (0)