Skip to content

Commit c115d47

Browse files
committed
Merge pull request #350 from commercialhaskell/faster-index-populate
Faster index populate
2 parents 644478f + 10d7f05 commit c115d47

File tree

2 files changed

+82
-103
lines changed

2 files changed

+82
-103
lines changed

src/Stack/PackageIndex.hs

Lines changed: 81 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ module Stack.PackageIndex
2323
) where
2424

2525
import qualified Codec.Archive.Tar as Tar
26-
import Control.Applicative
2726
import Control.Exception (Exception)
2827
import Control.Exception.Enclosed (tryIO)
2928
import Control.Monad (unless, when, liftM, mzero)
@@ -38,15 +37,19 @@ import Data.Aeson.Extended
3837
import qualified Data.Binary as Binary
3938
import Data.Binary.VersionTagged (taggedDecodeOrLoad, BinarySchema (..))
4039
import Data.ByteString (ByteString)
40+
import qualified Data.Word8 as Word8
41+
import qualified Data.ByteString as S
42+
import qualified Data.ByteString.Unsafe as SU
43+
import qualified Data.ByteString.Char8 as S8
4144
import qualified Data.ByteString.Lazy as L
42-
import Data.Conduit (($$), (=$), yield, Producer, ZipSink (..))
45+
import Data.Conduit (($$), (=$))
4346
import Data.Conduit.Binary (sinkHandle,
4447
sourceHandle)
45-
import qualified Data.Conduit.List as CL
4648
import Data.Conduit.Zlib (ungzip)
49+
import Data.Foldable (forM_)
4750
import Data.Int (Int64)
4851
import Data.Map (Map)
49-
import qualified Data.Map as Map
52+
import qualified Data.Map.Strict as Map
5053
import Data.Monoid
5154
import Data.Text (Text)
5255
import qualified Data.Text as T
@@ -78,18 +81,6 @@ import System.IO (IOMode (ReadMode, WriteM
7881
withBinaryFile)
7982
import System.Process.Read (readInNull, EnvOverride, doesExecutableExist)
8083

81-
-- | A cabal file with name and version parsed from the filepath, and the
82-
-- package description itself ready to be parsed. It's left in unparsed form
83-
-- for efficiency.
84-
data UnparsedCabalFile = UnparsedCabalFile
85-
{ ucfName :: PackageName
86-
, ucfVersion :: Version
87-
, ucfOffset :: !Int64
88-
-- ^ Byte offset into the 00-index.tar file for the entry contents
89-
, ucfSize :: !Int64
90-
-- ^ Size of the entry contents, in bytes
91-
}
92-
9384
data PackageCache = PackageCache
9485
{ pcOffset :: !Int64
9586
-- ^ offset in bytes into the 00-index.tar file for the .cabal file contents
@@ -100,65 +91,97 @@ data PackageCache = PackageCache
10091
deriving Generic
10192
instance Binary.Binary PackageCache
10293

103-
-- | Stream all of the cabal files from the 00-index tar file.
104-
withSourcePackageIndex
94+
newtype PackageCacheMap = PackageCacheMap (Map PackageIdentifier PackageCache)
95+
deriving Binary.Binary
96+
instance BinarySchema PackageCacheMap where
97+
-- Don't forget to update this if you change the datatype in any way!
98+
binarySchema _ = 1
99+
-- | Populate the package index caches and return them.
100+
populateCache
105101
:: (MonadIO m, MonadThrow m, MonadReader env m, HasConfig env, HasHttpManager env, MonadLogger m, MonadBaseControl IO m, MonadCatch m)
106102
=> EnvOverride
107103
-> PackageIndex
108-
-> (Producer m (Either UnparsedCabalFile (PackageIdentifier, L.ByteString)) -> m a)
109-
-> m a
110-
withSourcePackageIndex menv index cont = do
104+
-> m (Map PackageIdentifier PackageCache)
105+
populateCache menv index = do
106+
$logSticky "Populating index cache ..."
111107
requireIndex menv index
112108
-- This uses full on lazy I/O instead of ResourceT to provide some
113109
-- protections. Caveat emptor
114-
cont $ do
115-
path <- configPackageIndex (indexName index)
116-
lbs <- liftIO $ L.readFile $ Path.toFilePath path
117-
loop 0 (Tar.read lbs)
110+
path <- configPackageIndex (indexName index)
111+
lbs <- liftIO $ L.readFile $ Path.toFilePath path
112+
pis <- loop 0 Map.empty (Tar.read lbs)
113+
114+
when (indexRequireHashes index) $ forM_ (Map.toList pis) $ \(ident, pc) ->
115+
case pcDownload pc of
116+
Just _ -> return ()
117+
Nothing -> throwM $ MissingRequiredHashes (indexName index) ident
118+
119+
$logStickyDone "Populated index cache."
120+
121+
return pis
118122
where
119-
loop blockNo (Tar.Next e es) = do
120-
goE blockNo e
121-
loop blockNo' es
123+
loop !blockNo !m (Tar.Next e es) =
124+
loop (blockNo + entrySizeInBlocks e) (goE blockNo m e) es
125+
loop _ m Tar.Done = return m
126+
loop _ _ (Tar.Fail e) = throwM e
127+
128+
goE blockNo m e =
129+
case Tar.entryContent e of
130+
Tar.NormalFile lbs size ->
131+
case parseNameVersion $ Tar.entryPath e of
132+
Just (ident, ".cabal") -> addCabal ident size
133+
Just (ident, ".json") -> addJSON ident lbs
134+
_ -> m
135+
_ -> m
122136
where
123-
blockNo' = blockNo + entrySizeInBlocks e
124-
loop _ Tar.Done = return ()
125-
loop _ (Tar.Fail e) = throwM e
126-
127-
goE blockNo e
128-
| Just front <- T.stripSuffix ".cabal" $ T.pack $ Tar.entryPath e
129-
, Tar.NormalFile _ size <- Tar.entryContent e = do
130-
PackageIdentifier name version <- parseNameVersion front
131-
yield $ Left UnparsedCabalFile
132-
{ ucfName = name
133-
, ucfVersion = version
134-
, ucfOffset = (blockNo + 1) * 512
135-
, ucfSize = size
137+
addCabal ident size = Map.insertWith
138+
(\_ pcOld -> pcNew { pcDownload = pcDownload pcOld })
139+
ident
140+
pcNew
141+
m
142+
where
143+
pcNew = PackageCache
144+
{ pcOffset = (blockNo + 1) * 512
145+
, pcSize = size
146+
, pcDownload = Nothing
136147
}
137-
| Just front <- T.stripSuffix ".json" $ T.pack $ Tar.entryPath e
138-
, Tar.NormalFile lbs _size <- Tar.entryContent e = do
139-
ident <- parseNameVersion front
140-
yield $ Right (ident, lbs)
141-
| otherwise = return ()
148+
addJSON ident lbs =
149+
case decode lbs of
150+
Nothing -> m
151+
Just pd -> Map.insertWith
152+
(\_ pc -> pc { pcDownload = Just pd })
153+
ident
154+
PackageCache
155+
{ pcOffset = 0
156+
, pcSize = 0
157+
, pcDownload = Just pd
158+
}
159+
m
160+
161+
breakSlash x
162+
| S.null z = Nothing
163+
| otherwise = Just (y, SU.unsafeTail z)
164+
where
165+
(y, z) = S.break (== Word8._slash) x
142166

143167
parseNameVersion t1 = do
144-
let (p', t2) = T.break (== '/') $ T.replace "\\" "/" t1
145-
p <- parsePackageNameFromString $ T.unpack p'
146-
t3 <- maybe (throwM $ InvalidCabalPath t1 "no slash") return
147-
$ T.stripPrefix "/" t2
148-
let (v', t4) = T.break (== '/') t3
149-
v <- parseVersionFromString $ T.unpack v'
150-
when (t4 /= T.cons '/' p') $ throwM $ InvalidCabalPath t1 $ "Expected at end: " <> p'
151-
return $ PackageIdentifier p v
168+
(p', t3) <- breakSlash
169+
$ S.map (\c -> if c == Word8._backslash then Word8._slash else c)
170+
$ S8.pack t1
171+
p <- parsePackageName p'
172+
(v', t5) <- breakSlash t3
173+
v <- parseVersion v'
174+
let (t6, suffix) = S.break (== Word8._period) t5
175+
if t6 == p'
176+
then return (PackageIdentifier p v, suffix)
177+
else Nothing
152178

153179
data PackageIndexException
154-
= InvalidCabalPath Text Text
155-
| GitNotAvailable IndexName
180+
= GitNotAvailable IndexName
156181
| MissingRequiredHashes IndexName PackageIdentifier
157182
deriving Typeable
158183
instance Exception PackageIndexException
159184
instance Show PackageIndexException where
160-
show (InvalidCabalPath x y) =
161-
"Invalid cabal path " ++ T.unpack x ++ ": " ++ T.unpack y
162185
show (GitNotAvailable name) = concat
163186
[ "Package index "
164187
, T.unpack $ indexNameText name
@@ -359,51 +382,6 @@ getPackageCaches menv = do
359382

360383
return (fmap (index,) pis')
361384

362-
newtype PackageCacheMap = PackageCacheMap (Map PackageIdentifier PackageCache)
363-
deriving Binary.Binary
364-
instance BinarySchema PackageCacheMap where
365-
-- Don't forget to update this if you change the datatype in any way!
366-
binarySchema _ = 1
367-
368-
-- | Populate the package index caches and return them.
369-
populateCache :: (MonadIO m, MonadThrow m, MonadReader env m, HasConfig env, MonadLogger m, HasHttpManager env, MonadBaseControl IO m, MonadCatch m)
370-
=> EnvOverride
371-
-> PackageIndex
372-
-> m (Map PackageIdentifier PackageCache)
373-
populateCache menv index = do
374-
$logSticky "Populating index cache ..."
375-
let toIdent (Left ucf) = Just
376-
( PackageIdentifier (ucfName ucf) (ucfVersion ucf)
377-
, PackageCache
378-
{ pcOffset = ucfOffset ucf
379-
, pcSize = ucfSize ucf
380-
, pcDownload = Nothing
381-
}
382-
)
383-
toIdent (Right _) = Nothing
384-
385-
parseDownload (Left _) = Nothing
386-
parseDownload (Right (ident, lbs)) = do
387-
case decode lbs of
388-
Nothing -> Nothing
389-
Just pd -> Just (ident, pd)
390-
391-
withSourcePackageIndex menv index $ \source -> do
392-
(pis, pds) <- source $$ getZipSink ((,)
393-
<$> ZipSink (CL.mapMaybe toIdent =$ CL.consume)
394-
<*> ZipSink (Map.fromList <$> (CL.mapMaybe parseDownload =$ CL.consume)))
395-
396-
pis' <- liftM Map.fromList $ forM pis $ \(ident, pc) ->
397-
case Map.lookup ident pds of
398-
Just d -> return (ident, pc { pcDownload = Just d })
399-
Nothing
400-
| indexRequireHashes index -> throwM $ MissingRequiredHashes (indexName index) ident
401-
| otherwise -> return (ident, pc)
402-
403-
$logStickyDone "Populated index cache."
404-
405-
return pis'
406-
407385
--------------- Lifted from cabal-install, Distribution.Client.Tar:
408386
-- | Return the number of blocks in an entry.
409387
entrySizeInBlocks :: Tar.Entry -> Int64

stack.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ library
142142
, zlib >= 0.5.4.2
143143
, deepseq
144144
, file-embed
145+
, word8
145146
if !os(windows)
146147
build-depends: unix >= 2.7.0.1
147148
default-language: Haskell2010

0 commit comments

Comments
 (0)