Skip to content

Commit b66f5cc

Browse files
committed
Codec.Archive.Tar.Read: refactor getEntry
1 parent 928c63c commit b66f5cc

File tree

3 files changed

+140
-88
lines changed

3 files changed

+140
-88
lines changed

Codec/Archive/Tar/Read.hs

Lines changed: 123 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{-# LANGUAGE CPP, DeriveDataTypeable, BangPatterns #-}
1+
{-# LANGUAGE BangPatterns #-}
22
-----------------------------------------------------------------------------
33
-- |
44
-- Module : Codec.Archive.Tar.Read
@@ -12,18 +12,22 @@
1212
-- Portability : portable
1313
--
1414
-----------------------------------------------------------------------------
15-
module Codec.Archive.Tar.Read (read, FormatError(..)) where
15+
module Codec.Archive.Tar.Read
16+
( read
17+
, FormatError(..)
18+
) where
1619

1720
import Codec.Archive.Tar.Types
1821

1922
import Data.Char (ord)
2023
import Data.Int (Int64)
21-
import Data.Bits (Bits(shiftL))
24+
import Data.Bits (Bits(shiftL, (.&.), complement))
2225
import Control.Exception (Exception(..))
2326
import Data.Typeable (Typeable)
2427
import Control.Applicative
2528
import Control.Monad
2629
import Control.DeepSeq
30+
import Control.Monad.Trans.State.Lazy
2731

2832
import qualified Data.ByteString as BS
2933
import qualified Data.ByteString.Char8 as BS.Char8
@@ -64,97 +68,128 @@ instance NFData FormatError where
6468
-- * The conversion is done lazily.
6569
--
6670
read :: LBS.ByteString -> Entries FormatError
67-
read = unfoldEntries getEntry
68-
69-
getEntry :: LBS.ByteString -> Either FormatError (Maybe (Entry, LBS.ByteString))
70-
getEntry bs
71-
| BS.length header < 512 = Left TruncatedArchive
72-
73-
-- Tar files end with at least two blocks of all '0'. Checking this serves
74-
-- two purposes. It checks the format but also forces the tail of the data
75-
-- which is necessary to close the file if it came from a lazily read file.
76-
--
77-
-- It's tempting to fall into trailer parsing as soon as LBS.head bs == '\0',
78-
-- because, if interpreted as an 'Entry', it means that 'entryTarPath' is an empty
79-
-- string. Yet it's not a concern of this function: parse it as an 'Entry'
80-
-- and let further pipeline such as 'checkEntrySecurity' deal with it. After all,
81-
-- it might be a format extension with unknown semantics. Such somewhat malformed
82-
-- archives do exist in the wild, see https://github.com/haskell/tar/issues/73.
83-
--
84-
-- Only if an entire block is null, we assume that we are parsing a trailer.
85-
| LBS.all (== 0) (LBS.take 512 bs) = case LBS.splitAt 1024 bs of
86-
(end, trailing)
87-
| LBS.length end /= 1024 -> Left ShortTrailer
88-
| not (LBS.all (== 0) end) -> Left BadTrailer
89-
| not (LBS.all (== 0) trailing) -> Left TrailingJunk
90-
| otherwise -> Right Nothing
91-
92-
| otherwise = do
93-
94-
case (chksum_, format_) of
71+
read = evalState (readStreaming getN get)
72+
where
73+
getN :: Int64 -> State LBS.ByteString LBS.ByteString
74+
getN n = do
75+
(pref, st) <- LBS.splitAt n <$> get
76+
put st
77+
pure pref
78+
79+
readStreaming
80+
:: Monad m
81+
=> (Int64 -> m LBS.ByteString)
82+
-> m LBS.ByteString
83+
-> m (Entries FormatError)
84+
readStreaming = (unfoldEntriesM id .) . getEntryStreaming
85+
86+
getEntryStreaming
87+
:: Monad m
88+
=> (Int64 -> m LBS.ByteString)
89+
-> m LBS.ByteString
90+
-> m (Either FormatError (Maybe Entry))
91+
getEntryStreaming getN getAll = do
92+
header <- getN 512
93+
if LBS.length header < 512 then pure (Left TruncatedArchive) else do
94+
95+
-- Tar files end with at least two blocks of all '0'. Checking this serves
96+
-- two purposes. It checks the format but also forces the tail of the data
97+
-- which is necessary to close the file if it came from a lazily read file.
98+
--
99+
-- It's tempting to fall into trailer parsing as soon as LBS.head bs == '\0',
100+
-- because, if interpreted as an 'Entry', it means that 'entryTarPath' is an empty
101+
-- string. Yet it's not a concern of this function: parse it as an 'Entry'
102+
-- and let further pipeline such as 'checkEntrySecurity' deal with it. After all,
103+
-- it might be a format extension with unknown semantics. Such somewhat malformed
104+
-- archives do exist in the wild, see https://github.com/haskell/tar/issues/73.
105+
--
106+
-- Only if an entire block is null, we assume that we are parsing a trailer.
107+
if LBS.all (== 0) header then do
108+
nextBlock <- getN 512
109+
if LBS.length nextBlock < 512 then pure (Left ShortTrailer)
110+
else if LBS.all (== 0) nextBlock then do
111+
remainder <- getAll
112+
pure $ if LBS.all (== 0) remainder then Right Nothing else Left TrailingJunk
113+
else pure (Left BadTrailer)
114+
115+
else case parseHeader header of
116+
Left err -> pure $ Left err
117+
Right (name, mode, uid, gid, size, mtime, typecode, linkname, format, uname, gname, devmajor, devminor, prefix) -> do
118+
119+
-- It is crucial to get (size + padding) in one monadic operation
120+
-- and drop padding in a pure. If you get size bytes first,
121+
-- then skip padding, unpacking in constant memory will become impossible.
122+
let paddedSize = (size + 511) .&. complement 511
123+
paddedContent <- getN paddedSize
124+
let content = LBS.take size paddedContent
125+
126+
pure $ Right $ Just $ Entry {
127+
entryTarPath = TarPath name prefix,
128+
entryContent = case typecode of
129+
'\0' -> NormalFile content size
130+
'0' -> NormalFile content size
131+
'1' -> HardLink (LinkTarget linkname)
132+
'2' -> SymbolicLink (LinkTarget linkname)
133+
_ | format == V7Format
134+
-> OtherEntryType typecode content size
135+
'3' -> CharacterDevice devmajor devminor
136+
'4' -> BlockDevice devmajor devminor
137+
'5' -> Directory
138+
'6' -> NamedPipe
139+
'7' -> NormalFile content size
140+
_ -> OtherEntryType typecode content size,
141+
entryPermissions = mode,
142+
entryOwnership = Ownership (BS.Char8.unpack uname)
143+
(BS.Char8.unpack gname) uid gid,
144+
entryTime = mtime,
145+
entryFormat = format
146+
}
147+
148+
parseHeader
149+
:: LBS.ByteString
150+
-> Either FormatError (BS.ByteString, Permissions, Int, Int, Int64, EpochTime, Char, BS.ByteString, Format, BS.ByteString, BS.ByteString, DevMajor, DevMinor, BS.ByteString)
151+
parseHeader header' = do
152+
case (chksum_, format_ magic) of
95153
(Right chksum, _ ) | correctChecksum header chksum -> return ()
96154
(Right _, Right _) -> Left ChecksumIncorrect
97155
_ -> Left NotTarFormat
98156

99-
-- These fields are partial, have to check them
100-
format <- format_; mode <- mode_;
101-
uid <- uid_; gid <- gid_;
102-
size <- size_; mtime <- mtime_;
103-
devmajor <- devmajor_; devminor <- devminor_;
104-
105-
let content = LBS.take size (LBS.drop 512 bs)
106-
padding = (512 - size) `mod` 512
107-
bs' = LBS.drop (512 + size + padding) bs
108-
109-
entry = Entry {
110-
entryTarPath = TarPath name prefix,
111-
entryContent = case typecode of
112-
'\0' -> NormalFile content size
113-
'0' -> NormalFile content size
114-
'1' -> HardLink (LinkTarget linkname)
115-
'2' -> SymbolicLink (LinkTarget linkname)
116-
_ | format == V7Format
117-
-> OtherEntryType typecode content size
118-
'3' -> CharacterDevice devmajor devminor
119-
'4' -> BlockDevice devmajor devminor
120-
'5' -> Directory
121-
'6' -> NamedPipe
122-
'7' -> NormalFile content size
123-
_ -> OtherEntryType typecode content size,
124-
entryPermissions = mode,
125-
entryOwnership = Ownership (BS.Char8.unpack uname)
126-
(BS.Char8.unpack gname) uid gid,
127-
entryTime = mtime,
128-
entryFormat = format
129-
}
130-
131-
return (Just (entry, bs'))
157+
mode <- mode_
158+
uid <- uid_
159+
gid <- gid_
160+
size <- size_
161+
mtime <- mtime_
162+
format <- format_ magic
163+
devmajor <- devmajor_
164+
devminor <- devminor_
132165

166+
pure (name, mode, uid, gid, size, mtime, typecode, linkname, format, uname, gname, devmajor, devminor, prefix)
133167
where
134-
header = LBS.toStrict (LBS.take 512 bs)
135-
136-
name = getString 0 100 header
137-
mode_ = getOct 100 8 header
138-
uid_ = getOct 108 8 header
139-
gid_ = getOct 116 8 header
140-
size_ = getOct 124 12 header
141-
mtime_ = getOct 136 12 header
142-
chksum_ = getOct 148 8 header
143-
typecode = getByte 156 header
144-
linkname = getString 157 100 header
145-
magic = getChars 257 8 header
146-
uname = getString 265 32 header
147-
gname = getString 297 32 header
148-
devmajor_ = getOct 329 8 header
149-
devminor_ = getOct 337 8 header
150-
prefix = getString 345 155 header
151-
-- trailing = getBytes 500 12 header
152-
153-
format_
154-
| magic == ustarMagic = return UstarFormat
155-
| magic == gnuMagic = return GnuFormat
156-
| magic == v7Magic = return V7Format
157-
| otherwise = Left UnrecognisedTarFormat
168+
header = LBS.toStrict header'
169+
170+
name = getString 0 100 header
171+
mode_ = getOct 100 8 header
172+
uid_ = getOct 108 8 header
173+
gid_ = getOct 116 8 header
174+
size_ = getOct 124 12 header
175+
mtime_ = getOct 136 12 header
176+
chksum_ = getOct 148 8 header
177+
typecode = getByte 156 header
178+
linkname = getString 157 100 header
179+
magic = getChars 257 8 header
180+
uname = getString 265 32 header
181+
gname = getString 297 32 header
182+
devmajor_ = getOct 329 8 header
183+
devminor_ = getOct 337 8 header
184+
prefix = getString 345 155 header
185+
-- trailing = getBytes 500 12 header
186+
187+
format_ :: BS.ByteString -> Either FormatError Format
188+
format_ magic
189+
| magic == ustarMagic = return UstarFormat
190+
| magic == gnuMagic = return GnuFormat
191+
| magic == v7Magic = return V7Format
192+
| otherwise = Left UnrecognisedTarFormat
158193

159194
v7Magic, ustarMagic, gnuMagic :: BS.ByteString
160195
v7Magic = BS.Char8.pack "\0\0\0\0\0\0\0\0"

Codec/Archive/Tar/Types.hs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ module Codec.Archive.Tar.Types (
6565
foldEntries,
6666
foldlEntries,
6767
unfoldEntries,
68+
unfoldEntriesM,
6869
) where
6970

7071
import Data.Int (Int64)
@@ -604,6 +605,21 @@ unfoldEntries f = unfold
604605
Right Nothing -> Done
605606
Right (Just (e, x')) -> Next e (unfold x')
606607

608+
unfoldEntriesM
609+
:: Monad m
610+
=> (forall a. m a -> m a)
611+
-- ^ id or unsafeInterleaveIO
612+
-> m (Either e (Maybe (GenEntry tarPath linkTarget)))
613+
-> m (GenEntries tarPath linkTarget e)
614+
unfoldEntriesM interleave f = unfold
615+
where
616+
unfold = do
617+
f' <- f
618+
case f' of
619+
Left err -> pure $ Fail err
620+
Right Nothing -> pure Done
621+
Right (Just e) -> Next e <$> interleave unfold
622+
607623
-- | This is like the standard 'foldr' function on lists, but for 'Entries'.
608624
-- Compared to 'foldr' it takes an extra function to account for the
609625
-- possibility of failure.

tar.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ library tar-internal
5656
directory >= 1.3.1 && < 1.4,
5757
filepath < 1.6,
5858
time < 1.13,
59+
transformers < 0.7,
5960

6061
exposed-modules:
6162
Codec.Archive.Tar

0 commit comments

Comments
 (0)