Skip to content

Commit c0cb6eb

Browse files
Merge pull request #380 from ambiata/topic/better-recursive-upload
Topic/better recursive upload
2 parents c3c6997 + 437545f commit c0cb6eb

File tree

8 files changed

+123
-47
lines changed

8 files changed

+123
-47
lines changed

mismi-cli/main/s3.hs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ data UnitPrefix =
6868
deriving (Eq, Show)
6969

7070
data Command =
71-
Upload Recursive FilePath Address WriteMode
71+
Upload Recursive FilePath Address WriteMode Int
7272
| Download Recursive Address FilePath
7373
| Copy Address Address WriteMode
7474
| Concat [Address] Address WriteMode Int
@@ -126,10 +126,10 @@ run c = do
126126
let
127127
e' = configure (over serviceRetry (set retryAttempts 10 . set exponentBase 0.6) s3) e
128128
orDie O.renderError . O.runAWS e' $ case c of
129-
Upload NotRecursive s d m ->
129+
Upload NotRecursive s d m _ ->
130130
uploadWithModeOrFail m s d
131-
Upload Recursive s d m ->
132-
uploadRecursiveWithModeOrFail m s d
131+
Upload Recursive s d m f ->
132+
uploadRecursiveWithModeOrFail m s d f
133133
Download NotRecursive s d ->
134134
renderExit renderDownloadError . download s . optAppendFileName d $ key s
135135
Download Recursive s d ->
@@ -301,7 +301,7 @@ commandP' :: Force -> Parser Command
301301
commandP' f = XOA.subparser $
302302
command' "upload"
303303
"Upload a file to s3."
304-
(Upload <$> recursive' <*> filepath' <*> address' <*> writeMode' f)
304+
(Upload <$> recursive' <*> filepath' <*> address' <*> writeMode' f <*> fork')
305305
<> command' "download"
306306
"Download a file from s3."
307307
(Download <$> recursive' <*> address' <*> filepath')

mismi-s3-core/mismi-s3-core.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ test-suite test
4040
, ambiata-mismi-s3-core
4141
, ambiata-p
4242
, attoparsec
43+
, filepath >= 1.3 && < 1.5
4344
, text
4445
, QuickCheck >= 2.7 && < 2.10
4546
, quickcheck-instances == 0.3.*

mismi-s3-core/test/Test/Mismi/S3/Core/Arbitrary.hs

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,29 @@
33
{-# OPTIONS_GHC -fno-warn-orphans #-}
44
module Test.Mismi.S3.Core.Arbitrary where
55

6-
import Data.Text as T
6+
import qualified Data.List as L
7+
import qualified Data.Text as T
78

8-
import Disorder.Corpus
9+
import Disorder.Corpus (simpsons, southpark)
910

1011
import Mismi.S3.Core.Data
1112

1213
import P
1314

14-
import Test.QuickCheck
15+
import Test.QuickCheck (Arbitrary (..), Gen)
16+
import qualified Test.QuickCheck as QC
1517
import Test.QuickCheck.Instances ()
1618

19+
import System.FilePath (FilePath)
1720

1821
instance Arbitrary WriteMode where
19-
arbitrary = elements [Fail, Overwrite]
22+
arbitrary = QC.elements [Fail, Overwrite]
2023

2124
instance Arbitrary Bucket where
22-
arbitrary = Bucket <$> elements southpark
25+
arbitrary = Bucket <$> QC.elements southpark
2326

2427
instance Arbitrary Address where
25-
arbitrary = frequency [
28+
arbitrary = QC.frequency [
2629
(9, Address <$> arbitrary <*> arbitrary)
2730
, (1, flip Address (Key "") <$> arbitrary)
2831
]
@@ -32,8 +35,17 @@ instance Arbitrary Key where
3235
-- Unfortunately unicode characters aren't supported in the Haskell AWS library
3336
-- https://github.com/ambiata/vee/issues/7
3437
arbitrary =
35-
let genPath = elements ["happy", "sad", ".", ":", "-"]
38+
let genPath = QC.elements ["happy", "sad", ".", ":", "-"]
3639
path = do
37-
sep <- elements ["-", "=", "#", ""]
38-
T.take 256 . T.intercalate "/" <$> listOf1 (T.intercalate sep <$> listOf1 genPath)
39-
in (Key . append "tests/") <$> path
40+
sep <- QC.elements ["-", "=", "#", ""]
41+
T.take 256 . T.intercalate "/" <$> QC.listOf1 (T.intercalate sep <$> QC.listOf1 genPath)
42+
in (Key . T.append "tests/") <$> path
43+
44+
45+
fileNameSizePairs :: Int -> Gen [(FilePath, Int64)]
46+
fileNameSizePairs len = do
47+
names <- QC.vectorOf len $ QC.elements simpsons
48+
lengths <- QC.vectorOf len $ QC.choose (1, 1000000000)
49+
pure $ L.zipWith3 zipper names [(0::Int) ..] lengths
50+
where
51+
zipper n i l = (n <> show i, l)

mismi-s3-core/test/mismi-s3-core-test.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ library
1111
, ambiata-mismi-s3-core
1212
, ambiata-p
1313
, attoparsec
14+
, filepath >= 1.3 && < 1.5
1415
, text
1516
, QuickCheck >= 2.7 && < 2.10
1617
, quickcheck-instances == 0.3.*

mismi-s3/mismi-s3.cabal

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ library
3535
, filepath >= 1.3 && < 1.5
3636
, http-client >= 0.4.18 && < 0.5
3737
, http-types == 0.8.*
38+
, lifted-async == 0.9.*
3839
, mtl >= 2.1 && < 2.3
3940
, process >= 1.2 && < 1.5
4041
, resourcet == 1.1.*
@@ -89,6 +90,7 @@ test-suite test
8990
, ambiata-p
9091
, ambiata-x-eithert
9192
, conduit
93+
, containers == 0.5.*
9294
, directory == 1.2.*
9395
, exceptions
9496
, filepath

mismi-s3/src/Mismi/S3/Commands.hs

Lines changed: 62 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,11 @@ module Mismi.S3.Commands (
6666
, grantReadAccess
6767
, hoistUploadError
6868
, hoistDownloadError
69+
, chunkFilesBySize
6970
) where
7071

7172
import Control.Arrow ((***))
72-
73+
import Control.Concurrent.Async.Lifted (mapConcurrently_)
7374
import Control.Exception (ioError)
7475
import qualified Control.Exception as CE
7576
import Control.Lens ((.~), (^.), to, view)
@@ -122,7 +123,7 @@ import System.IO.Error (IOError)
122123
import System.Directory (createDirectoryIfMissing, doesFileExist, getDirectoryContents)
123124
import System.FilePath (FilePath, (</>), takeDirectory)
124125
import System.Posix.IO (OpenMode(..), openFd, closeFd, fdSeek, defaultFileFlags)
125-
import System.Posix.Files (getFileStatus, isDirectory, isRegularFile)
126+
import System.Posix.Files (fileSize, getFileStatus, isDirectory, isRegularFile)
126127
import qualified "unix-bytestring" System.Posix.IO.ByteString as UBS
127128

128129
import System.Timeout.Lifted (timeout)
@@ -370,13 +371,13 @@ upload :: FilePath -> Address -> EitherT UploadError AWS ()
370371
upload =
371372
uploadWithMode Fail
372373

373-
uploadRecursive :: FilePath -> Address -> EitherT UploadError AWS ()
374+
uploadRecursive :: FilePath -> Address -> Int -> EitherT UploadError AWS ()
374375
uploadRecursive =
375376
uploadRecursiveWithMode Fail
376377

377-
uploadRecursiveOrFail :: FilePath -> Address -> AWS ()
378-
uploadRecursiveOrFail f a =
379-
eitherT hoistUploadError pure $ uploadRecursive f a
378+
uploadRecursiveOrFail :: FilePath -> Address -> Int -> AWS ()
379+
uploadRecursiveOrFail f a i =
380+
eitherT hoistUploadError pure $ uploadRecursive f a i
380381

381382
uploadOrFail :: FilePath -> Address -> AWS ()
382383
uploadOrFail f a =
@@ -386,9 +387,9 @@ uploadWithModeOrFail :: WriteMode -> FilePath -> Address -> AWS ()
386387
uploadWithModeOrFail w f a =
387388
eitherT hoistUploadError pure $ uploadWithMode w f a
388389

389-
uploadRecursiveWithModeOrFail :: WriteMode -> FilePath -> Address -> AWS ()
390-
uploadRecursiveWithModeOrFail w f a =
391-
eitherT hoistUploadError pure $ uploadRecursiveWithMode w f a
390+
uploadRecursiveWithModeOrFail :: WriteMode -> FilePath -> Address -> Int -> AWS ()
391+
uploadRecursiveWithModeOrFail w f a i =
392+
eitherT hoistUploadError pure $ uploadRecursiveWithMode w f a i
392393

393394
hoistUploadError :: UploadError -> AWS ()
394395
hoistUploadError e =
@@ -410,8 +411,7 @@ uploadWithMode m f a = do
410411
unlessM (liftIO $ doesFileExist f) . left $ UploadSourceMissing f
411412
s <- liftIO $ withFile f ReadMode $ \h ->
412413
hFileSize h
413-
let chunk = 100 * 1024 * 1024
414-
case s < chunk of
414+
case s < bigChunkSize of
415415
True ->
416416
lift $ uploadSingle f a
417417
False ->
@@ -422,21 +422,27 @@ uploadWithMode m f a = do
422422
-- better default.
423423
case s > 1024 * 1024 * 1024 of
424424
True ->
425-
multipartUpload f a s (2 * chunk) 20
425+
multipartUpload f a s (2 * bigChunkSize) 20
426426
False ->
427-
multipartUpload f a s chunk 20
427+
multipartUpload f a s bigChunkSize 20
428+
429+
430+
431+
bigChunkSize :: Integer
432+
bigChunkSize = 100 * 1024 * 1024
433+
428434

429435
uploadSingle :: FilePath -> Address -> AWS ()
430436
uploadSingle file a = do
431437
rq <- N.chunkedFile (ChunkSize $ 1024 * 1024) file
432438
void . send $ f' A.putObject a rq & A.poServerSideEncryption .~ pure sse
433439

434440
multipartUpload :: FilePath -> Address -> Integer -> Integer -> Int -> EitherT UploadError AWS ()
435-
multipartUpload file a fileSize chunk fork = do
441+
multipartUpload file a fSize chunk fork = do
436442
e <- ask
437443
mpu <- lift $ createMultipartUpload a
438444

439-
let chunks = calculateChunksCapped (fromInteger fileSize) (fromInteger chunk) 4096 -- max 4096 prts returned
445+
let chunks = calculateChunksCapped (fromInteger fSize) (fromInteger chunk) 4096 -- max 4096 prts returned
440446

441447
r <- liftIO $
442448
consume (forM_ chunks . writeQueue) fork $ multipartUploadWorker e mpu file a
@@ -474,22 +480,49 @@ multipartUploadWorker e mpu file a (o, c, i) =
474480
pure $! Right $! PartResponse i m
475481

476482

477-
uploadRecursiveWithMode :: WriteMode -> FilePath -> Address -> EitherT UploadError AWS ()
478-
uploadRecursiveWithMode m src (Address buck ky) = do
483+
uploadRecursiveWithMode :: WriteMode -> FilePath -> Address -> Int -> EitherT UploadError AWS ()
484+
uploadRecursiveWithMode mode src (Address buck ky) fork = do
479485
es <- tryIO $ getFileStatus src
480486
case es of
481487
Left _ -> left $ UploadSourceMissing src
482488
Right st -> unless (isDirectory st) . left $ UploadSourceNotDirectory src
483-
files <- liftIO $ listRecursivelyLocal src
484-
let prefixLen = L.length (src </> "a") - 1
485-
outputAddrs = fmap (\fp -> Address buck (ky // Key (T.pack $ L.drop prefixLen fp))) files
486-
mapM_ (uncurry (uploadWithMode m)) $ L.zip files outputAddrs
489+
files <- liftIO (listRecursivelyLocal src)
490+
mapM_ uploadFiles $ chunkFilesBySize fork (fromIntegral bigChunkSize) files
491+
where
492+
uploadFiles :: [(FilePath, Int64)] -> EitherT UploadError AWS ()
493+
uploadFiles [] = pure ()
494+
uploadFiles [(f,s)]
495+
| fromIntegral s < bigChunkSize = lift . uploadSingle f $ uploadAddress f
496+
| otherwise = uploadWithMode mode f $ uploadAddress f
497+
uploadFiles xs =
498+
mapConcurrently_ (\ (f, _) -> lift . uploadSingle f $ uploadAddress f) xs
499+
500+
501+
prefixLen = L.length (src </> "a") - 1
502+
503+
uploadAddress :: FilePath -> Address
504+
uploadAddress fp = Address buck (ky // Key (T.pack $ L.drop prefixLen fp))
505+
506+
-- Take a list of files and their sizes, and convert it to a list of tests
507+
-- where the total size of of the files in the sub list is less than `maxSize`
508+
-- and the length of the sub lists is <= `maxCount`.
509+
chunkFilesBySize :: Int -> Int64 -> [(FilePath, Int64)] -> [[(FilePath, Int64)]]
510+
chunkFilesBySize maxCount maxSize =
511+
takeFiles 0 [] . L.sortOn snd
512+
where
513+
takeFiles :: Int64 -> [(FilePath, Int64)] -> [(FilePath, Int64)] -> [[(FilePath, Int64)]]
514+
takeFiles _ acc [] = [acc]
515+
takeFiles current acc ((x, s):xs) =
516+
if current + s < maxSize && L.length acc < maxCount
517+
then takeFiles (current + s) ((x, s):acc) xs
518+
else ((x, s):acc) : takeFiles 0 [] xs
487519

488520
-- | Like `listRecursively` but for the local filesystem.
489-
listRecursivelyLocal :: MonadIO m => FilePath -> m [FilePath]
521+
-- Also returns
522+
listRecursivelyLocal :: MonadIO m => FilePath -> m [(FilePath, Int64)]
490523
listRecursivelyLocal topdir = do
491524
entries <- liftIO $ listDirectory topdir
492-
(dirs, files) <- liftIO . partitionDirsFiles $ fmap (topdir </>) entries
525+
(dirs, files) <- liftIO . partitionDirsFilesWithSizes $ fmap (topdir </>) entries
493526
others <- concatMapM listRecursivelyLocal dirs
494527
pure $ files <> others
495528

@@ -502,16 +535,17 @@ listDirectory path =
502535
f filename =
503536
filename /= "." && filename /= ".."
504537

505-
partitionDirsFiles :: MonadIO m => [FilePath] -> m ([FilePath], [FilePath])
506-
partitionDirsFiles =
538+
partitionDirsFilesWithSizes :: MonadIO m => [FilePath] -> m ([FilePath], [(FilePath, Int64)])
539+
partitionDirsFilesWithSizes =
507540
pworker ([], [])
508541
where
509542
pworker (dirs, files) [] = pure (dirs, files)
510543
pworker (dirs, files) (x:xs) = do
511544
xstat <- liftIO $ getFileStatus x
512-
pworker
513-
(if isDirectory xstat then x : dirs else dirs, if isRegularFile xstat then x : files else files)
514-
xs
545+
let xsize = fromIntegral $ fileSize xstat
546+
newDirs = if isDirectory xstat then x : dirs else dirs
547+
newFiles = if isRegularFile xstat then (x, xsize) : files else files
548+
pworker (newDirs, newFiles) xs
515549

516550
write :: Address -> Text -> AWS WriteResult
517551
write =

mismi-s3/test/Test/IO/Mismi/S3/Commands.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ prop_upload_recursive = once . testAWS $ do
532532

533533
addr <- withKey (// Key "top") <$> newAddress
534534

535-
eitherT (fail . show) pure $ uploadRecursive tmpdir addr
535+
eitherT (fail . show) pure $ uploadRecursive tmpdir addr 2
536536

537537
a <- read (withKey (// Key "a") addr)
538538
c <- read (withKey (// Key "b/c") addr)

mismi-s3/test/Test/Mismi/S3/Commands.hs

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ module Test.Mismi.S3.Commands where
66

77
import Control.Lens ((.~))
88

9-
import Data.Time.Clock
9+
import qualified Data.List as DL
10+
import Data.Map.Strict (Map)
11+
import qualified Data.Map.Strict as DM
12+
import Data.Time.Clock (NominalDiffTime, addUTCTime, getCurrentTime)
1013

11-
import Disorder.Core.IO
14+
import Disorder.Core.IO (testIO)
1215

1316
import Mismi.S3.Commands
1417
import qualified Mismi.S3.Amazonka as A
@@ -17,9 +20,13 @@ import P
1720

1821
import System.IO
1922

20-
import Test.QuickCheck
23+
import Test.Mismi.S3.Core.Arbitrary
24+
25+
import Test.QuickCheck (Positive (..), Property, (===), quickCheckAll)
26+
import qualified Test.QuickCheck as QC
2127
import Test.QuickCheck.Instances ()
2228

29+
2330
prop_filter_old :: Positive NominalDiffTime -> Property
2431
prop_filter_old (Positive i) = testIO $ do
2532
n <- getCurrentTime
@@ -33,6 +40,25 @@ prop_filter_failure = testIO $ do
3340
let r = filterOld n $ A.multipartUpload & A.muInitiated .~ Just n
3441
pure $ r === False
3542

43+
prop_chunk_files_by_size :: Property
44+
prop_chunk_files_by_size =
45+
QC.forAll (QC.choose (2, 10)) $ \ maxFilesPerChunk ->
46+
QC.forAll (QC.choose (10, 100)) $ \ fileCount ->
47+
QC.forAll (QC.choose (1000, 10000)) $ \ maxChunkSize ->
48+
QC.forAll (fileNameSizePairs fileCount) $ \ pairs ->
49+
let chunks = chunkFilesBySize maxFilesPerChunk maxChunkSize pairs
50+
chunkSizes = DL.map (multiChunkSum (DM.fromList pairs) . DL.map fst) chunks
51+
in
52+
DL.filter (> maxChunkSize) chunkSizes === []
53+
54+
where
55+
multiChunkSum :: Map FilePath Int64 -> [FilePath] -> Int64
56+
multiChunkSum _ [] = 0
57+
multiChunkSum _ [_] = 0 -- Don't care about size of single file chunk.
58+
multiChunkSum sizes xs =
59+
sum $ mapMaybe (\ x -> DM.lookup x sizes) xs
60+
61+
3662
return []
3763
tests :: IO Bool
3864
tests = $quickCheckAll

0 commit comments

Comments
 (0)