Skip to content

Commit 4792344

Browse files
committed
mismi-s3: Add test for chunkFilesBySize
1 parent c7c0183 commit 4792344

File tree

3 files changed

+71
-16
lines changed

3 files changed

+71
-16
lines changed

mismi-s3/mismi-s3.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ test-suite test
8989
, ambiata-p
9090
, ambiata-x-eithert
9191
, conduit
92+
, containers == 0.5.*
9293
, directory == 1.2.*
9394
, exceptions
9495
, filepath

mismi-s3/src/Mismi/S3/Commands.hs

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ module Mismi.S3.Commands (
6666
, grantReadAccess
6767
, hoistUploadError
6868
, hoistDownloadError
69+
, chunkFilesBySize
6970
) where
7071

7172
import Control.Arrow ((***))
@@ -122,7 +123,8 @@ import System.IO.Error (IOError)
122123
import System.Directory (createDirectoryIfMissing, doesFileExist, getDirectoryContents)
123124
import System.FilePath (FilePath, (</>), takeDirectory)
124125
import System.Posix.IO (OpenMode(..), openFd, closeFd, fdSeek, defaultFileFlags)
125-
import System.Posix.Files (getFileStatus, isDirectory, isRegularFile)
126+
import System.Posix.Files (fileSize, getFileStatus, isDirectory, isRegularFile)
127+
import System.Posix.Types (FileOffset)
126128
import qualified "unix-bytestring" System.Posix.IO.ByteString as UBS
127129

128130
import System.Timeout.Lifted (timeout)
@@ -410,8 +412,7 @@ uploadWithMode m f a = do
410412
unlessM (liftIO $ doesFileExist f) . left $ UploadSourceMissing f
411413
s <- liftIO $ withFile f ReadMode $ \h ->
412414
hFileSize h
413-
let chunk = 100 * 1024 * 1024
414-
case s < chunk of
415+
case s < bigChunkSize of
415416
True ->
416417
lift $ uploadSingle f a
417418
False ->
@@ -422,9 +423,15 @@ uploadWithMode m f a = do
422423
-- better default.
423424
case s > 1024 * 1024 * 1024 of
424425
True ->
425-
multipartUpload f a s (2 * chunk) 20
426+
multipartUpload f a s (2 * bigChunkSize) 20
426427
False ->
427-
multipartUpload f a s chunk 20
428+
multipartUpload f a s bigChunkSize 20
429+
430+
431+
432+
bigChunkSize :: Integer
433+
bigChunkSize = 100 * 1024 * 1024
434+
428435

429436
uploadSingle :: FilePath -> Address -> AWS ()
430437
uploadSingle file a = do
@@ -480,16 +487,37 @@ uploadRecursiveWithMode m src (Address buck ky) = do
480487
case es of
481488
Left _ -> left $ UploadSourceMissing src
482489
Right st -> unless (isDirectory st) . left $ UploadSourceNotDirectory src
483-
files <- liftIO $ listRecursivelyLocal src
484-
let prefixLen = L.length (src </> "a") - 1
485-
outputAddrs = fmap (\fp -> Address buck (ky // Key (T.pack $ L.drop prefixLen fp))) files
490+
files <- fmap fst <$> liftIO (listRecursivelyLocal src)
491+
let outputAddrs = fmap (\fp -> Address buck (ky // Key (T.pack $ L.drop prefixLen fp))) files
486492
mapM_ (uncurry (uploadWithMode m)) $ L.zip files outputAddrs
493+
where
494+
prefixLen = L.length (src </> "a") - 1
495+
496+
-- uploadAddress :: FilePath -> Address
497+
-- uploadAddress fp = Address buck (ky // Key (T.pack $ L.drop prefixLen fp))
498+
499+
-- Take a list of files and their sizes, and return a list of list of files
500+
-- where the total size of of the files in the sub list is less than `bigChunkSize`
501+
-- and the length of the sub lists is <= `maxCount`.
502+
chunkFilesBySize :: Int -> Int64 -> [(FilePath, Int64)] -> [[FilePath]]
503+
chunkFilesBySize maxCount maxSize =
504+
takeFiles 0 [] . L.sortOn snd
505+
where
506+
takeFiles :: Int64 -> [FilePath] -> [(FilePath, Int64)] -> [[FilePath]]
507+
takeFiles _ acc [] = [acc]
508+
takeFiles current acc ((x, s):xs) =
509+
if current + s < maxSize && L.length acc < maxCount
510+
then takeFiles (current + s) (x:acc) xs
511+
else (x:acc) : takeFiles 0 [] xs
512+
513+
-- bigChunkSize
487514

488515
-- | Like `listRecursively` but for the local filesystem.
489-
listRecursivelyLocal :: MonadIO m => FilePath -> m [FilePath]
516+
-- Also returns
517+
listRecursivelyLocal :: MonadIO m => FilePath -> m [(FilePath, FileOffset)]
490518
listRecursivelyLocal topdir = do
491519
entries <- liftIO $ listDirectory topdir
492-
(dirs, files) <- liftIO . partitionDirsFiles $ fmap (topdir </>) entries
520+
(dirs, files) <- liftIO . partitionDirsFilesWithSizes $ fmap (topdir </>) entries
493521
others <- concatMapM listRecursivelyLocal dirs
494522
pure $ files <> others
495523

@@ -502,15 +530,15 @@ listDirectory path =
502530
f filename =
503531
filename /= "." && filename /= ".."
504532

505-
partitionDirsFiles :: MonadIO m => [FilePath] -> m ([FilePath], [FilePath])
506-
partitionDirsFiles =
533+
partitionDirsFilesWithSizes :: MonadIO m => [FilePath] -> m ([FilePath], [(FilePath, FileOffset)])
534+
partitionDirsFilesWithSizes =
507535
pworker ([], [])
508536
where
509537
pworker (dirs, files) [] = pure (dirs, files)
510538
pworker (dirs, files) (x:xs) = do
511539
xstat <- liftIO $ getFileStatus x
512540
pworker
513-
(if isDirectory xstat then x : dirs else dirs, if isRegularFile xstat then x : files else files)
541+
(if isDirectory xstat then x : dirs else dirs, if isRegularFile xstat then (x, fileSize xstat) : files else files)
514542
xs
515543

516544
write :: Address -> Text -> AWS WriteResult

mismi-s3/test/Test/Mismi/S3/Commands.hs

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ module Test.Mismi.S3.Commands where
66

77
import Control.Lens ((.~))
88

9-
import Data.Time.Clock
9+
import qualified Data.List as DL
10+
import Data.Map.Strict (Map)
11+
import qualified Data.Map.Strict as DM
12+
import Data.Time.Clock (NominalDiffTime, addUTCTime, getCurrentTime)
1013

11-
import Disorder.Core.IO
14+
import Disorder.Core.IO (testIO)
1215

1316
import Mismi.S3.Commands
1417
import qualified Mismi.S3.Amazonka as A
@@ -17,9 +20,13 @@ import P
1720

1821
import System.IO
1922

20-
import Test.QuickCheck
23+
import Test.Mismi.S3.Core.Arbitrary
24+
25+
import Test.QuickCheck (Positive (..), Property, (===), quickCheckAll)
26+
import qualified Test.QuickCheck as QC
2127
import Test.QuickCheck.Instances ()
2228

29+
2330
prop_filter_old :: Positive NominalDiffTime -> Property
2431
prop_filter_old (Positive i) = testIO $ do
2532
n <- getCurrentTime
@@ -33,6 +40,25 @@ prop_filter_failure = testIO $ do
3340
let r = filterOld n $ A.multipartUpload & A.muInitiated .~ Just n
3441
pure $ r === False
3542

43+
prop_chunk_files_by_size :: Property
44+
prop_chunk_files_by_size =
45+
QC.forAll (QC.choose (2, 10)) $ \ maxFilesPerChunk ->
46+
QC.forAll (QC.choose (10, 100)) $ \ fileCount ->
47+
QC.forAll (QC.choose (1000, 10000)) $ \ maxChunkSize ->
48+
QC.forAll (fileNameSizePairs fileCount) $ \ pairs ->
49+
let chunks = chunkFilesBySize maxFilesPerChunk maxChunkSize pairs
50+
chunkSizes = DL.map (multiChunkSum (DM.fromList pairs)) chunks
51+
in
52+
DL.filter (> maxChunkSize) chunkSizes === []
53+
54+
where
55+
multiChunkSum :: Map FilePath Int64 -> [FilePath] -> Int64
56+
multiChunkSum _ [] = 0
57+
multiChunkSum _ [_] = 0 -- Don't care about size of single file chunk.
58+
multiChunkSum sizes xs =
59+
sum $ mapMaybe (\ x -> DM.lookup x sizes) xs
60+
61+
3662
return []
3763
tests :: IO Bool
3864
tests = $quickCheckAll

0 commit comments

Comments
 (0)