11package sync
22
33import (
4+ "crypto/md5"
5+ "encoding/hex"
46 "fmt"
57 "io"
68 "io/ioutil"
@@ -56,8 +58,11 @@ type DownloadTask struct {
5658 Uri string
5759 LocalPath string
5860 Uid string
61+ // uid key is common suffix between local path and remote uri
62+ UidKey string
5963}
6064
65+ // parse bucket and key out of remote object URI
6166func parseObjectUri (uri string ) (string , string , error ) {
6267 parts := strings .SplitN (uri , "//" , 2 )
6368 if len (parts ) != 2 {
@@ -73,6 +78,27 @@ func parseObjectUri(uri string) (string, string, error) {
7378 return pathParts [0 ], pathParts [1 ], nil
7479}
7580
81+ func uidKeyFromLocalPath (localDir string , localPath string ) (string , error ) {
82+ return filepath .Rel (localDir , localPath )
83+ }
84+
85+ func uidFromLocalPath (localPath string ) (string , error ) {
86+ f , err := os .Open (localPath )
87+ if err != nil {
88+ return "" , fmt .Errorf ("Invalid file path for checksum calculation: %s, err: %s" , localPath , err )
89+ }
90+ defer f .Close ()
91+
92+ h := md5 .New ()
93+ if _ , err := io .Copy (h , f ); err != nil {
94+ return "" , fmt .Errorf ("Failed to calculate checksum for file: %s, err: %s" , localPath , err )
95+ }
96+
97+ uid := hex .EncodeToString (h .Sum (nil ))
98+ // AWS S3 ETag is a quoted hex string
99+ return fmt .Sprintf ("\" %s\" " , uid ), nil
100+ }
101+
76102func (self * Puller ) downloadHandler (task DownloadTask , downloader GenericDownloader ) {
77103 l := zap .S ()
78104
@@ -120,9 +146,19 @@ func (self *Puller) downloadHandler(task DownloadTask, downloader GenericDownloa
120146
121147 // update cache with new object ID
122148 self .uidLock .Lock ()
123- l .Debugw ("Updaing uid cache" , "key" , task .Uri , "val" , task .Uid )
124- self .uidCache [task .Uri ] = task .Uid
125- defer self .uidLock .Unlock ()
149+ l .Debugw ("Updaing uid cache" , "key" , task .UidKey , "val" , task .Uid )
150+ self .uidCache [task .UidKey ] = task .Uid
151+ self .uidLock .Unlock ()
152+ }
153+
154+ func (self * Puller ) isPathExcluded (path string ) bool {
155+ for _ , pattern := range self .exclude {
156+ matched , _ := doublestar .Match (pattern , path )
157+ if matched {
158+ return true
159+ }
160+ }
161+ return false
126162}
127163
128164func (self * Puller ) handlePageList (
@@ -153,16 +189,9 @@ func (self *Puller) handlePageList(
153189 continue
154190 }
155191 // ignore file that matches exclude rules
156- shouldSkip := false
157- for _ , pattern := range self .exclude {
158- matched , _ := doublestar .Match (pattern , relPath )
159- if matched {
160- l .Debugf ("skipped %s due to exclude pattern: %s" , uri , pattern )
161- shouldSkip = true
162- break
163- }
164- }
192+ shouldSkip := self .isPathExcluded (relPath )
165193 if shouldSkip {
194+ l .Debugf ("skipped %s due to exclude pattern" , uri )
166195 continue
167196 }
168197
@@ -178,10 +207,11 @@ func (self *Puller) handlePageList(
178207
179208 self .fileListedCnt += 1
180209
210+ uidKey := relPath
181211 self .uidLock .Lock ()
182- oldUid , ok := self .uidCache [uri ]
212+ oldUid , ok := self .uidCache [uidKey ]
183213 self .uidLock .Unlock ()
184- l .Debugf ("Comparing object UID: %s <> %s = %v " , oldUid , newUid , oldUid == newUid )
214+ l .Debugf ("Comparing object UID: %s <> %s" , oldUid , newUid )
185215 if ok && oldUid == newUid {
186216 // skip update if uid is the same
187217 continue
@@ -192,6 +222,7 @@ func (self *Puller) handlePageList(
192222 Uri : uri ,
193223 LocalPath : localPath ,
194224 Uid : newUid ,
225+ UidKey : uidKey ,
195226 }
196227 }
197228 return true
@@ -219,8 +250,10 @@ type Puller struct {
219250 filePulledCnt int
220251}
221252
222- func (self * Puller ) AddExcludePattern (pattern string ) {
223- self .exclude = append (self .exclude , pattern )
253+ func (self * Puller ) AddExcludePatterns (patterns []string ) {
254+ for _ , pattern := range patterns {
255+ self .exclude = append (self .exclude , pattern )
256+ }
224257}
225258
226259func (self * Puller ) Pull (remoteUri string , localDir string ) string {
@@ -321,6 +354,76 @@ func (self *Puller) Pull(remoteUri string, localDir string) string {
321354 }
322355}
323356
357+ func (self * Puller ) PopulateChecksum (localDir string ) {
358+ l := zap .S ()
359+
360+ setFileChecksum := func (path string ) {
361+ f , err := os .Open (path )
362+ if err != nil {
363+ l .Errorf ("Invalid file path for checksum calculation: %s, err: %s" , path , err )
364+ }
365+ defer f .Close ()
366+
367+ h := md5 .New ()
368+ if _ , err := io .Copy (h , f ); err != nil {
369+ l .Errorf ("Failed to calculate checksum for file: %s, err: %s" , path , err )
370+ }
371+
372+ uidKey , err := uidKeyFromLocalPath (localDir , path )
373+ if err != nil {
374+ l .Errorf ("Failed to calculate uidKey for file: %s under dir: %s, err: %s" , path , localDir , err )
375+ return
376+ }
377+
378+ uid , err := uidFromLocalPath (path )
379+ if err != nil {
380+ l .Errorf ("Failed to calculate UID: %s" , err )
381+ return
382+ }
383+
384+ self .uidLock .Lock ()
385+ self .uidCache [uidKey ] = uid
386+ self .uidLock .Unlock ()
387+ }
388+
389+ err := filepath .Walk (localDir , func (path string , info os.FileInfo , err error ) error {
390+ if err != nil {
391+ return err
392+ }
393+
394+ // ignore file that matches exclude rules
395+ shouldSkip := false
396+ relPath , err := filepath .Rel (localDir , path )
397+ if err != nil {
398+ l .Errorf ("Got invalid path from filepath.Walk: %s, err: %s" , path , err )
399+ shouldSkip = true
400+ } else {
401+ if info .IsDir () {
402+ // this is so that pattern `foo/**` also matches `foo`
403+ relPath += "/"
404+ }
405+ shouldSkip = self .isPathExcluded (relPath )
406+ }
407+
408+ if info .IsDir () {
409+ if shouldSkip {
410+ return filepath .SkipDir
411+ }
412+ } else {
413+ if shouldSkip {
414+ return nil
415+ }
416+
417+ setFileChecksum (path )
418+ }
419+ return nil
420+ })
421+
422+ if err != nil {
423+ l .Errorf ("Failed to walk directory for populating file checksum, err: %s" , err )
424+ }
425+ }
426+
324427func NewPuller () * Puller {
325428 return & Puller {
326429 workerCnt : 5 ,
0 commit comments