44
55using System . Collections . Concurrent ;
66using System . Diagnostics . CodeAnalysis ;
7+ using System . IO . Abstractions ;
78using System . Security . Cryptography ;
89using Amazon . S3 ;
910using Amazon . S3 . Model ;
1011using Microsoft . Extensions . Logging ;
1112
1213namespace Documentation . Assembler . Deploying ;
1314
14- public class AwsS3SyncPlanStrategy ( ILoggerFactory logFactory , IAmazonS3 s3Client , string bucketName , AssembleContext context ) : IDocsSyncPlanStrategy
15+ public interface IS3EtagCalculator
16+ {
17+ Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default ) ;
18+ }
19+
20+ public class S3EtagCalculator ( ILoggerFactory logFactory , IFileSystem readFileSystem ) : IS3EtagCalculator
1521{
16- internal const long PartSize = 5 * 1024 * 1024 ; // 5MB
1722 private readonly ILogger < AwsS3SyncPlanStrategy > _logger = logFactory . CreateLogger < AwsS3SyncPlanStrategy > ( ) ;
23+
1824 private static readonly ConcurrentDictionary < string , string > EtagCache = new ( ) ;
1925
26+ internal const long PartSize = 5 * 1024 * 1024 ; // 5MB
27+
28+ [ SuppressMessage ( "Security" , "CA5351:Do Not Use Broken Cryptographic Algorithms" ) ]
29+ public async Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default )
30+ {
31+ if ( EtagCache . TryGetValue ( filePath , out var cachedEtag ) )
32+ {
33+ _logger . LogDebug ( "Using cached ETag for {Path}" , filePath ) ;
34+ return cachedEtag ;
35+ }
36+
37+ var fileInfo = readFileSystem . FileInfo . New ( filePath ) ;
38+ var fileSize = fileInfo . Length ;
39+
40+ // For files under 5MB, use simple MD5 (matching TransferUtility behavior)
41+ if ( fileSize <= PartSize )
42+ {
43+ await using var stream = readFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
44+ var smallBuffer = new byte [ fileSize ] ;
45+ var bytesRead = await stream . ReadAsync ( smallBuffer . AsMemory ( 0 , ( int ) fileSize ) , ctx ) ;
46+ var hash = MD5 . HashData ( smallBuffer . AsSpan ( 0 , bytesRead ) ) ;
47+ var etag = Convert . ToHexStringLower ( hash ) ;
48+ EtagCache [ filePath ] = etag ;
49+ return etag ;
50+ }
51+
52+ // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility)
53+ var parts = ( int ) Math . Ceiling ( ( double ) fileSize / PartSize ) ;
54+
55+ await using var fileStream = readFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
56+ var partBuffer = new byte [ PartSize ] ;
57+ var partHashes = new List < byte [ ] > ( ) ;
58+
59+ for ( var i = 0 ; i < parts ; i ++ )
60+ {
61+ var bytesRead = await fileStream . ReadAsync ( partBuffer . AsMemory ( 0 , partBuffer . Length ) , ctx ) ;
62+ var partHash = MD5 . HashData ( partBuffer . AsSpan ( 0 , bytesRead ) ) ;
63+ partHashes . Add ( partHash ) ;
64+ }
65+
66+ // Concatenate all part hashes
67+ var concatenatedHashes = partHashes . SelectMany ( h => h ) . ToArray ( ) ;
68+ var finalHash = MD5 . HashData ( concatenatedHashes ) ;
69+
70+ var multipartEtag = $ "{ Convert . ToHexStringLower ( finalHash ) } -{ parts } ";
71+ EtagCache [ filePath ] = multipartEtag ;
72+ return multipartEtag ;
73+ }
74+ }
75+
76+ public class AwsS3SyncPlanStrategy (
77+ ILoggerFactory logFactory ,
78+ IAmazonS3 s3Client ,
79+ string bucketName ,
80+ AssembleContext context ,
81+ IS3EtagCalculator ? calculator = null
82+ )
83+ : IDocsSyncPlanStrategy
84+ {
85+ private readonly ILogger < AwsS3SyncPlanStrategy > _logger = logFactory . CreateLogger < AwsS3SyncPlanStrategy > ( ) ;
86+
87+ private readonly IS3EtagCalculator _s3EtagCalculator = calculator ?? new S3EtagCalculator ( logFactory , context . ReadFileSystem ) ;
88+
2089 private bool IsSymlink ( string path )
2190 {
2291 var fileInfo = context . ReadFileSystem . FileInfo . New ( path ) ;
@@ -42,7 +111,7 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) =>
42111 if ( remoteObjects . TryGetValue ( destinationPath , out var remoteObject ) )
43112 {
44113 // Check if the ETag differs for updates
45- var localETag = await CalculateS3ETag ( localFile . FullName , token ) ;
114+ var localETag = await _s3EtagCalculator . CalculateS3ETag ( localFile . FullName , token ) ;
46115 var remoteETag = remoteObject . ETag . Trim ( '"' ) ; // Remove quotes from remote ETag
47116 if ( localETag == remoteETag )
48117 {
@@ -89,14 +158,44 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) =>
89158
90159 return new SyncPlan
91160 {
161+ TotalSourceFiles = localObjects . Length ,
92162 DeleteRequests = deleteRequests . ToList ( ) ,
93163 AddRequests = addRequests . ToList ( ) ,
94164 UpdateRequests = updateRequests . ToList ( ) ,
95165 SkipRequests = skipRequests . ToList ( ) ,
96- Count = deleteRequests . Count + addRequests . Count + updateRequests . Count + skipRequests . Count
166+ TotalSyncRequests = deleteRequests . Count + addRequests . Count + updateRequests . Count + skipRequests . Count
97167 } ;
98168 }
99169
170+ /// <inheritdoc />
171+ public PlanValidationResult Validate ( SyncPlan plan , float deleteThreshold )
172+ {
173+ if ( plan . TotalSourceFiles == 0 )
174+ {
175+ _logger . LogError ( "No files to sync" ) ;
176+ return new ( false , 1.0f , deleteThreshold ) ;
177+ }
178+
179+ var deleteRatio = ( float ) plan . DeleteRequests . Count / plan . TotalSyncRequests ;
180+ // if the total sync requests are less than 100, we enforce a higher ratio of 0.8
181+ // this allows newer assembled documentation to be in a higher state of flux
182+ if ( plan . TotalSyncRequests <= 100 )
183+ deleteThreshold = Math . Max ( deleteThreshold , 0.8f ) ;
184+
185+ // if the total sync requests are less than 1000, we enforce a higher ratio of 0.5
186+ // this allows newer assembled documentation to be in a higher state of flux
187+ else if ( plan . TotalSyncRequests <= 1000 )
188+ deleteThreshold = Math . Max ( deleteThreshold , 0.5f ) ;
189+
190+ if ( deleteRatio > deleteThreshold )
191+ {
192+ _logger . LogError ( "Delete ratio is {Ratio} which is greater than the threshold of {Threshold}" , deleteRatio , deleteThreshold ) ;
193+ return new ( false , deleteRatio , deleteThreshold ) ;
194+ }
195+
196+ return new ( true , deleteRatio , deleteThreshold ) ;
197+ }
198+
100199 private async Task < Dictionary < string , S3Object > > ListObjects ( Cancel ctx = default )
101200 {
102201 var listBucketRequest = new ListObjectsV2Request
@@ -117,51 +216,4 @@ private async Task<Dictionary<string, S3Object>> ListObjects(Cancel ctx = defaul
117216
118217 return objects . ToDictionary ( o => o . Key ) ;
119218 }
120-
121- [ SuppressMessage ( "Security" , "CA5351:Do Not Use Broken Cryptographic Algorithms" ) ]
122- private async Task < string > CalculateS3ETag ( string filePath , Cancel ctx = default )
123- {
124- if ( EtagCache . TryGetValue ( filePath , out var cachedEtag ) )
125- {
126- _logger . LogDebug ( "Using cached ETag for {Path}" , filePath ) ;
127- return cachedEtag ;
128- }
129-
130- var fileInfo = context . ReadFileSystem . FileInfo . New ( filePath ) ;
131- var fileSize = fileInfo . Length ;
132-
133- // For files under 5MB, use simple MD5 (matching TransferUtility behavior)
134- if ( fileSize <= PartSize )
135- {
136- await using var stream = context . ReadFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
137- var smallBuffer = new byte [ fileSize ] ;
138- var bytesRead = await stream . ReadAsync ( smallBuffer . AsMemory ( 0 , ( int ) fileSize ) , ctx ) ;
139- var hash = MD5 . HashData ( smallBuffer . AsSpan ( 0 , bytesRead ) ) ;
140- var etag = Convert . ToHexStringLower ( hash ) ;
141- EtagCache [ filePath ] = etag ;
142- return etag ;
143- }
144-
145- // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility)
146- var parts = ( int ) Math . Ceiling ( ( double ) fileSize / PartSize ) ;
147-
148- await using var fileStream = context . ReadFileSystem . FileStream . New ( filePath , FileMode . Open , FileAccess . Read , FileShare . Read ) ;
149- var partBuffer = new byte [ PartSize ] ;
150- var partHashes = new List < byte [ ] > ( ) ;
151-
152- for ( var i = 0 ; i < parts ; i ++ )
153- {
154- var bytesRead = await fileStream . ReadAsync ( partBuffer . AsMemory ( 0 , partBuffer . Length ) , ctx ) ;
155- var partHash = MD5 . HashData ( partBuffer . AsSpan ( 0 , bytesRead ) ) ;
156- partHashes . Add ( partHash ) ;
157- }
158-
159- // Concatenate all part hashes
160- var concatenatedHashes = partHashes . SelectMany ( h => h ) . ToArray ( ) ;
161- var finalHash = MD5 . HashData ( concatenatedHashes ) ;
162-
163- var multipartEtag = $ "{ Convert . ToHexStringLower ( finalHash ) } -{ parts } ";
164- EtagCache [ filePath ] = multipartEtag ;
165- return multipartEtag ;
166- }
167219}
0 commit comments