diff --git a/src/tooling/docs-assembler/Cli/DeployCommands.cs b/src/tooling/docs-assembler/Cli/DeployCommands.cs index 0f67a3baf..6a6df9c28 100644 --- a/src/tooling/docs-assembler/Cli/DeployCommands.cs +++ b/src/tooling/docs-assembler/Cli/DeployCommands.cs @@ -38,9 +38,15 @@ private void AssignOutputLogger() /// The environment to build /// The S3 bucket name to deploy to /// The file to write the plan to + /// The percentage of deletions allowed in the plan as percentage of total files to sync /// public async Task Plan( - string environment, string s3BucketName, string @out = "", Cancel ctx = default) + string environment, + string s3BucketName, + string @out = "", + float deleteThreshold = 0.2f, + Cancel ctx = default + ) { AssignOutputLogger(); await using var collector = new ConsoleDiagnosticsCollector(logFactory, githubActionsService) @@ -52,11 +58,25 @@ public async Task Plan( var s3Client = new AmazonS3Client(); IDocsSyncPlanStrategy planner = new AwsS3SyncPlanStrategy(logFactory, s3Client, s3BucketName, assembleContext); var plan = await planner.Plan(ctx); - ConsoleApp.Log("Total files to sync: " + plan.Count); + ConsoleApp.Log("Total files to sync: " + plan.TotalSyncRequests); ConsoleApp.Log("Total files to delete: " + plan.DeleteRequests.Count); ConsoleApp.Log("Total files to add: " + plan.AddRequests.Count); ConsoleApp.Log("Total files to update: " + plan.UpdateRequests.Count); ConsoleApp.Log("Total files to skip: " + plan.SkipRequests.Count); + if (plan.TotalSyncRequests == 0) + { + collector.EmitError(@out, $"Plan has no files to sync so no plan will be written."); + await collector.StopAsync(ctx); + return collector.Errors; + } + var validationResult = planner.Validate(plan, deleteThreshold); + if (!validationResult.Valid) + { + collector.EmitError(@out, $"Plan is invalid, delete ratio: {validationResult.DeleteRatio}, threshold: {validationResult.DeleteThreshold} over {plan.TotalSyncRequests:N0} files while plan has {plan.DeleteRequests:N0} deletions"); + await collector.StopAsync(ctx); + return collector.Errors; + } + if (!string.IsNullOrEmpty(@out)) { var output = SyncPlan.Serialize(plan); @@ -91,7 +111,7 @@ public async Task Apply( var transferUtility = new TransferUtility(s3Client, new TransferUtilityConfig { ConcurrentServiceRequests = Environment.ProcessorCount * 2, - MinSizeBeforePartUpload = AwsS3SyncPlanStrategy.PartSize + MinSizeBeforePartUpload = S3EtagCalculator.PartSize }); IDocsSyncApplyStrategy applier = new AwsS3SyncApplyStrategy(logFactory, s3Client, transferUtility, s3BucketName, assembleContext, collector); if (!File.Exists(planFile)) diff --git a/src/tooling/docs-assembler/Deploying/AwsS3SyncPlanStrategy.cs b/src/tooling/docs-assembler/Deploying/AwsS3SyncPlanStrategy.cs index b8ea406ea..18c38f65e 100644 --- a/src/tooling/docs-assembler/Deploying/AwsS3SyncPlanStrategy.cs +++ b/src/tooling/docs-assembler/Deploying/AwsS3SyncPlanStrategy.cs @@ -4,6 +4,7 @@ using System.Collections.Concurrent; using System.Diagnostics.CodeAnalysis; +using System.IO.Abstractions; using System.Security.Cryptography; using Amazon.S3; using Amazon.S3.Model; @@ -11,12 +12,80 @@ namespace Documentation.Assembler.Deploying; -public class AwsS3SyncPlanStrategy(ILoggerFactory logFactory, IAmazonS3 s3Client, string bucketName, AssembleContext context) : IDocsSyncPlanStrategy +public interface IS3EtagCalculator +{ + Task CalculateS3ETag(string filePath, Cancel ctx = default); +} + +public class S3EtagCalculator(ILoggerFactory logFactory, IFileSystem readFileSystem) : IS3EtagCalculator { - internal const long PartSize = 5 * 1024 * 1024; // 5MB private readonly ILogger _logger = logFactory.CreateLogger(); + private static readonly ConcurrentDictionary EtagCache = new(); + internal const long PartSize = 5 * 1024 * 1024; // 5MB + + [SuppressMessage("Security", "CA5351:Do Not Use Broken Cryptographic Algorithms")] + public async Task CalculateS3ETag(string filePath, Cancel ctx = default) + { + if (EtagCache.TryGetValue(filePath, out var cachedEtag)) + { + _logger.LogDebug("Using cached ETag for {Path}", filePath); + return cachedEtag; + } + + var fileInfo = readFileSystem.FileInfo.New(filePath); + var fileSize = fileInfo.Length; + + // For files under 5MB, use simple MD5 (matching TransferUtility behavior) + if (fileSize <= PartSize) + { + await using var stream = readFileSystem.FileStream.New(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); + var smallBuffer = new byte[fileSize]; + var bytesRead = await stream.ReadAsync(smallBuffer.AsMemory(0, (int)fileSize), ctx); + var hash = MD5.HashData(smallBuffer.AsSpan(0, bytesRead)); + var etag = Convert.ToHexStringLower(hash); + EtagCache[filePath] = etag; + return etag; + } + + // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility) + var parts = (int)Math.Ceiling((double)fileSize / PartSize); + + await using var fileStream = readFileSystem.FileStream.New(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); + var partBuffer = new byte[PartSize]; + var partHashes = new List(); + + for (var i = 0; i < parts; i++) + { + var bytesRead = await fileStream.ReadAsync(partBuffer.AsMemory(0, partBuffer.Length), ctx); + var partHash = MD5.HashData(partBuffer.AsSpan(0, bytesRead)); + partHashes.Add(partHash); + } + + // Concatenate all part hashes + var concatenatedHashes = partHashes.SelectMany(h => h).ToArray(); + var finalHash = MD5.HashData(concatenatedHashes); + + var multipartEtag = $"{Convert.ToHexStringLower(finalHash)}-{parts}"; + EtagCache[filePath] = multipartEtag; + return multipartEtag; + } +} + +public class AwsS3SyncPlanStrategy( + ILoggerFactory logFactory, + IAmazonS3 s3Client, + string bucketName, + AssembleContext context, + IS3EtagCalculator? calculator = null +) + : IDocsSyncPlanStrategy +{ + private readonly ILogger _logger = logFactory.CreateLogger(); + + private readonly IS3EtagCalculator _s3EtagCalculator = calculator ?? new S3EtagCalculator(logFactory, context.ReadFileSystem); + private bool IsSymlink(string path) { var fileInfo = context.ReadFileSystem.FileInfo.New(path); @@ -42,7 +111,7 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) => if (remoteObjects.TryGetValue(destinationPath, out var remoteObject)) { // Check if the ETag differs for updates - var localETag = await CalculateS3ETag(localFile.FullName, token); + var localETag = await _s3EtagCalculator.CalculateS3ETag(localFile.FullName, token); var remoteETag = remoteObject.ETag.Trim('"'); // Remove quotes from remote ETag if (localETag == remoteETag) { @@ -89,14 +158,44 @@ await Parallel.ForEachAsync(localObjects, ctx, async (localFile, token) => return new SyncPlan { + TotalSourceFiles = localObjects.Length, DeleteRequests = deleteRequests.ToList(), AddRequests = addRequests.ToList(), UpdateRequests = updateRequests.ToList(), SkipRequests = skipRequests.ToList(), - Count = deleteRequests.Count + addRequests.Count + updateRequests.Count + skipRequests.Count + TotalSyncRequests = deleteRequests.Count + addRequests.Count + updateRequests.Count + skipRequests.Count }; } + /// + public PlanValidationResult Validate(SyncPlan plan, float deleteThreshold) + { + if (plan.TotalSourceFiles == 0) + { + _logger.LogError("No files to sync"); + return new(false, 1.0f, deleteThreshold); + } + + var deleteRatio = (float)plan.DeleteRequests.Count / plan.TotalSyncRequests; + // if the total sync requests are less than 100, we enforce a higher ratio of 0.8 + // this allows newer assembled documentation to be in a higher state of flux + if (plan.TotalSyncRequests <= 100) + deleteThreshold = Math.Max(deleteThreshold, 0.8f); + + // if the total sync requests are less than 1000, we enforce a higher ratio of 0.5 + // this allows newer assembled documentation to be in a higher state of flux + else if (plan.TotalSyncRequests <= 1000) + deleteThreshold = Math.Max(deleteThreshold, 0.5f); + + if (deleteRatio > deleteThreshold) + { + _logger.LogError("Delete ratio is {Ratio} which is greater than the threshold of {Threshold}", deleteRatio, deleteThreshold); + return new(false, deleteRatio, deleteThreshold); + } + + return new(true, deleteRatio, deleteThreshold); + } + private async Task> ListObjects(Cancel ctx = default) { var listBucketRequest = new ListObjectsV2Request @@ -115,51 +214,4 @@ private async Task> ListObjects(Cancel ctx = defaul return objects.ToDictionary(o => o.Key); } - - [SuppressMessage("Security", "CA5351:Do Not Use Broken Cryptographic Algorithms")] - private async Task CalculateS3ETag(string filePath, Cancel ctx = default) - { - if (EtagCache.TryGetValue(filePath, out var cachedEtag)) - { - _logger.LogDebug("Using cached ETag for {Path}", filePath); - return cachedEtag; - } - - var fileInfo = context.ReadFileSystem.FileInfo.New(filePath); - var fileSize = fileInfo.Length; - - // For files under 5MB, use simple MD5 (matching TransferUtility behavior) - if (fileSize <= PartSize) - { - await using var stream = context.ReadFileSystem.FileStream.New(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); - var smallBuffer = new byte[fileSize]; - var bytesRead = await stream.ReadAsync(smallBuffer.AsMemory(0, (int)fileSize), ctx); - var hash = MD5.HashData(smallBuffer.AsSpan(0, bytesRead)); - var etag = Convert.ToHexStringLower(hash); - EtagCache[filePath] = etag; - return etag; - } - - // For files over 5MB, use multipart format with 5MB parts (matching TransferUtility) - var parts = (int)Math.Ceiling((double)fileSize / PartSize); - - await using var fileStream = context.ReadFileSystem.FileStream.New(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); - var partBuffer = new byte[PartSize]; - var partHashes = new List(); - - for (var i = 0; i < parts; i++) - { - var bytesRead = await fileStream.ReadAsync(partBuffer.AsMemory(0, partBuffer.Length), ctx); - var partHash = MD5.HashData(partBuffer.AsSpan(0, bytesRead)); - partHashes.Add(partHash); - } - - // Concatenate all part hashes - var concatenatedHashes = partHashes.SelectMany(h => h).ToArray(); - var finalHash = MD5.HashData(concatenatedHashes); - - var multipartEtag = $"{Convert.ToHexStringLower(finalHash)}-{parts}"; - EtagCache[filePath] = multipartEtag; - return multipartEtag; - } } diff --git a/src/tooling/docs-assembler/Deploying/DocsSync.cs b/src/tooling/docs-assembler/Deploying/DocsSync.cs index 1c44cb940..2ee8634bd 100644 --- a/src/tooling/docs-assembler/Deploying/DocsSync.cs +++ b/src/tooling/docs-assembler/Deploying/DocsSync.cs @@ -10,7 +10,10 @@ namespace Documentation.Assembler.Deploying; public interface IDocsSyncPlanStrategy { Task Plan(Cancel ctx = default); + + PlanValidationResult Validate(SyncPlan plan, float deleteThreshold); } +public record PlanValidationResult(bool Valid, float DeleteRatio, float DeleteThreshold); public interface IDocsSyncApplyStrategy { @@ -49,8 +52,11 @@ public record SkipRequest : SyncRequest public record SyncPlan { - [JsonPropertyName("count")] - public required int Count { get; init; } + [JsonPropertyName("total_source_files")] + public required int TotalSourceFiles { get; init; } + + [JsonPropertyName("total_sync_requests")] + public required int TotalSyncRequests { get; init; } [JsonPropertyName("delete")] public required IReadOnlyList DeleteRequests { get; init; } diff --git a/tests/docs-assembler.Tests/src/docs-assembler.Tests/DocsSyncTests.cs b/tests/docs-assembler.Tests/src/docs-assembler.Tests/DocsSyncTests.cs index 11c58b5c8..01f4c72fe 100644 --- a/tests/docs-assembler.Tests/src/docs-assembler.Tests/DocsSyncTests.cs +++ b/tests/docs-assembler.Tests/src/docs-assembler.Tests/DocsSyncTests.cs @@ -4,6 +4,7 @@ using System.IO.Abstractions.TestingHelpers; using Amazon.S3; +using Amazon.S3.Model; using Amazon.S3.Transfer; using Documentation.Assembler.Deploying; using Elastic.Documentation.Configuration; @@ -12,6 +13,7 @@ using FakeItEasy; using FluentAssertions; using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; namespace Documentation.Assembler.Tests; @@ -39,21 +41,18 @@ public async Task TestPlan() var configurationContext = TestHelpers.CreateConfigurationContext(fileSystem); var config = AssemblyConfiguration.Create(configurationContext.ConfigurationFileProvider); var context = new AssembleContext(config, configurationContext, "dev", collector, fileSystem, fileSystem, null, Path.Combine(Paths.WorkingDirectoryRoot.FullName, ".artifacts", "assembly")); - A.CallTo(() => mockS3Client.ListObjectsV2Async(A._, A._)) - .Returns(new Amazon.S3.Model.ListObjectsV2Response + A.CallTo(() => mockS3Client.ListObjectsV2Async(A._, A._)) + .Returns(new ListObjectsV2Response { S3Objects = [ - new Amazon.S3.Model.S3Object - { - Key = "docs/delete.md", - }, - new Amazon.S3.Model.S3Object + new S3Object { Key = "docs/delete.md" }, + new S3Object { Key = "docs/skip.md", ETag = "\"69048c0964c9577a399b138b706a467a\"" }, // This is the result of CalculateS3ETag - new Amazon.S3.Model.S3Object + new S3Object { Key = "docs/update.md", ETag = "\"existing-etag\"" @@ -63,9 +62,13 @@ public async Task TestPlan() var planStrategy = new AwsS3SyncPlanStrategy(new LoggerFactory(), mockS3Client, "fake", context); // Act - var plan = await planStrategy.Plan(Cancel.None); + var plan = await planStrategy.Plan(ctx: Cancel.None); // Assert + + plan.TotalSourceFiles.Should().Be(5); + plan.TotalSyncRequests.Should().Be(6); //including skip on server + plan.AddRequests.Count.Should().Be(3); plan.AddRequests.Should().Contain(i => i.DestinationPath == "docs/add1.md"); plan.AddRequests.Should().Contain(i => i.DestinationPath == "docs/add2.md"); @@ -81,6 +84,128 @@ public async Task TestPlan() plan.DeleteRequests.Should().Contain(i => i.DestinationPath == "docs/delete.md"); } + [Theory] + [InlineData(0, 10_000, 10_000, 0, 10_000, 0.2, false)] + [InlineData(8_000, 10_000, 10_000, 0, 2000, 0.2, true)] + [InlineData(7900, 10_000, 10_000, 0, 2100, 0.2, false)] + [InlineData(10_000, 0, 10_000, 10_000, 0, 0.2, true)] + [InlineData(2000, 0, 2000, 2000, 0, 0.2, true)] + // When total files to sync is lower than 100 we enforce a minimum ratio of 0.8 + [InlineData(20, 40, 40, 0, 20, 0.2, true)] + [InlineData(19, 100, 100, 0, 81, 0.2, false)] + // When total files to sync is lower than 1000 we enforce a minimum ratio of 0.5 + [InlineData(200, 400, 400, 0, 200, 0.2, true)] + [InlineData(199, 1000, 1000, 0, 801, 0.2, false)] + public async Task ValidateAdditionsPlan( + int localFiles, + int remoteFiles, + int totalFilesToSync, + int totalFilesToAdd, + int totalFilesToRemove, + float deleteThreshold, + bool valid + ) + { + var (planStrategy, plan) = await SetupS3SyncContextSetup(localFiles, remoteFiles); + + // Assert + + plan.TotalSourceFiles.Should().Be(localFiles); + plan.TotalSyncRequests.Should().Be(totalFilesToSync); + + plan.AddRequests.Count.Should().Be(totalFilesToAdd); + plan.DeleteRequests.Count.Should().Be(totalFilesToRemove); + + var validationResult = planStrategy.Validate(plan, deleteThreshold); + if (plan.TotalSyncRequests <= 100) + validationResult.DeleteThreshold.Should().Be(Math.Max(deleteThreshold, 0.8f)); + else if (plan.TotalSyncRequests <= 1000) + validationResult.DeleteThreshold.Should().Be(Math.Max(deleteThreshold, 0.5f)); + + validationResult.Valid.Should().Be(valid, $"Delete ratio is {validationResult.DeleteRatio} when maximum is {validationResult.DeleteThreshold}"); + } + + [Theory] + [InlineData(10_000, 0, 10_000, 0, 0, 0.2, true)] + [InlineData(2000, 0, 2000, 0, 0, 0.2, true)] + [InlineData(0, 10_000, 10_000, 0, 10_000, 0.2, false)] + [InlineData(0, 10_000, 10_000, 0, 10_000, 1.0, false)] + [InlineData(20, 10_000, 10_000, 20, 9980, 0.2, false)] + [InlineData(20, 10_000, 10_000, 20, 9980, 1.0, true)] + [InlineData(8_000, 10_000, 10_000, 8000, 2000, 0.2, true)] + [InlineData(7900, 10_000, 10_000, 7900, 2100, 0.2, false)] + public async Task ValidateUpdatesPlan( + int localFiles, + int remoteFiles, + int totalFilesToSync, + int totalFilesToUpdate, + int totalFilesToRemove, + float deleteThreshold, + bool valid + ) + { + var (planStrategy, plan) = await SetupS3SyncContextSetup(localFiles, remoteFiles, "different-etag"); + + // Assert + + plan.TotalSourceFiles.Should().Be(localFiles); + plan.TotalSyncRequests.Should().Be(totalFilesToSync); + + plan.UpdateRequests.Count.Should().Be(totalFilesToUpdate); + plan.DeleteRequests.Count.Should().Be(totalFilesToRemove); + + var validationResult = planStrategy.Validate(plan, deleteThreshold); + if (plan.TotalSyncRequests <= 100) + validationResult.DeleteThreshold.Should().Be(Math.Max(deleteThreshold, 0.8f)); + else if (plan.TotalSyncRequests <= 1000) + validationResult.DeleteThreshold.Should().Be(Math.Max(deleteThreshold, 0.5f)); + + validationResult.Valid.Should().Be(valid, $"Delete ratio is {validationResult.DeleteRatio} when maximum is {validationResult.DeleteThreshold}"); + } + + private static async Task<(AwsS3SyncPlanStrategy planStrategy, SyncPlan plan)> SetupS3SyncContextSetup( + int localFiles, int remoteFiles, string etag = "etag") + { + // Arrange + IReadOnlyCollection diagnosticsOutputs = []; + var collector = new DiagnosticsCollector(diagnosticsOutputs); + var mockS3Client = A.Fake(); + var fileSystem = new MockFileSystem(new MockFileSystemOptions + { + CurrentDirectory = Path.Combine(Paths.WorkingDirectoryRoot.FullName, ".artifacts", "assembly") + }); + foreach (var i in Enumerable.Range(0, localFiles)) + fileSystem.AddFile($"docs/file-{i}.md", new MockFileData($"# Local Document {i}")); + + var configurationContext = TestHelpers.CreateConfigurationContext(fileSystem); + var config = AssemblyConfiguration.Create(configurationContext.ConfigurationFileProvider); + var context = new AssembleContext(config, configurationContext, "dev", collector, fileSystem, fileSystem, null, Path.Combine(Paths.WorkingDirectoryRoot.FullName, ".artifacts", "assembly")); + + var s3Objects = new List(); + foreach (var i in Enumerable.Range(0, remoteFiles)) + { + s3Objects.Add(new S3Object + { + Key = $"docs/file-{i}.md", + ETag = etag + }); + } + + A.CallTo(() => mockS3Client.ListObjectsV2Async(A._, A._)) + .Returns(new ListObjectsV2Response + { + S3Objects = s3Objects + }); + + var mockEtagCalculator = A.Fake(); + A.CallTo(() => mockEtagCalculator.CalculateS3ETag(A._, A._)).Returns("etag"); + var planStrategy = new AwsS3SyncPlanStrategy(new LoggerFactory(), mockS3Client, "fake", context, mockEtagCalculator); + + // Act + var plan = await planStrategy.Plan(ctx: Cancel.None); + return (planStrategy, plan); + } + [Fact] public async Task TestApply() { @@ -102,10 +227,12 @@ public async Task TestApply() }); var configurationContext = TestHelpers.CreateConfigurationContext(fileSystem); var config = AssemblyConfiguration.Create(configurationContext.ConfigurationFileProvider); - var context = new AssembleContext(config, configurationContext, "dev", collector, fileSystem, fileSystem, null, Path.Combine(Paths.WorkingDirectoryRoot.FullName, ".artifacts", "assembly")); + var checkoutDirectory = Path.Combine(Paths.WorkingDirectoryRoot.FullName, ".artifacts", "assembly"); + var context = new AssembleContext(config, configurationContext, "dev", collector, fileSystem, fileSystem, null, checkoutDirectory); var plan = new SyncPlan { - Count = 6, + TotalSourceFiles = 5, + TotalSyncRequests = 6, AddRequests = [ new AddRequest { LocalPath = "docs/add1.md", DestinationPath = "docs/add1.md" }, new AddRequest { LocalPath = "docs/add2.md", DestinationPath = "docs/add2.md" }, @@ -124,8 +251,8 @@ public async Task TestApply() { DestinationPath = "docs/delete.md" } ] }; - A.CallTo(() => moxS3Client.DeleteObjectsAsync(A._, A._)) - .Returns(new Amazon.S3.Model.DeleteObjectsResponse + A.CallTo(() => moxS3Client.DeleteObjectsAsync(A._, A._)) + .Returns(new DeleteObjectsResponse { HttpStatusCode = System.Net.HttpStatusCode.OK }); @@ -144,7 +271,7 @@ public async Task TestApply() transferredFiles.Length.Should().Be(4); // 3 add requests + 1 update request transferredFiles.Should().NotContain("docs/skip.md"); - A.CallTo(() => moxS3Client.DeleteObjectsAsync(A._, A._)) + A.CallTo(() => moxS3Client.DeleteObjectsAsync(A._, A._)) .MustHaveHappenedOnceExactly(); A.CallTo(() => moxTransferUtility.UploadDirectoryAsync(A._, A._))