Skip to content

Commit 57777c9

Browse files
committed
feat(scanning): enhance repository synchronization with rename detection and deletion
- Add automatic repository rename detection and updates when repositories are renamed in GitHub - Add automatic repository deletion when repositories no longer exist in GitHub organization - Add cascading deletion of related records (PolicyViolations and ActionLogs) before removing repositories - Add comprehensive logging for repository operations (add, rename, remove) with detailed information - Refactor SyncRepositoriesAsync to use dictionary lookups for improved performance - Add unit tests for repository deletion and rename scenarios - Update documentation to reflect repository synchronization capabilities This ensures the database stays in sync with GitHub organization state without manual intervention.
1 parent 9daee60 commit 57777c9

File tree

5 files changed

+279
-12
lines changed

5 files changed

+279
-12
lines changed

10xGitHubPolicies.App/Services/Scanning/ScanningService.cs

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,24 +118,75 @@ private async Task<Dictionary<string, Policy>> SyncPoliciesAsync(IEnumerable<Pol
118118

119119
private async Task SyncRepositoriesAsync(IReadOnlyList<Octokit.Repository> repositories)
120120
{
121-
var githubRepoIds = repositories.Select(r => r.Id).ToList();
122-
var reposInDb = await _dbContext.Repositories
123-
.Where(r => githubRepoIds.Contains(r.GitHubRepositoryId))
124-
.ToListAsync();
125-
var reposInDbIds = reposInDb.Select(r => r.GitHubRepositoryId).ToHashSet();
121+
var githubRepoIds = repositories.Select(r => r.Id).ToHashSet();
126122

123+
// Get all repositories from database
124+
var allReposInDb = await _dbContext.Repositories.ToListAsync();
125+
var reposInDbMap = allReposInDb.ToDictionary(r => r.GitHubRepositoryId);
126+
127+
// Add new repositories and update existing ones
127128
foreach (var repo in repositories)
128129
{
129-
if (!reposInDbIds.Contains(repo.Id))
130+
if (!reposInDbMap.ContainsKey(repo.Id))
130131
{
131132
_dbContext.Repositories.Add(new Repository
132133
{
133134
GitHubRepositoryId = repo.Id,
134135
Name = repo.FullName,
135136
ComplianceStatus = "Pending"
136137
});
138+
_logger.LogInformation("Added new repository: {RepoName} (GitHub ID: {GitHubRepoId})", repo.FullName, repo.Id);
139+
}
140+
else
141+
{
142+
// Update repository name in case it changed (e.g., repo was renamed)
143+
var existingRepo = reposInDbMap[repo.Id];
144+
if (existingRepo.Name != repo.FullName)
145+
{
146+
_logger.LogInformation("Repository renamed: {OldName} -> {NewName} (GitHub ID: {GitHubRepoId})", existingRepo.Name, repo.FullName, repo.Id);
147+
existingRepo.Name = repo.FullName;
148+
}
149+
}
150+
}
151+
152+
// Find repositories that need to be removed (exist in DB but not in GitHub)
153+
var reposToRemove = allReposInDb
154+
.Where(r => !githubRepoIds.Contains(r.GitHubRepositoryId))
155+
.ToList();
156+
157+
if (reposToRemove.Any())
158+
{
159+
_logger.LogInformation(
160+
"Removing {Count} repositories that no longer exist in GitHub: {RepoNames}",
161+
reposToRemove.Count,
162+
string.Join(", ", reposToRemove.Select(r => r.Name)));
163+
164+
// Delete related records first (PolicyViolations and ActionLogs)
165+
var repoIdsToRemove = reposToRemove.Select(r => r.RepositoryId).ToList();
166+
167+
var violationsToRemove = await _dbContext.PolicyViolations
168+
.Where(v => repoIdsToRemove.Contains(v.RepositoryId))
169+
.ToListAsync();
170+
171+
var actionLogsToRemove = await _dbContext.ActionsLogs
172+
.Where(a => repoIdsToRemove.Contains(a.RepositoryId))
173+
.ToListAsync();
174+
175+
if (violationsToRemove.Any())
176+
{
177+
_logger.LogInformation("Removing {Count} policy violations for deleted repositories", violationsToRemove.Count);
178+
_dbContext.PolicyViolations.RemoveRange(violationsToRemove);
137179
}
180+
181+
if (actionLogsToRemove.Any())
182+
{
183+
_logger.LogInformation("Removing {Count} action logs for deleted repositories", actionLogsToRemove.Count);
184+
_dbContext.ActionsLogs.RemoveRange(actionLogsToRemove);
185+
}
186+
187+
_dbContext.Repositories.RemoveRange(reposToRemove);
138188
}
189+
139190
await _dbContext.SaveChangesAsync();
140191
}
141192
}

10xGitHubPolicies.Tests/Services/Scanning/ScanningServiceTests.cs

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,168 @@ public async Task PerformScanAsync_WhenRepositoriesExist_SkipsExisting()
445445
repositories.Should().Contain(r => r.RepositoryId == existingRepo2.RepositoryId);
446446
}
447447

448+
[Fact]
449+
[Trait("Category", "Unit")]
450+
[Trait("Feature", "Scanning")]
451+
public async Task PerformScanAsync_WhenRepositoriesDeletedInGitHub_RemovesFromDatabase()
452+
{
453+
// Arrange
454+
var existingRepo1 = new Repository
455+
{
456+
GitHubRepositoryId = 100,
457+
Name = "test-org/existing-repo-1",
458+
ComplianceStatus = "Compliant"
459+
};
460+
var deletedRepo = new Repository
461+
{
462+
GitHubRepositoryId = 200,
463+
Name = "test-org/deleted-repo",
464+
ComplianceStatus = "NonCompliant"
465+
};
466+
var archivedRepo = new Repository
467+
{
468+
GitHubRepositoryId = 300,
469+
Name = "test-org/archived-repo",
470+
ComplianceStatus = "Pending"
471+
};
472+
_dbContext.Repositories.AddRange(existingRepo1, deletedRepo, archivedRepo);
473+
await _dbContext.SaveChangesAsync();
474+
475+
// Create related violations and action logs for the deleted repos
476+
var policy = new Policy
477+
{
478+
PolicyKey = "has_agents_md",
479+
Description = "Test policy",
480+
Action = "create-issue"
481+
};
482+
_dbContext.Policies.Add(policy);
483+
await _dbContext.SaveChangesAsync();
484+
485+
var scan = new Scan
486+
{
487+
Status = "Completed",
488+
StartedAt = DateTime.UtcNow.AddHours(-1),
489+
CompletedAt = DateTime.UtcNow.AddHours(-1)
490+
};
491+
_dbContext.Scans.Add(scan);
492+
await _dbContext.SaveChangesAsync();
493+
494+
var violationForDeletedRepo = new PolicyViolation
495+
{
496+
ScanId = scan.ScanId,
497+
RepositoryId = deletedRepo.RepositoryId,
498+
PolicyId = policy.PolicyId,
499+
PolicyType = "has_agents_md"
500+
};
501+
var violationForArchivedRepo = new PolicyViolation
502+
{
503+
ScanId = scan.ScanId,
504+
RepositoryId = archivedRepo.RepositoryId,
505+
PolicyId = policy.PolicyId,
506+
PolicyType = "has_agents_md"
507+
};
508+
_dbContext.PolicyViolations.AddRange(violationForDeletedRepo, violationForArchivedRepo);
509+
510+
var actionLogForDeletedRepo = new ActionLog
511+
{
512+
RepositoryId = deletedRepo.RepositoryId,
513+
PolicyId = policy.PolicyId,
514+
ActionType = "create-issue",
515+
Timestamp = DateTime.UtcNow,
516+
Status = "Completed",
517+
Details = "Test action"
518+
};
519+
var actionLogForArchivedRepo = new ActionLog
520+
{
521+
RepositoryId = archivedRepo.RepositoryId,
522+
PolicyId = policy.PolicyId,
523+
ActionType = "log-only",
524+
Timestamp = DateTime.UtcNow,
525+
Status = "Completed",
526+
Details = "Test action"
527+
};
528+
_dbContext.ActionsLogs.AddRange(actionLogForDeletedRepo, actionLogForArchivedRepo);
529+
await _dbContext.SaveChangesAsync();
530+
531+
var config = CreateTestConfig(
532+
new PolicyConfig { Type = "has_agents_md", Action = "create-issue" }
533+
);
534+
535+
// Only repo1 exists in GitHub now (repo2 and repo3 were deleted/archived)
536+
var repos = new List<Octokit.Repository>
537+
{
538+
CreateTestRepository(100, "existing-repo-1")
539+
};
540+
541+
_configurationService.GetConfigAsync(Arg.Any<bool>()).Returns(config);
542+
_githubService.GetOrganizationRepositoriesAsync().Returns(repos);
543+
_policyEvaluationService.EvaluateRepositoryAsync(
544+
Arg.Any<Octokit.Repository>(),
545+
Arg.Any<IEnumerable<PolicyConfig>>()
546+
).Returns(Task.FromResult<IEnumerable<PolicyViolation>>(new List<PolicyViolation>()));
547+
548+
// Act
549+
await _sut.PerformScanAsync();
550+
551+
// Assert
552+
var repositories = await _dbContext.Repositories.ToListAsync();
553+
repositories.Should().HaveCount(1, because: "only existing repository should remain");
554+
repositories.Should().Contain(r => r.RepositoryId == existingRepo1.RepositoryId);
555+
repositories.Should().NotContain(r => r.RepositoryId == deletedRepo.RepositoryId);
556+
repositories.Should().NotContain(r => r.RepositoryId == archivedRepo.RepositoryId);
557+
558+
// Verify related violations were removed
559+
var violations = await _dbContext.PolicyViolations.ToListAsync();
560+
violations.Should().NotContain(v => v.RepositoryId == deletedRepo.RepositoryId);
561+
violations.Should().NotContain(v => v.RepositoryId == archivedRepo.RepositoryId);
562+
563+
// Verify related action logs were removed
564+
var actionLogs = await _dbContext.ActionsLogs.ToListAsync();
565+
actionLogs.Should().NotContain(a => a.RepositoryId == deletedRepo.RepositoryId);
566+
actionLogs.Should().NotContain(a => a.RepositoryId == archivedRepo.RepositoryId);
567+
}
568+
569+
[Fact]
570+
[Trait("Category", "Unit")]
571+
[Trait("Feature", "Scanning")]
572+
public async Task PerformScanAsync_WhenRepositoryRenamed_UpdatesName()
573+
{
574+
// Arrange
575+
var existingRepo = new Repository
576+
{
577+
GitHubRepositoryId = 100,
578+
Name = "test-org/old-repo-name",
579+
ComplianceStatus = "Compliant"
580+
};
581+
_dbContext.Repositories.Add(existingRepo);
582+
await _dbContext.SaveChangesAsync();
583+
584+
var config = CreateTestConfig(
585+
new PolicyConfig { Type = "has_agents_md", Action = "create-issue" }
586+
);
587+
588+
// Repository was renamed in GitHub
589+
var repos = new List<Octokit.Repository>
590+
{
591+
CreateTestRepository(100, "new-repo-name")
592+
};
593+
594+
_configurationService.GetConfigAsync(Arg.Any<bool>()).Returns(config);
595+
_githubService.GetOrganizationRepositoriesAsync().Returns(repos);
596+
_policyEvaluationService.EvaluateRepositoryAsync(
597+
Arg.Any<Octokit.Repository>(),
598+
Arg.Any<IEnumerable<PolicyConfig>>()
599+
).Returns(Task.FromResult<IEnumerable<PolicyViolation>>(new List<PolicyViolation>()));
600+
601+
// Act
602+
await _sut.PerformScanAsync();
603+
604+
// Assert
605+
var repository = await _dbContext.Repositories.SingleAsync(r => r.GitHubRepositoryId == 100);
606+
repository.Name.Should().Be("test-org/new-repo-name", because: "repository name should be updated");
607+
repository.RepositoryId.Should().Be(existingRepo.RepositoryId, because: "same repository should be updated, not duplicated");
608+
}
609+
448610
[Fact]
449611
[Trait("Category", "Unit")]
450612
[Trait("Feature", "Scanning")]

CHANGELOG.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,32 @@
22

33
All notable changes to this project will be documented in this file.
44

5+
## 1.5
6+
7+
### Added
8+
- **Enhanced Repository Synchronization**: Improved `ScanningService` repository synchronization logic
9+
- Added automatic repository rename detection and updates when repositories are renamed in GitHub
10+
- Added automatic repository deletion when repositories no longer exist in GitHub organization
11+
- Added cascading deletion of related records (PolicyViolations and ActionLogs) before removing repositories
12+
- Added comprehensive logging for repository operations (add, rename, remove) with detailed information
13+
- Ensures database stays in sync with GitHub organization state without manual intervention
14+
- **Repository Synchronization Unit Tests**: Added comprehensive test coverage for repository synchronization scenarios
15+
- `PerformScanAsync_WhenRepositoriesDeletedInGitHub_RemovesFromDatabase`: Tests repository deletion with cascading cleanup
16+
- `PerformScanAsync_WhenRepositoryRenamed_UpdatesName`: Tests repository rename detection and updates
17+
18+
### Changed
19+
- **ScanningService**: Refactored `SyncRepositoriesAsync` method for improved repository synchronization
20+
- Changed from filtering database repositories by GitHub IDs to loading all repositories and mapping them
21+
- Improved performance by using dictionary lookups instead of repeated database queries
22+
- Enhanced data consistency by maintaining complete repository state in database
23+
24+
### Technical Details
25+
- **Repository Synchronization Strategy**:
26+
- Loads all repositories from database into memory for efficient comparison
27+
- Uses GitHub repository ID as the unique identifier (preserves identity across renames)
28+
- Performs three-phase synchronization: add new repos, update existing repos, remove deleted repos
29+
- Cascades deletions to maintain referential integrity (PolicyViolations and ActionLogs removed before repositories)
30+
531
## 1.4
632

733
### Added

docs/hangfire-integration.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,16 @@ The `ActionService` processes violations by:
125125

126126
This decouples the scanning process from the action-taking process. This is a robust design because even if the action-taking process fails, it can be retried independently from the scan itself, and the scan results are already safely stored in the database.
127127

128+
### Repository Synchronization During Scans
129+
130+
During each scan, the `ScanningService` automatically synchronizes the database with the current state of the GitHub organization:
131+
132+
- **New Repositories**: Detected and added to the database
133+
- **Renamed Repositories**: Repository names updated automatically (detected by GitHub repository ID)
134+
- **Deleted Repositories**: Removed from database with cascading cleanup of related records
135+
136+
This ensures that the compliance dashboard always reflects the current state of your GitHub organization, even as repositories are added, renamed, or removed over time.
137+
128138
## Best Practices
129139

130140
- **Idempotent Jobs**: Whenever possible, design background jobs to be idempotent. This means that running the job multiple times with the same input will produce the same result. Hangfire has built-in retry mechanisms, so idempotent jobs are safer to run.

docs/policy-evaluation.md

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,30 @@ The key components of the architecture are:
1616

1717
1. The `ScanningService` initiates a scan.
1818
2. It retrieves the policy configuration from the `.github/config.yaml` file.
19-
3. For each repository in the organization, it calls `IPolicyEvaluationService.EvaluateRepositoryAsync()`.
20-
4. The `PolicyEvaluationService` uses dependency injection to get all registered `IPolicyEvaluator` implementations.
21-
5. It matches the `type` of each policy in the configuration with the `PolicyType` property of the available evaluators.
22-
6. When a match is found, it executes the `EvaluateAsync` method of that specific evaluator.
23-
7. If the evaluator finds a violation, it returns a `PolicyViolation` object.
24-
8. The `ScanningService` collects all violations and saves them to the database.
19+
3. It synchronizes policies from the configuration file with the database (adds new policies if needed).
20+
4. It synchronizes repositories between GitHub and the database:
21+
- Adds new repositories that exist in GitHub but not in the database
22+
- Updates repository names if repositories were renamed in GitHub
23+
- Removes repositories that no longer exist in GitHub (with cascading deletion of related PolicyViolations and ActionLogs)
24+
5. For each repository in the organization, it calls `IPolicyEvaluationService.EvaluateRepositoryAsync()`.
25+
6. The `PolicyEvaluationService` uses dependency injection to get all registered `IPolicyEvaluator` implementations.
26+
7. It matches the `type` of each policy in the configuration with the `PolicyType` property of the available evaluators.
27+
8. When a match is found, it executes the `EvaluateAsync` method of that specific evaluator.
28+
9. If the evaluator finds a violation, it returns a `PolicyViolation` object.
29+
10. The `ScanningService` collects all violations and saves them to the database.
30+
11. After the scan completes, it enqueues a background job to process automated actions for the violations found.
31+
32+
### Repository Synchronization
33+
34+
The `ScanningService` ensures that the database repository list stays synchronized with the GitHub organization:
35+
36+
- **New Repositories**: Automatically detected and added to the database with status "Pending"
37+
- **Renamed Repositories**: Detection based on GitHub repository ID (which remains constant across renames). Repository names are automatically updated in the database.
38+
- **Deleted Repositories**: Repositories that no longer exist in GitHub are removed from the database, along with their related records:
39+
- Policy violations associated with the repository
40+
- Action logs for actions taken on the repository
41+
42+
This synchronization happens automatically during each scan, ensuring the database always reflects the current state of the GitHub organization.
2543

2644
---
2745

0 commit comments

Comments
 (0)