Skip to content

Commit 23a9b49

Browse files
committed
Add in-memory deduplication for dry_run mode
This addresses the maintainer's concern about finding matching within the same scan report. Problem: If findings 100 and 101 in the same report have identical hash_codes, in a real import finding 101 would match against finding 100 (which was just saved to the DB). In the previous dry_run implementation, this match would not occur since finding 100 was never saved, leading to inaccurate statistics. Solution: 1. Track new findings in-memory during dry_run (self.dry_run_new_findings) 2. Updated match_new_finding_to_existing_finding() to check both: - Database findings (existing behavior) - In-memory findings from current scan (new for dry_run) 3. Split matching logic into helper methods: - _get_db_matches(): Query database for matches - _get_in_memory_matches(): Check in-memory findings (dry_run only) 4. When a new finding is created in dry_run, add it to the tracking list Result: Dry run now accurately simulates deduplication within the same scan report, providing statistics that match what would actually happen in a real import. This makes the dry_run feature much more reliable for previewing imports. Updated documentation to reflect that this limitation has been resolved.
1 parent 3664582 commit 23a9b49

File tree

1 file changed

+64
-6
lines changed

1 file changed

+64
-6
lines changed

dojo/importers/default_reimporter.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,17 @@ class DefaultReImporter(BaseImporter, DefaultReImporterOptions):
6565
without making any database changes. This allows users to preview what would
6666
happen during a real reimport.
6767
68+
The dry_run mode uses in-memory tracking to accurately simulate deduplication,
69+
including matches between findings within the same scan report. This means that
70+
if finding 100 and 101 in the report have the same hash_code, finding 101 will
71+
correctly be identified as a duplicate of finding 100, just as in a real import.
72+
6873
Known Limitations in Dry Run Mode:
69-
- Finding matching within the same report: If two findings in the same scan report
70-
have the same hash_code, the second finding will NOT be matched against the first
71-
in dry_run mode (since the first is never saved to the database). In a real import,
72-
this match would occur. This means dry_run statistics may show slightly more "new"
73-
findings than would actually be created.
7474
- Endpoint updates are not simulated
7575
- Finding groups are not processed
7676
- JIRA integration is skipped
77+
- No notifications are sent
78+
- Test/engagement timestamps are not updated
7779
"""
7880

7981
def __init__(self, *args, **kwargs):
@@ -260,6 +262,9 @@ def process_findings(
260262
self.reactivated_items = []
261263
self.unchanged_items = []
262264
self.group_names_to_findings_dict = {}
265+
# In dry_run mode, track new findings in-memory to enable proper deduplication
266+
# within the same scan report (e.g., if finding 100 and 101 have same hash_code)
267+
self.dry_run_new_findings = [] if self.dry_run else None
263268
# Progressive batching for chord execution
264269
post_processing_task_signatures = []
265270
current_batch_number = 1
@@ -339,6 +344,8 @@ def process_findings(
339344
if self.dry_run:
340345
# In dry_run mode, just add to new_items without saving
341346
self.new_items.append(unsaved_finding)
347+
# Track in-memory for deduplication within the same scan report
348+
self.dry_run_new_findings.append(unsaved_finding)
342349
finding = unsaved_finding
343350
else:
344351
finding = self.process_finding_that_was_not_matched(unsaved_finding)
@@ -482,10 +489,31 @@ def match_new_finding_to_existing_finding(
482489
self,
483490
unsaved_finding: Finding,
484491
) -> list[Finding]:
485-
"""Matches a single new finding to N existing findings and then returns those matches"""
492+
"""
493+
Matches a single new finding to N existing findings and returns those matches.
494+
In dry_run mode, also checks against in-memory findings to simulate proper deduplication
495+
within the same scan report.
496+
"""
486497
# This code should match the logic used for deduplication out of the re-import feature.
487498
# See utils.py deduplicate_* functions
488499
deduplicationLogger.debug("return findings bases on algorithm: %s", self.deduplication_algorithm)
500+
501+
# Get matches from database
502+
db_matches = self._get_db_matches(unsaved_finding)
503+
504+
# In dry_run mode, also check in-memory findings from current scan
505+
if self.dry_run and self.dry_run_new_findings:
506+
in_memory_matches = self._get_in_memory_matches(unsaved_finding)
507+
# Combine matches: in-memory findings should come first (they would have lower IDs)
508+
if in_memory_matches:
509+
deduplicationLogger.debug(f"Found {len(in_memory_matches)} in-memory matches in dry_run mode")
510+
# Return in-memory match (simulates what would happen if it was saved)
511+
return [in_memory_matches[0]]
512+
513+
return db_matches
514+
515+
def _get_db_matches(self, unsaved_finding: Finding) -> list[Finding]:
516+
"""Get matches from the database based on deduplication algorithm"""
489517
if self.deduplication_algorithm == "hash_code":
490518
return (
491519
Finding.objects.filter(
@@ -532,6 +560,36 @@ def match_new_finding_to_existing_finding(
532560
logger.error(f'Internal error: unexpected deduplication_algorithm: "{self.deduplication_algorithm}"')
533561
return None
534562

563+
def _get_in_memory_matches(self, unsaved_finding: Finding) -> list[Finding]:
564+
"""
565+
Check in-memory findings for matches (used in dry_run mode).
566+
This simulates the deduplication that would occur within the same scan report.
567+
"""
568+
matches = []
569+
for in_memory_finding in self.dry_run_new_findings:
570+
if self.deduplication_algorithm == "hash_code":
571+
if in_memory_finding.hash_code and in_memory_finding.hash_code == unsaved_finding.hash_code:
572+
matches.append(in_memory_finding)
573+
elif self.deduplication_algorithm == "unique_id_from_tool":
574+
if (
575+
in_memory_finding.unique_id_from_tool
576+
and in_memory_finding.unique_id_from_tool == unsaved_finding.unique_id_from_tool
577+
):
578+
matches.append(in_memory_finding)
579+
elif self.deduplication_algorithm == "unique_id_from_tool_or_hash_code":
580+
if (in_memory_finding.hash_code and in_memory_finding.hash_code == unsaved_finding.hash_code) or (
581+
in_memory_finding.unique_id_from_tool
582+
and in_memory_finding.unique_id_from_tool == unsaved_finding.unique_id_from_tool
583+
):
584+
matches.append(in_memory_finding)
585+
elif self.deduplication_algorithm == "legacy":
586+
if (
587+
in_memory_finding.title.lower() == unsaved_finding.title.lower()
588+
and in_memory_finding.severity == unsaved_finding.severity
589+
):
590+
matches.append(in_memory_finding)
591+
return matches
592+
535593
def categorize_matched_finding_for_dry_run(
536594
self,
537595
unsaved_finding: Finding,

0 commit comments

Comments
 (0)