-
-
Notifications
You must be signed in to change notification settings - Fork 1
Improve Duplicate File Insight Coverage #519
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -711,3 +711,176 @@ def test_complex_scenario_with_multiple_edge_cases(self): | |
| # Verify that .xcprivacy files were ignored (no group for them) | ||
| privacy_group = next((g for g in result.groups if "xcprivacy" in g.name), None) | ||
| assert privacy_group is None | ||
|
|
||
| def test_nested_children_are_flattened_for_duplicate_detection(self): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a test case for what I mentioned above: have two duplicate
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
| """Test that nested children (e.g., assets inside .car files) are included in duplicate detection.""" | ||
| files = [ | ||
| # Parent .car file with nested children | ||
| FileInfo( | ||
| full_path=Path("Assets.car"), | ||
| path="Assets.car", | ||
| size=10000, | ||
| file_type="car", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="car_hash", | ||
| is_dir=False, | ||
| children=[ | ||
| FileInfo( | ||
| full_path=Path("Assets.car/icon.png"), | ||
| path="Assets.car/icon.png", | ||
| size=2000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="nested_icon_hash", | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| FileInfo( | ||
| full_path=Path("Assets.car/logo.png"), | ||
| path="Assets.car/logo.png", | ||
| size=3000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="nested_logo_hash", | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| ], | ||
| ), | ||
| # Another .car file with duplicate nested assets | ||
| FileInfo( | ||
| full_path=Path("Resources.bundle/Assets.car"), | ||
| path="Resources.bundle/Assets.car", | ||
| size=8000, | ||
| file_type="car", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="different_car_hash", | ||
| is_dir=False, | ||
| children=[ | ||
| FileInfo( | ||
| full_path=Path("Resources.bundle/Assets.car/icon.png"), | ||
| path="Resources.bundle/Assets.car/icon.png", | ||
| size=2000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="nested_icon_hash", # Same hash as Assets.car/icon.png | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| ], | ||
| ), | ||
| ] | ||
|
|
||
| insights_input = self._create_insights_input(files) | ||
| result = self.insight.generate(insights_input) | ||
|
|
||
| assert isinstance(result, DuplicateFilesInsightResult) | ||
| assert len(result.groups) == 1 | ||
|
|
||
| # Should detect the duplicate nested icon.png files | ||
| group = result.groups[0] | ||
| assert group.name == "icon.png" | ||
| assert len(group.files) == 2 | ||
| assert group.total_savings == 2000 | ||
|
|
||
| # Verify both paths are the nested ones, not the parent .car files | ||
| paths = {f.file_path for f in group.files} | ||
| assert "Assets.car/icon.png" in paths | ||
| assert "Resources.bundle/Assets.car/icon.png" in paths | ||
|
|
||
| def test_duplicate_car_files_dont_double_count_children(self): | ||
| """ | ||
| Test that when entire Assets.car files are duplicates, we only report the | ||
| .car files as duplicates, not their individual nested children (prevents double-counting). | ||
| """ | ||
| files = [ | ||
| # First Assets.car with children | ||
| FileInfo( | ||
| full_path=Path("Assets.car"), | ||
| path="Assets.car", | ||
| size=10000, | ||
| file_type="car", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="duplicate_car_hash", | ||
| is_dir=False, | ||
| children=[ | ||
| FileInfo( | ||
| full_path=Path("Assets.car/icon.png"), | ||
| path="Assets.car/icon.png", | ||
| size=2000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="icon_hash_inside_car", | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| FileInfo( | ||
| full_path=Path("Assets.car/logo.png"), | ||
| path="Assets.car/logo.png", | ||
| size=3000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="logo_hash_inside_car", | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| ], | ||
| ), | ||
| # Duplicate Assets.car with same children | ||
| FileInfo( | ||
| full_path=Path("Backup/Assets.car"), | ||
| path="Backup/Assets.car", | ||
| size=10000, | ||
| file_type="car", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="duplicate_car_hash", # Same hash as first .car | ||
| is_dir=False, | ||
| children=[ | ||
| FileInfo( | ||
| full_path=Path("Backup/Assets.car/icon.png"), | ||
| path="Backup/Assets.car/icon.png", | ||
| size=2000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="icon_hash_inside_car", # Same hash as first icon | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| FileInfo( | ||
| full_path=Path("Backup/Assets.car/logo.png"), | ||
| path="Backup/Assets.car/logo.png", | ||
| size=3000, | ||
| file_type="png", | ||
| treemap_type=TreemapType.ASSETS, | ||
| hash="logo_hash_inside_car", # Same hash as first logo | ||
| is_dir=False, | ||
| children=[], | ||
| ), | ||
| ], | ||
| ), | ||
| ] | ||
|
|
||
| insights_input = self._create_insights_input(files) | ||
| result = self.insight.generate(insights_input) | ||
|
|
||
| assert isinstance(result, DuplicateFilesInsightResult) | ||
|
|
||
| # Should only have 1 group: the duplicate Assets.car files | ||
| # The nested children should NOT be flagged separately | ||
| assert len(result.groups) == 1 | ||
|
|
||
| group = result.groups[0] | ||
| assert group.name == "Assets.car" | ||
| assert len(group.files) == 2 | ||
| assert group.total_savings == 10000 | ||
|
|
||
| # Verify the group contains the .car files, not their children | ||
| paths = {f.file_path for f in group.files} | ||
| assert "Assets.car" in paths | ||
| assert "Backup/Assets.car" in paths | ||
|
|
||
| # Verify no nested children appear in the results | ||
| for g in result.groups: | ||
| for f in g.files: | ||
| assert "icon.png" not in f.file_path | ||
| assert "logo.png" not in f.file_path | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
made this method just generally more efficient