diff --git a/src/launchpad/artifacts/apple/zipped_xcarchive.py b/src/launchpad/artifacts/apple/zipped_xcarchive.py index f2b610ed..8f4a709b 100644 --- a/src/launchpad/artifacts/apple/zipped_xcarchive.py +++ b/src/launchpad/artifacts/apple/zipped_xcarchive.py @@ -33,6 +33,7 @@ class AssetCatalogElement: full_path: Path | None idiom: str | None = None colorspace: str | None = None + content_hash: str | None = None @dataclass @@ -477,9 +478,10 @@ def _parse_asset_element(self, item: dict[str, Any], parent_path: Path) -> Asset filename = item.get("filename", "") idiom = item.get("idiom") colorspace = item.get("colorspace") + content_hash = item.get("contentHash") file_extension = Path(filename).suffix.lower() - if filename and file_extension in {".png", ".jpg", ".jpeg", ".heic", ".heif"}: + if filename and file_extension in {".png", ".jpg", ".jpeg", ".heic", ".heif", ".pdf", ".svg"}: potential_path = parent_path / f"{image_id}{file_extension}" if potential_path.exists(): full_path = potential_path @@ -498,6 +500,7 @@ def _parse_asset_element(self, item: dict[str, Any], parent_path: Path) -> Asset full_path=full_path, idiom=idiom, colorspace=colorspace, + content_hash=content_hash, ) def _parse_and_cache_all_binaries(self, binary_paths: List[Path]) -> None: diff --git a/src/launchpad/size/insights/common/duplicate_files.py b/src/launchpad/size/insights/common/duplicate_files.py index f6577cdc..e4166d99 100644 --- a/src/launchpad/size/insights/common/duplicate_files.py +++ b/src/launchpad/size/insights/common/duplicate_files.py @@ -21,10 +21,12 @@ class DuplicateFilesInsight(Insight[DuplicateFilesInsightResult]): def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None: files = input.file_analysis.files directories = input.file_analysis.directories + all_files = self._flatten_files(files) groups: List[FileSavingsResultGroup] = [] total_savings = 0 covered_dirs: Set[str] = set() + covered_files: Set[str] = set() # ----------------------------- # 1) Duplicate DIRECTORIES @@ -66,11 +68,19 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None: # 2) Duplicate FILES # ----------------------------- files_by_hash: Dict[str, List[FileInfo]] = defaultdict(list) - for f in files: - if f.hash and not self._is_allowed_extension(f.path) and not self._is_under_any(f.path, covered_dirs): + for f in all_files: + if ( + not f.path.endswith("/Other") # Skip synthetic /Other nodes + and not self._is_under_any(f.path, covered_dirs) # Skip files under duplicate dirs + and f.hash + and not self._is_allowed_extension(f.path) + ): files_by_hash[f.hash].append(f) - for dup_files in files_by_hash.values(): + # Process hash groups by depth (shallowest first) to handle parent files before children + for file_hash in sorted(files_by_hash, key=lambda h: self._min_depth([f.path for f in files_by_hash[h]])): + dup_files = [f for f in files_by_hash[file_hash] if not self._is_under_any(f.path, covered_files)] + if len(dup_files) < 2: continue @@ -88,6 +98,7 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None: ) ) total_savings += savings + covered_files.update(f.path for f in dup_files if f.children) groups.sort(key=lambda g: (-g.total_savings, g.name)) @@ -99,6 +110,15 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None: # Helpers # ------------------------------------------------------------------------- + def _flatten_files(self, files: List[FileInfo]) -> List[FileInfo]: + """Recursively flatten files, extracting nested children (e.g., assets within .car files).""" + result: List[FileInfo] = [] + for f in files: + result.append(f) + if f.children: + result.extend(self._flatten_files(f.children)) + return result + def _directory_duplicate_candidates(self, directories: List[FileInfo]) -> List[List[FileInfo]]: """ Returns a list of duplicate-directory groups (lists of FileInfo) where @@ -116,8 +136,14 @@ def _is_allowed_extension(self, file_path: str) -> bool: @staticmethod def _is_under_any(path: str, containers: Set[str]) -> bool: - for c in containers: - if path == c or path.startswith(c + "/"): + """Check if path or any of its parents are in containers (O(path_depth)).""" + if path in containers: + return True + # Walk up parent hierarchy: "a/b/c" checks "a" then "a/b" + parts = path.split("/") + for depth in range(1, len(parts)): + parent_path = "/".join(parts[:depth]) + if parent_path in containers: return True return False diff --git a/src/launchpad/size/utils/file_analysis.py b/src/launchpad/size/utils/file_analysis.py index e958c2b9..5321c00d 100644 --- a/src/launchpad/size/utils/file_analysis.py +++ b/src/launchpad/size/utils/file_analysis.py @@ -276,6 +276,8 @@ def _analyze_asset_catalog(xcarchive: ZippedXCArchive, relative_path: Path) -> L for element in catalog_details: if element.full_path and element.full_path.exists() and element.full_path.is_file(): file_hash = calculate_file_hash(element.full_path, algorithm="sha256") + elif element.content_hash: + file_hash = element.content_hash else: file_hash = element.image_id diff --git a/tests/unit/size/test_duplicate_files_insight.py b/tests/unit/size/test_duplicate_files_insight.py index c1000103..f6373683 100644 --- a/tests/unit/size/test_duplicate_files_insight.py +++ b/tests/unit/size/test_duplicate_files_insight.py @@ -711,3 +711,176 @@ def test_complex_scenario_with_multiple_edge_cases(self): # Verify that .xcprivacy files were ignored (no group for them) privacy_group = next((g for g in result.groups if "xcprivacy" in g.name), None) assert privacy_group is None + + def test_nested_children_are_flattened_for_duplicate_detection(self): + """Test that nested children (e.g., assets inside .car files) are included in duplicate detection.""" + files = [ + # Parent .car file with nested children + FileInfo( + full_path=Path("Assets.car"), + path="Assets.car", + size=10000, + file_type="car", + treemap_type=TreemapType.ASSETS, + hash="car_hash", + is_dir=False, + children=[ + FileInfo( + full_path=Path("Assets.car/icon.png"), + path="Assets.car/icon.png", + size=2000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="nested_icon_hash", + is_dir=False, + children=[], + ), + FileInfo( + full_path=Path("Assets.car/logo.png"), + path="Assets.car/logo.png", + size=3000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="nested_logo_hash", + is_dir=False, + children=[], + ), + ], + ), + # Another .car file with duplicate nested assets + FileInfo( + full_path=Path("Resources.bundle/Assets.car"), + path="Resources.bundle/Assets.car", + size=8000, + file_type="car", + treemap_type=TreemapType.ASSETS, + hash="different_car_hash", + is_dir=False, + children=[ + FileInfo( + full_path=Path("Resources.bundle/Assets.car/icon.png"), + path="Resources.bundle/Assets.car/icon.png", + size=2000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="nested_icon_hash", # Same hash as Assets.car/icon.png + is_dir=False, + children=[], + ), + ], + ), + ] + + insights_input = self._create_insights_input(files) + result = self.insight.generate(insights_input) + + assert isinstance(result, DuplicateFilesInsightResult) + assert len(result.groups) == 1 + + # Should detect the duplicate nested icon.png files + group = result.groups[0] + assert group.name == "icon.png" + assert len(group.files) == 2 + assert group.total_savings == 2000 + + # Verify both paths are the nested ones, not the parent .car files + paths = {f.file_path for f in group.files} + assert "Assets.car/icon.png" in paths + assert "Resources.bundle/Assets.car/icon.png" in paths + + def test_duplicate_car_files_dont_double_count_children(self): + """ + Test that when entire Assets.car files are duplicates, we only report the + .car files as duplicates, not their individual nested children (prevents double-counting). + """ + files = [ + # First Assets.car with children + FileInfo( + full_path=Path("Assets.car"), + path="Assets.car", + size=10000, + file_type="car", + treemap_type=TreemapType.ASSETS, + hash="duplicate_car_hash", + is_dir=False, + children=[ + FileInfo( + full_path=Path("Assets.car/icon.png"), + path="Assets.car/icon.png", + size=2000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="icon_hash_inside_car", + is_dir=False, + children=[], + ), + FileInfo( + full_path=Path("Assets.car/logo.png"), + path="Assets.car/logo.png", + size=3000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="logo_hash_inside_car", + is_dir=False, + children=[], + ), + ], + ), + # Duplicate Assets.car with same children + FileInfo( + full_path=Path("Backup/Assets.car"), + path="Backup/Assets.car", + size=10000, + file_type="car", + treemap_type=TreemapType.ASSETS, + hash="duplicate_car_hash", # Same hash as first .car + is_dir=False, + children=[ + FileInfo( + full_path=Path("Backup/Assets.car/icon.png"), + path="Backup/Assets.car/icon.png", + size=2000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="icon_hash_inside_car", # Same hash as first icon + is_dir=False, + children=[], + ), + FileInfo( + full_path=Path("Backup/Assets.car/logo.png"), + path="Backup/Assets.car/logo.png", + size=3000, + file_type="png", + treemap_type=TreemapType.ASSETS, + hash="logo_hash_inside_car", # Same hash as first logo + is_dir=False, + children=[], + ), + ], + ), + ] + + insights_input = self._create_insights_input(files) + result = self.insight.generate(insights_input) + + assert isinstance(result, DuplicateFilesInsightResult) + + # Should only have 1 group: the duplicate Assets.car files + # The nested children should NOT be flagged separately + assert len(result.groups) == 1 + + group = result.groups[0] + assert group.name == "Assets.car" + assert len(group.files) == 2 + assert group.total_savings == 10000 + + # Verify the group contains the .car files, not their children + paths = {f.file_path for f in group.files} + assert "Assets.car" in paths + assert "Backup/Assets.car" in paths + + # Verify no nested children appear in the results + for g in result.groups: + for f in g.files: + assert "icon.png" not in f.file_path + assert "logo.png" not in f.file_path