Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/launchpad/artifacts/apple/zipped_xcarchive.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class AssetCatalogElement:
full_path: Path | None
idiom: str | None = None
colorspace: str | None = None
content_hash: str | None = None


@dataclass
Expand Down Expand Up @@ -477,9 +478,10 @@ def _parse_asset_element(self, item: dict[str, Any], parent_path: Path) -> Asset
filename = item.get("filename", "")
idiom = item.get("idiom")
colorspace = item.get("colorspace")
content_hash = item.get("contentHash")

file_extension = Path(filename).suffix.lower()
if filename and file_extension in {".png", ".jpg", ".jpeg", ".heic", ".heif"}:
if filename and file_extension in {".png", ".jpg", ".jpeg", ".heic", ".heif", ".pdf", ".svg"}:
potential_path = parent_path / f"{image_id}{file_extension}"
if potential_path.exists():
full_path = potential_path
Expand All @@ -498,6 +500,7 @@ def _parse_asset_element(self, item: dict[str, Any], parent_path: Path) -> Asset
full_path=full_path,
idiom=idiom,
colorspace=colorspace,
content_hash=content_hash,
)

def _parse_and_cache_all_binaries(self, binary_paths: List[Path]) -> None:
Expand Down
36 changes: 31 additions & 5 deletions src/launchpad/size/insights/common/duplicate_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ class DuplicateFilesInsight(Insight[DuplicateFilesInsightResult]):
def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None:
files = input.file_analysis.files
directories = input.file_analysis.directories
all_files = self._flatten_files(files)

groups: List[FileSavingsResultGroup] = []
total_savings = 0
covered_dirs: Set[str] = set()
covered_files: Set[str] = set()

# -----------------------------
# 1) Duplicate DIRECTORIES
Expand Down Expand Up @@ -66,11 +68,19 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None:
# 2) Duplicate FILES
# -----------------------------
files_by_hash: Dict[str, List[FileInfo]] = defaultdict(list)
for f in files:
if f.hash and not self._is_allowed_extension(f.path) and not self._is_under_any(f.path, covered_dirs):
for f in all_files:
if (
not f.path.endswith("/Other") # Skip synthetic /Other nodes
and not self._is_under_any(f.path, covered_dirs) # Skip files under duplicate dirs
and f.hash
and not self._is_allowed_extension(f.path)
):
files_by_hash[f.hash].append(f)

for dup_files in files_by_hash.values():
# Process hash groups by depth (shallowest first) to handle parent files before children
for file_hash in sorted(files_by_hash, key=lambda h: self._min_depth([f.path for f in files_by_hash[h]])):
dup_files = [f for f in files_by_hash[file_hash] if not self._is_under_any(f.path, covered_files)]

if len(dup_files) < 2:
continue

Expand All @@ -88,6 +98,7 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None:
)
)
total_savings += savings
covered_files.update(f.path for f in dup_files if f.children)

groups.sort(key=lambda g: (-g.total_savings, g.name))

Expand All @@ -99,6 +110,15 @@ def generate(self, input: InsightsInput) -> DuplicateFilesInsightResult | None:
# Helpers
# -------------------------------------------------------------------------

def _flatten_files(self, files: List[FileInfo]) -> List[FileInfo]:
"""Recursively flatten files, extracting nested children (e.g., assets within .car files)."""
result: List[FileInfo] = []
for f in files:
result.append(f)
if f.children:
result.extend(self._flatten_files(f.children))
return result

def _directory_duplicate_candidates(self, directories: List[FileInfo]) -> List[List[FileInfo]]:
"""
Returns a list of duplicate-directory groups (lists of FileInfo) where
Expand All @@ -116,8 +136,14 @@ def _is_allowed_extension(self, file_path: str) -> bool:

@staticmethod
def _is_under_any(path: str, containers: Set[str]) -> bool:
for c in containers:
if path == c or path.startswith(c + "/"):
"""Check if path or any of its parents are in containers (O(path_depth))."""
if path in containers:
return True
# Walk up parent hierarchy: "a/b/c" checks "a" then "a/b"
parts = path.split("/")
for depth in range(1, len(parts)):
parent_path = "/".join(parts[:depth])
if parent_path in containers:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made this method just generally more efficient

return True
return False

Expand Down
2 changes: 2 additions & 0 deletions src/launchpad/size/utils/file_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ def _analyze_asset_catalog(xcarchive: ZippedXCArchive, relative_path: Path) -> L
for element in catalog_details:
if element.full_path and element.full_path.exists() and element.full_path.is_file():
file_hash = calculate_file_hash(element.full_path, algorithm="sha256")
elif element.content_hash:
file_hash = element.content_hash
else:
file_hash = element.image_id

Expand Down
173 changes: 173 additions & 0 deletions tests/unit/size/test_duplicate_files_insight.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,176 @@ def test_complex_scenario_with_multiple_edge_cases(self):
# Verify that .xcprivacy files were ignored (no group for them)
privacy_group = next((g for g in result.groups if "xcprivacy" in g.name), None)
assert privacy_group is None

def test_nested_children_are_flattened_for_duplicate_detection(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test case for what I mentioned above: have two duplicate Assets.car's with duplicate children so we can confirm only the Assets.car gets flagged as the duplicate.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

"""Test that nested children (e.g., assets inside .car files) are included in duplicate detection."""
files = [
# Parent .car file with nested children
FileInfo(
full_path=Path("Assets.car"),
path="Assets.car",
size=10000,
file_type="car",
treemap_type=TreemapType.ASSETS,
hash="car_hash",
is_dir=False,
children=[
FileInfo(
full_path=Path("Assets.car/icon.png"),
path="Assets.car/icon.png",
size=2000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="nested_icon_hash",
is_dir=False,
children=[],
),
FileInfo(
full_path=Path("Assets.car/logo.png"),
path="Assets.car/logo.png",
size=3000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="nested_logo_hash",
is_dir=False,
children=[],
),
],
),
# Another .car file with duplicate nested assets
FileInfo(
full_path=Path("Resources.bundle/Assets.car"),
path="Resources.bundle/Assets.car",
size=8000,
file_type="car",
treemap_type=TreemapType.ASSETS,
hash="different_car_hash",
is_dir=False,
children=[
FileInfo(
full_path=Path("Resources.bundle/Assets.car/icon.png"),
path="Resources.bundle/Assets.car/icon.png",
size=2000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="nested_icon_hash", # Same hash as Assets.car/icon.png
is_dir=False,
children=[],
),
],
),
]

insights_input = self._create_insights_input(files)
result = self.insight.generate(insights_input)

assert isinstance(result, DuplicateFilesInsightResult)
assert len(result.groups) == 1

# Should detect the duplicate nested icon.png files
group = result.groups[0]
assert group.name == "icon.png"
assert len(group.files) == 2
assert group.total_savings == 2000

# Verify both paths are the nested ones, not the parent .car files
paths = {f.file_path for f in group.files}
assert "Assets.car/icon.png" in paths
assert "Resources.bundle/Assets.car/icon.png" in paths

def test_duplicate_car_files_dont_double_count_children(self):
"""
Test that when entire Assets.car files are duplicates, we only report the
.car files as duplicates, not their individual nested children (prevents double-counting).
"""
files = [
# First Assets.car with children
FileInfo(
full_path=Path("Assets.car"),
path="Assets.car",
size=10000,
file_type="car",
treemap_type=TreemapType.ASSETS,
hash="duplicate_car_hash",
is_dir=False,
children=[
FileInfo(
full_path=Path("Assets.car/icon.png"),
path="Assets.car/icon.png",
size=2000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="icon_hash_inside_car",
is_dir=False,
children=[],
),
FileInfo(
full_path=Path("Assets.car/logo.png"),
path="Assets.car/logo.png",
size=3000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="logo_hash_inside_car",
is_dir=False,
children=[],
),
],
),
# Duplicate Assets.car with same children
FileInfo(
full_path=Path("Backup/Assets.car"),
path="Backup/Assets.car",
size=10000,
file_type="car",
treemap_type=TreemapType.ASSETS,
hash="duplicate_car_hash", # Same hash as first .car
is_dir=False,
children=[
FileInfo(
full_path=Path("Backup/Assets.car/icon.png"),
path="Backup/Assets.car/icon.png",
size=2000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="icon_hash_inside_car", # Same hash as first icon
is_dir=False,
children=[],
),
FileInfo(
full_path=Path("Backup/Assets.car/logo.png"),
path="Backup/Assets.car/logo.png",
size=3000,
file_type="png",
treemap_type=TreemapType.ASSETS,
hash="logo_hash_inside_car", # Same hash as first logo
is_dir=False,
children=[],
),
],
),
]

insights_input = self._create_insights_input(files)
result = self.insight.generate(insights_input)

assert isinstance(result, DuplicateFilesInsightResult)

# Should only have 1 group: the duplicate Assets.car files
# The nested children should NOT be flagged separately
assert len(result.groups) == 1

group = result.groups[0]
assert group.name == "Assets.car"
assert len(group.files) == 2
assert group.total_savings == 10000

# Verify the group contains the .car files, not their children
paths = {f.file_path for f in group.files}
assert "Assets.car" in paths
assert "Backup/Assets.car" in paths

# Verify no nested children appear in the results
for g in result.groups:
for f in g.files:
assert "icon.png" not in f.file_path
assert "logo.png" not in f.file_path
Loading