Be more flexible with filename matches

iurisilvio · iurisilvio · commit bad8588d8cbd · 2025-10-05T17:57:25.000+02:00
diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py
@@ -140,7 +140,17 @@ def _build_image_and_annotation_maps(annotationFiles):
         )
         if parsedType == "coco":
             for imageRef in parsed["images"]:
-                imgRefMap[f"{filename}/{imageRef['file_name']}"] = imageRef
+                # Normalize and index by multiple forms to improve matching robustness
+                file_name = _patch_sep(imageRef["file_name"]).lstrip("/")
+                basename = os.path.basename(file_name)
+                stem = os.path.splitext(basename)[0]
+
+                # Prefer full relative path, but also allow basename and stem
+                imgRefMap.update({
+                    f"{filename}/{file_name}": imageRef,
+                    f"{filename}/{basename}": imageRef,
+                    f"{filename}/{stem}": imageRef,
+                })
             for annotation in parsed["annotations"]:
                 annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation)
     return imgRefMap, annotationMap
@@ -149,7 +159,15 @@ def _build_image_and_annotation_maps(annotationFiles):
 def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotationMap):
     parsed = annotation["parsed"]
     if format == "coco":
-        imgReference = imgRefMap.get(f"{annotation['file']}/{image['name']}")
+        rel_path = image["file"].lstrip("/")
+        imgReference = (
+            # Try matching by full relative path first
+            imgRefMap.get(f"{annotation['file']}/{rel_path}")
+            # Fallback: basename with extension
+            or imgRefMap.get(f"{annotation['file']}/{image['name']}")
+            # Fallback: stem (no extension)
+            or imgRefMap.get(f"{annotation['file']}/{image['key']}")
+        )
         if imgReference:
             # workaround to make Annotations.js correctly identify this as coco in the backend
             fake_annotation = {
diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py
@@ -1,4 +1,6 @@
 import json
+import os
+import tempfile
 import unittest
 from os.path import abspath, dirname
 
@@ -95,6 +97,55 @@ def test_parse_multilabel_classification_csv(self):
         self.assertEqual(img1["annotationfile"]["type"], "classification_multilabel")
         self.assertEqual(set(img1["annotationfile"]["labels"]), {"Blackheads"})
 
+    def test_coco_with_subdir_file_name_should_match_annotations(self):
+        # COCO file_name includes a subdirectory, but the actual image is at dataset root.
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Create nested image path: /2/100002/img.jpeg
+            image_name = "metaclip_2_100002_02f2f7c6e15f09b401575ae6.jpeg"
+            image_relpath = os.path.join("2", "100002", image_name)
+            image_path = os.path.join(tmpdir, image_name)
+            # Create an empty image file (content not used by parser)
+            open(image_path, "wb").close()
+
+            # Create COCO annotation JSON at dataset root, referencing the image with subdir in file_name
+            coco = {
+                "info": {},
+                "licenses": [],
+                "categories": [{"id": 1, "name": "thing"}],
+                "images": [
+                    {
+                        "id": 10000000,
+                        "file_name": image_relpath.replace(os.sep, "/"),
+                        "width": 800,
+                        "height": 533,
+                    }
+                ],
+                "annotations": [
+                    {
+                        "id": 1,
+                        "image_id": 10000000,
+                        "category_id": 1,
+                        "bbox": [10, 10, 100, 50],
+                        "area": 5000,
+                        "segmentation": [],
+                        "iscrowd": 0,
+                    }
+                ],
+            }
+            coco_path = os.path.join(tmpdir, "_annotations.coco.json")
+            with open(coco_path, "w") as f:
+                json.dump(coco, f)
+
+            parsed = folderparser.parsefolder(tmpdir)
+            # Image entries store file with a leading slash relative to root
+            expected_file_key = f"/{image_name}"
+            img_entries = [i for i in parsed["images"] if i["file"] == expected_file_key]
+            self.assertTrue(len(img_entries) == 1)
+            img_entry = img_entries[0]
+
+            # Expect annotationfile to be populated, but this currently fails due to basename-only matching
+            self.assertIsNotNone(img_entry.get("annotationfile"))
+
 
 def _assertJsonMatchesFile(actual, filename):
     with open(filename) as file: