parsefolder with jsonl annotations

tonylampada · tonylampada · commit 5c64f10a2c41 · 2024-09-02T19:26:51.000Z
diff --git a/roboflow/util/folderparser.py b/roboflow/util/folderparser.py
@@ -8,7 +8,7 @@
 from .image_utils import load_labelmap
 
 IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp"}
-ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv"}
+ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv", ".jsonl"}
 LABELMAPS_EXTENSIONS = {".labels", ".yaml", ".yml"}
 
 
@@ -107,13 +107,14 @@ def _map_annotations_to_images_1tomany(images, annotationFiles):
         dirname = image["dirname"]
         annotationsInSameDir = annotationsByDirname.get(dirname, [])
         if annotationsInSameDir:
-            if len(annotationsInSameDir) > 1:
-                print(f"warning: found multiple annotation files on dir {dirname}")
-            annotationFile = annotationsInSameDir[0]
-            format = annotationFile["parsedType"]
-            image["annotationfile"] = _filterIndividualAnnotations(
-                image, annotationFile, format, imgRefMap, annotationMap
-            )
+            for annotationFile in annotationsInSameDir:
+                format = annotationFile["parsedType"]
+                filtered_annotations = _filterIndividualAnnotations(
+                    image, annotationFile, format, imgRefMap, annotationMap
+                )
+                if filtered_annotations:
+                    image["annotationfile"] = filtered_annotations
+                    break
 
 
 def _build_image_and_annotation_maps(annotationFiles):
@@ -182,11 +183,16 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio
             return _annotation
         else:
             return None
+    elif format == "jsonl":
+        imgLines = [line for line in parsed if line["image"] == image["name"]]
+        if imgLines:
+            _annotation = {"name": "annotation.jsonl", "rawText": json.dumps(imgLines)}
+            return _annotation
     return None
 
 
 def _loadAnnotations(folder, annotations):
-    valid_extensions = {".json", ".csv"}
+    valid_extensions = {".json", ".csv", ".jsonl"}
     annotations = [a for a in annotations if a["extension"] in valid_extensions]
     for ann in annotations:
         extension = ann["extension"]
@@ -197,12 +203,27 @@ def _loadAnnotations(folder, annotations):
                 if parsedType:
                     ann["parsed"] = parsed
                     ann["parsedType"] = parsedType
+        elif extension == ".jsonl":
+            ann["parsed"] = _read_jsonl(f"{folder}{ann['file']}")
+            ann["parsedType"] = "jsonl"
         elif extension == ".csv":
             ann["parsedType"] = "csv"
             ann["parsed"] = _parseAnnotationCSV(f"{folder}{ann['file']}")
     return annotations
 
 
+def _read_jsonl(path):
+    data = []
+    with open(path) as file:
+        for line in file:
+            try:
+                json_object = json.loads(line.strip())
+                data.append(json_object)
+            except json.JSONDecodeError:
+                print(f"Warning: Skipping invalid JSON line in {path}")
+    return data
+
+
 def _parseAnnotationCSV(filename):
     # TODO: use a proper CSV library?
     with open(filename) as f:
diff --git a/tests/datasets/paligemma/dataset/_annotations.test.jsonl b/tests/datasets/paligemma/dataset/_annotations.test.jsonl
@@ -3,7 +3,7 @@
 {"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
 {"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
 {"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
-{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
+{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2014?","suffix":"Electronics"}
 {"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
 {"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
 {"image":"eaab023f1ce380c4c9163415facc3c0d_png.rf.01c5a1f19653c056bbb3b0c8fc2d752d.jpg","prefix":"What's the percentage value of leftmost bar?","suffix":"24"}
diff --git a/tests/util/test_folderparser.py b/tests/util/test_folderparser.py
@@ -52,6 +52,28 @@ def test_parse_mosquitos_csv(self):
         expected += "train_10308.jpeg,1058,943,japonicus/koreicus,28,187,908,815\n"
         assert testImage["annotationfile"]["rawText"] == expected
 
+    def test_paligemma_format(self):
+        folder = f"{thisdir}/../datasets/paligemma"
+        parsed = folderparser.parsefolder(folder)
+        testImagePath = "/dataset/de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg"
+        testImage = [i for i in parsed["images"] if i["file"] == testImagePath][0]
+        assert testImage["annotationfile"]["name"] == "annotation.jsonl"
+        expected = json.dumps(
+            [
+                {
+                    "image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",
+                    "prefix": "Which sector had the highest ROI in 2013?",
+                    "suffix": "Retail",
+                },
+                {
+                    "image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",
+                    "prefix": "Which sector had the highest ROI in 2014?",
+                    "suffix": "Electronics",
+                },
+            ]
+        )
+        assert testImage["annotationfile"]["rawText"] == expected
+
 
 def _assertJsonMatchesFile(actual, filename):
     with open(filename) as file: