Skip to content

Commit 5c64f10

Browse files
committed
parsefolder with jsonl annotations
1 parent c92dcf7 commit 5c64f10

File tree

3 files changed

+53
-10
lines changed

3 files changed

+53
-10
lines changed

roboflow/util/folderparser.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .image_utils import load_labelmap
99

1010
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp"}
11-
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv"}
11+
ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv", ".jsonl"}
1212
LABELMAPS_EXTENSIONS = {".labels", ".yaml", ".yml"}
1313

1414

@@ -107,13 +107,14 @@ def _map_annotations_to_images_1tomany(images, annotationFiles):
107107
dirname = image["dirname"]
108108
annotationsInSameDir = annotationsByDirname.get(dirname, [])
109109
if annotationsInSameDir:
110-
if len(annotationsInSameDir) > 1:
111-
print(f"warning: found multiple annotation files on dir {dirname}")
112-
annotationFile = annotationsInSameDir[0]
113-
format = annotationFile["parsedType"]
114-
image["annotationfile"] = _filterIndividualAnnotations(
115-
image, annotationFile, format, imgRefMap, annotationMap
116-
)
110+
for annotationFile in annotationsInSameDir:
111+
format = annotationFile["parsedType"]
112+
filtered_annotations = _filterIndividualAnnotations(
113+
image, annotationFile, format, imgRefMap, annotationMap
114+
)
115+
if filtered_annotations:
116+
image["annotationfile"] = filtered_annotations
117+
break
117118

118119

119120
def _build_image_and_annotation_maps(annotationFiles):
@@ -182,11 +183,16 @@ def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotatio
182183
return _annotation
183184
else:
184185
return None
186+
elif format == "jsonl":
187+
imgLines = [line for line in parsed if line["image"] == image["name"]]
188+
if imgLines:
189+
_annotation = {"name": "annotation.jsonl", "rawText": json.dumps(imgLines)}
190+
return _annotation
185191
return None
186192

187193

188194
def _loadAnnotations(folder, annotations):
189-
valid_extensions = {".json", ".csv"}
195+
valid_extensions = {".json", ".csv", ".jsonl"}
190196
annotations = [a for a in annotations if a["extension"] in valid_extensions]
191197
for ann in annotations:
192198
extension = ann["extension"]
@@ -197,12 +203,27 @@ def _loadAnnotations(folder, annotations):
197203
if parsedType:
198204
ann["parsed"] = parsed
199205
ann["parsedType"] = parsedType
206+
elif extension == ".jsonl":
207+
ann["parsed"] = _read_jsonl(f"{folder}{ann['file']}")
208+
ann["parsedType"] = "jsonl"
200209
elif extension == ".csv":
201210
ann["parsedType"] = "csv"
202211
ann["parsed"] = _parseAnnotationCSV(f"{folder}{ann['file']}")
203212
return annotations
204213

205214

215+
def _read_jsonl(path):
216+
data = []
217+
with open(path) as file:
218+
for line in file:
219+
try:
220+
json_object = json.loads(line.strip())
221+
data.append(json_object)
222+
except json.JSONDecodeError:
223+
print(f"Warning: Skipping invalid JSON line in {path}")
224+
return data
225+
226+
206227
def _parseAnnotationCSV(filename):
207228
# TODO: use a proper CSV library?
208229
with open(filename) as f:

tests/datasets/paligemma/dataset/_annotations.test.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"What region in Italy had the highest number of mafia crimes in 2018?","suffix":"Calabria"}
44
{"image":"de960ddd58344041754d5f984f8f82c2_png.rf.011864613b53c6b6a0c0a7086b657a71.jpg","prefix":"How many criminal reports were recorded in the region of Calabria in 2018?","suffix":"896"}
55
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
6-
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2013?","suffix":"Retail"}
6+
{"image":"de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg","prefix":"Which sector had the highest ROI in 2014?","suffix":"Electronics"}
77
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
88
{"image":"e1893eee3f64bda1eac88da795ad3a00_png.rf.01248d761c27015da1fa5f3c4daea759.jpg","prefix":"How much did Hermes' national general cargo revenue add up to in 2009?","suffix":"100"}
99
{"image":"eaab023f1ce380c4c9163415facc3c0d_png.rf.01c5a1f19653c056bbb3b0c8fc2d752d.jpg","prefix":"What's the percentage value of leftmost bar?","suffix":"24"}

tests/util/test_folderparser.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,28 @@ def test_parse_mosquitos_csv(self):
5252
expected += "train_10308.jpeg,1058,943,japonicus/koreicus,28,187,908,815\n"
5353
assert testImage["annotationfile"]["rawText"] == expected
5454

55+
def test_paligemma_format(self):
56+
folder = f"{thisdir}/../datasets/paligemma"
57+
parsed = folderparser.parsefolder(folder)
58+
testImagePath = "/dataset/de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg"
59+
testImage = [i for i in parsed["images"] if i["file"] == testImagePath][0]
60+
assert testImage["annotationfile"]["name"] == "annotation.jsonl"
61+
expected = json.dumps(
62+
[
63+
{
64+
"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",
65+
"prefix": "Which sector had the highest ROI in 2013?",
66+
"suffix": "Retail",
67+
},
68+
{
69+
"image": "de48275e1ff70fab78bee31e09fc896d_png.rf.01a97b1ad053aa1e6525ac0451cee8b7.jpg",
70+
"prefix": "Which sector had the highest ROI in 2014?",
71+
"suffix": "Electronics",
72+
},
73+
]
74+
)
75+
assert testImage["annotationfile"]["rawText"] == expected
76+
5577

5678
def _assertJsonMatchesFile(actual, filename):
5779
with open(filename) as file:

0 commit comments

Comments
 (0)