File tree Expand file tree Collapse file tree 7 files changed +3466
-446
lines changed
Expand file tree Collapse file tree 7 files changed +3466
-446
lines changed Original file line number Diff line number Diff line change @@ -68,6 +68,28 @@ This package is compatible with the following platforms:
6868pip install labelformat
6969```
7070
71+ ### ☁️ Using Cloud Storage
72+
73+ To work with annotations stored in cloud storage (like AWS S3,
74+ GCS, or Azure), install the cloud storage dependencies:
75+
76+ ``` shell
77+ pip install " labelformat[cloud-storage]"
78+ ```
79+
80+ This installs the required libraries: ` s3fs ` (for S3), ` gcsfs ` (for GCS), and
81+ ` adlfs ` (for Azure).
82+
83+ Labelformat uses ` fsspec ` , which also supports other file systems. If you need a
84+ different provider (for example FTP or SSH), check the
85+ [ fsspec documentation] ( https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations )
86+ and install the matching implementation manually (for example ` pip install sftpfs ` ).
87+
88+ ** Current Support Limitations:**
89+
90+ - ** Input format:** Cloud URIs are currently supported only for COCO input (` --input-format coco ` ).
91+
92+
7193## Usage
7294
7395### CLI
Original file line number Diff line number Diff line change @@ -13,11 +13,19 @@ dependencies = [
1313 " pydantic==2.10.6" ,
1414 " pydantic-xml==2.17.3" ,
1515 " numpy" ,
16+ " fsspec>=2023.1.0" ,
1617]
1718
1819[project .scripts ]
1920labelformat = " labelformat.cli.cli:main"
2021
22+ [project .optional-dependencies ]
23+ cloud-storage = [
24+ " s3fs>=2023.1.0" ,
25+ " gcsfs>=2023.1.0" ,
26+ " adlfs>=2023.1.0" ,
27+ ]
28+
2129[dependency-groups ]
2230dev = [
2331 " mypy==1.14.1" ,
@@ -60,6 +68,10 @@ strict = true
6068disallow_untyped_defs = true
6169ignore_missing_imports = false
6270
71+ [[tool .mypy .overrides ]]
72+ module = " fsspec.*"
73+ ignore_missing_imports = true
74+
6375# Print format
6476show_error_codes = true
6577show_error_context = true
Original file line number Diff line number Diff line change 55from pathlib import Path
66from typing import Dict , Iterable , List
77
8+ import fsspec
9+
810import labelformat .formats .coco_segmentation_helpers as segmentation_helpers
911from labelformat .cli .registry import Task , cli_register
1012from labelformat .formats .coco_segmentation_helpers import (
3133 ObjectDetectionOutput ,
3234 SingleObjectDetection ,
3335)
34- from labelformat .types import JsonDict , ParseError
36+ from labelformat .types import JsonDict , ParseError , PathLike
3537
3638
3739class _COCOBaseInput :
3840 @staticmethod
3941 def add_cli_arguments (parser : ArgumentParser ) -> None :
4042 parser .add_argument (
4143 "--input-file" ,
42- type = Path ,
44+ type = str ,
4345 required = True ,
44- help = "Path to input COCO JSON file" ,
46+ help = "Path or URI to input COCO JSON file" ,
4547 )
4648
47- def __init__ (self , input_file : Path ) -> None :
48- with input_file .open () as file :
49+ def __init__ (self , input_file : PathLike ) -> None :
50+ with fsspec .open (str ( input_file ), mode = "r" ) as file :
4951 self ._data = json .load (file )
5052
5153 def get_categories (self ) -> Iterable [Category ]:
Original file line number Diff line number Diff line change 1- from typing import Any , Dict
1+ from pathlib import Path
2+ from typing import Any , Dict , Union
23
34JsonDict = Dict [str , Any ]
45
56
67class ParseError (Exception ):
78 pass
9+
10+
11+ PathLike = Union [str , Path ]
Original file line number Diff line number Diff line change 1+ import pytest
2+ from fsspec .core import url_to_fs
3+
4+ from labelformat .formats .coco import COCOObjectDetectionInput
5+
6+ COCO_S3_ANNOTATION_FILE = (
7+ "s3://studio-test-datasets-eu/coco_subset_128_images/instances_train2017.json"
8+ )
9+ COCO_S3_IMAGES_DIR = "s3://studio-test-datasets-eu/coco_subset_128_images/images/"
10+
11+
12+ @pytest .mark .skip (reason = "Requires access to S3 dataset" )
13+ def test_coco_od_inputs_read_from_s3__unmocked () -> None :
14+ object_detection_input = COCOObjectDetectionInput (
15+ input_file = COCO_S3_ANNOTATION_FILE
16+ )
17+
18+ object_detection_categories = list (object_detection_input .get_categories ())
19+ object_detection_images = list (object_detection_input .get_images ())
20+ object_detection_labels = list (object_detection_input .get_labels ())
21+
22+ assert len (object_detection_categories ) == 80
23+ assert len (object_detection_images ) == 128
24+ assert len (object_detection_labels ) == 128
25+ assert sum (len (label .objects ) for label in object_detection_labels ) == 900
26+
27+ # Assert that the images found in the annotations can be accessed in S3
28+ for image in object_detection_images :
29+ image_path = COCO_S3_IMAGES_DIR + image .filename
30+ fs , fs_path = url_to_fs (image_path )
31+ assert fs .exists (fs_path )
Original file line number Diff line number Diff line change 1+ import io
2+ import json
3+
4+ from pytest_mock import MockerFixture
5+
6+ from labelformat .formats .coco import COCOObjectDetectionInput
7+
8+ COCO_S3_ANNOTATION_FILE = "s3://some_bucket/some_file.json"
9+ MOCK_COCO_PAYLOAD = {
10+ "categories" : [{"id" : 1 , "name" : "cat" }],
11+ "images" : [],
12+ "annotations" : [],
13+ }
14+
15+
16+ def test_coco_od_inputs_read_from_s3__mocked (mocker : MockerFixture ) -> None :
17+ mock_open = mocker .patch (
18+ "labelformat.formats.coco.fsspec.open" ,
19+ return_value = io .StringIO (json .dumps (MOCK_COCO_PAYLOAD )),
20+ )
21+ object_detection_input = COCOObjectDetectionInput (
22+ input_file = COCO_S3_ANNOTATION_FILE
23+ )
24+ object_detection_categories = list (object_detection_input .get_categories ())
25+
26+ assert len (object_detection_categories ) == 1
27+ assert object_detection_categories [0 ].name == "cat"
28+ mock_open .assert_called_once_with (COCO_S3_ANNOTATION_FILE , mode = "r" )
You can’t perform that action at this time.
0 commit comments