Skip to content

Commit ef60da9

Browse files
authored
Support Cloud Storage for COCO (#66)
1 parent 6787e1e commit ef60da9

File tree

7 files changed

+3466
-446
lines changed

7 files changed

+3466
-446
lines changed

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,28 @@ This package is compatible with the following platforms:
6868
pip install labelformat
6969
```
7070

71+
### ☁️ Using Cloud Storage
72+
73+
To work with annotations stored in cloud storage (like AWS S3,
74+
GCS, or Azure), install the cloud storage dependencies:
75+
76+
```shell
77+
pip install "labelformat[cloud-storage]"
78+
```
79+
80+
This installs the required libraries: `s3fs` (for S3), `gcsfs` (for GCS), and
81+
`adlfs` (for Azure).
82+
83+
Labelformat uses `fsspec`, which also supports other file systems. If you need a
84+
different provider (for example FTP or SSH), check the
85+
[fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations)
86+
and install the matching implementation manually (for example `pip install sftpfs`).
87+
88+
**Current Support Limitations:**
89+
90+
- **Input format:** Cloud URIs are currently supported only for COCO input (`--input-format coco`).
91+
92+
7193
## Usage
7294

7395
### CLI

pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,19 @@ dependencies = [
1313
"pydantic==2.10.6",
1414
"pydantic-xml==2.17.3",
1515
"numpy",
16+
"fsspec>=2023.1.0",
1617
]
1718

1819
[project.scripts]
1920
labelformat = "labelformat.cli.cli:main"
2021

22+
[project.optional-dependencies]
23+
cloud-storage = [
24+
"s3fs>=2023.1.0",
25+
"gcsfs>=2023.1.0",
26+
"adlfs>=2023.1.0",
27+
]
28+
2129
[dependency-groups]
2230
dev = [
2331
"mypy==1.14.1",
@@ -60,6 +68,10 @@ strict = true
6068
disallow_untyped_defs = true
6169
ignore_missing_imports = false
6270

71+
[[tool.mypy.overrides]]
72+
module = "fsspec.*"
73+
ignore_missing_imports = true
74+
6375
# Print format
6476
show_error_codes = true
6577
show_error_context = true

src/labelformat/formats/coco.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from pathlib import Path
66
from typing import Dict, Iterable, List
77

8+
import fsspec
9+
810
import labelformat.formats.coco_segmentation_helpers as segmentation_helpers
911
from labelformat.cli.registry import Task, cli_register
1012
from labelformat.formats.coco_segmentation_helpers import (
@@ -31,21 +33,21 @@
3133
ObjectDetectionOutput,
3234
SingleObjectDetection,
3335
)
34-
from labelformat.types import JsonDict, ParseError
36+
from labelformat.types import JsonDict, ParseError, PathLike
3537

3638

3739
class _COCOBaseInput:
3840
@staticmethod
3941
def add_cli_arguments(parser: ArgumentParser) -> None:
4042
parser.add_argument(
4143
"--input-file",
42-
type=Path,
44+
type=str,
4345
required=True,
44-
help="Path to input COCO JSON file",
46+
help="Path or URI to input COCO JSON file",
4547
)
4648

47-
def __init__(self, input_file: Path) -> None:
48-
with input_file.open() as file:
49+
def __init__(self, input_file: PathLike) -> None:
50+
with fsspec.open(str(input_file), mode="r") as file:
4951
self._data = json.load(file)
5052

5153
def get_categories(self) -> Iterable[Category]:

src/labelformat/types.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
from typing import Any, Dict
1+
from pathlib import Path
2+
from typing import Any, Dict, Union
23

34
JsonDict = Dict[str, Any]
45

56

67
class ParseError(Exception):
78
pass
9+
10+
11+
PathLike = Union[str, Path]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import pytest
2+
from fsspec.core import url_to_fs
3+
4+
from labelformat.formats.coco import COCOObjectDetectionInput
5+
6+
COCO_S3_ANNOTATION_FILE = (
7+
"s3://studio-test-datasets-eu/coco_subset_128_images/instances_train2017.json"
8+
)
9+
COCO_S3_IMAGES_DIR = "s3://studio-test-datasets-eu/coco_subset_128_images/images/"
10+
11+
12+
@pytest.mark.skip(reason="Requires access to S3 dataset")
13+
def test_coco_od_inputs_read_from_s3__unmocked() -> None:
14+
object_detection_input = COCOObjectDetectionInput(
15+
input_file=COCO_S3_ANNOTATION_FILE
16+
)
17+
18+
object_detection_categories = list(object_detection_input.get_categories())
19+
object_detection_images = list(object_detection_input.get_images())
20+
object_detection_labels = list(object_detection_input.get_labels())
21+
22+
assert len(object_detection_categories) == 80
23+
assert len(object_detection_images) == 128
24+
assert len(object_detection_labels) == 128
25+
assert sum(len(label.objects) for label in object_detection_labels) == 900
26+
27+
# Assert that the images found in the annotations can be accessed in S3
28+
for image in object_detection_images:
29+
image_path = COCO_S3_IMAGES_DIR + image.filename
30+
fs, fs_path = url_to_fs(image_path)
31+
assert fs.exists(fs_path)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import io
2+
import json
3+
4+
from pytest_mock import MockerFixture
5+
6+
from labelformat.formats.coco import COCOObjectDetectionInput
7+
8+
COCO_S3_ANNOTATION_FILE = "s3://some_bucket/some_file.json"
9+
MOCK_COCO_PAYLOAD = {
10+
"categories": [{"id": 1, "name": "cat"}],
11+
"images": [],
12+
"annotations": [],
13+
}
14+
15+
16+
def test_coco_od_inputs_read_from_s3__mocked(mocker: MockerFixture) -> None:
17+
mock_open = mocker.patch(
18+
"labelformat.formats.coco.fsspec.open",
19+
return_value=io.StringIO(json.dumps(MOCK_COCO_PAYLOAD)),
20+
)
21+
object_detection_input = COCOObjectDetectionInput(
22+
input_file=COCO_S3_ANNOTATION_FILE
23+
)
24+
object_detection_categories = list(object_detection_input.get_categories())
25+
26+
assert len(object_detection_categories) == 1
27+
assert object_detection_categories[0].name == "cat"
28+
mock_open.assert_called_once_with(COCO_S3_ANNOTATION_FILE, mode="r")

0 commit comments

Comments
 (0)