Skip to content

Commit ec5be8e

Browse files
authored
feat: Implement LabelBox staging brick (#26)
* Implement stage_for_label_box function * Add unit tests for stage_for_label_box function * Update docs with description and example for stage_for_label_box function * Bump version and update CHANGELOG.md * Fix linting issues and implement suggested changes * Update stage_for_label_box docs with a note for uploading files to cloud providers
1 parent 546865f commit ec5be8e

File tree

5 files changed

+306
-2
lines changed

5 files changed

+306
-2
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.2.1-dev5
1+
## 0.2.1-dev6
22

3+
* Added staging brick for LabelBox.
34
* Added ability to upload LabelStudio predictions
45
* Added utility function for JSONL reading and writing
56
* Added staging brick for CSV format for Prodigy

docs/source/bricks.rst

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,70 @@ Examples:
562562
# The resulting CSV file is ready to be used with Prodigy
563563
with open("prodigy.csv", "w") as csv_file:
564564
csv_file.write(prodigy_csv_data)
565+
566+
567+
568+
``stage_for_label_box``
569+
--------------------------
570+
571+
Formats outputs for use with `LabelBox <https://docs.labelbox.com/docs/overview>`_. LabelBox accepts cloud-hosted data
572+
and does not support importing text directly. The ``stage_for_label_box`` does the following:
573+
574+
* Stages the data files in the ``output_directory`` specified in function arguments to be uploaded to a cloud storage service.
575+
* Returns a config of type ``List[Dict[str, Any]]`` that can be written to a ``json`` file and imported into LabelBox.
576+
577+
**Note:** ``stage_for_label_box`` does not upload the data to remote storage such as S3. Users can upload the data to S3
578+
using ``aws s3 sync ${output_directory} ${url_prefix}`` after running the ``stage_for_label_box`` staging brick.
579+
580+
Examples:
581+
582+
The following example demonstrates generating a ``config.json`` file that can be used with LabelBox and uploading the staged data
583+
files to an S3 bucket.
584+
585+
.. code:: python
586+
587+
import os
588+
import json
589+
590+
from unstructured.documents.elements import Title, NarrativeText
591+
from unstructured.staging.label_box import stage_for_label_box
592+
593+
# The S3 Bucket name where data files should be uploaded.
594+
S3_BUCKET_NAME = "labelbox-staging-bucket"
595+
596+
# The S3 key prefix (I.e. directory) where data files should be stored.
597+
S3_BUCKET_KEY_PREFIX = "data/"
598+
599+
# The URL prefix where the data files will be accessed.
600+
S3_URL_PREFIX = f"https://{S3_BUCKET_NAME}.s3.amazonaws.com/{S3_BUCKET_KEY_PREFIX}"
601+
602+
# The local output directory where the data files will be staged for uploading to a Cloud Storage service.
603+
LOCAL_OUTPUT_DIRECTORY = "/tmp/labelbox-staging"
604+
605+
elements = [Title(text="Title"), NarrativeText(text="Narrative")]
606+
607+
labelbox_config = stage_for_label_box(
608+
elements,
609+
output_directory=LOCAL_OUTPUT_DIRECTORY,
610+
url_prefix=S3_URL_PREFIX,
611+
external_ids=["id1", "id2"],
612+
attachments=[[{"type": "RAW_TEXT", "value": "Title description"}], [{"type": "RAW_TEXT", "value": "Narrative Description"}]],
613+
create_directory=True,
614+
)
615+
616+
# The resulting JSON config file is ready to be used with LabelBox.
617+
with open("config.json", "w+") as labelbox_config_file:
618+
json.dump(labelbox_config, labelbox_config_file, indent=4)
619+
620+
621+
# Upload staged data files to S3 from local output directory.
622+
def upload_staged_files():
623+
import boto3
624+
s3 = boto3.client("s3")
625+
for filename in os.listdir(LOCAL_OUTPUT_DIRECTORY):
626+
filepath = os.path.join(LOCAL_OUTPUT_DIRECTORY, filename)
627+
upload_key = os.path.join(S3_BUCKET_KEY_PREFIX, filename)
628+
s3.upload_file(filepath, Bucket=S3_BUCKET_NAME, Key=upload_key)
629+
630+
upload_staged_files()
631+
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
import pytest
3+
import unstructured.staging.label_box as label_box
4+
from unstructured.documents.elements import Title, NarrativeText
5+
6+
7+
@pytest.fixture
8+
def elements():
9+
return [Title(text="Title 1"), NarrativeText(text="Narrative 1")]
10+
11+
12+
@pytest.fixture
13+
def output_directory(tmp_path):
14+
return str(tmp_path)
15+
16+
17+
@pytest.fixture
18+
def nonexistent_output_directory(tmp_path):
19+
return os.path.join(str(tmp_path), "nonexistent_dir")
20+
21+
22+
@pytest.fixture
23+
def url_prefix():
24+
return "https://storage.googleapis.com/labelbox-sample-datasets/nlp"
25+
26+
27+
@pytest.mark.parametrize(
28+
"attachments, raises_error",
29+
[
30+
(
31+
[
32+
{"type": "RAW_TEXT", "value": "Description Text"},
33+
{"type": "IMAGE", "value": "Image label", "ignored_value": 123},
34+
],
35+
False,
36+
),
37+
([{"type": "INVALID_TYPE", "value": "Description Text"}], True),
38+
([{"type": "RAW_TEXT", "value": 1}], True),
39+
([{"type": "RAW_TEXT"}], True),
40+
([{"value": "My text label"}], True),
41+
],
42+
)
43+
def test_validate_attachments(attachments, raises_error):
44+
if raises_error:
45+
with pytest.raises(ValueError):
46+
label_box._validate_attachments(attachments, 0)
47+
else:
48+
label_box._validate_attachments(attachments, 0)
49+
50+
51+
attachment = {"type": "RAW_TEXT", "value": "Text description."}
52+
53+
54+
@pytest.mark.parametrize(
55+
(
56+
"external_ids, attachments, output_directory_fixture, create_directory, "
57+
"raises, exception_class"
58+
),
59+
[
60+
(None, None, "output_directory", True, False, None),
61+
(["id1", "id2"], None, "output_directory", True, False, None),
62+
(["id1"], None, "output_directory", True, True, ValueError),
63+
(None, [[attachment], [attachment]], "output_directory", True, False, None),
64+
(None, [[attachment]], "output_directory", True, True, ValueError),
65+
(["id1", "id2"], [[attachment] * 2, [attachment]], "output_directory", True, False, None),
66+
(
67+
["id1", "id2"],
68+
[[attachment] * 2, [attachment]],
69+
"nonexistent_output_directory",
70+
True,
71+
False,
72+
None,
73+
),
74+
(
75+
["id1", "id2"],
76+
[[attachment] * 2, [attachment]],
77+
"nonexistent_output_directory",
78+
False,
79+
True,
80+
FileNotFoundError,
81+
),
82+
],
83+
)
84+
def test_stage_for_label_box(
85+
elements,
86+
url_prefix,
87+
external_ids,
88+
attachments,
89+
output_directory_fixture,
90+
create_directory,
91+
raises,
92+
exception_class,
93+
request,
94+
):
95+
output_directory = request.getfixturevalue(output_directory_fixture)
96+
if raises:
97+
with pytest.raises(exception_class):
98+
label_box.stage_for_label_box(
99+
elements,
100+
output_directory,
101+
url_prefix,
102+
external_ids=external_ids,
103+
attachments=attachments,
104+
create_directory=create_directory,
105+
)
106+
else:
107+
config = label_box.stage_for_label_box(
108+
elements,
109+
output_directory,
110+
url_prefix,
111+
external_ids=external_ids,
112+
attachments=attachments,
113+
create_directory=create_directory,
114+
)
115+
assert len(config) == len(elements)
116+
for index, (element_config, element) in enumerate(zip(config, elements)):
117+
print(element_config)
118+
119+
if external_ids:
120+
assert element_config["externalId"] == external_ids[index]
121+
else:
122+
assert element_config["externalId"] == element.id
123+
124+
if attachments:
125+
assert element_config["attachments"] == [
126+
{"type": attachment["type"], "value": attachment["value"]}
127+
for attachment in attachments[index]
128+
]
129+
130+
assert element_config["data"].startswith(url_prefix)
131+
assert element_config["data"].endswith(f'{element_config["externalId"]}.txt')
132+
133+
output_filepath = os.path.join(output_directory, f'{element_config["externalId"]}.txt')
134+
with open(output_filepath, "r") as data_file:
135+
assert data_file.read().strip() == element.text.strip()

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.1-dev5" # pragma: no cover
1+
__version__ = "0.2.1-dev6" # pragma: no cover

unstructured/staging/label_box.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import os
2+
3+
from typing import Any, Dict, List, Optional, Union, Sequence
4+
from unstructured.documents.elements import Text, NoID
5+
6+
7+
VALID_ATTACHMENT_TYPES: List[str] = ["IMAGE", "VIDEO", "RAW_TEXT", "TEXT_URL", "HTML"]
8+
9+
10+
def _validate_attachments(attachment_list: List[Dict[str, str]], element_index: int):
11+
"""
12+
Validates attachment list specified for an element.
13+
Raises a ValueError with error message if the attachment list are not valid.
14+
"""
15+
for attachment_index, attachment in enumerate(attachment_list):
16+
error_message_prefix = (
17+
f"Error at index {attachment_index} of attachments parameter "
18+
f"for element at index {element_index}."
19+
)
20+
try:
21+
attachment_type = attachment["type"]
22+
attachment_value = attachment["value"]
23+
except KeyError as e:
24+
raise ValueError(f" Missing required key: {e.args[0]}")
25+
26+
if (
27+
not isinstance(attachment_type, str)
28+
or attachment_type.upper() not in VALID_ATTACHMENT_TYPES
29+
):
30+
raise ValueError(
31+
f"{error_message_prefix}. Invalid value specified for attachment.type. "
32+
f"Must be one of: {', '.join(VALID_ATTACHMENT_TYPES)}"
33+
)
34+
if not isinstance(attachment_value, str):
35+
raise ValueError(
36+
f"{error_message_prefix}. Invalid value specified for attachment.value. "
37+
"Must be of type string."
38+
)
39+
40+
41+
def stage_for_label_box(
42+
elements: List[Text],
43+
output_directory: str,
44+
url_prefix: str,
45+
external_ids: Optional[List[str]] = None,
46+
attachments: Optional[List[List[Dict[str, str]]]] = None,
47+
create_directory: bool = False,
48+
) -> List[Dict[str, Any]]:
49+
"""
50+
Stages documents to be uploaded to LabelBox and generates LabelBox configuration.
51+
ref: https://docs.labelbox.com/reference/data-import-format-overview
52+
"""
53+
ids: Sequence[Union[str, NoID]]
54+
if (external_ids is not None) and len(external_ids) != len(elements):
55+
raise ValueError(
56+
"The external_ids parameter must be a list and the length of external_ids parameter "
57+
"must be the same as the length of elements parameter."
58+
)
59+
elif external_ids is None:
60+
ids = [element.id for element in elements]
61+
else:
62+
ids = external_ids
63+
64+
if (attachments is not None) and len(attachments) != len(elements):
65+
raise ValueError(
66+
"The attachments parameter must be a list and the length of attachments parameter "
67+
"must be the same as the length of elements parameter."
68+
)
69+
elif attachments is None:
70+
attachments = [[] for _ in elements]
71+
else:
72+
for index, attachment_list in enumerate(attachments):
73+
_validate_attachments(attachment_list, index)
74+
75+
if create_directory:
76+
os.makedirs(output_directory, exist_ok=True)
77+
else:
78+
if not os.path.isdir(output_directory):
79+
raise FileNotFoundError(output_directory)
80+
81+
config_data: List[Dict[str, str]] = []
82+
for element, element_id, attachment_list in zip(elements, ids, attachments):
83+
output_filename = f"{element_id}.txt"
84+
data_url = "/".join([url_prefix.rstrip("/"), output_filename])
85+
output_filepath = os.path.join(output_directory, output_filename)
86+
with open(output_filepath, "w+") as output_text_file:
87+
output_text_file.write(element.text)
88+
89+
element_config: Dict[str, Any] = {
90+
"data": data_url,
91+
"attachments": [
92+
{"type": attachment["type"].upper(), "value": attachment["value"]}
93+
for attachment in attachment_list
94+
],
95+
}
96+
if isinstance(element_id, str):
97+
element_config["externalId"] = element_id
98+
99+
config_data.append(element_config)
100+
101+
return config_data

0 commit comments

Comments
 (0)