Skip to content

Commit daf5222

Browse files
upload label files from local
1 parent 3a0e739 commit daf5222

File tree

2 files changed

+87
-19
lines changed

2 files changed

+87
-19
lines changed

notebooks/analyzer_training.ipynb

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"\n",
1616
"Labeled data is a group of samples that have been tagged with one or more labels to add context or meaning, which is used to improve analyzer's performance.\n",
1717
"\n",
18-
"Please go to [Azure AI Foundry]() to use the labling tool to annotate your data.\n",
18+
"In your own project, you will use [Azure AI Foundry](https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/quickstart/use-ai-foundry) to use the labeling tool to annotate your data.\n",
1919
"\n",
2020
"In this notebook we will demonstrate after you have the labeled data, how to create analyzer with them and analyze your files.\n",
2121
"\n",
@@ -42,8 +42,13 @@
4242
"cell_type": "markdown",
4343
"metadata": {},
4444
"source": [
45-
"## Analyzer template\n",
46-
"In this sample we define a template for purchase order. We labeled the fields in the training data."
45+
"## Analyzer template and local training folder set up\n",
46+
"In this sample we define a template for receipts.\n",
47+
"\n",
48+
"The training folder should contain a flat (one-level) directory of labeled receipt documents. Each document includes:\n",
49+
"- The original file (e.g., PDF or image).\n",
50+
"- A corresponding labels.json file with labeled fields.\n",
51+
"- A corresponding result.json file with OCR results."
4752
]
4853
},
4954
{
@@ -52,7 +57,8 @@
5257
"metadata": {},
5358
"outputs": [],
5459
"source": [
55-
"analyzer_template = '../analyzer_templates/receipt.json'"
60+
"analyzer_template = \"../analyzer_templates/receipt.json\"\n",
61+
"training_docs_folder = \"../data/document_training\""
5662
]
5763
},
5864
{
@@ -106,14 +112,38 @@
106112
")"
107113
]
108114
},
115+
{
116+
"cell_type": "markdown",
117+
"metadata": {},
118+
"source": [
119+
"## Prepare labeled data\n",
120+
"In this step, we will \n",
121+
"- Check whether document files in local folder have corresponding `.labels.json` and `.result.json` files\n",
122+
"- Upload these files to the designated Azure blob storage.\n",
123+
"\n",
124+
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set in the Prerequisites step."
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": null,
130+
"metadata": {},
131+
"outputs": [],
132+
"source": [
133+
"TRAINING_DATA_SAS_URL = os.getenv(\"TRAINING_DATA_SAS_URL\")\n",
134+
"TRAINING_DATA_PATH = os.getenv(\"TRAINING_DATA_PATH\")\n",
135+
"\n",
136+
"await client.generate_training_data_on_blob(training_docs_folder, TRAINING_DATA_SAS_URL, TRAINING_DATA_PATH)"
137+
]
138+
},
109139
{
110140
"cell_type": "markdown",
111141
"metadata": {},
112142
"source": [
113143
"## Create analyzer with defined schema\n",
114144
"Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n",
115145
"\n",
116-
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set in the Prerequisites step."
146+
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set up in the `.env` file and used in the previous step."
117147
]
118148
},
119149
{
@@ -128,8 +158,8 @@
128158
"response = client.begin_create_analyzer(\n",
129159
" CUSTOM_ANALYZER_ID,\n",
130160
" analyzer_template_path=analyzer_template,\n",
131-
" training_storage_container_sas_url=os.getenv(\"TRAINING_DATA_SAS_URL\"),\n",
132-
" training_storage_container_path_prefix=os.getenv(\"TRAINING_DATA_PATH\"),\n",
161+
" training_storage_container_sas_url=TRAINING_DATA_SAS_URL,\n",
162+
" training_storage_container_path_prefix=TRAINING_DATA_PATH,\n",
133163
")\n",
134164
"result = client.poll_result(response)\n",
135165
"if result is not None and \"status\" in result and result[\"status\"] == \"Succeeded\":\n",

python/content_understanding_client.py

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@ class AzureContentUnderstandingClient:
1919

2020
PREBUILT_DOCUMENT_ANALYZER_ID: str = "prebuilt-documentAnalyzer"
2121
OCR_RESULT_FILE_SUFFIX: str = ".result.json"
22+
LABEL_FILE_SUFFIX: str = ".labels.json"
2223
KNOWLEDGE_SOURCE_LIST_FILE_NAME: str = "sources.jsonl"
2324

2425
# https://learn.microsoft.com/en-us/azure/ai-services/content-understanding/service-limits#document-and-text
25-
SUPPORTED_FILE_TYPES: List[str] = [
26+
SUPPORTED_FILE_TYPES_DOCUMENT_TXT: List[str] = [
2627
".pdf",
2728
".tiff",
2829
".jpg",
@@ -41,15 +42,15 @@ class AzureContentUnderstandingClient:
4142
".xml",
4243
]
4344

44-
SUPPORTED_FILE_TYPES_PRO_MODE: List[str] = [
45+
SUPPORTED_FILE_TYPES_DOCUMENT: List[str] = [
4546
".pdf",
4647
".tiff",
4748
".jpg",
4849
".jpeg",
4950
".png",
5051
".bmp",
5152
".heif",
52-
]
53+
] # Pro mode and Training for Standard mode only support document data
5354

5455
def __init__(
5556
self,
@@ -130,39 +131,39 @@ def _get_headers(
130131
return headers
131132

132133
@staticmethod
133-
def is_supported_type_by_file_ext(file_ext: str, is_pro_mode: bool=False) -> bool:
134+
def is_supported_type_by_file_ext(file_ext: str, is_document: bool=False) -> bool:
134135
"""
135136
Checks if the given file extension is supported.
136137
137138
Args:
138139
file_ext (str): The file extension to check.
139-
is_pro_mode (bool): If True, checks against Pro mode supported file types.
140+
is_document (bool): If True, checks against Document supported file types.
140141
141142
Returns:
142143
bool: True if the file type is supported, False otherwise.
143144
"""
144145
supported_types = (
145-
AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES_PRO_MODE
146-
if is_pro_mode else AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES
146+
AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES_DOCUMENT
147+
if is_document else AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES_DOCUMENT_TXT
147148
)
148149
return file_ext.lower() in supported_types
149150

150151
@staticmethod
151-
def is_supported_type_by_file_path(file_path: Path, is_pro_mode: bool=False) -> bool:
152+
def is_supported_type_by_file_path(file_path: Path, is_document: bool=False) -> bool:
152153
"""
153154
Checks if the given file path has a supported file type.
154155
155156
Args:
156157
file_path (Path): The path to the file to check.
157-
is_pro_mode (bool): If True, checks against Pro mode supported file types.
158+
is_document (bool): If True, checks against Document supported file types.
158159
159160
Returns:
160161
bool: True if the file type is supported, False otherwise.
161162
"""
162163
if not file_path.is_file():
163164
return False
164165
file_ext = file_path.suffix.lower()
165-
return AzureContentUnderstandingClient.is_supported_type_by_file_ext(file_ext, is_pro_mode)
166+
return AzureContentUnderstandingClient.is_supported_type_by_file_ext(file_ext, is_document)
166167

167168
def get_all_analyzers(self) -> Dict[str, Any]:
168169
"""
@@ -318,7 +319,7 @@ def begin_analyze(self, analyzer_id: str, file_location: str) -> Response:
318319
"data": base64.b64encode(f.read_bytes()).decode("utf-8")
319320
}
320321
for f in file_path.rglob("*")
321-
if f.is_file() and self.is_supported_type_by_file_path(f, is_pro_mode=True)
322+
if f.is_file() and self.is_supported_type_by_file_path(f, is_document=True)
322323
]
323324
}
324325
headers = {"Content-Type": "application/json"}
@@ -389,6 +390,42 @@ async def upload_jsonl_to_blob(
389390
await container_client.upload_blob(name=target_blob_path, data=jsonl_bytes, overwrite=True)
390391
self._logger.info(f"Uploaded jsonl to blob '{target_blob_path}'")
391392

393+
async def generate_training_data_on_blob(
394+
self,
395+
training_docs_folder: str,
396+
storage_container_sas_url: str,
397+
storage_container_path_prefix: str,
398+
) -> None:
399+
if not storage_container_path_prefix.endswith("/"):
400+
storage_container_path_prefix += "/"
401+
402+
async with ContainerClient.from_container_url(storage_container_sas_url) as container_client:
403+
for filename in os.listdir(training_docs_folder):
404+
file_path = os.path.join(training_docs_folder, filename)
405+
_, file_ext = os.path.splitext(filename)
406+
if os.path.isfile(file_path) and (
407+
file_ext == "" or file_ext.lower() in self.SUPPORTED_FILE_TYPES_DOCUMENT):
408+
# Training feature only supports Standard mode with document data
409+
# Document files uploaded to AI Foundry will be convert to uuid without extension
410+
label_filename = filename + self.LABEL_FILE_SUFFIX
411+
label_path = os.path.join(training_docs_folder, label_filename)
412+
ocr_result_filename = filename + self.OCR_RESULT_FILE_SUFFIX
413+
ocr_result_path = os.path.join(training_docs_folder, ocr_result_filename)
414+
if os.path.exists(label_path) and os.path.exists(ocr_result_path):
415+
file_blob_path = storage_container_path_prefix + filename
416+
label_blob_path = storage_container_path_prefix + label_filename
417+
ocr_result_blob_path = storage_container_path_prefix + ocr_result_filename
418+
419+
# Upload files
420+
await self._upload_file_to_blob(container_client, file_path, file_blob_path)
421+
await self._upload_file_to_blob(container_client, label_path, label_blob_path)
422+
await self._upload_file_to_blob(container_client, ocr_result_path, ocr_result_blob_path)
423+
self._logger.info(f"Uploaded training data for {filename}")
424+
else:
425+
self._logger.warning(
426+
f"Label file {label_filename} or OCR result file {ocr_result_filename} does not exist for {filename}, skipping."
427+
)
428+
392429
async def generate_knowledge_base_on_blob(
393430
self,
394431
reference_docs_folder: str,
@@ -398,12 +435,13 @@ async def generate_knowledge_base_on_blob(
398435
) -> None:
399436
if not storage_container_path_prefix.endswith("/"):
400437
storage_container_path_prefix += "/"
438+
401439
resources = []
402440
async with ContainerClient.from_container_url(storage_container_sas_url) as container_client:
403441
for dirpath, _, filenames in os.walk(reference_docs_folder):
404442
for filename in filenames:
405443
filename_no_ext, file_ext = os.path.splitext(filename)
406-
if self.is_supported_type_by_file_ext(file_ext, is_pro_mode=True):
444+
if self.is_supported_type_by_file_ext(file_ext, is_document=True):
407445
file_path = os.path.join(dirpath, filename)
408446
result_file_name = filename_no_ext + self.OCR_RESULT_FILE_SUFFIX
409447
result_file_blob_path = storage_container_path_prefix + result_file_name

0 commit comments

Comments
 (0)