Skip to content

Commit 58f0ccd

Browse files
raise error for unsopprted files
1 parent ac8de45 commit 58f0ccd

9 files changed

+40
-11
lines changed
Binary file not shown.
Binary file not shown.

notebooks/analyzer_training.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"\n",
2424
"## Prerequisites\n",
2525
"1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n",
26-
"1. Follow steps in [Set env for trainging data](../docs/set_env_for_training_data_and_reference_doc.md) to add training data related env variables `TRAINING_DATA_SAS_URL` and `TRAINING_DATA_PATH` into the `.env` file.\n",
26+
"1. Follow steps in [Set env for trainging data](../docs/set_env_for_training_data_and_reference_doc.md) to add training data related env variables `TRAINING_DATA_SAS_URL` and `TRAINING_DATA_PATH` into the [.env](./.env) file.\n",
2727
" - `TRAINING_DATA_SAS_URL`: SAS URL for your Azure Blob container. \n",
2828
" - `TRAINING_DATA_PATH`: Folder path within the container to upload training data. \n",
2929
"1. Install packages needed to run the sample\n",
@@ -145,7 +145,7 @@
145145
"## Create analyzer with defined schema\n",
146146
"Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n",
147147
"\n",
148-
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set up in the `.env` file and used in the previous step."
148+
"We use **TRAINING_DATA_SAS_URL** and **TRAINING_DATA_PATH** that's set up in the [.env](./.env) file and used in the previous step."
149149
]
150150
},
151151
{

notebooks/field_extraction_pro_mode.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
"source": [
2929
"## Prerequisites\n",
3030
"1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n",
31-
"1. If using reference documents, please follow [Set env for reference doc](../docs/set_env_for_training_data_and_reference_doc.md) to set up `REFERENCE_DOC_SAS_URL` and `REFERENCE_DOC_PATH` in the `.env` file.\n",
31+
"1. If using reference documents, please follow [Set env for reference doc](../docs/set_env_for_training_data_and_reference_doc.md) to set up `REFERENCE_DOC_SAS_URL` and `REFERENCE_DOC_PATH` in the [.env](./.env) file.\n",
3232
" - `REFERENCE_DOC_SAS_URL`: SAS URL for your Azure Blob container. \n",
3333
" - `REFERENCE_DOC_PATH`: Folder path within the container for uploading reference docs. \n",
3434
"1. Install the required packages to run the sample."
@@ -181,7 +181,7 @@
181181
"## Create analyzer with defined schema for Pro mode\n",
182182
"Before creating the analyzer, you should fill in the constant ANALYZER_ID with a relevant name to your task. Here, we generate a unique suffix so this cell can be run multiple times to create different analyzers.\n",
183183
"\n",
184-
"We use **REFERENCE_DOC_SAS_URL** and **REFERENCE_DOC_PATH** that's set up in the `.env` file and used in the previous step."
184+
"We use **REFERENCE_DOC_SAS_URL** and **REFERENCE_DOC_PATH** that's set up in the [.env](./.env) file and used in the previous step."
185185
]
186186
},
187187
{

python/content_understanding_client.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,10 @@ async def generate_training_data_on_blob(
422422
await self._upload_file_to_blob(container_client, ocr_result_path, ocr_result_blob_path)
423423
self._logger.info(f"Uploaded training data for {filename}")
424424
else:
425-
self._logger.warning(
426-
f"Label file {label_filename} or OCR result file {ocr_result_filename} does not exist for {filename}, skipping."
425+
raise FileNotFoundError(
426+
f"Label file '{label_filename}' or OCR result file '{ocr_result_filename}' "
427+
f"does not exist in '{training_docs_folder}'. "
428+
f"Please ensure both files exist for '{filename}'."
427429
)
428430

429431
async def generate_knowledge_base_on_blob(
@@ -451,20 +453,47 @@ async def generate_knowledge_base_on_blob(
451453
try:
452454
analyze_result = self.get_prebuilt_document_analyze_result(file_path)
453455
except Exception as e:
454-
self._logger.error(f"Error of getting analyze result of {filename}: {e}")
455-
continue
456+
self._logger.error(
457+
f"Error of getting analyze result of '{filename}'. "
458+
f"Please check the error message and consider retrying or removing this file."
459+
)
460+
raise e
456461
await self._upload_json_to_blob(container_client, analyze_result, result_file_blob_path)
457462
else:
458-
self._logger.info(f"Using existing result.json for {filename}")
463+
self._logger.info(f"Using existing result.json for '{filename}'")
459464
result_file_path = os.path.join(dirpath, result_file_name)
460465
if not os.path.exists(result_file_path):
461-
self._logger.warning(f"Result file {result_file_name} does not exist, skipping.")
462-
continue
466+
raise FileNotFoundError(
467+
f"Result file '{result_file_name}' does not exist in '{dirpath}'. "
468+
f"Please run analyze first or remove this file from the folder."
469+
)
463470
await self._upload_file_to_blob(container_client, result_file_path, result_file_blob_path)
464471
# Upload the original file
465472
file_blob_path = storage_container_path_prefix + filename
466473
await self._upload_file_to_blob(container_client, file_path, file_blob_path)
467474
resources.append({"file": filename, "resultFile": result_file_name})
475+
elif filename.endswith(self.OCR_RESULT_FILE_SUFFIX) and skip_analyze:
476+
if filename.replace(self.OCR_RESULT_FILE_SUFFIX, "") in filenames:
477+
# skip result.json files corresponding to the file with supported document type
478+
original_filename = filename.replace(self.OCR_RESULT_FILE_SUFFIX, "")
479+
original_filename_no_ext, original_file_ext = os.path.splitext(original_filename)
480+
if self.is_supported_type_by_file_ext(original_file_ext, is_document=True):
481+
continue
482+
else:
483+
raise ValueError(
484+
f"The original file of '{filename}' is not a supported document type, "
485+
f"please remove the result file '{filename}' and '{original_filename}'."
486+
)
487+
else:
488+
raise ValueError(
489+
f"Result file '{filename}' is not corresponding to an original file, "
490+
f"please remove it."
491+
)
492+
else:
493+
raise ValueError(
494+
f"File '{filename}' is not a supported document type, "
495+
f"please remove it or convert it to a supported type."
496+
)
468497
# Upload sources.jsonl
469498
await self.upload_jsonl_to_blob(
470499
container_client, resources, storage_container_path_prefix + self.KNOWLEDGE_SOURCE_LIST_FILE_NAME)

0 commit comments

Comments
 (0)