Skip to content

Commit f07513c

Browse files
author
Yalin Li
authored
[DI] Enable to run sphinx in pipeline (#35078)
1 parent dba02d4 commit f07513c

File tree

10 files changed

+146
-50
lines changed

10 files changed

+146
-50
lines changed

sdk/documentintelligence/azure-ai-documentintelligence/README.md

Lines changed: 87 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,91 @@ print("----------------------------------------")
266266

267267
<!-- END SNIPPET -->
268268

269+
### Using the General Document Model
270+
271+
Analyze key-value pairs, tables, styles, and selection marks from documents using the general document model provided by the Document Intelligence service.
272+
Select the General Document Model by passing `model_id="prebuilt-document"` into the `begin_analyze_document` method:
273+
274+
<!-- SNIPPET:sample_analyze_general_documents.analyze_general_documents -->
275+
276+
```python
277+
from azure.core.credentials import AzureKeyCredential
278+
from azure.ai.documentintelligence import DocumentIntelligenceClient
279+
from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult
280+
281+
endpoint = os.environ["DOCUMENTINTELLIGENCE_ENDPOINT"]
282+
key = os.environ["DOCUMENTINTELLIGENCE_API_KEY"]
283+
284+
document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))
285+
with open(path_to_sample_documents, "rb") as f:
286+
poller = document_intelligence_client.begin_analyze_document(
287+
"prebuilt-layout",
288+
analyze_request=f,
289+
features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS],
290+
content_type="application/octet-stream",
291+
)
292+
result: AnalyzeResult = poller.result()
293+
294+
if result.styles:
295+
for style in result.styles:
296+
if style.is_handwritten:
297+
print("Document contains handwritten content: ")
298+
print(",".join([result.content[span.offset : span.offset + span.length] for span in style.spans]))
299+
300+
print("----Key-value pairs found in document----")
301+
if result.key_value_pairs:
302+
for kv_pair in result.key_value_pairs:
303+
if kv_pair.key:
304+
print(f"Key '{kv_pair.key.content}' found within " f"'{kv_pair.key.bounding_regions}' bounding regions")
305+
if kv_pair.value:
306+
print(
307+
f"Value '{kv_pair.value.content}' found within "
308+
f"'{kv_pair.value.bounding_regions}' bounding regions\n"
309+
)
310+
311+
for page in result.pages:
312+
print(f"----Analyzing document from page #{page.page_number}----")
313+
print(f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}")
314+
315+
if page.lines:
316+
for line_idx, line in enumerate(page.lines):
317+
words = get_words(page.words, line)
318+
print(
319+
f"...Line #{line_idx} has {len(words)} words and text '{line.content}' within "
320+
f"bounding polygon '{line.polygon}'"
321+
)
322+
323+
for word in words:
324+
print(f"......Word '{word.content}' has a confidence of {word.confidence}")
325+
326+
if page.selection_marks:
327+
for selection_mark in page.selection_marks:
328+
print(
329+
f"Selection mark is '{selection_mark.state}' within bounding polygon "
330+
f"'{selection_mark.polygon}' and has a confidence of "
331+
f"{selection_mark.confidence}"
332+
)
333+
334+
if result.tables:
335+
for table_idx, table in enumerate(result.tables):
336+
print(f"Table # {table_idx} has {table.row_count} rows and {table.column_count} columns")
337+
if table.bounding_regions:
338+
for region in table.bounding_regions:
339+
print(f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}")
340+
for cell in table.cells:
341+
print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
342+
if cell.bounding_regions:
343+
for region in cell.bounding_regions:
344+
print(
345+
f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'\n"
346+
)
347+
print("----------------------------------------")
348+
```
349+
350+
<!-- END SNIPPET -->
351+
352+
- Read more about the features provided by the `prebuilt-document` model [here][service_prebuilt_document].
353+
269354
### Using Prebuilt Models
270355

271356
Extract fields from select document types such as receipts, invoices, business cards, identity documents, and U.S. W-2 tax documents using prebuilt models provided by the Document Intelligence service.
@@ -467,8 +552,7 @@ if result.documents:
467552
value_obj = obj[KEY_OF_VALUE_OBJECT]
468553
extract_value_by_col_name = lambda key: (
469554
value_obj[key].get(KEY_OF_CELL_CONTENT)
470-
if key in value_obj
471-
and KEY_OF_CELL_CONTENT in value_obj[key]
555+
if key in value_obj and KEY_OF_CELL_CONTENT in value_obj[key]
472556
else "None"
473557
)
474558
row_data = list(map(extract_value_by_col_name, col_names))
@@ -720,3 +804,4 @@ additional questions or comments.
720804
[addon_languages_sample]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_addon_languages.py
721805
[query_fields_sample]: https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_analyze_addon_query_fields.py
722806
[service-rename]: https://techcommunity.microsoft.com/t5/azure-ai-services-blog/azure-form-recognizer-is-now-azure-ai-document-intelligence-with/ba-p/3875765
807+
[service_prebuilt_document]: https://docs.microsoft.com/azure/ai-services/document-intelligence/concept-general-document#general-document-features

sdk/documentintelligence/azure-ai-documentintelligence/azure/ai/documentintelligence/_operations/_operations.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from io import IOBase
1010
import json
1111
import sys
12-
from typing import Any, Callable, Dict, IO, Iterable, List, Optional, TypeVar, Union, cast, overload
12+
from typing import Any, Callable, Dict, IO, Iterable, List, Optional, Type, TypeVar, Union, cast, overload
1313
import urllib.parse
1414

1515
from azure.core.exceptions import (
@@ -460,7 +460,7 @@ def _analyze_document_initial( # pylint: disable=inconsistent-return-statements
460460
output_content_format: Optional[Union[str, _models.ContentFormat]] = None,
461461
**kwargs: Any,
462462
) -> None:
463-
error_map = {
463+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
464464
401: ClientAuthenticationError,
465465
404: ResourceNotFoundError,
466466
409: ResourceExistsError,
@@ -3686,7 +3686,7 @@ def _classify_document_initial( # pylint: disable=inconsistent-return-statement
36863686
split: Optional[Union[str, _models.SplitMode]] = None,
36873687
**kwargs: Any,
36883688
) -> None:
3689-
error_map = {
3689+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
36903690
401: ClientAuthenticationError,
36913691
404: ResourceNotFoundError,
36923692
409: ResourceExistsError,
@@ -6831,7 +6831,7 @@ class DocumentIntelligenceAdministrationClientOperationsMixin( # pylint: disabl
68316831
def _build_document_model_initial( # pylint: disable=inconsistent-return-statements
68326832
self, build_request: Union[_models.BuildDocumentModelRequest, JSON, IO[bytes]], **kwargs: Any
68336833
) -> None:
6834-
error_map = {
6834+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
68356835
401: ClientAuthenticationError,
68366836
404: ResourceNotFoundError,
68376837
409: ResourceExistsError,
@@ -7331,7 +7331,7 @@ def get_long_running_output(pipeline_response):
73317331
def _compose_model_initial( # pylint: disable=inconsistent-return-statements
73327332
self, compose_request: Union[_models.ComposeDocumentModelRequest, JSON, IO[bytes]], **kwargs: Any
73337333
) -> None:
7334-
error_map = {
7334+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
73357335
401: ClientAuthenticationError,
73367336
404: ResourceNotFoundError,
73377337
409: ResourceExistsError,
@@ -7982,7 +7982,7 @@ def authorize_model_copy(
79827982
the document model should be copied to. Required.
79837983
}
79847984
"""
7985-
error_map = {
7985+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
79867986
401: ClientAuthenticationError,
79877987
404: ResourceNotFoundError,
79887988
409: ResourceExistsError,
@@ -8042,7 +8042,7 @@ def authorize_model_copy(
80428042
def _copy_model_to_initial( # pylint: disable=inconsistent-return-statements
80438043
self, model_id: str, copy_to_request: Union[_models.CopyAuthorization, JSON, IO[bytes]], **kwargs: Any
80448044
) -> None:
8045-
error_map = {
8045+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
80468046
401: ClientAuthenticationError,
80478047
404: ResourceNotFoundError,
80488048
409: ResourceExistsError,
@@ -8619,7 +8619,7 @@ def get_model(self, model_id: str, **kwargs: Any) -> _models.DocumentModelDetail
86198619
]
86208620
}
86218621
"""
8622-
error_map = {
8622+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
86238623
401: ClientAuthenticationError,
86248624
404: ResourceNotFoundError,
86258625
409: ResourceExistsError,
@@ -8755,7 +8755,7 @@ def list_models(self, **kwargs: Any) -> Iterable["_models.DocumentModelDetails"]
87558755

87568756
cls: ClsType[List[_models.DocumentModelDetails]] = kwargs.pop("cls", None)
87578757

8758-
error_map = {
8758+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
87598759
401: ClientAuthenticationError,
87608760
404: ResourceNotFoundError,
87618761
409: ResourceExistsError,
@@ -8837,7 +8837,7 @@ def delete_model(self, model_id: str, **kwargs: Any) -> None: # pylint: disable
88378837
:rtype: None
88388838
:raises ~azure.core.exceptions.HttpResponseError:
88398839
"""
8840-
error_map = {
8840+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
88418841
401: ClientAuthenticationError,
88428842
404: ResourceNotFoundError,
88438843
409: ResourceExistsError,
@@ -8911,7 +8911,7 @@ def get_resource_info(self, **kwargs: Any) -> _models.ResourceDetails:
89118911
}
89128912
}
89138913
"""
8914-
error_map = {
8914+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
89158915
401: ClientAuthenticationError,
89168916
404: ResourceNotFoundError,
89178917
409: ResourceExistsError,
@@ -8971,6 +8971,7 @@ def get_operation(self, operation_id: str, **kwargs: Any) -> _models.OperationDe
89718971

89728972
Example:
89738973
.. code-block:: python
8974+
89748975
# The response is polymorphic. The following are possible polymorphic responses based
89758976
off discriminator "kind":
89768977

@@ -9263,7 +9264,7 @@ def get_operation(self, operation_id: str, **kwargs: Any) -> _models.OperationDe
92639264
# response body for status code(s): 200
92649265
response == operation_details
92659266
"""
9266-
error_map = {
9267+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
92679268
401: ClientAuthenticationError,
92689269
404: ResourceNotFoundError,
92699270
409: ResourceExistsError,
@@ -9327,6 +9328,7 @@ def list_operations(self, **kwargs: Any) -> Iterable["_models.OperationDetails"]
93279328

93289329
Example:
93299330
.. code-block:: python
9331+
93309332
# The response is polymorphic. The following are possible polymorphic responses based
93319333
off discriminator "kind":
93329334

@@ -9624,7 +9626,7 @@ def list_operations(self, **kwargs: Any) -> Iterable["_models.OperationDetails"]
96249626

96259627
cls: ClsType[List[_models.OperationDetails]] = kwargs.pop("cls", None)
96269628

9627-
error_map = {
9629+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
96289630
401: ClientAuthenticationError,
96299631
404: ResourceNotFoundError,
96309632
409: ResourceExistsError,
@@ -9699,7 +9701,7 @@ def get_next(next_link=None):
96999701
def _build_classifier_initial( # pylint: disable=inconsistent-return-statements
97009702
self, build_request: Union[_models.BuildDocumentClassifierRequest, JSON, IO[bytes]], **kwargs: Any
97019703
) -> None:
9702-
error_map = {
9704+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
97039705
401: ClientAuthenticationError,
97049706
404: ResourceNotFoundError,
97059707
409: ResourceExistsError,
@@ -10170,7 +10172,7 @@ def get_classifier(self, classifier_id: str, **kwargs: Any) -> _models.DocumentC
1017010172
]
1017110173
}
1017210174
"""
10173-
error_map = {
10175+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
1017410176
401: ClientAuthenticationError,
1017510177
404: ResourceNotFoundError,
1017610178
409: ResourceExistsError,
@@ -10282,7 +10284,7 @@ def list_classifiers(self, **kwargs: Any) -> Iterable["_models.DocumentClassifie
1028210284

1028310285
cls: ClsType[List[_models.DocumentClassifierDetails]] = kwargs.pop("cls", None)
1028410286

10285-
error_map = {
10287+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
1028610288
401: ClientAuthenticationError,
1028710289
404: ResourceNotFoundError,
1028810290
409: ResourceExistsError,
@@ -10366,7 +10368,7 @@ def delete_classifier( # pylint: disable=inconsistent-return-statements
1036610368
:rtype: None
1036710369
:raises ~azure.core.exceptions.HttpResponseError:
1036810370
"""
10369-
error_map = {
10371+
error_map: MutableMapping[int, Type[HttpResponseError]] = {
1037010372
401: ClientAuthenticationError,
1037110373
404: ResourceNotFoundError,
1037210374
409: ResourceExistsError,

0 commit comments

Comments
 (0)