@@ -63,7 +63,7 @@ The Document Intelligence Layout model **2023-10-31-preview** supports the follo
63
63
64
64
* [ REST API] ( /rest/api/aiservices/document-models/analyze-document?view=rest-aiservices-2023-10-31-preview&branch=main&tabs=HTTP&preserve-view=true )
65
65
66
- * [ .NET, Java, JavaScript, and Python programming language SDKs.] ( sdk-overview-v4-0.md#supported-programming-languages )
66
+ * [ .NET & bull ; Java & bull ; JavaScript & bull ; Python programming language SDKs.] ( sdk-overview-v4-0.md#supported-programming-languages )
67
67
68
68
** Ready to begin?**
69
69
@@ -119,31 +119,36 @@ If you're looking for a specific section in a document, you can use semantic chu
119
119
120
120
# Using SDK targeting 2023-10-31-preview
121
121
# pip install azure-ai-documentintelligence==1.0.0b1
122
+ # pip install langchain langchain-community azure-ai-documentintelligence
122
123
123
- from azure.ai.documentintelligence import DocumentIntelligenceClient
124
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
124
125
from azure.core.credentials import AzureKeyCredential
125
126
126
127
endpoint = " https://<my-custom-subdomain>.cognitiveservices.azure.com/"
127
128
credential = AzureKeyCredential(" <api_key>" )
128
129
129
130
document_intelligence_client = DocumentIntelligenceClient(
130
131
endpoint, credential)
131
-
132
- from langchain.document_loaders.doc_intelligence import DocumentIntelligenceLoader
133
- from langchain.text_splitter import MarkdownHeaderTextSplitter
134
- # Initiate Azure AI Document Intelligence to load the document and split it into chunks
135
- loader = DocumentIntelligenceLoader(file_path = < your file path> , credential, endpoint)
136
- docs = loader.load()
137
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
138
- headers_to_split_on = [
139
- (" #" , " Header 1" ),
140
- (" ##" , " Header 2" ),
141
- (" ###" , " Header 3" ),
142
- ]
143
- text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)
144
- splits = text_splitter.split_text(docs_string)
145
- splits
146
-
132
+
133
+ from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
134
+
135
+ from langchain.text_splitter import MarkdownHeaderTextSplitter
136
+
137
+ # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
138
+ loader = AzureAIDocumentIntelligenceLoader(file_path = " <path to your file>" , api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model = " prebuilt-layout" )
139
+ docs = loader.load()
140
+
141
+ # Split the document into chunks base on markdown headers.
142
+ headers_to_split_on = [
143
+ (" #" , " Header 1" ),
144
+ (" ##" , " Header 2" ),
145
+ (" ###" , " Header 3" ),
146
+ ]
147
+ text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)
148
+
149
+ docs_string = docs[0 ].page_content
150
+ splits = text_splitter.split_text(docs_string)
151
+ splits
147
152
```
148
153
149
154
## Next steps
0 commit comments