@@ -119,31 +119,27 @@ If you're looking for a specific section in a document, you can use semantic chu
119
119
120
120
# Using SDK targeting 2023-10-31-preview
121
121
# pip install azure-ai-documentintelligence==1.0.0b1
122
-
123
- from azure.ai.documentintelligence import DocumentIntelligenceClient
124
- from azure.core.credentials import AzureKeyCredential
125
-
126
- endpoint = " https://<my-custom-subdomain>.cognitiveservices.azure.com/"
127
- credential = AzureKeyCredential(" <api_key>" )
128
-
129
- document_intelligence_client = DocumentIntelligenceClient(
130
- endpoint, credential)
131
-
132
- from langchain.document_loaders.doc_intelligence import DocumentIntelligenceLoader
133
- from langchain.text_splitter import MarkdownHeaderTextSplitter
134
- # Initiate Azure AI Document Intelligence to load the document and split it into chunks
135
- loader = DocumentIntelligenceLoader(file_path = < your file path> , credential, endpoint)
136
- docs = loader.load()
137
- # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
138
- headers_to_split_on = [
139
- (" #" , " Header 1" ),
140
- (" ##" , " Header 2" ),
141
- (" ###" , " Header 3" ),
142
- ]
143
- text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)
144
- splits = text_splitter.split_text(docs_string)
145
- splits
146
-
122
+ # pip install langchain langchain-community azure-ai-documentintelligence
123
+
124
+ from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
125
+
126
+ from langchain.text_splitter import MarkdownHeaderTextSplitter
127
+
128
+ # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
129
+ loader = AzureAIDocumentIntelligenceLoader(file_path = " <path to your file>" , api_key = doc_intelligence_key, api_endpoint = doc_intelligence_endpoint, api_model = " prebuilt-layout" )
130
+ docs = loader.load()
131
+
132
+ # Split the document into chunks base on markdown headers.
133
+ headers_to_split_on = [
134
+ (" #" , " Header 1" ),
135
+ (" ##" , " Header 2" ),
136
+ (" ###" , " Header 3" ),
137
+ ]
138
+ text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on)
139
+
140
+ docs_string = docs[0 ].page_content
141
+ splits = text_splitter.split_text(docs_string)
142
+ splits
147
143
```
148
144
149
145
## Next steps
0 commit comments