Skip to content

Commit 5e35bf2

Browse files
committed
update sections
1 parent 293611f commit 5e35bf2

File tree

1 file changed

+114
-112
lines changed

1 file changed

+114
-112
lines changed

articles/ai-services/content-understanding/tutorial/RAG-tutorial.md

Lines changed: 114 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,120 @@ Analyzers are reusable components in Content Understanding that streamline the d
6565

6666
The following code samples demonstrate how to create analyzers for each modality, specifying the structured data to be extracted, such as key fields, summaries, or classifications. These analyzers will serve as the foundation for extracting and enriching content in your RAG solution.
6767

68-
**Starting off with the schema details for each modality:**
68+
#### Load all environment variables and necessary libraries from Langchain
69+
70+
``` python
71+
72+
import os
73+
from dotenv import load_dotenv
74+
load_dotenv()
75+
76+
# Load and validate Azure AI Services configs
77+
AZURE_AI_SERVICE_ENDPOINT = os.getenv("AZURE_AI_SERVICE_ENDPOINT")
78+
AZURE_AI_SERVICE_API_VERSION = os.getenv("AZURE_AI_SERVICE_API_VERSION") or "2024-12-01-preview"
79+
AZURE_DOCUMENT_INTELLIGENCE_API_VERSION = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_VERSION") or "2024-11-30"
80+
81+
# Load and validate Azure OpenAI configs
82+
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
83+
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
84+
AZURE_OPENAI_CHAT_API_VERSION = os.getenv("AZURE_OPENAI_CHAT_API_VERSION") or "2024-08-01-preview"
85+
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
86+
AZURE_OPENAI_EMBEDDING_API_VERSION = os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION") or "2023-05-15"
87+
88+
# Load and validate Azure Search Services configs
89+
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
90+
AZURE_SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME") or "sample-doc-index"
91+
92+
# Import libraries from Langchain
93+
from langchain import hub
94+
from langchain_openai import AzureChatOpenAI
95+
from langchain_openai import AzureOpenAIEmbeddings
96+
from langchain.schema import StrOutputParser
97+
from langchain.schema.runnable import RunnablePassthrough
98+
from langchain.text_splitter import MarkdownHeaderTextSplitter
99+
from langchain.vectorstores.azuresearch import AzureSearch
100+
from langchain_core.prompts import ChatPromptTemplate
101+
from langchain.schema import Document
102+
import requests
103+
import json
104+
import sys
105+
import uuid
106+
from pathlib import Path
107+
from dotenv import find_dotenv, load_dotenv
108+
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
109+
110+
# Add the parent directory to the path to use shared modules
111+
parent_dir = Path(Path.cwd()).parent
112+
sys.path.append(str(parent_dir))
113+
114+
```
115+
---
116+
117+
#### Create analyzers
118+
119+
``` python
120+
from pathlib import Path
121+
from python.content_understanding_client import AzureContentUnderstandingClient
122+
credential = DefaultAzureCredential()
123+
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
124+
125+
#set analyzer configs
126+
analyzer_configs = [
127+
{
128+
"id": "doc-analyzer" + str(uuid.uuid4()),
129+
"template_path": "../analyzer_templates/content_document.json",
130+
"location": Path("../data/sample_layout.pdf"),
131+
},
132+
{
133+
"id": "image-analyzer" + str(uuid.uuid4()),
134+
"template_path": "../analyzer_templates/image_chart_diagram_understanding.json",
135+
"location": Path("../data/sample_report.pdf"),
136+
},
137+
{
138+
"id": "audio-analyzer" + str(uuid.uuid4()),
139+
"template_path": "../analyzer_templates/call_recording_analytics.json",
140+
"location": Path("../data/callCenterRecording.mp3"),
141+
},
142+
{
143+
"id": "video-analyzer" + str(uuid.uuid4()),
144+
"template_path": "../analyzer_templates/video_content_understanding.json",
145+
"location": Path("../data/FlightSimulator.mp4"),
146+
},
147+
]
148+
149+
# Create Content Understanding client
150+
content_understanding_client = AzureContentUnderstandingClient(
151+
endpoint=AZURE_AI_SERVICE_ENDPOINT,
152+
api_version=AZURE_AI_SERVICE_API_VERSION,
153+
token_provider=token_provider,
154+
x_ms_useragent="azure-ai-content-understanding-python/content_extraction", # This header is used for sample usage telemetry, please comment out this line if you want to opt out.
155+
)
156+
157+
# Iterate through each config and create an analyzer
158+
for analyzer in analyzer_configs:
159+
analyzer_id = analyzer["id"]
160+
template_path = analyzer["template_path"]
161+
162+
try:
163+
164+
# Create the analyzer using the content understanding client
165+
response = content_understanding_client.begin_create_analyzer(
166+
analyzer_id=analyzer_id,
167+
analyzer_template_path=template_path
168+
)
169+
result = content_understanding_client.poll_result(response)
170+
print(f"Successfully created analyzer: {analyzer_id}")
171+
172+
except Exception as e:
173+
print(f"Failed to create analyzer: {analyzer_id}")
174+
print(f"Error: {e}")
175+
176+
```
177+
---
178+
179+
**Note:** Field extraction schemas are optional and not required for performing content extraction. To execute content extraction and create analyzers without defining field schemas, simply provide the analyzer ID and the file to be analyzed.
180+
181+
**Here is a sample schema definition:**
69182

70183
# [Document](#tab/document)
71184

@@ -198,117 +311,6 @@ To create a custom analyzer, you need to define a field schema that describes th
198311

199312
---
200313

201-
#### Load all environment variables and necessary libraries from Langchain
202-
203-
``` python
204-
205-
import os
206-
from dotenv import load_dotenv
207-
load_dotenv()
208-
209-
# Load and validate Azure AI Services configs
210-
AZURE_AI_SERVICE_ENDPOINT = os.getenv("AZURE_AI_SERVICE_ENDPOINT")
211-
AZURE_AI_SERVICE_API_VERSION = os.getenv("AZURE_AI_SERVICE_API_VERSION") or "2024-12-01-preview"
212-
AZURE_DOCUMENT_INTELLIGENCE_API_VERSION = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_VERSION") or "2024-11-30"
213-
214-
# Load and validate Azure OpenAI configs
215-
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
216-
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
217-
AZURE_OPENAI_CHAT_API_VERSION = os.getenv("AZURE_OPENAI_CHAT_API_VERSION") or "2024-08-01-preview"
218-
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME")
219-
AZURE_OPENAI_EMBEDDING_API_VERSION = os.getenv("AZURE_OPENAI_EMBEDDING_API_VERSION") or "2023-05-15"
220-
221-
# Load and validate Azure Search Services configs
222-
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
223-
AZURE_SEARCH_INDEX_NAME = os.getenv("AZURE_SEARCH_INDEX_NAME") or "sample-doc-index"
224-
225-
# Import libraries from Langchain
226-
from langchain import hub
227-
from langchain_openai import AzureChatOpenAI
228-
from langchain_openai import AzureOpenAIEmbeddings
229-
from langchain.schema import StrOutputParser
230-
from langchain.schema.runnable import RunnablePassthrough
231-
from langchain.text_splitter import MarkdownHeaderTextSplitter
232-
from langchain.vectorstores.azuresearch import AzureSearch
233-
from langchain_core.prompts import ChatPromptTemplate
234-
from langchain.schema import Document
235-
import requests
236-
import json
237-
import sys
238-
import uuid
239-
from pathlib import Path
240-
from dotenv import find_dotenv, load_dotenv
241-
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
242-
243-
# Add the parent directory to the path to use shared modules
244-
parent_dir = Path(Path.cwd()).parent
245-
sys.path.append(str(parent_dir))
246-
247-
```
248-
---
249-
250-
#### Create analyzers
251-
252-
``` python
253-
from pathlib import Path
254-
from python.content_understanding_client import AzureContentUnderstandingClient
255-
credential = DefaultAzureCredential()
256-
token_provider = get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default")
257-
258-
#set analyzer configs
259-
analyzer_configs = [
260-
{
261-
"id": "doc-analyzer" + str(uuid.uuid4()),
262-
"template_path": "../analyzer_templates/content_document.json",
263-
"location": Path("../data/sample_layout.pdf"),
264-
},
265-
{
266-
"id": "image-analyzer" + str(uuid.uuid4()),
267-
"template_path": "../analyzer_templates/image_chart_diagram_understanding.json",
268-
"location": Path("../data/sample_report.pdf"),
269-
},
270-
{
271-
"id": "audio-analyzer" + str(uuid.uuid4()),
272-
"template_path": "../analyzer_templates/call_recording_analytics.json",
273-
"location": Path("../data/callCenterRecording.mp3"),
274-
},
275-
{
276-
"id": "video-analyzer" + str(uuid.uuid4()),
277-
"template_path": "../analyzer_templates/video_content_understanding.json",
278-
"location": Path("../data/FlightSimulator.mp4"),
279-
},
280-
]
281-
282-
# Create Content Understanding client
283-
content_understanding_client = AzureContentUnderstandingClient(
284-
endpoint=AZURE_AI_SERVICE_ENDPOINT,
285-
api_version=AZURE_AI_SERVICE_API_VERSION,
286-
token_provider=token_provider,
287-
x_ms_useragent="azure-ai-content-understanding-python/content_extraction", # This header is used for sample usage telemetry, please comment out this line if you want to opt out.
288-
)
289-
290-
# Iterate through each config and create an analyzer
291-
for analyzer in analyzer_configs:
292-
analyzer_id = analyzer["id"]
293-
template_path = analyzer["template_path"]
294-
295-
try:
296-
297-
# Create the analyzer using the content understanding client
298-
response = content_understanding_client.begin_create_analyzer(
299-
analyzer_id=analyzer_id,
300-
analyzer_template_path=template_path
301-
)
302-
result = content_understanding_client.poll_result(response)
303-
print(f"Successfully created analyzer: {analyzer_id}")
304-
305-
except Exception as e:
306-
print(f"Failed to create analyzer: {analyzer_id}")
307-
print(f"Error: {e}")
308-
309-
```
310-
---
311-
312314
## Perform Content and Field Analysis
313315
**Content extraction** is the first step in the RAG implementation process. It transforms raw multimodal data—such as documents, images, audio, and video—into structured, searchable formats. This foundational step ensures that the content is organized and ready for indexing and retrieval. Content extraction provides the baseline for indexing and retrieval but may not fully address domain-specific needs or provide deeper contextual insights.
314316
[Learn more]() about content extraction capabilities for each modality.

0 commit comments

Comments
 (0)