Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .azdo/pipelines/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ steps:
AZURE_SEARCH_QUERY_SPELLER: $(AZURE_SEARCH_QUERY_SPELLER)
AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER)
AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING)
AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING)
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

deal with image embedding in a future pr

AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT)
AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP)
AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU)
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ jobs:
AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }}
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }}
AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }}
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING }}
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}
Expand Down
13 changes: 12 additions & 1 deletion app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,9 @@ async def setup_clients():
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER") or "lexicon"
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower()
# This defaults to the previous field name "embedding", for backwards compatibility
AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding")
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding")

AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID")
AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION")
Expand Down Expand Up @@ -579,7 +582,11 @@ async def setup_clients():
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
)
ingester = UploadUserFileStrategy(
search_info=search_info, embeddings=text_embeddings_service, file_processors=file_processors
search_info=search_info,
embeddings=text_embeddings_service,
file_processors=file_processors,
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
search_field_name_image_embedding=AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING,
)
current_app.config[CONFIG_INGESTER] = ingester

Expand Down Expand Up @@ -676,6 +683,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -694,6 +702,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down Expand Up @@ -733,6 +742,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -754,6 +764,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down
11 changes: 8 additions & 3 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ class Document:
reranker_score: Optional[float] = None

def serialize_for_results(self) -> dict[str, Any]:
return {
result_dict = {
"id": self.id,
"content": self.content,
# Should we rename to its actual field name in the index?
"embedding": Document.trim_embedding(self.embedding),
"imageEmbedding": Document.trim_embedding(self.image_embedding),
"category": self.category,
Expand All @@ -78,6 +79,7 @@ def serialize_for_results(self) -> dict[str, Any]:
"score": self.score,
"reranker_score": self.reranker_score,
}
return result_dict

@classmethod
def trim_embedding(cls, embedding: Optional[List[float]]) -> Optional[str]:
Expand Down Expand Up @@ -162,6 +164,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
openai_host: str,
vision_endpoint: str,
vision_token_provider: Callable[[], Awaitable[str]],
Expand All @@ -176,6 +179,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.openai_host = openai_host
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
Expand Down Expand Up @@ -241,7 +245,7 @@ async def search(
Document(
id=document.get("id"),
content=document.get("content"),
embedding=document.get("embedding"),
embedding=document.get(self.embedding_field),
image_embedding=document.get("imageEmbedding"),
category=document.get("category"),
sourcepage=document.get("sourcepage"),
Expand Down Expand Up @@ -317,7 +321,8 @@ class ExtraArgs(TypedDict, total=False):
**dimensions_args,
)
query_vector = embedding.data[0].embedding
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")
# TODO: use optimizations from rag time journey 3
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)

async def compute_image_embedding(self, q: str):
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -49,6 +50,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
10 changes: 6 additions & 4 deletions app/backend/approaches/chatreadretrievereadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -57,6 +58,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down Expand Up @@ -88,7 +90,7 @@ async def run_until_final_call(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", [self.embedding_field])
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

Expand Down Expand Up @@ -123,9 +125,9 @@ async def run_until_final_call(
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(query_text)
if field == "embedding"
else await self.compute_image_embedding(query_text)
await self.compute_image_embedding(query_text)
if field.startswith("image")
else await self.compute_text_embedding(query_text)
)
vectors.append(vector)

Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
embedding_model: str,
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -44,6 +45,7 @@ def __init__(
self.embedding_dimensions = embedding_dimensions
self.chatgpt_deployment = chatgpt_deployment
self.embedding_deployment = embedding_deployment
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
10 changes: 6 additions & 4 deletions app/backend/approaches/retrievethenreadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -47,6 +48,7 @@ def __init__(
self.embedding_model = embedding_model
self.embedding_deployment = embedding_deployment
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.gpt4v_deployment = gpt4v_deployment
Expand Down Expand Up @@ -83,7 +85,7 @@ async def run(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", [self.embedding_field])
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

Expand All @@ -92,9 +94,9 @@ async def run(
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(q)
if field == "embedding"
else await self.compute_image_embedding(q)
await self.compute_image_embedding(q)
if field.startswith("image")
else await self.compute_text_embedding(q)
)
vectors.append(vector)

Expand Down
5 changes: 5 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
blob_manager=blob_manager,
document_action=document_action,
embeddings=openai_embeddings_service,
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
search_field_name_image_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING"],
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
search_service_user_assigned_id=args.searchserviceassignedid,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
Expand Down Expand Up @@ -430,6 +432,9 @@ async def main(strategy: Strategy, setup_index: bool = True):
embeddings=openai_embeddings_service,
image_embeddings=image_embeddings_service,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
# Default to the previous field names for backward compatibility
search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"),
search_field_name_image_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding"),
use_acls=use_acls,
category=args.category,
use_content_understanding=use_content_understanding,
Expand Down
40 changes: 30 additions & 10 deletions app/backend/prepdocslib/filestrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def __init__(
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_analyzer_name: Optional[str] = None,
search_field_name_embedding: Optional[str] = None,
search_field_name_image_embedding: Optional[str] = None,
use_acls: bool = False,
category: Optional[str] = None,
use_content_understanding: bool = False,
Expand All @@ -63,22 +65,29 @@ def __init__(
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_analyzer_name = search_analyzer_name
self.search_field_name_embedding = search_field_name_embedding
self.search_field_name_image_embedding = search_field_name_image_embedding
self.search_info = search_info
self.use_acls = use_acls
self.category = category
self.use_content_understanding = use_content_understanding
self.content_understanding_endpoint = content_understanding_endpoint

async def setup(self):
search_manager = SearchManager(
def setup_search_manager(self):
self.search_manager = SearchManager(
self.search_info,
self.search_analyzer_name,
self.use_acls,
False,
self.embeddings,
field_name_embedding=self.search_field_name_embedding,
field_name_image_embedding=self.search_field_name_image_embedding,
search_images=self.image_embeddings is not None,
)
await search_manager.create_index()

async def setup(self):
self.setup_search_manager()
await self.search_manager.create_index()

if self.use_content_understanding:
if self.content_understanding_endpoint is None:
Expand All @@ -91,9 +100,7 @@ async def setup(self):
await cu_manager.create_analyzer()

async def run(self):
search_manager = SearchManager(
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
)
self.setup_search_manager()
if self.document_action == DocumentAction.Add:
files = self.list_file_strategy.list()
async for file in files:
Expand All @@ -104,18 +111,18 @@ async def run(self):
blob_image_embeddings: Optional[List[List[float]]] = None
if self.image_embeddings and blob_sas_uris:
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
await search_manager.update_content(sections, blob_image_embeddings, url=file.url)
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
finally:
if file:
file.close()
elif self.document_action == DocumentAction.Remove:
paths = self.list_file_strategy.list_paths()
async for path in paths:
await self.blob_manager.remove_blob(path)
await search_manager.remove_content(path)
await self.search_manager.remove_content(path)
elif self.document_action == DocumentAction.RemoveAll:
await self.blob_manager.remove_blob()
await search_manager.remove_content()
await self.search_manager.remove_content()


class UploadUserFileStrategy:
Expand All @@ -129,12 +136,25 @@ def __init__(
file_processors: dict[str, FileProcessor],
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_field_name_embedding: Optional[str] = None,
search_field_name_image_embedding: Optional[str] = None,
):
self.file_processors = file_processors
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_info = search_info
self.search_manager = SearchManager(self.search_info, None, True, False, self.embeddings)
self.search_manager = SearchManager(
search_info=self.search_info,
search_analyzer_name=None,
use_acls=True,
use_int_vectorization=False,
embeddings=self.embeddings,
field_name_embedding=search_field_name_embedding,
field_name_image_embedding=search_field_name_image_embedding,
search_images=False,
)
self.search_field_name_embedding = search_field_name_embedding
self.search_field_name_image_embedding = search_field_name_image_embedding

async def add_file(self, file: File):
if self.image_embeddings:
Expand Down
Loading
Loading