Skip to content

Commit d0a300c

Browse files
committed
Test vision feature, refactor vector_fields to make sense, address feedback
1 parent 6e76618 commit d0a300c

File tree

94 files changed

+109
-278
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+109
-278
lines changed

.azdo/pipelines/azure-dev.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ steps:
6161
AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER)
6262
AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING)
6363
AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING)
64-
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING)
6564
AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT)
6665
AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP)
6766
AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU)

.github/workflows/azure-dev.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ jobs:
5151
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
5252
AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }}
5353
AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }}
54-
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING }}
5554
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
5655
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
5756
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}

app/backend/app.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,6 @@ async def setup_clients():
465465
AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower()
466466
# This defaults to the previous field name "embedding", for backwards compatibility
467467
AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding")
468-
AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding")
469468

470469
AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID")
471470
AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION")
@@ -586,7 +585,6 @@ async def setup_clients():
586585
embeddings=text_embeddings_service,
587586
file_processors=file_processors,
588587
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
589-
search_field_name_image_embedding=AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING,
590588
)
591589
current_app.config[CONFIG_INGESTER] = ingester
592590

app/backend/approaches/approach.py

Lines changed: 3 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@
4141
class Document:
4242
id: Optional[str]
4343
content: Optional[str]
44-
embedding: Optional[List[float]]
45-
image_embedding: Optional[List[float]]
4644
category: Optional[str]
4745
sourcepage: Optional[str]
4846
sourcefile: Optional[str]
@@ -56,9 +54,6 @@ def serialize_for_results(self) -> dict[str, Any]:
5654
result_dict = {
5755
"id": self.id,
5856
"content": self.content,
59-
# Should we rename to its actual field name in the index?
60-
"embedding": Document.trim_embedding(self.embedding),
61-
"imageEmbedding": Document.trim_embedding(self.image_embedding),
6257
"category": self.category,
6358
"sourcepage": self.sourcepage,
6459
"sourcefile": self.sourcefile,
@@ -81,18 +76,6 @@ def serialize_for_results(self) -> dict[str, Any]:
8176
}
8277
return result_dict
8378

84-
@classmethod
85-
def trim_embedding(cls, embedding: Optional[List[float]]) -> Optional[str]:
86-
"""Returns a trimmed list of floats from the vector embedding."""
87-
if embedding:
88-
if len(embedding) > 2:
89-
# Format the embedding list to show the first 2 items followed by the count of the remaining items."""
90-
return f"[{embedding[0]}, {embedding[1]} ...+{len(embedding) - 2} more]"
91-
else:
92-
return str(embedding)
93-
94-
return None
95-
9679

9780
@dataclass
9881
class ThoughtStep:
@@ -245,8 +228,6 @@ async def search(
245228
Document(
246229
id=document.get("id"),
247230
content=document.get("content"),
248-
embedding=document.get(self.embedding_field),
249-
image_embedding=document.get("imageEmbedding"),
250231
category=document.get("category"),
251232
sourcepage=document.get("sourcepage"),
252233
sourcefile=document.get("sourcefile"),
@@ -321,13 +302,14 @@ class ExtraArgs(TypedDict, total=False):
321302
**dimensions_args,
322303
)
323304
query_vector = embedding.data[0].embedding
324-
# TODO: use optimizations from rag time journey 3
305+
# This performs an oversampling due to how the search index was setup,
306+
# so we do not need to explicitly pass in an oversampling parameter here
325307
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)
326308

327309
async def compute_image_embedding(self, q: str):
328310
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
329311
headers = {"Content-Type": "application/json"}
330-
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
312+
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
331313
data = {"text": q}
332314

333315
headers["Authorization"] = "Bearer " + await self.vision_token_provider()

app/backend/approaches/chatreadretrievereadvision.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ async def run_until_final_call(
9090
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
9191
filter = self.build_filter(overrides, auth_claims)
9292

93-
vector_fields = overrides.get("vector_fields", [self.embedding_field])
93+
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
9494
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
9595
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
9696

@@ -123,13 +123,10 @@ async def run_until_final_call(
123123
# If retrieval mode includes vectors, compute an embedding for the query
124124
vectors = []
125125
if use_vector_search:
126-
for field in vector_fields:
127-
vector = (
128-
await self.compute_image_embedding(query_text)
129-
if field.startswith("image")
130-
else await self.compute_text_embedding(query_text)
131-
)
132-
vectors.append(vector)
126+
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
127+
vectors.append(await self.compute_text_embedding(query_text))
128+
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
129+
vectors.append(await self.compute_image_embedding(query_text))
133130

134131
results = await self.search(
135132
top,

app/backend/approaches/retrievethenreadvision.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,20 +85,17 @@ async def run(
8585
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
8686
filter = self.build_filter(overrides, auth_claims)
8787

88-
vector_fields = overrides.get("vector_fields", [self.embedding_field])
88+
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
8989
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
9090
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]
9191

9292
# If retrieval mode includes vectors, compute an embedding for the query
9393
vectors = []
9494
if use_vector_search:
95-
for field in vector_fields:
96-
vector = (
97-
await self.compute_image_embedding(q)
98-
if field.startswith("image")
99-
else await self.compute_text_embedding(q)
100-
)
101-
vectors.append(vector)
95+
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
96+
vectors.append(await self.compute_text_embedding(q))
97+
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
98+
vectors.append(await self.compute_image_embedding(q))
10299

103100
results = await self.search(
104101
top,

app/backend/prepdocs.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
399399
document_action=document_action,
400400
embeddings=openai_embeddings_service,
401401
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
402-
search_field_name_image_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING"],
403402
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
404403
search_service_user_assigned_id=args.searchserviceassignedid,
405404
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
@@ -434,7 +433,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
434433
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
435434
# Default to the previous field names for backward compatibility
436435
search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"),
437-
search_field_name_image_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_IMAGE_EMBEDDING", "imageEmbedding"),
438436
use_acls=use_acls,
439437
category=args.category,
440438
use_content_understanding=use_content_understanding,

app/backend/prepdocslib/embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]):
238238
async def create_embeddings(self, blob_urls: List[str]) -> List[List[float]]:
239239
endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage")
240240
headers = {"Content-Type": "application/json"}
241-
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
241+
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
242242
headers["Authorization"] = "Bearer " + await self.token_provider()
243243

244244
embeddings: List[List[float]] = []

app/backend/prepdocslib/filestrategy.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ def __init__(
5252
image_embeddings: Optional[ImageEmbeddings] = None,
5353
search_analyzer_name: Optional[str] = None,
5454
search_field_name_embedding: Optional[str] = None,
55-
search_field_name_image_embedding: Optional[str] = None,
5655
use_acls: bool = False,
5756
category: Optional[str] = None,
5857
use_content_understanding: bool = False,
@@ -66,7 +65,6 @@ def __init__(
6665
self.image_embeddings = image_embeddings
6766
self.search_analyzer_name = search_analyzer_name
6867
self.search_field_name_embedding = search_field_name_embedding
69-
self.search_field_name_image_embedding = search_field_name_image_embedding
7068
self.search_info = search_info
7169
self.use_acls = use_acls
7270
self.category = category
@@ -81,7 +79,6 @@ def setup_search_manager(self):
8179
False,
8280
self.embeddings,
8381
field_name_embedding=self.search_field_name_embedding,
84-
field_name_image_embedding=self.search_field_name_image_embedding,
8582
search_images=self.image_embeddings is not None,
8683
)
8784

@@ -137,7 +134,6 @@ def __init__(
137134
embeddings: Optional[OpenAIEmbeddings] = None,
138135
image_embeddings: Optional[ImageEmbeddings] = None,
139136
search_field_name_embedding: Optional[str] = None,
140-
search_field_name_image_embedding: Optional[str] = None,
141137
):
142138
self.file_processors = file_processors
143139
self.embeddings = embeddings
@@ -150,11 +146,9 @@ def __init__(
150146
use_int_vectorization=False,
151147
embeddings=self.embeddings,
152148
field_name_embedding=search_field_name_embedding,
153-
field_name_image_embedding=search_field_name_image_embedding,
154149
search_images=False,
155150
)
156151
self.search_field_name_embedding = search_field_name_embedding
157-
self.search_field_name_image_embedding = search_field_name_image_embedding
158152

159153
async def add_file(self, file: File):
160154
if self.image_embeddings:

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ def __init__(
4141
search_info: SearchInfo,
4242
embeddings: AzureOpenAIEmbeddingService,
4343
search_field_name_embedding: str,
44-
search_field_name_image_embedding: str,
4544
subscription_id: str,
4645
search_service_user_assigned_id: str,
4746
document_action: DocumentAction = DocumentAction.Add,
@@ -55,7 +54,6 @@ def __init__(
5554
self.document_action = document_action
5655
self.embeddings = embeddings
5756
self.search_field_name_embedding = search_field_name_embedding
58-
self.search_field_name_image_embedding = search_field_name_image_embedding
5957
self.subscription_id = subscription_id
6058
self.search_user_assigned_identity = search_service_user_assigned_id
6159
self.search_analyzer_name = search_analyzer_name
@@ -139,7 +137,6 @@ async def setup(self):
139137
use_int_vectorization=True,
140138
embeddings=self.embeddings,
141139
field_name_embedding=self.search_field_name_embedding,
142-
field_name_image_embedding=self.search_field_name_image_embedding,
143140
search_images=False,
144141
)
145142

0 commit comments

Comments
 (0)