|
5 | 5 |
|
6 | 6 | import asyncio |
7 | 7 | import requests |
| 8 | +import hashlib |
8 | 9 |
|
9 | 10 | from huggingface_hub import snapshot_download |
10 | 11 | from langchain.retrievers import ContextualCompressionRetriever, EnsembleRetriever |
@@ -175,46 +176,41 @@ def merge_get_results(get_results: list[dict]) -> dict: |
175 | 176 |
|
176 | 177 | def merge_and_sort_query_results( |
177 | 178 | query_results: list[dict], k: int, reverse: bool = False |
178 | | -) -> list[dict]: |
| 179 | +) -> dict: |
179 | 180 | # Initialize lists to store combined data |
180 | | - combined_distances = [] |
181 | | - combined_documents = [] |
182 | | - combined_metadatas = [] |
| 181 | + combined = [] |
| 182 | + seen_hashes = set() # To store unique document hashes |
183 | 183 |
|
184 | 184 | for data in query_results: |
185 | | - combined_distances.extend(data["distances"][0]) |
186 | | - combined_documents.extend(data["documents"][0]) |
187 | | - combined_metadatas.extend(data["metadatas"][0]) |
| 185 | + distances = data["distances"][0] |
| 186 | + documents = data["documents"][0] |
| 187 | + metadatas = data["metadatas"][0] |
| 188 | + |
| 189 | + for distance, document, metadata in zip(distances, documents, metadatas): |
| 190 | + if isinstance(document, str): |
| 191 | + doc_hash = hashlib.md5( |
| 192 | + document.encode() |
| 193 | + ).hexdigest() # Compute a hash for uniqueness |
188 | 194 |
|
189 | | - # Create a list of tuples (distance, document, metadata) |
190 | | - combined = list(zip(combined_distances, combined_documents, combined_metadatas)) |
| 195 | + if doc_hash not in seen_hashes: |
| 196 | + seen_hashes.add(doc_hash) |
| 197 | + combined.append((distance, document, metadata)) |
191 | 198 |
|
192 | 199 | # Sort the list based on distances |
193 | 200 | combined.sort(key=lambda x: x[0], reverse=reverse) |
194 | 201 |
|
195 | | - # We don't have anything :-( |
196 | | - if not combined: |
197 | | - sorted_distances = [] |
198 | | - sorted_documents = [] |
199 | | - sorted_metadatas = [] |
200 | | - else: |
201 | | - # Unzip the sorted list |
202 | | - sorted_distances, sorted_documents, sorted_metadatas = zip(*combined) |
203 | | - |
204 | | - # Slicing the lists to include only k elements |
205 | | - sorted_distances = list(sorted_distances)[:k] |
206 | | - sorted_documents = list(sorted_documents)[:k] |
207 | | - sorted_metadatas = list(sorted_metadatas)[:k] |
| 202 | + # Slice to keep only the top k elements |
| 203 | + sorted_distances, sorted_documents, sorted_metadatas = ( |
| 204 | + zip(*combined[:k]) if combined else ([], [], []) |
| 205 | + ) |
208 | 206 |
|
209 | | - # Create the output dictionary |
210 | | - result = { |
211 | | - "distances": [sorted_distances], |
212 | | - "documents": [sorted_documents], |
213 | | - "metadatas": [sorted_metadatas], |
| 207 | + # Create and return the output dictionary |
| 208 | + return { |
| 209 | + "distances": [list(sorted_distances)], |
| 210 | + "documents": [list(sorted_documents)], |
| 211 | + "metadatas": [list(sorted_metadatas)], |
214 | 212 | } |
215 | 213 |
|
216 | | - return result |
217 | | - |
218 | 214 |
|
219 | 215 | def get_all_items_from_collections(collection_names: list[str]) -> dict: |
220 | 216 | results = [] |
|
0 commit comments