|
17 | 17 | from ...data_type import DataType |
18 | 18 | from ...global_metadata import GlobalMetadataDesc |
19 | 19 |
|
| 20 | + |
| 21 | +def _is_empty_embedding_value(v) -> bool: |
| 22 | + if v is None: |
| 23 | + return True |
| 24 | + if isinstance(v, (list, tuple)): |
| 25 | + return len(v) == 0 |
| 26 | + if isinstance(v, dict): |
| 27 | + return not v |
| 28 | + return False |
| 29 | + |
20 | 30 | MILVUS_UPSERT_BATCH_SIZE = 500 |
21 | 31 | MILVUS_PAGINATION_OFFSET = 1000 |
22 | 32 | MILVUS_INDEX_MAX_RETRY = 3 |
@@ -141,10 +151,31 @@ def _client_context(self): |
141 | 151 | finally: |
142 | 152 | self._client_pool.release(c) |
143 | 153 |
|
| 154 | + def _row_has_valid_embedding(self, d: dict) -> bool: |
| 155 | + '''True if row has every required embed key with a non-empty value (Milvus requires all columns same length).''' |
| 156 | + emb = d.get('embedding') |
| 157 | + if not emb or not isinstance(emb, dict): |
| 158 | + return False |
| 159 | + for k in self._embed_datatypes: |
| 160 | + if _is_empty_embedding_value(emb.get(k)): |
| 161 | + return False |
| 162 | + return True |
| 163 | + |
144 | 164 | @override |
145 | | - def upsert(self, collection_name: str, data: List[dict]) -> bool: |
| 165 | + def upsert(self, collection_name: str, data: List[dict]) -> bool: # noqa: C901 |
146 | 166 | try: |
147 | 167 | if not data: return True |
| 168 | + # Only upsert rows that have valid embedding for every key. _serialize_data omits missing/empty |
| 169 | + # embedding fields, which would make pymilvus build columns with different lengths (e.g. uid 230 vs |
| 170 | + # embedding___default__ 229) and raise num_rows mismatch. |
| 171 | + valid_data = [d for d in data if self._row_has_valid_embedding(d)] |
| 172 | + dropped = len(data) - len(valid_data) |
| 173 | + if dropped: |
| 174 | + LOG.warning(f'[Milvus Store - upsert] Dropping {dropped} rows with missing/empty embedding for ' |
| 175 | + f'collection {collection_name}.') |
| 176 | + data = valid_data |
| 177 | + if not data: |
| 178 | + return True |
148 | 179 | data_embeddings = data[0].get('embedding', {}) |
149 | 180 | if not data_embeddings: return True |
150 | 181 | with self._client_context() as client: |
|
0 commit comments