Skip to content

Commit 6b00fca

Browse files
committed
feat(search): add iscc_id lookup with self-exclusion
Implement iscc_id-based similarity search across all index backends (memory, lmdb, usearch). When iscc_id is provided in query, the system looks up the asset and uses its iscc_code/units/simprints for similarity search while excluding the query asset from results. - Add iscc_id query handling with precedence over other fields - Implement self-exclusion to prevent query asset in results - Return 404 when iscc_id not found (FileNotFoundError) - Add comprehensive tests for iscc_id search behavior - Update existing tests to avoid iscc_id conflicts
1 parent f8be565 commit 6b00fca

File tree

6 files changed

+154
-28
lines changed

6 files changed

+154
-28
lines changed

iscc_search/indexes/lmdb/index.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ def search_assets(self, query, limit=100):
204204
:return: IsccSearchResult with matches sorted by score (descending, normalized 0.0-1.0)
205205
:raises ValueError: If query has neither iscc_code nor units
206206
"""
207+
# Handle iscc_id lookup if provided (takes precedence over other fields)
208+
query_iscc_id = None # Track original query iscc_id for self-exclusion
209+
if query.iscc_id:
210+
query_iscc_id = query.iscc_id
211+
# Look up asset by iscc_id (raises FileNotFoundError if not found -> HTTP 404)
212+
asset = self.get_asset(query.iscc_id)
213+
# Create new query with extracted iscc_code, units and simprints
214+
from iscc_search.schema import IsccQuery
215+
216+
query = IsccQuery(iscc_code=asset.iscc_code, units=asset.units, simprints=asset.simprints)
217+
207218
# Normalize query to ensure it has units (derive from iscc_code if needed)
208219
query = common.normalize_query(query)
209220

@@ -273,6 +284,10 @@ def search_assets(self, query, limit=100):
273284
# Sort by score descending
274285
match_list.sort(key=lambda x: x.score, reverse=True)
275286

287+
# Exclude query asset from results (self-exclusion for iscc_id queries)
288+
if query_iscc_id:
289+
match_list = [match for match in match_list if match.iscc_id != query_iscc_id]
290+
276291
return IsccSearchResult(query=query, global_matches=match_list[:limit], chunk_matches=[])
277292

278293
def get_asset_count(self):

iscc_search/indexes/memory/index.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,17 @@ def search_assets(self, index_name, query, limit=100):
190190
if index_name not in self._indexes:
191191
raise FileNotFoundError(f"Index '{index_name}' not found")
192192

193+
# Handle iscc_id lookup if provided (takes precedence over other fields)
194+
query_iscc_id = None # Track original query iscc_id for self-exclusion
195+
if query.iscc_id:
196+
query_iscc_id = query.iscc_id
197+
# Look up asset by iscc_id (raises FileNotFoundError if not found -> HTTP 404)
198+
asset = self.get_asset(index_name, query.iscc_id)
199+
# Create new query with extracted iscc_code, units and simprints
200+
from iscc_search.schema import IsccQuery
201+
202+
query = IsccQuery(iscc_code=asset.iscc_code, units=asset.units, simprints=asset.simprints)
203+
193204
# Normalize query to ensure it has units (derive from iscc_code if needed)
194205
# This ensures consistent behavior across backends
195206
query = common.normalize_query(query)
@@ -211,6 +222,10 @@ def search_assets(self, index_name, query, limit=100):
211222
)
212223
)
213224

225+
# Exclude query asset from results (self-exclusion for iscc_id queries)
226+
if query_iscc_id:
227+
match_list = [match for match in match_list if match.iscc_id != query_iscc_id]
228+
214229
return IsccSearchResult(
215230
query=query,
216231
global_matches=match_list[:limit],

iscc_search/indexes/usearch/index.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,17 @@ def search_assets(self, query, limit=100):
343343
:param limit: Maximum number of results
344344
:return: IsccSearchResult with query and list of matches (scores normalized 0.0-1.0)
345345
"""
346+
# Handle iscc_id lookup if provided (takes precedence over other fields)
347+
query_iscc_id = None # Track original query iscc_id for self-exclusion
348+
if query.iscc_id:
349+
query_iscc_id = query.iscc_id
350+
# Look up asset by iscc_id (raises FileNotFoundError if not found -> HTTP 404)
351+
asset = self.get_asset(query.iscc_id)
352+
# Create new query with extracted iscc_code, units and simprints
353+
from iscc_search.schema import IsccQuery
354+
355+
query = IsccQuery(iscc_code=asset.iscc_code, units=asset.units, simprints=asset.simprints)
356+
346357
# Normalize query
347358
query = common.normalize_query(query)
348359

@@ -399,6 +410,11 @@ def search_assets(self, query, limit=100):
399410

400411
scored_results.append((key, total_score, unit_scores))
401412

413+
# Exclude query asset from results (self-exclusion for iscc_id queries)
414+
if query_iscc_id:
415+
query_key = int(IsccID(query_iscc_id))
416+
scored_results = [result for result in scored_results if result[0] != query_key]
417+
402418
# Sort by total score descending
403419
scored_results.sort(key=lambda x: x[1], reverse=True)
404420

@@ -441,6 +457,10 @@ def search_assets(self, query, limit=100):
441457
iscc_id = str(IsccID.from_int(key, self._realm_id))
442458
matches.append(IsccGlobalMatch(iscc_id=iscc_id, score=total_score, types=unit_scores))
443459

460+
# Exclude query asset from chunk matches (self-exclusion for iscc_id queries)
461+
if query_iscc_id:
462+
chunk_matches = [match for match in chunk_matches if match.iscc_id != query_iscc_id]
463+
444464
return IsccSearchResult(query=query, global_matches=matches, chunk_matches=chunk_matches)
445465

446466
def flush(self):

tests/test_indexes_lmdb_index.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,14 +181,6 @@ def test_search_assets_basic(lmdb_index, sample_assets):
181181
assert result.global_matches[0].metadata.model_dump(exclude_none=True) == asset.metadata
182182

183183

184-
def test_search_assets_no_units(lmdb_index, sample_iscc_ids):
185-
"""Test search without units or iscc_code raises error."""
186-
query = IsccQuery(iscc_id=sample_iscc_ids[0])
187-
188-
with pytest.raises(ValueError, match="must have 'iscc_code', 'units', or 'simprints'"):
189-
lmdb_index.search_assets(query)
190-
191-
192184
def test_search_assets_empty_index(lmdb_index, sample_content_units):
193185
"""Test search on empty index returns no matches."""
194186
query = IsccQuery(units=[sample_content_units[0], sample_content_units[1]])

tests/test_indexes_memory_index.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -300,20 +300,22 @@ def test_search_assets_by_iscc_code(sample_iscc_ids, sample_iscc_codes):
300300

301301

302302
def test_search_assets_by_iscc_id(sample_iscc_ids, sample_iscc_codes):
303-
"""Test searching assets by iscc_id with iscc_code."""
303+
"""Test searching assets by iscc_id uses asset's code for similarity search and excludes query asset."""
304304
index = MemoryIndex()
305305
index.create_index(IsccIndex(name="testindex"))
306306

307-
# Add assets with iscc_code (required for search)
308-
asset = IsccEntry(iscc_id=sample_iscc_ids[0], iscc_code=sample_iscc_codes[0])
309-
index.add_assets("testindex", [asset])
307+
# Add two assets with same iscc_code (similar assets)
308+
asset1 = IsccEntry(iscc_id=sample_iscc_ids[0], iscc_code=sample_iscc_codes[0])
309+
asset2 = IsccEntry(iscc_id=sample_iscc_ids[1], iscc_code=sample_iscc_codes[0])
310+
index.add_assets("testindex", [asset1, asset2])
310311

311-
# Search by iscc_id and iscc_code
312-
query = IsccQuery(iscc_id=sample_iscc_ids[0], iscc_code=sample_iscc_codes[0])
312+
# Search by iscc_id of first asset
313+
query = IsccQuery(iscc_id=sample_iscc_ids[0])
313314
result = index.search_assets("testindex", query)
314315

316+
# Should find the similar asset but exclude the query asset itself (self-exclusion)
315317
assert len(result.global_matches) == 1
316-
assert result.global_matches[0].iscc_id == sample_iscc_ids[0]
318+
assert result.global_matches[0].iscc_id == sample_iscc_ids[1]
317319

318320

319321
def test_search_assets_no_matches(sample_iscc_ids, sample_iscc_codes):
@@ -447,20 +449,20 @@ def test_metadata_field(sample_iscc_ids, sample_iscc_codes):
447449

448450

449451
def test_search_assets_no_matching_iscc_id(sample_iscc_ids, sample_iscc_codes):
450-
"""Test searching by different iscc_code when no match exists."""
452+
"""Test searching by non-existent iscc_id raises FileNotFoundError."""
451453
index = MemoryIndex()
452454
index.create_index(IsccIndex(name="testindex"))
453455

454-
# Add an asset with one iscc_code
456+
# Add an asset with one iscc_id
455457
asset = IsccEntry(iscc_id=sample_iscc_ids[0], iscc_code=sample_iscc_codes[0])
456458
index.add_assets("testindex", [asset])
457459

458-
# Search with different iscc_id and iscc_code
459-
query = IsccQuery(iscc_id=sample_iscc_ids[1], iscc_code=sample_iscc_codes[1])
460-
result = index.search_assets("testindex", query)
460+
# Search with different iscc_id (not in index)
461+
query = IsccQuery(iscc_id=sample_iscc_ids[1])
461462

462-
# Should not match (different iscc_code)
463-
assert len(result.global_matches) == 0
463+
# Should raise FileNotFoundError (asset lookup fails -> HTTP 404)
464+
with pytest.raises(FileNotFoundError, match=f"Asset '{sample_iscc_ids[1]}' not found"):
465+
index.search_assets("testindex", query)
464466

465467

466468
def test_search_assets_no_iscc_code_in_asset(sample_iscc_ids, sample_iscc_codes):

tests/test_server_search.py

Lines changed: 88 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ def test_search_post_success(test_client, sample_assets):
1010
assets_dict = [a.model_dump(mode="json", exclude_none=True) for a in sample_assets]
1111
test_client.post("/indexes/testindex/assets", json=assets_dict)
1212

13-
# Search using first asset as query
14-
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True)
13+
# Search using first asset as query (exclude iscc_id to avoid iscc_id search)
14+
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True, exclude={"iscc_id"})
1515
response = test_client.post("/indexes/testindex/search", json=query_dict)
1616

1717
assert response.status_code == 200
@@ -147,8 +147,8 @@ def test_search_result_structure(test_client, sample_assets):
147147
assets_dict = [a.model_dump(mode="json", exclude_none=True) for a in sample_assets]
148148
test_client.post("/indexes/testindex/assets", json=assets_dict)
149149

150-
# Search
151-
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True)
150+
# Search (exclude iscc_id to avoid iscc_id search)
151+
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True, exclude={"iscc_id"})
152152
response = test_client.post("/indexes/testindex/search", json=query_dict)
153153

154154
assert response.status_code == 200
@@ -174,8 +174,8 @@ def test_search_empty_index(test_client, sample_assets):
174174
# Create empty index
175175
test_client.post("/indexes", json={"name": "testindex"})
176176

177-
# Search empty index
178-
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True)
177+
# Search empty index (exclude iscc_id to avoid iscc_id lookup failure)
178+
query_dict = sample_assets[0].model_dump(mode="json", exclude_none=True, exclude={"iscc_id"})
179179
response = test_client.post("/indexes/testindex/search", json=query_dict)
180180

181181
assert response.status_code == 200
@@ -284,3 +284,85 @@ def test_response_excludes_unset_fields(test_client, request):
284284
assert "iscc_id" not in query
285285
assert "iscc_code" not in query
286286
assert "units" not in query
287+
288+
289+
def test_search_by_iscc_id_success(test_client, sample_assets):
290+
"""Test POST search with iscc_id parameter finds similar assets."""
291+
# Create index and add multiple assets
292+
test_client.post("/indexes", json={"name": "testindex"})
293+
assets_dict = [a.model_dump(mode="json", exclude_none=True) for a in sample_assets]
294+
test_client.post("/indexes/testindex/assets", json=assets_dict)
295+
296+
# Search using iscc_id from first asset
297+
query_dict = {"iscc_id": sample_assets[0].iscc_id}
298+
response = test_client.post("/indexes/testindex/search", json=query_dict)
299+
300+
assert response.status_code == 200
301+
data = response.json()
302+
303+
# Verify response structure
304+
assert "query" in data
305+
assert "global_matches" in data
306+
307+
# Should find similar assets (excluding the query asset itself)
308+
assert isinstance(data["global_matches"], list)
309+
310+
311+
def test_search_by_iscc_id_self_exclusion(test_client, sample_assets):
312+
"""Test that query asset is excluded from results when searching by iscc_id."""
313+
# Create index and add multiple assets
314+
test_client.post("/indexes", json={"name": "testindex"})
315+
assets_dict = [a.model_dump(mode="json", exclude_none=True) for a in sample_assets]
316+
test_client.post("/indexes/testindex/assets", json=assets_dict)
317+
318+
# Search using iscc_id from first asset
319+
query_iscc_id = sample_assets[0].iscc_id
320+
query_dict = {"iscc_id": query_iscc_id}
321+
response = test_client.post("/indexes/testindex/search", json=query_dict)
322+
323+
assert response.status_code == 200
324+
data = response.json()
325+
326+
# Verify query asset is NOT in results
327+
result_iscc_ids = [match["iscc_id"] for match in data["global_matches"]]
328+
assert query_iscc_id not in result_iscc_ids
329+
330+
331+
def test_search_by_iscc_id_not_found(test_client, sample_iscc_ids):
332+
"""Test POST search with iscc_id that doesn't exist returns 404."""
333+
# Create empty index
334+
test_client.post("/indexes", json={"name": "testindex"})
335+
336+
# Search for non-existent iscc_id
337+
query_dict = {"iscc_id": sample_iscc_ids[0]}
338+
response = test_client.post("/indexes/testindex/search", json=query_dict)
339+
340+
# Should return 404 as documented
341+
assert response.status_code == 404
342+
data = response.json()
343+
assert "detail" in data
344+
assert sample_iscc_ids[0] in data["detail"]
345+
346+
347+
def test_search_by_iscc_id_precedence(test_client, sample_assets, sample_iscc_codes):
348+
"""Test that iscc_id takes precedence over other query fields."""
349+
# Create index and add assets
350+
test_client.post("/indexes", json={"name": "testindex"})
351+
assets_dict = [a.model_dump(mode="json", exclude_none=True) for a in sample_assets]
352+
test_client.post("/indexes/testindex/assets", json=assets_dict)
353+
354+
# Search with iscc_id AND other fields (iscc_id should take precedence)
355+
query_dict = {
356+
"iscc_id": sample_assets[0].iscc_id,
357+
"iscc_code": sample_iscc_codes[5], # Different code - should be ignored
358+
"units": ["ISCC:AAAUHBUDQUT3LPWR"], # Different units - should be ignored
359+
}
360+
response = test_client.post("/indexes/testindex/search", json=query_dict)
361+
362+
assert response.status_code == 200
363+
data = response.json()
364+
365+
# Verify query asset is excluded (self-exclusion for iscc_id)
366+
query_iscc_id = sample_assets[0].iscc_id
367+
result_iscc_ids = [match["iscc_id"] for match in data["global_matches"]]
368+
assert query_iscc_id not in result_iscc_ids

0 commit comments

Comments
 (0)