Skip to content

Commit b121447

Browse files
authored
♻️(backends) change how duplicate statement ids are handled
Previously this would fail with a 429 return code on any duplicate event id, now it only fails if the events differ per the LRS spec.
1 parent 55f567a commit b121447

File tree

7 files changed

+99
-36
lines changed

7 files changed

+99
-36
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ and this project adheres to
99
## [Unreleased]
1010

1111
### Changed
12+
13+
- Change how duplicate xAPI statements are handled by backends
1214
General improvement for the Helm Chart:
1315
- add dependencies for MongoDB and Clickhouse
1416
- make persistence optional

src/ralph/api/routers/statements.py

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
HTTPException,
1414
Query,
1515
Request,
16+
Response,
1617
status,
1718
)
1819
from pydantic import parse_raw_as
@@ -332,21 +333,24 @@ async def put(
332333
)
333334

334335
try:
335-
statements_ids_result = DATABASE_CLIENT.query_statements_by_ids([statementId])
336+
existing_statement = DATABASE_CLIENT.query_statements_by_ids([statementId])
336337
except BackendException as error:
337338
raise HTTPException(
338339
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
339340
detail="xAPI statements query failed",
340341
) from error
341342

342-
if len(statements_ids_result) > 0:
343-
# NB: LRS specification calls for performing a deep comparison of incoming
344-
# statements and existing statements with the same ID.
345-
# This seems too costly for performance and was not implemented for this POC.
346-
raise HTTPException(
347-
status_code=status.HTTP_409_CONFLICT,
348-
detail="Statement already exists with the same ID",
349-
)
343+
if existing_statement:
344+
# The LRS specification calls for deep comparison of duplicate statement ids.
345+
# In the case that the current statement is not an exact duplicate of the one
346+
# found in the database we return a 409, otherwise the usual 204.
347+
for existing in existing_statement:
348+
if statement_dict != existing["_source"]:
349+
raise HTTPException(
350+
status_code=status.HTTP_409_CONFLICT,
351+
detail="A different statement already exists with the same ID",
352+
)
353+
return
350354

351355
# For valid requests, perform the bulk indexing of all incoming statements
352356
try:
@@ -368,6 +372,7 @@ async def put(
368372
async def post(
369373
statements: Union[LaxStatement, List[LaxStatement]],
370374
background_tasks: BackgroundTasks,
375+
response: Response,
371376
):
372377
"""Stores a set of statements (or a single statement as a single member of a set).
373378
@@ -405,21 +410,44 @@ async def post(
405410
)
406411

407412
try:
408-
statements_ids_result = DATABASE_CLIENT.query_statements_by_ids(statements_ids)
413+
existing_statements = DATABASE_CLIENT.query_statements_by_ids(statements_ids)
409414
except BackendException as error:
410415
raise HTTPException(
411416
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
412417
detail="xAPI statements query failed",
413418
) from error
414419

415-
if len(statements_ids_result) > 0:
416-
# NB: LRS specification calls for performing a deep comparison of incoming
417-
# statements and existing statements with the same ID.
418-
# This seems too costly for performance and was not implemented for this POC.
419-
raise HTTPException(
420-
status_code=status.HTTP_409_CONFLICT,
421-
detail="Statements already exist with the same ID",
422-
)
420+
# If there are duplicate statements, remove them from our id list and
421+
# dictionary for insertion. We will return the shortened list of ids below
422+
# so that consumers can derive which statements were inserted and which
423+
# were skipped for being duplicates.
424+
# See: https://github.com/openfun/ralph/issues/345
425+
if existing_statements:
426+
existing_ids = []
427+
for existing in existing_statements:
428+
existing_ids.append(existing["_id"])
429+
430+
# The LRS specification calls for deep comparison of duplicates. This
431+
# is done here. If they are not exactly the same, we raise an error.
432+
if statements_dict[existing["_id"]] != existing["_source"]:
433+
raise HTTPException(
434+
status_code=status.HTTP_409_CONFLICT,
435+
detail="Differing statements already exist with the same ID: "
436+
f"{existing['_id']}",
437+
)
438+
439+
# Overwrite our statements and ids with the deduplicated ones.
440+
statements_ids = [id for id in statements_ids if id not in existing_ids]
441+
442+
statements_dict = {
443+
key: value
444+
for key, value in statements_dict.items()
445+
if key in statements_ids
446+
}
447+
448+
if not statements_ids:
449+
response.status_code = status.HTTP_204_NO_CONTENT
450+
return
423451

424452
# For valid requests, perform the bulk indexing of all incoming statements
425453
try:

src/ralph/backends/database/clickhouse.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -238,15 +238,15 @@ def put(
238238

239239
return rows_inserted
240240

241-
def query_statements_by_ids(self, ids: List[str]) -> List:
242-
"""Returns the list of matching statement IDs from the database."""
241+
def query_statements_by_ids(self, ids: List[str]) -> List[dict]:
242+
"""Returns the list of matching statements from the database."""
243243

244244
def chunk_id_list(chunk_size=10000):
245245
for i in range(0, len(ids), chunk_size):
246246
yield ids[i : i + chunk_size]
247247

248248
sql = """
249-
SELECT event_id
249+
SELECT event_id, event_str
250250
FROM {table_name:Identifier}
251251
WHERE event_id IN ({ids:Array(String)})
252252
"""
@@ -257,15 +257,22 @@ def chunk_id_list(chunk_size=10000):
257257
column_oriented=True,
258258
)
259259

260-
found_ids = []
260+
found_statements = []
261261

262262
try:
263263
for chunk_ids in chunk_id_list():
264264
query_context.set_parameter("ids", chunk_ids)
265265
result = self.client.query(context=query_context).named_results()
266-
found_ids.extend(result)
267-
268-
return found_ids
266+
for row in result:
267+
# This is the format to match the other backends
268+
found_statements.append(
269+
{
270+
"_id": str(row["event_id"]),
271+
"_source": json.loads(row["event_str"]),
272+
}
273+
)
274+
275+
return found_statements
269276
except (ClickHouseError, IndexError, TypeError, ValueError) as error:
270277
msg = "Failed to execute ClickHouse query"
271278
logger.error("%s. %s", msg, error)

src/ralph/backends/database/mongo.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,11 @@ def query_statements(self, params: StatementParameters) -> StatementQueryResult:
254254
)
255255

256256
def query_statements_by_ids(self, ids: List[str]) -> List:
257-
"""Returns the list of matching statement IDs from the database."""
258-
return self._find(filter={"_source.id": {"$in": ids}})
257+
"""Returns the list of matching statements from the database."""
258+
return [
259+
{"_id": statement["_source"]["id"], "_source": statement["_source"]}
260+
for statement in self._find(filter={"_source.id": {"$in": ids}})
261+
]
259262

260263
def _find(self, **kwargs):
261264
"""Wraps the MongoClient.collection.find method.

tests/api/test_statements_post.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,8 @@ def test_api_statements_post_statements_list_with_duplicate_of_existing_statemen
278278
# pylint: disable=invalid-name,unused-argument
279279

280280
monkeypatch.setattr("ralph.api.routers.statements.DATABASE_CLIENT", backend())
281+
282+
statement_uuid = str(uuid4())
281283
statement = {
282284
"actor": {
283285
"account": {
@@ -286,7 +288,7 @@ def test_api_statements_post_statements_list_with_duplicate_of_existing_statemen
286288
},
287289
"objectType": "Agent",
288290
},
289-
"id": str(uuid4()),
291+
"id": statement_uuid,
290292
"object": {"id": "https://example.com/object-id/1/"},
291293
"timestamp": "2022-03-15T14:07:51Z",
292294
"verb": {"id": "https://example.com/verb-id/1/"},
@@ -299,18 +301,33 @@ def test_api_statements_post_statements_list_with_duplicate_of_existing_statemen
299301
json=statement,
300302
)
301303
assert response.status_code == 200
304+
assert response.json() == [statement_uuid]
305+
306+
es.indices.refresh()
307+
308+
# Post the statement twice, the data is identical so it should succeed but not
309+
# include the ID in the response as it wasn't inserted.
310+
response = client.post(
311+
"/xAPI/statements/",
312+
headers={"Authorization": f"Basic {auth_credentials}"},
313+
json=statement,
314+
)
315+
assert response.status_code == 204
302316

303317
es.indices.refresh()
304318

305-
# Post the statement twice, trying to change the version field which is not allowed.
319+
# Post the statement again, trying to change the version field which is not allowed.
306320
response = client.post(
307321
"/xAPI/statements/",
308322
headers={"Authorization": f"Basic {auth_credentials}"},
309323
json=[dict(statement, **{"version": "1.0.0"})],
310324
)
311325

312326
assert response.status_code == 409
313-
assert response.json() == {"detail": "Statements already exist with the same ID"}
327+
assert response.json() == {
328+
"detail": f"Differing statements already exist with the same ID: "
329+
f"{statement_uuid}"
330+
}
314331

315332
response = client.get(
316333
"/xAPI/statements/", headers={"Authorization": f"Basic {auth_credentials}"}

tests/api/test_statements_put.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,9 @@ def test_api_statements_put_statement_duplicate_of_existing_statement(
224224
)
225225

226226
assert response.status_code == 409
227-
assert response.json() == {"detail": "Statement already exists with the same ID"}
227+
assert response.json() == {
228+
"detail": "A different statement already exists with the same ID"
229+
}
228230

229231
response = client.get(
230232
f"/xAPI/statements/?statementId={statement['id']}",

tests/backends/database/test_mongo.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -470,16 +470,20 @@ def test_backends_database_mongo_query_statements_by_ids_with_multiple_collectio
470470

471471
# Insert documents
472472
timestamp = {"timestamp": "2022-06-27T15:36:50"}
473-
collection_1_document = list(MongoDatabase.to_documents([{"id": "1", **timestamp}]))
474-
collection_2_document = list(MongoDatabase.to_documents([{"id": "2", **timestamp}]))
473+
statement_1 = {"id": "1", **timestamp}
474+
statement_1_expected = [{"_id": "1", "_source": statement_1}]
475+
statement_2 = {"id": "2", **timestamp}
476+
statement_2_expected = [{"_id": "2", "_source": statement_2}]
477+
collection_1_document = list(MongoDatabase.to_documents([statement_1]))
478+
collection_2_document = list(MongoDatabase.to_documents([statement_2]))
475479
backend_1.bulk_import(collection_1_document)
476480
backend_2.bulk_import(collection_2_document)
477481

478482
# Check the expected search query results
479-
assert backend_1.query_statements_by_ids(["1"]) == collection_1_document
480-
assert backend_1.query_statements_by_ids(["2"]) == []
483+
assert backend_1.query_statements_by_ids(["1"]) == statement_1_expected
481484
assert backend_2.query_statements_by_ids(["1"]) == []
482-
assert backend_2.query_statements_by_ids(["2"]) == collection_2_document
485+
assert backend_2.query_statements_by_ids(["2"]) == statement_2_expected
486+
assert backend_1.query_statements_by_ids(["2"]) == []
483487

484488

485489
def test_backends_database_mongo_status(mongo):

0 commit comments

Comments
 (0)