|
6 | 6 | - Label clashes within and across sheets |
7 | 7 | - Parent classes/properties referenced but not defined |
8 | 8 | - Self-referential parent definitions |
| 9 | +- Assay outcome pairing (synonym +/- pairs must be consistent) |
9 | 10 | - Structural issues (missing IDs, labels, malformed IDs) |
10 | 11 | """ |
11 | 12 |
|
12 | 13 | import csv |
| 14 | +import re |
13 | 15 | import sys |
14 | 16 | import urllib.request |
15 | 17 | from collections import defaultdict |
@@ -268,50 +270,166 @@ def check_undefined_parents(sheets: list[SheetData]) -> list[QCIssue]: |
268 | 270 | if "stub" in parent_ref.lower(): |
269 | 271 | continue |
270 | 272 |
|
271 | | - # Check if parent is defined (could be ID or label) |
272 | | - is_id = parent_ref.startswith("METPO:") or ":" in parent_ref |
273 | | - is_label = not is_id |
| 273 | + # ROBOT templates use pipe-separated multiple parents (SPLIT=|) |
| 274 | + parent_parts = [p.strip() for p in parent_ref.split("|") if p.strip()] |
274 | 275 |
|
275 | | - if is_id and parent_ref not in all_ids: |
276 | | - issues.append( |
277 | | - QCIssue( |
278 | | - "ERROR", |
279 | | - "UNDEFINED_PARENT_ID", |
280 | | - f"Parent ID '{parent_ref}' not defined anywhere", |
281 | | - f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 276 | + for part in parent_parts: |
| 277 | + # Check if parent is defined (could be ID or label) |
| 278 | + is_id = part.startswith("METPO:") or ":" in part |
| 279 | + is_label = not is_id |
| 280 | + |
| 281 | + if is_id and part not in all_ids: |
| 282 | + issues.append( |
| 283 | + QCIssue( |
| 284 | + "ERROR", |
| 285 | + "UNDEFINED_PARENT_ID", |
| 286 | + f"Parent ID '{part}' not defined anywhere", |
| 287 | + f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 288 | + ) |
282 | 289 | ) |
283 | | - ) |
284 | | - elif is_label and parent_ref not in all_labels: |
285 | | - issues.append( |
286 | | - QCIssue( |
287 | | - "WARNING", |
288 | | - "UNDEFINED_PARENT_LABEL", |
289 | | - f"Parent label '{parent_ref}' not defined anywhere (using labels for parents may cause issues)", |
290 | | - f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 290 | + elif is_label and part not in all_labels: |
| 291 | + issues.append( |
| 292 | + QCIssue( |
| 293 | + "WARNING", |
| 294 | + "UNDEFINED_PARENT_LABEL", |
| 295 | + f"Parent label '{part}' not defined anywhere (using labels for parents may cause issues)", |
| 296 | + f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 297 | + ) |
291 | 298 | ) |
292 | | - ) |
293 | 299 |
|
294 | | - # Check for self-referential parents |
295 | | - # Get the label for this ID |
296 | | - current_label = sheet.ids.get(id_val, (None, None, None, None))[1] |
297 | | - if parent_ref == id_val: |
298 | | - issues.append( |
299 | | - QCIssue( |
300 | | - "ERROR", |
301 | | - "SELF_REFERENTIAL_PARENT_ID", |
302 | | - "Parent references itself via ID", |
303 | | - f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 300 | + # Check for self-referential parents |
| 301 | + current_label = sheet.ids.get(id_val, (None, None, None, None))[1] |
| 302 | + if part == id_val: |
| 303 | + issues.append( |
| 304 | + QCIssue( |
| 305 | + "ERROR", |
| 306 | + "SELF_REFERENTIAL_PARENT_ID", |
| 307 | + "Parent references itself via ID", |
| 308 | + f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 309 | + ) |
304 | 310 | ) |
| 311 | + elif part == current_label: |
| 312 | + issues.append( |
| 313 | + QCIssue( |
| 314 | + "ERROR", |
| 315 | + "SELF_REFERENTIAL_PARENT_LABEL", |
| 316 | + f"Parent references itself via label '{part}'", |
| 317 | + f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 318 | + ) |
| 319 | + ) |
| 320 | + |
| 321 | + return issues |
| 322 | + |
| 323 | + |
| 324 | +def check_assay_outcome_pairing(sheets: list[SheetData]) -> list[QCIssue]: |
| 325 | + """Check that synonym/assay-outcome pairs in metpo-properties.tsv are consistent. |
| 326 | +
|
| 327 | + Every synonym string shared by two properties should have exactly one '+' |
| 328 | + and one '-' assay outcome. Flags: |
| 329 | + - ERROR if two properties share a synonym and both have the same outcome |
| 330 | + - WARNING if a property has a synonym + outcome but no counterpart |
| 331 | + """ |
| 332 | + issues: list[QCIssue] = [] |
| 333 | + |
| 334 | + for sheet in sheets: |
| 335 | + if not _sheet_has_assay_outcome_column(sheet): |
| 336 | + continue |
| 337 | + |
| 338 | + synonym_map = _extract_synonym_map_for_assay_outcomes(sheet) |
| 339 | + issues.extend(_build_assay_outcome_issues(sheet, synonym_map)) |
| 340 | + |
| 341 | + return issues |
| 342 | + |
| 343 | + |
| 344 | +def _sheet_has_assay_outcome_column(sheet: SheetData) -> bool: |
| 345 | + """Return True when a sheet header includes an assay outcome column.""" |
| 346 | + header_cells: list[str] = [] |
| 347 | + for header_row in sheet.rows[:2]: |
| 348 | + for cell in header_row: |
| 349 | + if cell: |
| 350 | + header_cells.append(str(cell).strip().lower()) |
| 351 | + return any("assay outcome" in cell for cell in header_cells) |
| 352 | + |
| 353 | + |
| 354 | +def _extract_synonym_map_for_assay_outcomes( |
| 355 | + sheet: SheetData, |
| 356 | +) -> dict[str, list[tuple[int, str, str, str]]]: |
| 357 | + """Extract synonym -> [(row_num, id, label, assay_outcome)] from a sheet.""" |
| 358 | + synonym_map: dict[str, list[tuple[int, str, str, str]]] = defaultdict(list) |
| 359 | + |
| 360 | + for row_num, row in enumerate(sheet.rows, start=1): |
| 361 | + if row_num <= 2 or len(row) < 12: |
| 362 | + continue |
| 363 | + |
| 364 | + row_id = row[0].strip() if row[0] else "" |
| 365 | + label = row[1].strip() if len(row) > 1 and row[1] else "" |
| 366 | + synonym_tuples = row[9].strip() if len(row) > 9 and row[9] else "" |
| 367 | + assay_outcome = row[11].strip() if len(row) > 11 and row[11] else "" |
| 368 | + |
| 369 | + if not (row_id and synonym_tuples and assay_outcome): |
| 370 | + continue |
| 371 | + |
| 372 | + # Extract synonym string from tuple like "oboInOwl:hasRelatedSynonym 'fermentation'" |
| 373 | + match = re.search(r"'([^']+)'", synonym_tuples) |
| 374 | + if match: |
| 375 | + synonym = match.group(1) |
| 376 | + synonym_map[synonym].append((row_num, row_id, label, assay_outcome)) |
| 377 | + |
| 378 | + return synonym_map |
| 379 | + |
| 380 | + |
| 381 | +def _build_assay_outcome_issues( |
| 382 | + sheet: SheetData, |
| 383 | + synonym_map: dict[str, list[tuple[int, str, str, str]]], |
| 384 | +) -> list[QCIssue]: |
| 385 | + """Build assay outcome QC issues for a parsed synonym map.""" |
| 386 | + issues: list[QCIssue] = [] |
| 387 | + |
| 388 | + for synonym, entries in synonym_map.items(): |
| 389 | + entry_count = len(entries) |
| 390 | + outcomes = [e[3] for e in entries] |
| 391 | + |
| 392 | + if entry_count == 2 and outcomes[0] == outcomes[1]: |
| 393 | + id1, label1 = entries[0][1], entries[0][2] |
| 394 | + id2, label2 = entries[1][1], entries[1][2] |
| 395 | + issues.append( |
| 396 | + QCIssue( |
| 397 | + "ERROR", |
| 398 | + "ASSAY_OUTCOME_MISMATCH", |
| 399 | + f"Synonym '{synonym}' has two properties with same outcome " |
| 400 | + f"'{outcomes[0]}': {id1} ({label1}) and {id2} ({label2}). " |
| 401 | + f"One should be '+' and the other '-'.", |
| 402 | + f"{sheet.filename}: rows {entries[0][0]}, {entries[1][0]}", |
305 | 403 | ) |
306 | | - elif parent_ref == current_label: |
| 404 | + ) |
| 405 | + continue |
| 406 | + |
| 407 | + if entry_count == 1: |
| 408 | + row_num, row_id, label, outcome = entries[0] |
| 409 | + # Single entry with outcome is OK for parent properties (e.g. enzyme activity analyzed) |
| 410 | + # but warn for +/- properties that lack a counterpart |
| 411 | + if outcome in ("+", "-"): |
307 | 412 | issues.append( |
308 | 413 | QCIssue( |
309 | | - "ERROR", |
310 | | - "SELF_REFERENTIAL_PARENT_LABEL", |
311 | | - f"Parent references itself via label '{parent_ref}'", |
312 | | - f"{sheet.filename}: row {row_num}, ID {id_val}", |
| 414 | + "WARNING", |
| 415 | + "UNPAIRED_ASSAY_OUTCOME", |
| 416 | + f"Synonym '{synonym}' has only one property with outcome " |
| 417 | + f"'{outcome}': {row_id} ({label}). Expected a +/- pair.", |
| 418 | + f"{sheet.filename}: row {row_num}", |
313 | 419 | ) |
314 | 420 | ) |
| 421 | + continue |
| 422 | + |
| 423 | + if entry_count > 2: |
| 424 | + ids = ", ".join(f"{e[1]} ({e[2]}, {e[3]})" for e in entries) |
| 425 | + issues.append( |
| 426 | + QCIssue( |
| 427 | + "WARNING", |
| 428 | + "MULTIPLE_ASSAY_OUTCOMES", |
| 429 | + f"Synonym '{synonym}' has {entry_count} properties: {ids}", |
| 430 | + sheet.filename, |
| 431 | + ) |
| 432 | + ) |
315 | 433 |
|
316 | 434 | return issues |
317 | 435 |
|
@@ -458,6 +576,9 @@ def main(download: bool, main_sheet: str, properties_sheet: str): |
458 | 576 | click.echo("Checking for undefined parents...") |
459 | 577 | all_issues.extend(check_undefined_parents(sheets)) |
460 | 578 |
|
| 579 | + click.echo("Checking for assay outcome pairing...") |
| 580 | + all_issues.extend(check_assay_outcome_pairing(sheets)) |
| 581 | + |
461 | 582 | click.echo("Checking for structural issues...") |
462 | 583 | all_issues.extend(check_structural_issues(sheets)) |
463 | 584 |
|
|
0 commit comments