diff --git a/src/reporting/filter_results/config.vsh.yaml b/src/reporting/filter_results/config.vsh.yaml new file mode 100644 index 000000000..09e22ba7a --- /dev/null +++ b/src/reporting/filter_results/config.vsh.yaml @@ -0,0 +1,162 @@ +name: filter_results +namespace: reporting +description: Filter dataset, method, metric info and results based on include/exclude criteria + +argument_groups: + - name: Inputs + arguments: + - name: --input_dataset_info + type: file + description: JSON file containing dataset information + required: true + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + + - name: --input_method_info + type: file + description: JSON file containing method information + required: true + example: resources_test/openproblems/task_results_v4/processed/method_info.json + + - name: --input_metric_info + type: file + description: JSON file containing metric information + required: true + example: resources_test/openproblems/task_results_v4/processed/metric_info.json + + - name: --input_results + type: file + description: JSON file containing results + required: true + example: resources_test/openproblems/task_results_v4/processed/results.json + + - name: Dataset filtering + description: | + Use these arguments to filter datasets by name. By default, all datasets are + included. If `--datasets_include` is defined, only those datasets are included. If + `--datasets_exclude` is defined, all datasets except those specified are included. + These arguments are mutually exclusive, so only `--datasets_include` OR + `--datasets_exclude` can be set but not both. + arguments: + - name: "--datasets_include" + type: string + multiple: true + description: | + A list of dataset ids to include. If specified, only these datasets will be included. + - name: "--datasets_exclude" + type: string + multiple: true + description: | + A list of dataset ids to exclude. If specified, all datasets except the ones listed will be included. + + - name: Method filtering + description: | + Use these arguments to filter methods by name. By default, all methods are + included. If `--methods_include` is defined, only those methods are included. If + `--methods_exclude` is defined, all methods except those specified are included. + These arguments are mutually exclusive, so only `--methods_include` OR + `--methods_exclude` can be set but not both. + arguments: + - name: "--methods_include" + type: string + multiple: true + description: | + A list of method ids to include. If specified, only these methods will be included. + - name: "--methods_exclude" + type: string + multiple: true + description: | + A list of method ids to exclude. If specified, all methods except the ones listed will be included. + + - name: Metric filtering + description: | + Use these arguments to filter metrics by name. By default, all metrics are + included. If `--metrics_include` is defined, only those metrics are included. If + `--metrics_exclude` is defined, all metrics except those specified are included. + These arguments are mutually exclusive, so only `--metrics_include` OR + `--metrics_exclude` can be set but not both. + arguments: + - name: "--metrics_include" + type: string + multiple: true + description: | + A list of metric ids to include. If specified, only these metrics will be included. + - name: "--metrics_exclude" + type: string + multiple: true + description: | + A list of metric ids to exclude. If specified, all metrics except the ones listed will be included. + + - name: Outputs + arguments: + - name: --output_dataset_info + type: file + direction: output + default: filtered_dataset_info.json + description: Filtered dataset info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/dataset_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_dataset_info.json + + - name: --output_method_info + type: file + direction: output + default: filtered_method_info.json + description: Filtered method info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/method_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_method_info.json + + - name: --output_metric_info + type: file + direction: output + default: filtered_metric_info.json + description: Filtered metric info JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/metric_info.json + example: resources_test/openproblems/task_results_v4/processed/filtered_metric_info.json + + - name: --output_results + type: file + direction: output + default: filtered_results.json + description: Filtered results JSON file + info: + format: + type: json + schema: /common/schemas/results_v4/results.json + example: resources_test/openproblems/task_results_v4/processed/filtered_results.json + +resources: + - type: python_script + path: script.py + - path: /common/schemas + dest: schemas + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: apt + packages: + - nodejs + - npm + - type: docker + run: npm install -g ajv-cli + +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowtime, lowcpu] diff --git a/src/reporting/filter_results/script.py b/src/reporting/filter_results/script.py new file mode 100644 index 000000000..952e5fab7 --- /dev/null +++ b/src/reporting/filter_results/script.py @@ -0,0 +1,311 @@ +## VIASH START +par = { + "input_dataset_info": "resources_test/openproblems/task_results_v4/processed/dataset_info.json", + "input_method_info": "resources_test/openproblems/task_results_v4/processed/method_info.json", + "input_metric_info": "resources_test/openproblems/task_results_v4/processed/metric_info.json", + "input_results": "resources_test/openproblems/task_results_v4/processed/results.json", + "output_dataset_info": "resources_test/openproblems/task_results_v4/processed/filtered_dataset_info.json", + "output_method_info": "resources_test/openproblems/task_results_v4/processed/filtered_method_info.json", + "output_metric_info": "resources_test/openproblems/task_results_v4/processed/filtered_metric_info.json", + "output_results": "resources_test/openproblems/task_results_v4/processed/filtered_results.json", + "datasets_exclude": [ + "cellxgene_census/tabula_sapiens", + "cellxgene_census/mouse_pancreas_atlas", + ], + "datasets_include": None, + "methods_exclude": None, + "methods_include": None, + "metrics_exclude": None, + "metrics_include": None, +} +meta = {"resources_dir": "target/executable/reporting/filter_results"} +## VIASH END + +import json +import subprocess +import sys +from pathlib import Path +from typing import List, Dict, Any, Optional + + +def validate_filtering_args(): + """Validate that include/exclude arguments are mutually exclusive.""" + if par["datasets_include"] and par["datasets_exclude"]: + raise ValueError( + "Cannot specify both --datasets_include and --datasets_exclude" + ) + + if par["methods_include"] and par["methods_exclude"]: + raise ValueError("Cannot specify both --methods_include and --methods_exclude") + + if par["metrics_include"] and par["metrics_exclude"]: + raise ValueError("Cannot specify both --metrics_include and --metrics_exclude") + + +def apply_name_filter( + data_list: List[Dict[str, Any]], + include_list: Optional[List[str]] = None, + exclude_list: Optional[List[str]] = None, + item_type: str = "item", +) -> List[Dict[str, Any]]: + """Apply filtering to a list based on name field.""" + if not data_list: + return data_list + + original_count = len(data_list) + item_names = [item["name"] for item in data_list] + + if include_list: + items_to_include = set(item_names) & set(include_list) + if not items_to_include: + print( + f"Warning: None of the specified {item_type}s to include were found in the data", + file=sys.stderr, + ) + return [] + + missing_items = set(include_list) - set(item_names) + if missing_items: + print( + f"Warning: The following {item_type}s specified in include list were not found: " + + ", ".join(missing_items), + file=sys.stderr, + ) + + filtered_data = [item for item in data_list if item["name"] in items_to_include] + print(f"Included {len(filtered_data)} out of {original_count} {item_type}s") + return filtered_data + + elif exclude_list: + items_to_exclude = set(item_names) & set(exclude_list) + + missing_items = set(exclude_list) - set(item_names) + if missing_items: + print( + f"Warning: The following {item_type}s specified in exclude list were not found: " + + ", ".join(missing_items), + file=sys.stderr, + ) + + filtered_data = [ + item for item in data_list if item["name"] not in items_to_exclude + ] + print( + f"Excluded {len(items_to_exclude)} {item_type}s, keeping {len(filtered_data)} out of {original_count} {item_type}s" + ) + return filtered_data + + # No filtering applied + return data_list + + +def filter_results_data( + results_data: List[Dict[str, Any]], + dataset_names: List[str], + method_names: List[str], + metric_names: List[str], +) -> List[Dict[str, Any]]: + """Filter results based on dataset, method, and metric filters.""" + if not results_data: + return results_data + + original_count = len(results_data) + + # Filter result entries based on dataset_name, method_name, and metric_names + filtered_results = [] + for result in results_data: + dataset_keep = result["dataset_name"] in dataset_names + method_keep = result["method_name"] in method_names + + # Check whether this result should be kept + if dataset_keep and method_keep: + filtered_result = result.copy() + + filtered_metrics = [ + (i, name) + for i, name in enumerate(result["metric_names"]) + if name in metric_names + ] + + # store metric names + filtered_result["metric_names"] = [name for _, name in filtered_metrics] + + # store metric values + filtered_result["metric_values"] = [ + result["metric_values"][i] for i, _ in filtered_metrics + ] + + # store metric components + new_metric_components = [] + for component in result.get("metric_components", []): + new_component = component.copy() + new_component["metric_names"] = [ + name for name in component["metric_names"] if name in metric_names + ] + + # if metric_names are not empty + if new_component["metric_names"]: + new_metric_components.append(new_component) + filtered_result["metric_components"] = new_metric_components + + filtered_results.append(filtered_result) + + print( + f"Filtered results: keeping {len(filtered_results)} out of {original_count} result entries" + ) + return filtered_results + + +def validate_json_against_schema( + json_file: str, schema_file: str, name: str +) -> tuple[bool, str]: + """Validate a JSON file against its schema using ajv-cli. + + Returns: + tuple[bool, str]: (is_valid, error_message) + """ + try: + cmd = [ + "ajv", + "validate", + "--spec", + "draft2020", + "-s", + schema_file, + "-r", + str(Path(meta["resources_dir"]) / "schemas" / "results_v4" / "core.json"), + "-d", + json_file, + ] + + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode == 0: + print(f"✓ {name} validation passed") + return True, "" + else: + error_msg = "" + if result.stderr: + error_msg += f"stderr: {result.stderr.strip()}" + if result.stdout: + error_msg += f"\nstdout: {result.stdout.strip()}" + if not error_msg: + error_msg = "Unknown validation error" + + return False, error_msg + + except FileNotFoundError: + return False, "ajv-cli not found. Cannot validate schema" + + +print("====== Filter results ======") + +# Validation +print("\n>>> Validating arguments...") +validate_filtering_args() + +# Read input files +print("\n>>> Reading input files...") + +print(f'Reading dataset info from "{par["input_dataset_info"]}"...') +with open(par["input_dataset_info"], "r") as f: + dataset_info = json.load(f) + +print(f'Reading method info from "{par["input_method_info"]}"...') +with open(par["input_method_info"], "r") as f: + method_info = json.load(f) + +print(f'Reading metric info from "{par["input_metric_info"]}"...') +with open(par["input_metric_info"], "r") as f: + metric_info = json.load(f) + +print(f'Reading results from "{par["input_results"]}"...') +with open(par["input_results"], "r") as f: + results = json.load(f) + +# Apply filters +print("\n>>> Applying filters...") + +print("Filtering datasets...") +filtered_dataset_info = apply_name_filter( + dataset_info, par["datasets_include"], par["datasets_exclude"], "dataset" +) + +print("Filtering methods...") +filtered_method_info = apply_name_filter( + method_info, par["methods_include"], par["methods_exclude"], "method" +) + +print("Filtering metrics...") +filtered_metric_info = apply_name_filter( + metric_info, par["metrics_include"], par["metrics_exclude"], "metric" +) + +# Get names for results filtering +filtered_dataset_names = [item["name"] for item in filtered_dataset_info] +filtered_method_names = [item["name"] for item in filtered_method_info] +filtered_metric_names = [item["name"] for item in filtered_metric_info] + +print("Filtering results...") +filtered_results = filter_results_data( + results, filtered_dataset_names, filtered_method_names, filtered_metric_names +) + +# Write and validate output files +print("\n>>> Writing and validating output files...") +results_schemas_dir = Path(meta["resources_dir"]) / "schemas" / "results_v4" + +validation_files = [ + { + "data": filtered_dataset_info, + "schema": "dataset_info.json", + "file": par["output_dataset_info"], + "name": "dataset info", + }, + { + "data": filtered_method_info, + "schema": "method_info.json", + "file": par["output_method_info"], + "name": "method info", + }, + { + "data": filtered_metric_info, + "schema": "metric_info.json", + "file": par["output_metric_info"], + "name": "metric info", + }, + { + "data": filtered_results, + "schema": "results.json", + "file": par["output_results"], + "name": "results", + }, +] + +all_valid = True +for validation in validation_files: + print(f'Writing {validation["name"]} to "{validation["file"]}"...') + with open(validation["file"], "w") as f: + json.dump(validation["data"], f, indent=2, ensure_ascii=False) + + print(f'Validating {validation["name"]}...') + schema_file = str(results_schemas_dir / validation["schema"]) + is_valid, error_msg = validate_json_against_schema( + validation["file"], schema_file, validation["name"] + ) + if not is_valid: + print(f'✗ {validation["name"]} validation failed') + print(f"Validation error: {error_msg}") + all_valid = False + +if not all_valid: + raise RuntimeError("One or more output files do not conform to their schemas") + +# Summary +print("\n>>> Summary of filtering results:") +print(f"Datasets: {len(filtered_dataset_info)} (from {len(dataset_info)})") +print(f"Methods: {len(filtered_method_info)} (from {len(method_info)})") +print(f"Metrics: {len(filtered_metric_info)} (from {len(metric_info)})") +print(f"Results: {len(filtered_results)} (from {len(results)})") + +print("\n>>> Done!") diff --git a/src/reporting/process_task_results/config.vsh.yaml b/src/reporting/process_task_results/config.vsh.yaml index e1703bf52..c3dcd3699 100644 --- a/src/reporting/process_task_results/config.vsh.yaml +++ b/src/reporting/process_task_results/config.vsh.yaml @@ -44,6 +44,63 @@ argument_groups: description: Nextflow execution trace file example: resources_test/openproblems/task_results_v4/raw/trace.txt + - name: Dataset filtering + description: | + Use these arguments to filter datasets by name. By default, all datasets are + run. If `--datasets_include` is defined, only those datasets are run. If + `--datasets_exclude` is defined, all datasets except those specified are run. + These arguments are mutually exclusive, so only `--datasets_include` OR + `--datasets_exclude` can set but not both. + arguments: + - name: "--datasets_include" + type: string + multiple: true + description: | + A list of dataset ids to include. If specified, only these datasets will be run. + - name: "--datasets_exclude" + type: string + multiple: true + description: | + A list of dataset ids to exclude. If specified, all datasets except the ones listed will be run. + + - name: Method filtering + description: | + Use these arguments to filter methods by name. By default, all methods are + run. If `--methods_include` is defined, only those methods are run. If + `--methods_exclude` is defined, all methods except those specified are run. + These arguments are mutually exclusive, so only `--methods_include` OR + `--methods_exclude` can set but not both. + arguments: + - name: "--methods_include" + type: string + multiple: true + description: | + A list of method ids to include. If specified, only these methods will be run. + - name: "--methods_exclude" + type: string + multiple: true + description: | + A list of method ids to exclude. If specified, all methods except the ones listed will be run. + + - name: Metric filtering + description: | + Use these arguments to filter metrics by name. By default, all metrics are + run. If `--metrics_include` is defined, only those metrics are run. If + `--metrics_exclude` is defined, all metrics except those specified are run. + These arguments are mutually exclusive, so only `--metrics_include` OR + `--metrics_exclude` can set but not both. + arguments: + - name: "--metrics_include" + type: string + multiple: true + description: | + A list of metric ids to include. If specified, only these metrics will be run. + - name: "--metrics_exclude" + type: string + multiple: true + description: | + A list of metric ids to exclude. If specified, all metrics except the ones listed will be run. + - name: Outputs arguments: - name: "--output_combined" @@ -137,6 +194,7 @@ dependencies: - name: reporting/get_metric_info - name: reporting/get_dataset_info - name: reporting/get_task_info + - name: reporting/filter_results - name: reporting/generate_qc - name: reporting/combine_output - name: reporting/render_report diff --git a/src/reporting/process_task_results/main.nf b/src/reporting/process_task_results/main.nf index 1fc64f389..059960d65 100644 --- a/src/reporting/process_task_results/main.nf +++ b/src/reporting/process_task_results/main.nf @@ -59,6 +59,32 @@ workflow run_wf { ] ) + | filter_results.run( + runIf: { id, state -> + // Only run filtering if there are include/exclude lists defined + return state.datasets_exclude || state.methods_exclude || state.metrics_exclude || + state.datasets_include || state.methods_include || state.metrics_include + }, + fromState: [ + "input_dataset_info": "output_dataset", + "input_method_info": "output_method", + "input_metric_info": "output_metric", + "input_results": "output_results", + "datasets_include": "datasets_include", + "datasets_exclude": "datasets_exclude", + "methods_include": "methods_include", + "methods_exclude": "methods_exclude", + "metrics_include": "metrics_include", + "metrics_exclude": "metrics_exclude" + ], + toState: [ + "output_dataset": "output_dataset_info", + "output_method": "output_method_info", + "output_metric": "output_metric_info", + "output_results": "output_results" + ] + ) + | generate_qc.run( fromState: [ "input_task_info": "output_task",