Plotting Provenance (#24)

M-Jafarkhani · web-flow · commit 5398ceeb5663 · 2025-09-24T05:59:10.000+02:00
* add process-artifacts job

* Add parameter extractor

* Fix yml file

* Fix Parameter Extractor

* Add provenance plotting

* Fix python path

* Add python packages

* Edit Extractor

* Edit filename

* Edit SPARQL query

* Fix Sparql query

* Switch to conda for plotting

* Fix Env

* Merge two jobs

* Edit tool name extraction

* Print packages

* Rollback to separate jobs

* Fix Extractor

* Fix post-processing env

* Fix plotting bug

* Fix tool extraction

* Fix SPARQL query and parame extraction

* Fix file name and duplicate envs

* Fix param extraction

* Remove unit

* SPARQL refactor

* Fix version finding

* Fix Param Extractor

* Fix case-insentitive

* Remove tool from paramscript

* Add dynamic tool names

* Fix QUDT units

* Remove qudt extraction

* Remove unused method

* Installing version 1.0.0 and unzipping with file name

* Fix file name

* Merge run-benchmark

* Fix renaming
diff --git a/.github/workflows/run-benchmark.yml b/.github/workflows/run-benchmark.yml
@@ -49,8 +49,11 @@ jobs:
         run: |
           cd $GITHUB_WORKSPACE/benchmarks/linear-elastic-plate-with-hole/
           snakemake --use-conda --force --cores 'all'
-          snakemake --use-conda --force --cores 'all' --reporter metadata4ing
-
+          snakemake --use-conda --force --cores all \
+            --reporter metadata4ing \
+            --report-metadata4ing-paramscript parameter_extractor.py \
+            --report-metadata4ing-filename metadata4ing_provenance
+      
       - name: run_linear-elastic-plate-with-hole-benchmarks_nextflow
         shell: bash -l {0}
         run: |
@@ -62,11 +65,48 @@ jobs:
         with:
             name: snakemake_results_linear-elastic-plate-with-hole
             path: |
-              benchmarks/linear-elastic-plate-with-hole/*.zip
+              benchmarks/linear-elastic-plate-with-hole/metadata4ing_provenance.zip
 
       - name: Archive Linear Elastic plate with a hole benchmark data for nextflow
         uses: actions/upload-artifact@v4
         with:
             name: nextflow_results_linear-elastic-plate-with-hole
             path: |
               benchmarks/linear-elastic-plate-with-hole/nextflow_results/
+
+  process-artifacts:
+    runs-on: ubuntu-latest
+    needs: tests
+    steps:
+      - name: Checkout repo content
+        uses: actions/checkout@v2
+
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: snakemake_results_linear-elastic-plate-with-hole
+          path: ./artifact_files
+
+      - name: Unzip metadata4ing_provenance.zip
+        run: |
+          mkdir -p ./metadata4ing_provenance
+          unzip -o ./artifact_files/metadata4ing_provenance.zip -d ./metadata4ing_provenance
+
+      - name: Setup Mambaforge with postprocessing env
+        uses: conda-incubator/setup-miniconda@v3
+        with:
+            miniforge-version: latest
+            activate-environment: postprocessing 
+            use-mamba: true
+            environment-file: benchmarks/linear-elastic-plate-with-hole/environment_postprocessing.yml
+
+      - name: Run plotting script
+        shell: bash -l {0}
+        run: |
+          python benchmarks/linear-elastic-plate-with-hole/plot_provenance.py ./metadata4ing_provenance
+
+      - name: Upload PDF plot as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: element-size-vs-stress-plot
+          path: element_size_vs_stress.pdf
diff --git a/benchmarks/linear-elastic-plate-with-hole/environment_postprocessing.yml b/benchmarks/linear-elastic-plate-with-hole/environment_postprocessing.yml
@@ -1,10 +1,13 @@
 name: postprocessing
 channels:
   - conda-forge
+  - defaults
 
 channel_priority: strict
 
 dependencies:
   - python=3.12
   - pint
   - pyvista
+  - rdflib
+  - matplotlib
diff --git a/benchmarks/linear-elastic-plate-with-hole/parameter_extractor.py b/benchmarks/linear-elastic-plate-with-hole/parameter_extractor.py
@@ -0,0 +1,59 @@
+import json
+import os
+from snakemake_report_plugin_metadata4ing.interfaces import (
+    ParameterExtractorInterface,
+)
+
+class ParameterExtractor(ParameterExtractorInterface):
+    def extract_params(self, rule_name: str, file_path: str) -> dict:
+        results = {}
+        file_name = os.path.basename(file_path)
+        if (
+            file_name.startswith("parameters_")
+            and file_name.endswith(".json")
+            and (rule_name.startswith("postprocess_") or rule_name.startswith("run_"))
+        ):
+            results.setdefault(rule_name, {}).setdefault("has parameter", [])
+            with open(file_path) as f:
+                data = json.load(f)
+            for key, val in data.items():
+                if isinstance(val, dict):
+                    results[rule_name]["has parameter"].append({key: {
+                        "value": val["value"],
+                        "unit": f"{val["unit"]}" if "unit" in val else None,
+                        "json-path": f"/{key}/value",
+                        "data-type": self._get_type(val["value"]),
+                    }})
+                else:
+                    results[rule_name]["has parameter"].append({key: {
+                        "value": val,
+                        "unit": None,
+                        "json-path": f"/{key}",
+                        "data-type": self._get_type(val),
+                    }})
+        elif (
+            file_name.startswith("solution_")
+            and file_name.endswith(".json")
+            and (rule_name.startswith("postprocess_") or rule_name.startswith("run_"))
+        ):
+            results.setdefault(rule_name, {}).setdefault("investigates", [])
+            with open(file_path) as f:
+                data = json.load(f)
+            for key, val in data.items():
+                if key == "max_von_mises_stress_nodes":
+                    results[rule_name]["investigates"].append({key: {
+                        "value": val,
+                        "unit": None,
+                        "json-path": f"/{key}",
+                        "data-type": "schema:Float",
+                    }})
+        return results
+
+    def _get_type(self, val):
+        if isinstance(val, float):
+            return "schema:Float"
+        elif isinstance(val, int):
+            return "schema:Integer"
+        elif isinstance(val, str):
+            return "schema:Text"
+        return None
diff --git a/benchmarks/linear-elastic-plate-with-hole/plot_provenance.py b/benchmarks/linear-elastic-plate-with-hole/plot_provenance.py
@@ -0,0 +1,153 @@
+import os
+import argparse
+from rdflib import Graph
+import matplotlib.pyplot as plt
+from collections import defaultdict
+from generate_config import workflow_config
+
+def load_graphs(base_dir):
+    """
+    Walk through the base_dir and load all JSON-LD files into rdflib Graphs.
+    """
+    graph_list = []
+    for root, _, files in os.walk(base_dir):
+        for file in files:
+            if file.endswith(".jsonld"):
+                file_path = os.path.join(root, file)
+                try:
+                    g = Graph()
+                    g.parse(file_path, format='json-ld')
+                    graph_list.append(g)
+                    print(f"✅ Parsed: {file_path}")
+                except Exception as e:
+                    print(f"❌ Failed to parse {file_path}: {e}")
+    print(f"\nTotal graphs loaded: {len(graph_list)}")
+    return graph_list
+
+
+def query_and_build_table(graph_list):
+    """
+    Run SPARQL query on graphs and build a table.
+    Returns headers and table_data.
+    """
+    tools = workflow_config["tools"]
+    filter_conditions = " || ".join(
+        f'CONTAINS(LCASE(?tool_name), "{tool.lower()}")' for tool in tools
+    )
+    query = f"""
+    PREFIX cr: <http://mlcommons.org/croissant/>
+    PREFIX sio: <http://semanticscience.org/resource/>
+
+    SELECT DISTINCT ?value_element_size ?value_max_von_mises_stress_gauss_points ?tool_name
+    WHERE {{
+      ?processing_step a schema:Action ;
+            m4i:hasParameter ?element_size ;
+            m4i:hasParameter ?element_order ;
+            m4i:hasParameter ?element_degree ;
+            m4i:investigates ?max_von_mises_stress_gauss_points ;
+            schema:instrument ?tool .
+    
+      ?max_von_mises_stress_gauss_points a schema:PropertyValue ;
+            rdfs:label "max_von_mises_stress_nodes" ;
+            schema:value ?value_max_von_mises_stress_gauss_points .
+            
+      ?element_order a schema:PropertyValue ;
+            rdfs:label "element_order" ;
+            schema:value 1 .
+
+      ?element_degree a schema:PropertyValue ;
+            rdfs:label "element_degree" ;
+            schema:value 1 .
+
+      ?element_size a schema:PropertyValue ;
+            rdfs:label "element_size" ;
+            schema:value ?value_element_size .
+
+      ?tool a schema:SoftwareApplication ;
+            rdfs:label ?tool_name .
+            
+      FILTER ({filter_conditions})
+    }}
+    """
+
+    headers = [
+        "element-size",
+        "max-mises-stress",
+        "Tool Name"
+    ]
+
+    table_data = []
+
+    for g in graph_list:
+        results = g.query(query)
+        for row in results:
+            value_element_size = row.value_element_size
+            value_max_von_mises_stress_gauss_points = row.value_max_von_mises_stress_gauss_points
+            tool_name = row.tool_name
+            table_data.append(
+                [
+                    value_element_size,
+                    value_max_von_mises_stress_gauss_points,
+                    tool_name,
+                ]
+            )
+
+    # Sort by element-size
+    sort_key = headers.index("element-size")
+    table_data.sort(key=lambda x: x[sort_key])
+
+    return headers, table_data
+
+
+def plot_element_size_vs_stress(headers, table_data, output_file="element_size_vs_stress.pdf"):
+    """Plots element-size vs max-mises-stress grouped by tool and saves as PDF."""
+
+    idx_element_size = headers.index("element-size")
+    idx_stress = headers.index("max-mises-stress")
+    idx_tool = headers.index("Tool Name")
+
+    grouped_data = defaultdict(list)
+    x_tick_set = set()
+
+    for row in table_data:
+        tool = row[idx_tool]
+        x = float(row[idx_element_size])
+        y = float(row[idx_stress])
+        grouped_data[tool].append((x, y))
+        x_tick_set.add(x)
+
+    # Sort x-tick labels
+    x_ticks = sorted(x_tick_set)
+
+    plt.figure(figsize=(12, 5))
+    for tool, values in grouped_data.items():
+        values.sort()
+        x_vals, y_vals = zip(*values)
+        plt.plot(x_vals, y_vals, marker='o', linestyle='-', label=tool)
+
+    plt.xlabel("element-size")
+    plt.ylabel("max-mises-stress")
+    plt.title("element-size vs max-mises-stress by Tool\n(element-order = 1 , element-degree = 1)")
+    plt.legend(title="Tool Name")
+    plt.grid(True)
+
+    # Use logarithmic scale for x-axis
+    plt.xscale('log')
+
+    # Set x-ticks to show original values
+    plt.xticks(ticks=x_ticks, labels=[str(x) for x in x_ticks], rotation=45)
+    plt.tight_layout()
+    
+    # Save to PDF instead of showing
+    plt.savefig(output_file)
+    print(f"Plot saved as {output_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process JSON-LD artifacts and display simulation results.")
+    parser.add_argument("artifact_folder", type=str, help="Path to the folder containing unzipped artifacts")
+    args = parser.parse_args()
+
+    graphs = load_graphs(args.artifact_folder)
+    headers, table_data = query_and_build_table(graphs)
+    plot_element_size_vs_stress(headers, table_data, output_file="element_size_vs_stress.pdf")
diff --git a/environment_benchmarks.yml b/environment_benchmarks.yml
@@ -11,4 +11,4 @@ dependencies:
   - conda
   - pip
   - pip:
-      - "--editable=git+https://github.com/izus-fokus/snakemake-report-plugin-metadata4ing.git#egg=snakemake-report-plugin-metadata4ing"
+      - "git+https://github.com/izus-fokus/snakemake-report-plugin-metadata4ing@v1.0.0#egg=snakemake-report-plugin-metadata4ing"