Test 4

jgchn · jgchn · commit 816193f1fef2 · 2025-03-29T14:46:57.000-04:00
Signed-off-by: Jing Chen &lt;jing.chen2@ibm.com&gt;
diff --git a/.github/workflows/run-examples-jing.yml b/.github/workflows/run-examples-jing.yml
@@ -61,6 +61,8 @@ jobs:
 
     # Run tests
     - uses: actions/checkout@v4
+      with:
+        ref: ${{ github.head_ref }}
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -90,4 +92,13 @@ jobs:
         WATSONX_URL: ${{ secrets.WATSONX_URL }}
         REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
         OLLAMA_GHACTIONS_RESULTS: true
-      run: py.test -v --capture=tee-sys -rfE -s tests/test_examples_run.py
+      run: py.test -v --capture=tee-sys -rfE -s tests/test_examples_run_jing.py
+    - name: Update example result files (if any) generated from Ollama running on GH Actions
+      run: |
+        git config --local user.name github-actions[bot]
+        git config --local user.email "${{ github.actor_id }}+${{ github.actor }}@users.noreply.github.com"
+        if ! git diff --exit-code; then
+          git add tests/results/
+          git commit -m "GH Action updated results file when running examples $(date)"
+          git push
+        fi
diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml
@@ -34,13 +34,14 @@ jobs:
       shell: bash
       run: |
         ollama pull granite3.2:2b
+        ollama pull granite3.2:8b
         ollama pull mxbai-embed-large
         ollama list
 
     - name: Check that all required models are available
       shell: bash
       run: |
-        models=("mxbai-embed-large" "granite3.2:2b")
+        models=("mxbai-embed-large" "granite3.2:2b" "granite3.2:8b")
         missing=0
         for model in "${models[@]}"; do
           if ! ollama list | awk 'NR>1 {print $1}' | grep -q "$model"; then
@@ -91,4 +92,5 @@ jobs:
         WATSONX_APIKEY: ${{ secrets.WATSONX_APIKEY }}
         WATSONX_URL: ${{ secrets.WATSONX_URL }}
         REPLICATE_API_TOKEN: ${{ secrets.REPLICATE_API_TOKEN }}
+        OLLAMA_GHACTIONS_RESULTS: true
       run: py.test -v --capture=tee-sys -rfE -s tests/test_examples_run.py
diff --git a/tests/test_examples_run_jing.py b/tests/test_examples_run_jing.py
@@ -0,0 +1,269 @@
+import io
+import os
+import pathlib
+import random
+from dataclasses import dataclass
+from typing import Optional
+
+from pytest import CaptureFixture, MonkeyPatch
+
+from pdl import pdl
+from pdl.pdl_ast import ScopeType
+from pdl.pdl_dumper import block_to_dict
+from pdl.pdl_lazy import PdlDict
+from pdl.pdl_parser import PDLParseError
+
+# test_examples_run.py runs the examples and compares the results
+# to the expected results in tests/results/examples
+
+UPDATE_RESULTS = True
+RESULTS_VERSION = 1
+OLLAMA_GHACTIONS_RESULTS_ENV_VAR = os.getenv("OLLAMA_GHACTIONS_RESULTS", "")
+OLLAMA_GHACTIONS_RESULTS = False
+if OLLAMA_GHACTIONS_RESULTS_ENV_VAR.lower().strip() == "true":
+    OLLAMA_GHACTIONS_RESULTS = True
+
+TO_SKIP = {
+    str(name)
+    for name in [
+        # Requires dataset dependency
+        pathlib.Path("examples") / "cldk" / "cldk-assistant.pdl",
+        pathlib.Path("examples") / "gsm8k" / "gsm8.pdl",
+        pathlib.Path("examples") / "gsm8k" / "gsm8k-plan.pdl",
+        # Requires installation dependencies
+        pathlib.Path("examples") / "intrinsics" / "demo-hallucination.pdl",
+        # Skip RAG examples
+        pathlib.Path("examples") / "rag" / "pdf_index.pdl",
+        pathlib.Path("examples") / "rag" / "pdf_query.pdl",
+        pathlib.Path("examples")
+        / "rag"
+        / "rag_library1.pdl",  # (This is glue to Python, it doesn't "run" alone)
+        # Skip structure decoding example (Jing doesn't have WATSONX API KEY)
+        pathlib.Path("examples") / "tutorial" / "structured_decoding.pdl",
+        # OUtput result include trace (and thus timing) for some reason. Investigate why
+        pathlib.Path("examples") / "react" / "react_call.pdl",  # Very non-deterministic
+        pathlib.Path("pdl-live-react") / "demos" / "error.pdl",
+        pathlib.Path("pdl-live-react") / "demos" / "demo1.pdl",
+        pathlib.Path("pdl-live-react") / "demos" / "demo2.pdl",
+        # For now, skip the granite-io examples
+        pathlib.Path("examples") / "granite-io" / "granite_io_hallucinations.pdl",
+        pathlib.Path("examples") / "granite-io" / "granite_io_openai.pdl",
+        pathlib.Path("examples") / "granite-io" / "granite_io_thinking.pdl",
+        pathlib.Path("examples") / "granite-io" / "granite_io_transformers.pdl",
+    ]
+}
+
+
+@dataclass
+class InputsType:
+    stdin: Optional[str] = None
+    scope: Optional[ScopeType] = None
+
+
+TESTS_WITH_INPUT: dict[str, InputsType] = {
+    str(name): inputs
+    for name, inputs in {
+        pathlib.Path("examples")
+        / "tutorial"
+        / "programs"
+        / "chatbot.pdl": InputsType(stdin="What is APR?\nyes\n"),
+        pathlib.Path("examples")
+        / "tutorial"
+        / "input_stdin.pdl": InputsType(stdin="Hello\n"),
+        pathlib.Path("examples")
+        / "tutorial"
+        / "input_stdin_multiline.pdl": InputsType(stdin="Hello\nBye\n"),
+        pathlib.Path("examples")
+        / "input"
+        / "input_test1.pdl": InputsType(stdin="Hello\n"),
+        pathlib.Path("examples")
+        / "input"
+        / "input_test2.pdl": InputsType(stdin="Hello\n"),
+        pathlib.Path("examples")
+        / "chatbot"
+        / "chatbot.pdl": InputsType(stdin="What is APR?\nyes\n"),
+        pathlib.Path("examples")
+        / "demo"
+        / "7-chatbot-roles.pdl": InputsType(stdin="What is APR?\nquit\n"),
+        pathlib.Path("examples")
+        / "tutorial"
+        / "free_variables.pdl": InputsType(scope=PdlDict({"something": "ABC"})),
+    }.items()
+}
+
+
+EXPECTED_PARSE_ERROR = [
+    pathlib.Path("tests") / "data" / "line" / "hello.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello1.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello4.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello7.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello8.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello10.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello11.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello31.pdl",
+]
+
+EXPECTED_RUNTIME_ERROR = [
+    pathlib.Path("examples") / "callback" / "repair_prompt.pdl",
+    pathlib.Path("examples") / "tutorial" / "type_list.pdl",
+    pathlib.Path("examples") / "tutorial" / "type_checking.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello12.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello13.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello14.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello15.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello16.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello17.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello18.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello19.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello20.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello21.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello22.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello23.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello24.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello25.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello26.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello27.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello28.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello29.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello3.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello30.pdl",
+    pathlib.Path("tests") / "data" / "line" / "hello9.pdl",
+]
+
+
+def __write_to_results_file(
+    dir_name: pathlib.Path, filename: str, content: str
+) -> None:
+    """
+    Write to results file
+    """
+
+    dir_name.mkdir(parents=True, exist_ok=True)
+    with open(dir_name / filename, "w", encoding="utf-8") as result_file:
+        result_file.write(content)
+
+
+def __find_and_compare_results(
+    test_file_name: pathlib.Path, actual_result: str
+) -> bool:
+    """
+    Look through test_file_name's parent directory and see if any of *.result
+    matches the actual output
+    """
+
+    result_dir_name = pathlib.Path(".") / "tests" / "results" / test_file_name.parent
+    expected_files = result_dir_name.glob(test_file_name.stem + ".*.result")
+
+    for expected_file in expected_files:
+        with open(expected_file, "r", encoding="utf-8") as truth_file:
+            expected_result = str(truth_file.read())
+            if str(actual_result).strip() == expected_result.strip():
+                return True
+    return False
+
+
+def test_valid_programs(capsys: CaptureFixture[str], monkeypatch: MonkeyPatch) -> None:
+    actual_parse_error: set[str] = set()
+    actual_runtime_error: set[str] = set()
+    wrong_results = {}
+
+    # files = pathlib.Path(".").glob("**/*.pdl")
+    files = [
+        pathlib.Path("examples") / "demo" / "4-function.pdl",
+    ]
+
+    for pdl_file_name in files:
+
+        scope: ScopeType = PdlDict({})
+        if str(pdl_file_name) in TO_SKIP:
+            continue
+        if str(pdl_file_name) in TESTS_WITH_INPUT:
+            inputs = TESTS_WITH_INPUT[str(pdl_file_name)]
+            if inputs.stdin is not None:
+                monkeypatch.setattr(
+                    "sys.stdin",
+                    io.StringIO(inputs.stdin),
+                )
+            if inputs.scope is not None:
+                scope = inputs.scope
+        try:
+            random.seed(11)
+            output = pdl.exec_file(
+                pdl_file_name,
+                scope=scope,
+                output="all",
+                config=pdl.InterpreterConfig(batch=0),
+            )
+            result = output["result"]
+
+            block_to_dict(output["trace"], json_compatible=True)
+            result_dir_name = (
+                pathlib.Path(".") / "tests" / "results" / pdl_file_name.parent
+            )
+
+            if not __find_and_compare_results(pdl_file_name, str(result)):
+
+                if OLLAMA_GHACTIONS_RESULTS:
+                    print(
+                        "-------------------- Updating result from running Ollama on GitHub Actions -------------------- "
+                    )
+                    result_file_name = f"{pdl_file_name.stem}.ollama_ghactions.result"
+                    __write_to_results_file(
+                        result_dir_name, result_file_name, str(result)
+                    )
+
+                    # Evaluate the results again. If fails again, then consider this program as failing
+                    if not __find_and_compare_results(pdl_file_name, str(result)):
+                        wrong_results[str(pdl_file_name)] = {
+                            "actual": str(result),
+                        }
+                    # If evaluating results produces correct result, then this is considered passing
+                    else:
+                        continue
+
+                if UPDATE_RESULTS:
+                    result_file_name = (
+                        f"{pdl_file_name.stem}.{str(RESULTS_VERSION)}.result"
+                    )
+                    __write_to_results_file(
+                        result_dir_name, result_file_name, str(result)
+                    )
+
+                wrong_results[str(pdl_file_name)] = {
+                    "actual": str(result),
+                }
+        except PDLParseError:
+            actual_parse_error |= {str(pdl_file_name)}
+        except Exception as exc:
+            if str(pdl_file_name) not in set(str(p) for p in EXPECTED_RUNTIME_ERROR):
+                print(f"{pdl_file_name}: {exc}")  # unexpected error: breakpoint
+            actual_runtime_error |= {str(pdl_file_name)}
+            print(exc)
+
+    # Parse errors
+    expected_parse_error = set(str(p) for p in [])
+    unexpected_parse_error = sorted(list(actual_parse_error - expected_parse_error))
+    assert (
+        len(unexpected_parse_error) == 0
+    ), f"Unexpected parse error: {unexpected_parse_error}"
+
+    # Runtime errors
+    expected_runtime_error = set(str(p) for p in [])
+    unexpected_runtime_error = sorted(
+        list(actual_runtime_error - expected_runtime_error)
+    )
+    assert (
+        len(unexpected_runtime_error) == 0
+    ), f"Unexpected runtime error: {unexpected_runtime_error}"
+
+    # Unexpected valid
+    unexpected_valid = sorted(
+        list(
+            (expected_parse_error - actual_parse_error).union(
+                expected_runtime_error - actual_runtime_error
+            )
+        )
+    )
+    assert len(unexpected_valid) == 0, f"Unexpected valid: {unexpected_valid}"
+    # Unexpected results
+    assert len(wrong_results) == 0, f"Wrong results: {wrong_results}"