[Build] Various fixes (#936)

riedgar-ms · web-flow · commit 44af848e4555 · 2024-07-03T14:43:20.000-04:00
The builds have broken in a number of interesting ways:

- The required CMake configuration for `llama-cpp-python` has changed slightly, so update the various workflow files (not to mention the ReadMe).
- It also appears that a new version of Phi3 started giving trouble with another test, so fix that as well
- One of the LLamaCpp tests is consistently failing on Windows (and only Windows), so add a conditional xfail
diff --git a/.github/workflows/action_gpu_basic_tests.yml b/.github/workflows/action_gpu_basic_tests.yml
@@ -57,7 +57,7 @@ jobs:
         run: |
           pip install accelerate
           pip uninstall -y llama-cpp-python
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
@@ -53,7 +53,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
diff --git a/.github/workflows/notebook_tests.yml b/.github/workflows/notebook_tests.yml
@@ -56,7 +56,7 @@ jobs:
       - name: GPU pip installs
         run: |
           pip install accelerate
-          CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
+          CMAKE_ARGS="-DGGML_CUDA=on" pip install "llama-cpp-python!=0.2.58,!=0.2.75"
       - name: Check GPU available
         run: |
           python -c "import torch; assert torch.cuda.is_available()"
diff --git a/README.md b/README.md
@@ -648,7 +648,7 @@ time.time() - a
 ### llama.cpp
 Install the python bindings:
 ```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
+CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python
 ```
 Loading the model:
 ```python
diff --git a/tests/model_specific/test_llama_cpp.py b/tests/model_specific/test_llama_cpp.py
@@ -1,3 +1,5 @@
+import platform
+
 import numpy as np
 import pytest
 
@@ -49,8 +51,11 @@ def test_llama_cpp_select2(llamacpp_model: guidance.models.Model):
     ]
 
 
+@pytest.mark.xfail(
+    condition=platform.system() == "Windows",
+    reason="llama-cpp-python >=0.2.79 appears to have made models non-deterministic on Windows",
+)
 def test_repeat_calls(llamacpp_model: guidance.models.Model):
-    # llama-cpp-python 0.2.79 appears to have made models non-deterministic on Windows
     llama2 = llamacpp_model
     a = []
     lm = llama2 + "How much is 2 + 2? " + gen(name="test", max_tokens=10)
diff --git a/tests/model_specific/test_transformers.py b/tests/model_specific/test_transformers.py
@@ -15,10 +15,7 @@ def phi3_model(selected_model, selected_model_name):
 
 @pytest.fixture(scope="module")
 def llama3_model(selected_model, selected_model_name):
-    if (
-        selected_model_name in ["transformers_llama3cpu_8b"]
-        and selected_model is not None
-    ):
+    if selected_model_name in ["transformers_llama3cpu_8b"] and selected_model is not None:
         return selected_model
     else:
         pytest.skip("Requires Llama3 model (needs HF_TOKEN to be set)")
@@ -27,7 +24,7 @@ def llama3_model(selected_model, selected_model_name):
 def test_gpt2():
     gpt2 = get_model("transformers:gpt2")
     lm = gpt2 + "this is a test" + gen("test", max_tokens=10)
-    
+
     assert len(str(lm)) > len("this is a test")
 
 
@@ -42,9 +39,7 @@ def test_recursion_error():
     {gen('verse', max_tokens=2)}
     """
     )
-    assert len(str(lm)) > len(
-        "Tweak this proverb to apply to model instructions instead.\n\n"
-    )
+    assert len(str(lm)) > len("Tweak this proverb to apply to model instructions instead.\n\n")
 
 
 TRANSFORMER_MODELS = {
@@ -81,6 +76,7 @@ def test_transformer_smoke_select(model_name, model_kwargs):
 
 # Phi-3 specific tests
 
+
 @pytest.mark.skip("Don't overload the build machines")
 def test_phi3_transformers_orig():
     import torch
@@ -116,11 +112,10 @@ def test_phi3_transformers_orig():
 def test_phi3_chat_basic(phi3_model: models.Model):
     lm = phi3_model
 
-    lm += "You are a counting bot. Just keep counting numbers."
     with user():
-        lm += "1,2,3,4"
+        lm += "You are a counting bot. Just keep counting numbers."
     with assistant():
-        lm += gen(name="five", max_tokens=10)
+        lm += "1,2,3,4," + gen(name="five", max_tokens=20)
 
     assert "5" in lm["five"]
 
@@ -143,7 +138,7 @@ def test_phi3_newline_chat(phi3_model: models.Model):
     with assistant():
         lm += "\n" + gen(name="five", max_tokens=1)
         lm += "\n" + gen(name="six", max_tokens=1)
-    
+
     # This test would raise an exception earlier if we didn't fix the tokenizer.
     assert True
 
@@ -155,7 +150,7 @@ def test_phi3_unstable_tokenization(phi3_model: models.Model):
     with user():
         lm += "1,2,3,4,"
     with assistant():
-        lm += "\n" # comment and uncomment this line to get the error
+        lm += "\n"  # comment and uncomment this line to get the error
         lm += gen(name="five", max_tokens=1)
         lm += "," + gen(name="six", max_tokens=1)
 
@@ -168,4 +163,4 @@ def test_phi3_basic_completion_badtokens(phi3_model: models.Model):
     lm += f"""<|use\n\nYou are a counting bot. Just keep counting numbers.<|end|><|assistant|>1,2,3,4,"""
     lm += gen("five", max_tokens=10)
 
-    assert len(lm["five"]) > 0
+    assert len(lm["five"]) > 0