diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index b6fc8766..9f8c0146 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -7,7 +7,8 @@ steps:
           - JuliaCI/julia#v1:
               version: "{{matrix.julia}}"
           - JuliaCI/julia-test#v1: ~
-          - JuliaCI/julia-coverage#v1: ~
+          - JuliaCI/julia-coverage#v1:
+              coverage: "{{matrix.coverage}}"
         agents:
           queue: "juliagpu"
           cuda: "*"
@@ -31,12 +32,16 @@ steps:
               - "1.6"
               - "1.7"
               - "1.8"
-              - "1.9"
-          #     - "nightly"
-          # adjustments:
-          #   - with:
-          #       julia: "nightly"
-          #     soft_fail: true
+            coverage:
+              - "false"
+          adjustments:
+            - with:
+                julia: "1.9"
+                coverage: true
+          # - with:
+          #     julia: "nightly"
+          #     coverage: "false"
+          #   soft_fail: true
 
   - group: ":racehorse: Benchmarks"
     steps:
diff --git a/test/Project.toml b/test/Project.toml
index 8828b9af..8d1be0aa 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -5,5 +5,6 @@ Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 XUnit = "3e3c03f2-1a94-11e9-2981-050a4ca824ab"
diff --git a/test/matmul.jl b/test/matmul.jl
index 90fc0baa..808d6971 100644
--- a/test/matmul.jl
+++ b/test/matmul.jl
@@ -1,7 +1,26 @@
 using CUDA
 using ForwardDiff
 using GemmKernels
-using LinearAlgebra
+import Octavian, LinearAlgebra
+
+# for large, non-BLAS-compatible matrices, use Octavian.
+matmul!(C, A, B, alpha=true, beta=false) = LinearAlgebra.mul!(C, A, B, alpha, beta)
+function matmul!(C::Array,
+                 A::Union{Array, LinearAlgebra.Transpose{<:Any, <:Array},
+                                 LinearAlgebra.Adjoint{<:Any, <:Array}},
+                 B::Union{Array, LinearAlgebra.Transpose{<:Any, <:Array},
+                                 LinearAlgebra.Adjoint{<:Any, <:Array}},
+                 alpha::Bool=true, beta::Bool=false)
+    supported = eltype(C) <: LinearAlgebra.BlasFloat &&
+                eltype(A) <: LinearAlgebra.BlasFloat &&
+                eltype(B) <: LinearAlgebra.BlasFloat &&
+                eltype(C) == eltype(A) == eltype(B)
+    if !supported && (sizeof(C) > 2^20 || sizeof(A) > 2^20 || sizeof(B) > 2^20)
+        Octavian.matmul!(C, A, B, alpha, beta)
+    else
+        LinearAlgebra.mul!(C, A, B, alpha, beta)
+    end
+end
 
 ################################################################################
 
@@ -63,7 +82,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(a_h) : a_h
             new_b_h = transpose_b ? transpose(b_h) : b_h
 
-            mul!(c_h, new_a_h, new_b_h, alpha, beta)
+            matmul!(c_h, new_a_h, new_b_h, alpha, beta)
             if A_type <: Integer
                 @test c_h ≈ Array(d)
             else
@@ -121,7 +140,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(a_h) : a_h
             new_b_h = transpose_b ? transpose(b_h) : b_h
 
-            mul!(c_h, new_a_h, new_b_h, alpha, beta)
+            matmul!(c_h, new_a_h, new_b_h, alpha, beta)
             @test c_h ≈ Array(d) rtol=sqrt(eps(A_type))
         end
     end
@@ -222,7 +241,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(a_h) : a_h
             new_b_h = transpose_b ? transpose(b_h) : b_h
 
-            mul!(c_h, new_a_h, new_b_h, alpha, beta)
+            matmul!(c_h, new_a_h, new_b_h, alpha, beta)
             @test c_h ≈ Array(d) rtol=sqrt(eps(AB_type))
         end
     end
@@ -274,7 +293,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(a_h) : a_h
             new_b_h = transpose_b ? transpose(b_h) : b_h
 
-            mul!(c_h, new_a_h, new_b_h, true, true)
+            matmul!(c_h, new_a_h, new_b_h, true, true)
             @test c_h .+ Array(bias) ≈ Array(d) rtol=sqrt(eps(Float16))
         end
     end
@@ -319,7 +338,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(a_h) : a_h
             new_b_h = transpose_b ? transpose(b_h) : b_h
 
-            mul!(c_h, Diagonal(new_a_h), new_b_h, true, true)
+            matmul!(c_h, Diagonal(new_a_h), new_b_h, true, true)
             @test c_h ≈ Array(d) rtol=sqrt(eps(Float16))
         end
     end
@@ -383,7 +402,7 @@ using LinearAlgebra
             new_a_h = transpose_a ? transpose(new_a_h) : new_a_h
             new_b_h = transpose_b ? transpose(new_b_h) : new_b_h
 
-            mul!(c_h, new_a_h, new_b_h, true, true)
+            matmul!(c_h, new_a_h, new_b_h, true, true)
             @test c_h ≈ Array(d) rtol=sqrt(eps(Float16))
         end
     end
@@ -436,7 +455,7 @@ using LinearAlgebra
             c_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, c_h)
             d_dual = reinterpret(ForwardDiff.Dual{Float32,Float32,1}, Array(d))
 
-            mul!(c_dual, a_dual, b_dual, true, true)
+            matmul!(c_dual, a_dual, b_dual, true, true)
             @test c_dual ≈ d_dual rtol=sqrt(eps(Float16))
         end
     end