diff --git a/Project.toml b/Project.toml
index cd47a89..7fd9f0d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -11,12 +11,14 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 
 [compat]
 CUDA = "2, 3"
-CUDAKernels = "0.1, 0.2"
+CUDAKernels = "0.1, 0.2, 0.3"
 ChainRulesCore = "0.10"
 DiffRules = "1"
 FillArrays = "0.11"
+FLoops = "0.1.10"
+FoldsCUDA = "0.1.5"
 ForwardDiff = "0.10"
-KernelAbstractions = "0.6"
+KernelAbstractions = "0.6, 0.7"
 LoopVectorization = "0.12.48"
 NamedDims = "0.2"
 OffsetArrays = "1"
@@ -31,6 +33,8 @@ julia = "1.5"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
+FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+FoldsCUDA = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -46,4 +50,4 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
+test = ["Test", "CUDA", "CUDAKernels", "FillArrays", "FLoops", "FoldsCUDA", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
diff --git a/README.md b/README.md
index 2178548..6854d5b 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,9 @@ But it also co-operates with various other packages, provided they are loaded be
 
 * It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with keyword `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication.
 
-* It uses [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
+* It can use [`KernelAbstractions.@kernel`](https://github.com/JuliaGPU/KernelAbstractions.jl) (plus CUDAKernels) to make a GPU version. (Disable with `cuda=false`.) This is somewhat experimental, and may not be fast.
+
+* It can also use [`FLoops.@floop`](https://github.com/JuliaFolds/FLoops.jl), in particular to execute using [FoldsCUDA.jl](https://github.com/JuliaFolds/FoldsCUDA.jl) on the GPU.
 
 The macro also tries to provide a gradient for use with [Tracker](https://github.com/FluxML/Tracker.jl) or (via  [ChainRules](https://github.com/JuliaDiff/ChainRules.jl)) for [Zygote](https://github.com/FluxML/Zygote.jl), [Yota](https://github.com/dfdx/Yota.jl), etc. <!-- or [ReverseDiff](https://github.com/JuliaDiff/ReverseDiff.jl). -->
 (Disable with `grad=false`, or `nograd=A`.) This is done in one of two ways:
diff --git a/src/macro.jl b/src/macro.jl
index 0f0aa0b..60765aa 100644
--- a/src/macro.jl
+++ b/src/macro.jl
@@ -102,6 +102,7 @@ OPTS = Dict(
     :grad => [false, :Base, :Dual],
     :avx => Integer,
     :cuda => Integer,
+    :floops => [true, false],
     :tensor => [true, false],
     )
 
@@ -111,6 +112,7 @@ _THREADS = Ref{Any}(true)
 _GRAD = Ref{Any}(:Base)
 _AVX = Ref{Any}(true)
 _CUDA = Ref{Any}(true)
+_FLOOPS = Ref{Any}(false)
 
 function parse_options(exs...)
     opts = Dict{Symbol,Any}(
@@ -123,6 +125,7 @@ function parse_options(exs...)
         :grad => _GRAD[],
         :avx => _AVX[],
         :cuda => _CUDA[],
+        :floops => _FLOOPS[],
         :tensor => false,
         )
     expr = nothing
@@ -178,6 +181,7 @@ function parse_options(exs...)
         _GRAD[] = opts[:grad]
         _AVX[] = opts[:avx]
         _CUDA[] = opts[:cuda]
+        _FLOOPS[] = opts[:floops]
     end
     opts[:tensor] == false || @warn "option tensor=true is deprecated, try Tullio.@tensor"
     (redfun=opts[:redfun],
@@ -189,6 +193,7 @@ function parse_options(exs...)
         grad=opts[:grad],
         avx=opts[:avx],
         cuda=opts[:cuda],
+        floops=opts[:floops],
         nograd=nograd,
     ), ranges, expr
 end
@@ -1059,6 +1064,42 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
         store.verbose>0 && @warn "can't parallelise this gradient, no shared indices $note"
     end
 
+    #===== FLoops =====#
+
+    if store.floops != false && isdefined(store.mod, :FLoops)
+        try
+            info1 = store.verbose>0 ? :(@info "running FLoops actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
+            fex1 = quote
+
+                local @inline function $act!(::Type{<:AbstractArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                    $info1
+                    FLoops.@floop begin $ex1; $ex2 end
+                end
+
+            end
+            store.verbose==2 && @info "=====FL===== FLoops actor $note" verbosetidy(fex1)
+            if store.threads==false
+                # same dodgy switch as for KernelAbstractions, threads=false routes CPU calculation here: 
+                push!(store.outpre, macroexpand(store.mod, fex1))
+            end
+            if isdefined(store.mod, :FoldsCUDA) && isdefined(store.mod, :CUDA)
+                info2 = store.verbose>0 ? :(@info "running FLoops + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
+                fex2 = quote
+
+                    local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                        $info1
+                        FLoops.@floop FLoops.CUDAEx() begin $ex1; $ex2 end
+                    end
+
+                end
+                push!(store.outpre, macroexpand(store.mod, fex2))
+            end
+            store.verbose==2 && @info "success expanding FLoops.@floops"
+        catch err
+            store.verbose>0 && @warn "FLoops failed $note" err
+        end                
+    end
+
     #===== LoopVectorization =====#
 
     expre, exloop0, expost = if isempty(outer)
@@ -1159,11 +1200,11 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
             end
             store.verbose==2 && @info "=====KA===== KernelAbstractions kernel $note" verbosetidy(kex1)
             push!(store.outpre, macroexpand(store.mod, kex1))
-            if isdefined(store.mod, :CUDA) && isdefined(store.mod, :CuArray) # new-style, CUDA.jl, with CUDADevice()
+            if isdefined(store.mod, :CUDA)
                 info2 = store.verbose>0 ? :(@info "running KernelAbstractions + CUDA actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
                 kex2 = quote
 
-                    local @inline function $act!(::Type{<:CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
+                    local @inline function $act!(::Type{<:CUDA.CuArray}, $(args...), $KEEP=nothing, $FINAL=true) where {$TYP}
                         $info2
                         cu_kern! = $kernel(CUDADevice())
                         $(asserts...)
diff --git a/test/cuda.jl b/test/cuda.jl
index 2c1624a..0413a3b 100644
--- a/test/cuda.jl
+++ b/test/cuda.jl
@@ -1,7 +1,11 @@
 
 using Tullio, Test
-using CUDA, CUDAKernels, KernelAbstractions
+using CUDA 
 CUDA.allowscalar(false)
+
+# using CUDAKernels, KernelAbstractions
+# using FoldsCUDA, FLoops
+
 using Tracker, ForwardDiff
 @tullio grad=Base
 
diff --git a/test/group-2.jl b/test/group-2.jl
index 6381206..3ddaca3 100644
--- a/test/group-2.jl
+++ b/test/group-2.jl
@@ -4,7 +4,6 @@ t4 = time()
 using KernelAbstractions
 
 using Tracker
-
 GRAD = :Tracker
 _gradient(x...) = Tracker.gradient(x...)
 
@@ -23,20 +22,62 @@ _gradient(x...) = Tracker.gradient(x...)
     end
 end
 
-using CUDA
+using CUDA, CUDAKernels
 
 if is_buildkite
     # If we are on Buildkite, we should assert that we have a CUDA GPU available
     @test CUDA.has_cuda_gpu()
 end
 
-if CUDA.has_cuda_gpu()
+if false # CUDA.has_cuda_gpu()
     @info "===== found a GPU, starting CUDA tests ====="
-    @testset "===== CUDA tests on GPU =====" begin
+    @testset "===== KernelAbstractions CUDA tests on GPU =====" begin
         include("cuda.jl")
     end
+else
+    @info "===== skipping KernelAbstractions + CUDA tests ====="
 end
 
 @info @sprintf("KernelAbstractions tests took %.1f seconds", time()-t4)
 
 @tullio cuda=false
+
+#===== FLoops =====#
+
+t5 = time()
+using FLoops
+@tullio floops=false
+
+using Tracker
+GRAD = :Tracker
+_gradient(x...) = Tracker.gradient(x...)
+
+@testset "FLoops + parsing + gradients" begin
+    A = (rand(3,4));
+    B = (rand(4,5));
+    @tullio C[i,k] := A[i,j] * B[j,k]  threads=false  verbose=1
+    @test C ≈ A * B
+
+    @tullio threads=false 
+    include("parsing.jl")
+    include("gradients.jl")
+    @tullio threads=true
+
+    for sy in Tullio.SYMBOLS
+        @test !isdefined(@__MODULE__, sy)
+    end
+end
+
+using CUDA, FoldsCUDA
+
+if CUDA.has_cuda_gpu()
+    @info "===== found a GPU, starting CUDA tests ====="
+    @testset "===== FLoops + FoldsCUDA tests on GPU =====" begin
+        include("cuda.jl")
+    end
+end
+
+@info @sprintf("FLoops tests took %.1f seconds", time()-t5)
+
+@tullio floops=false
+