diff --git a/.gitignore b/.gitignore
index 612bdcb..29f58f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,7 @@
 .vscode
 docs/build
 docs/site
-docs/Manifest.toml
 docs/src/democards
-/Manifest.toml
+Manifest.toml
 /.benchmarkci
 /benchmark/*.json
diff --git a/Project.toml b/Project.toml
index 99ffc84..3ce7c84 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,7 @@ ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 FFTViews = "4f61f5a4-77b1-5117-aa51-3ab5ef4ef0cd"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+ImageBase = "c817782e-172a-44cc-b673-b171935fbb9e"
 ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
@@ -24,6 +25,7 @@ ComputationalResources = "0.3"
 DataStructures = "0.17.7, 0.18"
 FFTViews = "0.3"
 FFTW = "0.3, 1"
+ImageBase = "0.1.5"
 ImageCore = "0.9"
 OffsetArrays = "1.9"
 Reexport = "1.1"
@@ -33,10 +35,14 @@ julia = "1"
 
 [extras]
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
+ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
+ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
 ImageMetadata = "bc367c6b-8a6b-528e-b4bd-a4b897500b49"
+ImageQualityIndexes = "2996bd0c-7a13-11e9-2da2-2f5ce47296a9"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestImages = "5e47fb64-e119-507b-a336-dd2b206d9990"
 
 [targets]
-test = ["AxisArrays", "ImageMetadata", "Logging", "Random", "Test"]
+test = ["AxisArrays", "ImageIO", "ImageMagick", "ImageMetadata", "ImageQualityIndexes", "Logging", "Random", "Test", "TestImages"]
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index fc8d2f9..4fa61f5 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -2,6 +2,7 @@ using ImageFiltering, ImageCore
 using PkgBenchmark
 using BenchmarkTools
 using Statistics: quantile, mean, median!
+using ImageFiltering.Models
 
 function makeimages(sz)
     imgF32      = rand(Float32, sz)
@@ -56,3 +57,14 @@ let grp = SUITE["imfilter"]
         end
     end
 end
+
+
+SUITE["ROF"] = BenchmarkGroup()
+let grp = SUITE["ROF"]
+    for sz in ((100, 100), (256, 256), (2048, 2048), (256, 256, 30))
+        for (aname, img) in makeimages(sz)
+            szstr = sz2str(sz)
+            grp["PrimalDual"*"_"*aname*"_"*szstr] = @benchmarkable solve_ROF_PD($img, 0.1, 10)
+        end
+    end
+end
diff --git a/docs/src/function_reference.md b/docs/src/function_reference.md
index 9199c37..d727130 100644
--- a/docs/src/function_reference.md
+++ b/docs/src/function_reference.md
@@ -72,6 +72,12 @@ Algorithm.IIR
 Algorithm.Mixed
 ```
 
+# Solvers for predefined models
+
+```@autodocs
+Modules = [ImageFiltering.Models]
+```
+
 # Internal machinery
 
 ```@docs
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index 58df01f..7f10bfe 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -92,6 +92,8 @@ include("mapwindow.jl")
 using .MapWindow
 include("extrema.jl")
 
+include("models.jl")
+
 function __init__()
     # See ComputationalResources README for explanation
     push!(LOAD_PATH, dirname(@__FILE__))
diff --git a/src/models.jl b/src/models.jl
new file mode 100644
index 0000000..be3bdc7
--- /dev/null
+++ b/src/models.jl
@@ -0,0 +1,173 @@
+module Models
+
+using ImageBase
+using ImageBase.ImageCore.MappedArrays: of_eltype
+using ImageBase.FiniteDiff
+
+# Introduced in ColorVectorSpace v0.9.3
+# https://github.com/JuliaGraphics/ColorVectorSpace.jl/pull/172
+using ImageBase.ImageCore.ColorVectorSpace.Future: abs2
+
+"""
+This submodule provides predefined image-related models and its solvers that can be reused
+by many image processing tasks.
+
+- solve the Rudin Osher Fatemi (ROF) model using the primal-dual method: [`solve_ROF_PD`](@ref) and [`solve_ROF_PD!`](@ref)
+"""
+Models
+
+export solve_ROF_PD, solve_ROF_PD!
+
+
+##### implementation details
+
+"""
+    solve_ROF_PD([T], img::AbstractArray, λ; kwargs...)
+
+Return a smoothed version of `img`, using Rudin-Osher-Fatemi (ROF) filtering, more commonly
+known as Total Variation (TV) denoising or TV regularization. This algorithm is based on the
+primal-dual method.
+
+This function applies to generic N-dimensional colorant array and is also CUDA-compatible.
+See also [`solve_ROF_PD!`](@ref) for the in-place version.
+
+# Arguments
+
+- `T`: the output element type. By default it is `float32(eltype(img))`.
+- `img`: the input image, usually a noisy image.
+- `λ`: the regularization coefficient. Larger `λ` results in more smoothing.
+
+# Parameters
+
+- `num_iters::Int`: The number of iterations before stopping.
+
+# Examples
+
+```julia
+using ImageFiltering
+using ImageFiltering.Models: solve_ROF_PD
+using ImageQualityIndexes
+using TestImages
+
+img_ori = float.(testimage("cameraman"))
+img_noisy = img_ori .+ 0.1 .* randn(size(img_ori))
+assess_psnr(img_noisy, img_ori) # ~20 dB
+
+img_smoothed = solve_ROF_PD(img_noisy, 0.015, 50)
+assess_psnr(img_smoothed, img_ori) # ~27 dB
+
+# larger λ produces over-smoothed result
+img_smoothed = solve_ROF_PD(img_noisy, 5, 50)
+assess_psnr(img_smoothed, img_ori) # ~21 dB
+```
+
+# Extended help
+
+Mathematically, this function solves the following ROF model using the primal-dual method:
+
+```math
+\\min_u \\lVert u - g \\rVert^2 + \\lambda\\lvert\\nabla u\\rvert
+```
+
+# References
+
+- [1] Chambolle, A. (2004). "An algorithm for total variation minimization and applications". _Journal of Mathematical Imaging and Vision_. 20: 89–97
+- [2] https://en.wikipedia.org/wiki/Total_variation_denoising
+"""
+solve_ROF_PD(img::AbstractArray{T}, args...) where T = solve_ROF_PD(float32(T), img, args...)
+function solve_ROF_PD(::Type{T}, img::AbstractArray, args...) where T
+    u = similar(img, T)
+    buffer = preallocate_solve_ROF_PD(T, img)
+    solve_ROF_PD!(u, buffer, img, args...)
+end
+
+# non-exported helper
+preallocate_solve_ROF_PD(img::AbstractArray{T}) where T = preallocate_solve_ROF_PD(float32(T), img)
+function preallocate_solve_ROF_PD(::Type{T}, img) where T
+    div_p = similar(img, T)
+    p = ntuple(i->similar(img, T), ndims(img))
+    ∇u = ntuple(i->similar(img, T), ndims(img))
+    ∇u_mag = similar(img, eltype(T))
+    return div_p, p, ∇u, ∇u_mag
+end
+
+"""
+    solve_ROF_PD!(out, buffer, img, λ, num_iters)
+
+The in-place version of [`solve_ROF_PD`](@ref).
+
+It is not uncommon to use ROF solver in a higher-level loop, in which case it makes sense to
+preallocate the output and intermediate arrays to make it faster.
+
+!!! note "Buffer"
+    The content and meaning of `buffer` might change without any notice if the internal
+    implementation is changed. Use `preallocate_solve_ROF_PD` helper function to avoid
+    potential changes.
+
+# Examples
+
+```julia
+using ImageFiltering.Models: preallocate_solve_ROF_PD
+
+out = similar(img)
+buffer = preallocate_solve_ROF_PD(img)
+solve_ROF_PD!(out, buffer, img, 0.2, 30)
+```
+
+"""
+function solve_ROF_PD!(
+        out::AbstractArray{T},
+        buffer::Tuple,
+        img::AbstractArray,
+        λ::Real,
+        num_iters::Integer) where T
+    # seperate a stub method to reduce latency
+    FT = float32(T)
+    if FT == T
+        solve_ROF_PD!(out, buffer, img, Float32(λ), Int(num_iters))
+    else
+        solve_ROF_PD!(out, buffer, FT.(img), Float32(λ), Int(num_iters))
+    end
+end
+function solve_ROF_PD!(
+        out::AbstractArray,
+        (div_p, p, ∇u, ∇u_mag)::Tuple,
+        img::AbstractArray,
+        λ::Float32,
+        num_iters::Int)
+    # Total Variation regularized image denoising using the primal dual algorithm
+    # Implement according to reference [1]
+    τ = 1//4 # see 2nd remark after proof of Theorem 3.1.
+
+    # use the same symbol in the paper
+    u, g = out, img
+
+    fgradient!(p, g)
+    # This iterates Eq. (9) of [1]
+    # TODO(johnnychen94): set better stop criterion
+    for _ in 1:num_iters
+        fdiv!(div_p, p)
+        # multiply term inside ∇ by -λ. Thm. 3.1 relates this to `u` via Eq. 7.
+        @. u = g - λ*div_p
+        fgradient!(∇u, u)
+        _l2norm_vec!(∇u_mag, ∇u) # |∇(g - λdiv p)|
+        # Eq. (9): update p
+        for i in 1:length(p)
+            @. p[i] = (p[i] - (τ/λ)*∇u[i])/(1 + (τ/λ) * ∇u_mag)
+        end
+    end
+    return u
+end
+
+function _l2norm_vec!(out, Vs::Tuple)
+    all(v->axes(out) == axes(v), Vs) || throw(ArgumentError("All axes of input data should be the same."))
+    @. out = abs2(Vs[1])
+    for v in Vs[2:end]
+        @. out += abs2(v)
+    end
+    @. out = sqrt(out)
+    return out
+end
+
+
+end # module
diff --git a/test/cuda/Project.toml b/test/cuda/Project.toml
new file mode 100644
index 0000000..da371cd
--- /dev/null
+++ b/test/cuda/Project.toml
@@ -0,0 +1,10 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+ImageBase = "c817782e-172a-44cc-b673-b171935fbb9e"
+ImageFiltering = "6a3955dd-da59-5b1f-98d4-e7296123deb5"
+ImageIO = "82e4d734-157c-48bb-816b-45c225c6df19"
+ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
+ImageQualityIndexes = "2996bd0c-7a13-11e9-2da2-2f5ce47296a9"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestImages = "5e47fb64-e119-507b-a336-dd2b206d9990"
diff --git a/test/cuda/models.jl b/test/cuda/models.jl
new file mode 100644
index 0000000..8712c82
--- /dev/null
+++ b/test/cuda/models.jl
@@ -0,0 +1,67 @@
+using ImageFiltering.Models
+
+@testset "solve_ROF_PD" begin
+    # This testset is modified from its CPU version
+
+    @testset "Numerical" begin
+        # 2D Gray
+        img = restrict(testimage("cameraman"))
+        img_noisy = img .+ 0.05randn(MersenneTwister(0), size(img))
+        img_smoothed = solve_ROF_PD(img_noisy, 0.05, 20)
+        @test ndims(img_smoothed) == 2
+        @test eltype(img_smoothed) <: Gray
+        @test assess_psnr(img_smoothed, img) > 31.67
+        @test assess_ssim(img_smoothed, img) > 0.90
+
+        img_noisy_cu = CuArray(float32.(img_noisy))
+        img_smoothed_cu = solve_ROF_PD(img_noisy_cu, 0.05, 20)
+        @test img_smoothed_cu isa CuArray
+        @test eltype(eltype(img_smoothed_cu)) == Float32
+        @test Array(img_smoothed_cu) ≈ img_smoothed
+
+        # 2D RGB
+        img = restrict(testimage("lighthouse"))
+        img_noisy = img .+ colorview(RGB, ntuple(i->0.05.*randn(MersenneTwister(i), size(img)), 3)...)
+        img_smoothed = solve_ROF_PD(img_noisy, 0.03, 20)
+        @test ndims(img_smoothed) == 2
+        @test eltype(img_smoothed) <: RGB
+        @test assess_psnr(img_smoothed, img) > 32.15
+        @test assess_ssim(img_smoothed, img) > 0.90
+
+        img_noisy_cu = CuArray(float32.(img_noisy))
+        img_smoothed_cu = solve_ROF_PD(img_noisy_cu, 0.03, 20)
+        @test img_smoothed_cu isa CuArray
+        @test eltype(eltype(img_smoothed_cu)) == Float32
+        @test Array(img_smoothed_cu) ≈ img_smoothed
+
+        # 3D Gray
+        img = Gray.(restrict(testimage("mri"), (1, 2)))
+        img_noisy = img .+ 0.05randn(MersenneTwister(0), size(img))
+        img_smoothed = solve_ROF_PD(img_noisy, 0.02, 20)
+        @test ndims(img_smoothed) == 3
+        @test eltype(img_smoothed) <: Gray
+        @test assess_psnr(img_smoothed, img) > 31.78
+        @test assess_ssim(img_smoothed, img) > 0.85
+
+        img_noisy_cu = CuArray(float32.(img_noisy))
+        img_smoothed_cu = solve_ROF_PD(img_noisy_cu, 0.02, 20)
+        @test img_smoothed_cu isa CuArray
+        @test eltype(eltype(img_smoothed_cu)) == Float32
+        @test Array(img_smoothed_cu) ≈ img_smoothed
+
+        # 3D RGB
+        img = RGB.(restrict(testimage("mri"), (1, 2)))
+        img_noisy = img .+ colorview(RGB, ntuple(i->0.05.*randn(MersenneTwister(i), size(img)), 3)...)
+        img_smoothed = solve_ROF_PD(img_noisy, 0.02, 20)
+        @test ndims(img_smoothed) == 3
+        @test eltype(img_smoothed) <: RGB
+        @test assess_psnr(img_smoothed, img) > 31.17
+        @test assess_ssim(img_smoothed, img) > 0.79
+
+        img_noisy_cu = CuArray(float32.(img_noisy))
+        img_smoothed_cu = solve_ROF_PD(img_noisy_cu, 0.02, 20)
+        @test img_smoothed_cu isa CuArray
+        @test eltype(eltype(img_smoothed_cu)) == Float32
+        @test Array(img_smoothed_cu) ≈ img_smoothed
+    end
+end
diff --git a/test/cuda/runtests.jl b/test/cuda/runtests.jl
new file mode 100644
index 0000000..5789d00
--- /dev/null
+++ b/test/cuda/runtests.jl
@@ -0,0 +1,17 @@
+# This file is maintained in a way to support CUDA-only test via
+# `julia --project=test/cuda -e 'include("runtests.jl")'`
+using ImageFiltering
+using CUDA
+using TestImages
+using ImageBase
+using ImageQualityIndexes
+using Test
+using Random
+
+CUDA.allowscalar(false)
+
+@testset "ImageFiltering" begin
+    if CUDA.functional()
+        include("models.jl")
+    end
+end
diff --git a/test/models.jl b/test/models.jl
new file mode 100644
index 0000000..274791d
--- /dev/null
+++ b/test/models.jl
@@ -0,0 +1,70 @@
+using ImageFiltering.Models
+using ImageFiltering.Models: preallocate_solve_ROF_PD
+
+@testset "solve_ROF_PD" begin
+    # Note: random seed really matters a lot
+
+    @testset "API" begin
+        img = rand(MersenneTwister(0), Float64, 64)
+        out = solve_ROF_PD(img, 0.2, 30)
+        @test eltype(out) == Float32
+
+        out2 = solve_ROF_PD(Float64, img, 0.2, 30)
+        @test eltype(out2) == Float64
+        @test out ≈ out2
+
+        out3 = similar(img, Float32)
+        buffer = preallocate_solve_ROF_PD(img)
+        solve_ROF_PD!(out3, buffer, img, 0.2, 30)
+        @test out == out3
+    end
+
+    @testset "Numerical" begin
+        # 2D Gray
+        img = restrict(testimage("cameraman"))
+        img_noisy = img .+ 0.05randn(MersenneTwister(0), size(img))
+        img_smoothed = solve_ROF_PD(img_noisy, 0.05, 20)
+        @test ndims(img_smoothed) == 2
+        @test eltype(img_smoothed) <: Gray
+        @test assess_psnr(img_smoothed, img) > 31.67
+        @test assess_ssim(img_smoothed, img) > 0.90
+
+        # 2D RGB
+        img = restrict(testimage("lighthouse"))
+        img_noisy = img .+ colorview(RGB, ntuple(i->0.05.*randn(MersenneTwister(i), size(img)), 3)...)
+        img_smoothed = solve_ROF_PD(img_noisy, 0.03, 20)
+        @test ndims(img_smoothed) == 2
+        @test eltype(img_smoothed) <: RGB
+        @test assess_psnr(img_smoothed, img) > 32.15
+        @test assess_ssim(img_smoothed, img) > 0.90
+
+        # 3D Gray
+        img = Gray.(restrict(testimage("mri"), (1, 2)))
+        img_noisy = img .+ 0.05randn(MersenneTwister(0), size(img))
+        img_smoothed = solve_ROF_PD(img_noisy, 0.02, 20)
+        @test ndims(img_smoothed) == 3
+        @test eltype(img_smoothed) <: Gray
+        @test assess_psnr(img_smoothed, img) > 31.78
+        @test assess_ssim(img_smoothed, img) > 0.85
+
+        # 3D RGB
+        img = RGB.(restrict(testimage("mri"), (1, 2)))
+        img_noisy = img .+ colorview(RGB, ntuple(i->0.05.*randn(MersenneTwister(i), size(img)), 3)...)
+        img_smoothed = solve_ROF_PD(img_noisy, 0.02, 20)
+        @test ndims(img_smoothed) == 3
+        @test eltype(img_smoothed) <: RGB
+        @test assess_psnr(img_smoothed, img) > 31.17
+        @test assess_ssim(img_smoothed, img) > 0.79
+    end
+
+    @testset "FixedPointNumbers" begin
+        A = rand(N0f8, 20, 20)
+        @test solve_ROF_PD(A, 0.01, 5) ≈ solve_ROF_PD(float32.(A), 0.01, 5)
+    end
+
+    @testset "OffsetArray" begin
+        Ao = OffsetArray(rand(N0f8, 20, 20), -1, -1)
+        out = solve_ROF_PD(Ao, 0.01, 5)
+        @test axes(out) == axes(Ao)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index fd407e9..4469629 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,5 +1,8 @@
-using ImageFiltering, ImageCore
+using ImageFiltering, ImageCore, ImageBase
+using OffsetArrays
 using Test
+using TestImages
+using ImageQualityIndexes
 import StaticArrays
 using Random
 
@@ -31,5 +34,30 @@ include("mapwindow.jl")
 include("extrema.jl")
 include("basic.jl")
 include("gabor.jl")
+include("models.jl")
 
+
+CUDA_INSTALLED = false
+try
+    global CUDA_INSTALLED
+    # This errors with `IOError` when nvidia driver is not available,
+    # in which case we don't even need to try `using CUDA`
+    run(pipeline(`nvidia-smi`, stdout=devnull, stderr=devnull))
+    push!(LOAD_PATH, "@v#.#") # force using global CUDA installation
+
+    @eval using CUDA
+    CUDA.allowscalar(false)
+    CUDA_INSTALLED = true
+catch e
+    e isa Base.IOError || @warn e LOAD_PATH
+end
+CUDA_FUNCTIONAL = CUDA_INSTALLED && CUDA.functional()
+if CUDA_FUNCTIONAL
+    @info "CUDA test: enabled"
+    @testset "CUDA" begin
+        include("cuda/runtests.jl")
+    end
+else
+    @warn "CUDA test: disabled"
+end
 nothing