diff --git a/src/PerformanceProfilingHttpEndpoints.jl b/src/PerformanceProfilingHttpEndpoints.jl
index 47d2de7..edb52db 100644
--- a/src/PerformanceProfilingHttpEndpoints.jl
+++ b/src/PerformanceProfilingHttpEndpoints.jl
@@ -21,11 +21,21 @@ using Serialization: serialize
 #
 #----------------------------------------------------------
 
+function _http_response(binary_data, filename)
+    return HTTP.Response(200, [
+        "Content-Type" => "application/octet-stream"
+        "Content-Disposition" => "attachment; filename=$(repr(filename))"
+    ], body = binary_data)
+end
+
+###
+### CPU
+###
+
 default_n() = "1e8"
 default_delay() = "0.01"
 default_duration() = "10.0"
 default_pprof() = "true"
-default_alloc_sample_rate() = "0.0001"
 
 cpu_profile_error_message() = """Need to provide query params:
     - duration=$(default_duration())
@@ -50,24 +60,6 @@ controlled by `n=`. If you assume an average stack depth of 100, and you were ai
 
 The default `n` is 1e8, which should be big enough for most profiles.
 """
-allocs_profile_error_message() = """Need to provide query params:
-    - duration=$(default_duration())
-    - sample_rate=$(default_alloc_sample_rate())
-
-Hint: A good goal is to shoot for around 1,000 to 10,000 samples. So if you know what
-duration you want to profile for, and you *already have an expectation for how much your
-program will allocate,* you can pick a sample_rate via `sample_rate = 1,000 / expected_allocations`.
-
-For example, if you expect your program will actually perform 1 million allocations:
-1_000 / 1_000_000 = 0.001
-for `duration=30&sample_rate=0.001`
-
-Note that if your sample_rate gets too large, you can really slow down the program you're
-profiling, and thus end up with an inaccurate profile.
-
-Finally, if you think your program only allocates a small amount, you can capture *all*
-allocations by passing sample_rate=1.
-"""
 
 function cpu_profile_endpoint(req::HTTP.Request)
     uri = HTTP.URI(req.target)
@@ -82,45 +74,90 @@ function cpu_profile_endpoint(req::HTTP.Request)
     delay = parse(Float64, get(qp, "delay", default_delay()))
     duration = parse(Float64, get(qp, "duration", default_duration()))
     with_pprof = parse(Bool, get(qp, "pprof", default_pprof()))
-
     return _do_cpu_profile(n, delay, duration, with_pprof)
 end
 
+function cpu_profile_start_endpoint(req::HTTP.Request)
+    uri = HTTP.URI(req.target)
+    qp = HTTP.queryparams(uri)
+
+    # Run the profile
+    n = convert(Int, parse(Float64, get(qp, "n", default_n())))
+    delay = parse(Float64, get(qp, "delay", default_delay()))
+    return _start_cpu_profile(n, delay)
+end
+
+function cpu_profile_stop_endpoint(req::HTTP.Request)
+    Profile.stop_timer()
+    @info "Stopping CPU Profiling from PerformanceProfilingHttpEndpoints"
+    uri = HTTP.URI(req.target)
+    qp = HTTP.queryparams(uri)
+    with_pprof = parse(Bool, get(qp, "pprof", default_pprof()))
+    filename = "cpu_profile"
+    return _cpu_profile_response(filename; with_pprof)
+end
+
 function _do_cpu_profile(n, delay, duration, with_pprof)
     @info "Starting CPU Profiling from PerformanceProfilingHttpEndpoints with configuration:" n delay duration
-
     Profile.clear()
-
     Profile.init(n, delay)
-
     Profile.@profile sleep(duration)
+    filename = "cpu_profile-duration=$duration&delay=$delay&n=$n"
+    return _cpu_profile_response(filename; with_pprof)
+end
 
-    data = Profile.retrieve()
+function _start_cpu_profile(n, delay)
+    @info "Starting CPU Profiling from PerformanceProfilingHttpEndpoints with configuration:" n delay
+    resp = HTTP.Response(200, "CPU profiling started.")
+    Profile.clear()
+    Profile.init(n, delay)
+    Profile.start_timer()
+    return resp
+end
+
+function _cpu_profile_response(filename; with_pprof::Bool)
     if with_pprof
         prof_name = tempname()
         PProf.pprof(out=prof_name, web=false)
         prof_name = "$prof_name.pb.gz"
-        return _http_response(read(prof_name),
-            "cpu_profile-duration=$duration&delay=$delay&n=$n.pb.gz")
+        return _http_response(read(prof_name), "$filename.pb.gz")
     else
         iobuf = IOBuffer()
+        data = Profile.retrieve()
         serialize(iobuf, data)
-        return _http_response(take!(iobuf),
-            "cpu_profile&duration=$duration&delay=$delay&n=$n.prof.bin")
+        return _http_response(take!(iobuf), "$filename.prof.bin")
     end
 end
 
-function _http_response(binary_data, filename)
-    return HTTP.Response(200, [
-        "Content-Type" => "application/octet-stream"
-        "Content-Disposition" => "attachment; filename=$(repr(filename))"
-    ], body = binary_data)
-end
+###
+### Allocs
+###
 
 function heap_snapshot_endpoint(req::HTTP.Request)
     # TODO: implement this once https://github.com/JuliaLang/julia/pull/42286 is merged
 end
 
+default_alloc_sample_rate() = "0.0001"
+
+allocs_profile_error_message() = """Need to provide query params:
+    - duration=$(default_duration())
+    - sample_rate=$(default_alloc_sample_rate())
+
+Hint: A good goal is to shoot for around 1,000 to 10,000 samples. So if you know what
+duration you want to profile for, and you *already have an expectation for how much your
+program will allocate,* you can pick a sample_rate via `sample_rate = 1,000 / expected_allocations`.
+
+For example, if you expect your program will actually perform 1 million allocations:
+1_000 / 1_000_000 = 0.001
+for `duration=30&sample_rate=0.001`
+
+Note that if your sample_rate gets too large, you can really slow down the program you're
+profiling, and thus end up with an inaccurate profile.
+
+Finally, if you think your program only allocates a small amount, you can capture *all*
+allocations by passing sample_rate=1.
+"""
+
 @static if !(isdefined(Profile, :Allocs) && isdefined(PProf, :Allocs))
 
 for f in (:allocations_profile_endpoint, :allocations_start_endpoint, :allocations_stop_endpoint)
@@ -170,9 +207,10 @@ end
 
 function _start_alloc_profile(sample_rate)
     @info "Starting allocation Profiling from PerformanceProfilingHttpEndpoints with configuration:" sample_rate
+    resp = HTTP.Response(200, "Allocation profiling started.")
     Profile.Allocs.clear()
     Profile.Allocs.start(; sample_rate)
-    return HTTP.Response(200, "Allocation profiling started.")
+    return resp
 end
 
 function _stop_alloc_profile()
@@ -185,10 +223,16 @@ end
 
 end  # if isdefined
 
+###
+### Server
+###
+
 function serve_profiling_server(;addr="127.0.0.1", port=16825, verbose=false, kw...)
     verbose >= 0 && @info "Starting HTTP profiling server on port $port"
     router = HTTP.Router()
     HTTP.register!(router, "/profile", cpu_profile_endpoint)
+    HTTP.register!(router, "/profile_start", cpu_profile_start_endpoint)
+    HTTP.register!(router, "/profile_stop", cpu_profile_stop_endpoint)
     HTTP.register!(router, "/allocs_profile", allocations_profile_endpoint)
     HTTP.register!(router, "/allocs_profile_start", allocations_start_endpoint)
     HTTP.register!(router, "/allocs_profile_stop", allocations_stop_endpoint)
@@ -200,8 +244,13 @@ end
 # up profiling compilation!
 function __init__()
     precompile(serve_profiling_server, ()) || error("precompilation of package functions is not supposed to fail")
+
     precompile(cpu_profile_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
+    precompile(cpu_profile_start_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
+    precompile(cpu_profile_stop_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
     precompile(_do_cpu_profile, (Int,Float64,Float64,Bool)) || error("precompilation of package functions is not supposed to fail")
+    precompile(_start_cpu_profile, (Int,Float64,)) || error("precompilation of package functions is not supposed to fail")
+
     precompile(allocations_profile_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
     precompile(allocations_start_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
     precompile(allocations_stop_endpoint, (HTTP.Request,)) || error("precompilation of package functions is not supposed to fail")
diff --git a/test/runtests.jl b/test/runtests.jl
index e78e313..d755816 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -18,26 +18,61 @@ const url = "http://127.0.0.1:$port"
     @testset "CPU profiling" begin
         done = Threads.Atomic{Bool}(false)
         # Schedule some work that's known to be expensive, to profile it
-        t = @async begin
-            for _ in 1:200
+        workload() = @async begin
+            for _ in 1:1000
                 if done[] return end
                 InteractiveUtils.peakflops()
                 yield()  # yield to allow the tests to run
             end
         end
 
-        req = HTTP.get("$url/profile?duration=3&pprof=false")
-        @test req.status == 200
-        @test length(req.body) > 0
+        @testset "profile endpoint" begin
+            done[] = false
+            t = workload()
+            req = HTTP.get("$url/profile?duration=3&pprof=false")
+            @test req.status == 200
+            @test length(req.body) > 0
+
+            data, lidict = deserialize(IOBuffer(req.body))
+            # Test that the profile seems like valid profile data
+            @test data isa Vector{UInt64}
+            @test lidict isa Dict{UInt64, Vector{Base.StackTraces.StackFrame}}
+
+            @info "Finished `profile` tests, waiting for peakflops workload to finish."
+            done[] = true
+            wait(t)  # handle errors
+        end
+
+        @testset "profile_start/stop endpoints" begin
+            done[] = false
+            t = workload()
+            req = HTTP.get("$url/profile_start")
+            @test req.status == 200
+            @test String(req.body) == "CPU profiling started."
+
+            sleep(3)  # Allow workload to run a while before we stop profiling.
 
-        data, lidict = deserialize(IOBuffer(req.body))
-        # Test that the profile seems like valid profile data
-        @test data isa Vector{UInt64}
-        @test lidict isa Dict{UInt64, Vector{Base.StackTraces.StackFrame}}
+            req = HTTP.get("$url/profile_stop?pprof=false")
+            @test req.status == 200
+            data, lidict = deserialize(IOBuffer(req.body))
+            # Test that the profile seems like valid profile data
+            @test data isa Vector{UInt64}
+            @test lidict isa Dict{UInt64, Vector{Base.StackTraces.StackFrame}}
 
-        @info "Finished tests, waiting for peakflops workload to finish."
-        done[] = true
-        wait(t)  # handle errors
+            @info "Finished `profile_start/stop` tests, waiting for peakflops workload to finish."
+            done[] = true
+            wait(t)  # handle errors
+
+            # We retrive data via PProf directly if `pprof=true`; make sure that path's tested.
+            # This second call to `profile_stop` should still return the profile, even though
+            # the profiler is already stopped, as it's `profile_start` that calls `clear()`.
+            req = HTTP.get("$url/profile_stop?pprof=true")
+            @test req.status == 200
+            # Test that there's something here
+            # TODO: actually parse the profile
+            data = read(IOBuffer(req.body), String)
+            @test length(data) > 100
+        end
     end
 
     @testset "Allocation profiling" begin