diff --git a/Compiler/extras/CompilerDevTools/src/CompilerDevTools.jl b/Compiler/extras/CompilerDevTools/src/CompilerDevTools.jl
index 5d0df5ccaa4e4..dd32564d7fa8d 100644
--- a/Compiler/extras/CompilerDevTools/src/CompilerDevTools.jl
+++ b/Compiler/extras/CompilerDevTools/src/CompilerDevTools.jl
@@ -9,12 +9,13 @@ struct SplitCacheInterp <: Compiler.AbstractInterpreter
     inf_params::Compiler.InferenceParams
     opt_params::Compiler.OptimizationParams
     inf_cache::Vector{Compiler.InferenceResult}
+    codegen_cache::IdDict{CodeInstance,CodeInfo}
     function SplitCacheInterp(;
         world::UInt = Base.get_world_counter(),
         inf_params::Compiler.InferenceParams = Compiler.InferenceParams(),
         opt_params::Compiler.OptimizationParams = Compiler.OptimizationParams(),
         inf_cache::Vector{Compiler.InferenceResult} = Compiler.InferenceResult[])
-        new(world, inf_params, opt_params, inf_cache)
+        new(world, inf_params, opt_params, inf_cache, IdDict{CodeInstance,CodeInfo}())
     end
 end
 
@@ -23,10 +24,11 @@ Compiler.OptimizationParams(interp::SplitCacheInterp) = interp.opt_params
 Compiler.get_inference_world(interp::SplitCacheInterp) = interp.world
 Compiler.get_inference_cache(interp::SplitCacheInterp) = interp.inf_cache
 Compiler.cache_owner(::SplitCacheInterp) = SplitCacheOwner()
+Compiler.codegen_cache(interp::SplitCacheInterp) = interp.codegen_cache
 
 import Core.OptimizedGenerics.CompilerPlugins: typeinf, typeinf_edge
 @eval @noinline typeinf(::SplitCacheOwner, mi::MethodInstance, source_mode::UInt8) =
-    Base.invoke_in_world(which(typeinf, Tuple{SplitCacheOwner, MethodInstance, UInt8}).primary_world, Compiler.typeinf_ext, SplitCacheInterp(; world=Base.tls_world_age()), mi, source_mode)
+    Base.invoke_in_world(which(typeinf, Tuple{SplitCacheOwner, MethodInstance, UInt8}).primary_world, Compiler.typeinf_ext_toplevel, SplitCacheInterp(; world=Base.tls_world_age()), mi, source_mode)
 
 @eval @noinline function typeinf_edge(::SplitCacheOwner, mi::MethodInstance, parent_frame::Compiler.InferenceState, world::UInt, source_mode::UInt8)
     # TODO: This isn't quite right, we're just sketching things for now
diff --git a/Compiler/src/abstractinterpretation.jl b/Compiler/src/abstractinterpretation.jl
index 1c12bbf0c5e64..2a535cb20b4d0 100644
--- a/Compiler/src/abstractinterpretation.jl
+++ b/Compiler/src/abstractinterpretation.jl
@@ -2561,7 +2561,6 @@ function abstract_eval_setglobalonce!(interp::AbstractInterpreter, sv::AbsIntSta
     end
 end
 
-
 function abstract_eval_replaceglobal!(interp::AbstractInterpreter, sv::AbsIntState, saw_latestworld::Bool, argtypes::Vector{Any})
     if length(argtypes) in (5, 6, 7)
         (M, s, x, v) = argtypes[2], argtypes[3], argtypes[4], argtypes[5]
@@ -3624,7 +3623,7 @@ end
 
 function global_assignment_rt_exct(interp::AbstractInterpreter, sv::AbsIntState, saw_latestworld::Bool, g::GlobalRef, @nospecialize(newty))
     if saw_latestworld
-        return Pair{Any,Any}(newty, Union{ErrorException, TypeError})
+        return Pair{Any,Any}(newty, ErrorException)
     end
     (valid_worlds, ret) = scan_partitions((interp, _, partition)->global_assignment_binding_rt_exct(interp, partition, newty), interp, g, sv.world)
     update_valid_age!(sv, valid_worlds)
@@ -3641,10 +3640,10 @@ function global_assignment_binding_rt_exct(interp::AbstractInterpreter, partitio
     ty = kind == PARTITION_KIND_DECLARED ? Any : partition_restriction(partition)
     wnewty = widenconst(newty)
     if !hasintersect(wnewty, ty)
-        return Pair{Any,Any}(Bottom, TypeError)
+        return Pair{Any,Any}(Bottom, ErrorException)
     elseif !(wnewty <: ty)
         retty = tmeet(typeinf_lattice(interp), newty, ty)
-        return Pair{Any,Any}(retty, TypeError)
+        return Pair{Any,Any}(retty, ErrorException)
     end
     return Pair{Any,Any}(newty, Bottom)
 end
diff --git a/Compiler/src/bootstrap.jl b/Compiler/src/bootstrap.jl
index 2671ea114e818..a847d1fb835c7 100644
--- a/Compiler/src/bootstrap.jl
+++ b/Compiler/src/bootstrap.jl
@@ -67,17 +67,10 @@ function bootstrap!()
                     end
                     mi = specialize_method(m.method, Tuple{params...}, m.sparams)
                     #isa_compileable_sig(mi) || println(stderr, "WARNING: inferring `", mi, "` which isn't expected to be called.")
-                    push!(methods, mi)
+                    typeinf_ext_toplevel(mi, world, isa_compileable_sig(mi) ? SOURCE_MODE_ABI : SOURCE_MODE_NOT_REQUIRED)
                 end
             end
         end
-        codeinfos = typeinf_ext_toplevel(methods, [world], TRIM_NO)
-        for i = 1:2:length(codeinfos)
-            ci = codeinfos[i]::CodeInstance
-            src = codeinfos[i + 1]::CodeInfo
-            isa_compileable_sig(ci.def) || continue # println(stderr, "WARNING: compiling `", ci.def, "` which isn't expected to be called.")
-            ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), ci, src)
-        end
         endtime = time()
         println("Base.Compiler ──── ", sub_float(endtime,starttime), " seconds")
     end
diff --git a/Compiler/src/ssair/irinterp.jl b/Compiler/src/ssair/irinterp.jl
index 084f28f0aa523..3d72da72625be 100644
--- a/Compiler/src/ssair/irinterp.jl
+++ b/Compiler/src/ssair/irinterp.jl
@@ -32,7 +32,7 @@ function concrete_eval_invoke(interp::AbstractInterpreter, ci::CodeInstance, arg
 end
 
 function abstract_eval_invoke_inst(interp::AbstractInterpreter, inst::Instruction, irsv::IRInterpretationState)
-    stmt = inst[:stmt]
+    stmt = inst[:stmt]::Expr
     ci = stmt.args[1]
     if ci isa MethodInstance
         world = frame_world(irsv)
diff --git a/Compiler/src/ssair/passes.jl b/Compiler/src/ssair/passes.jl
index 14fc0ab20913c..46ed299167060 100644
--- a/Compiler/src/ssair/passes.jl
+++ b/Compiler/src/ssair/passes.jl
@@ -183,7 +183,7 @@ function find_def_for_use(
 end
 
 function collect_leaves(compact::IncrementalCompact, @nospecialize(val), @nospecialize(typeconstraint), 𝕃ₒ::AbstractLattice,
-                        predecessors = ((@nospecialize(def), compact::IncrementalCompact) -> isa(def, PhiNode) ? def.values : nothing))
+                        predecessors::Pre = ((@nospecialize(def), compact::IncrementalCompact) -> isa(def, PhiNode) ? def.values : nothing)) where {Pre}
     if isa(val, Union{OldSSAValue, SSAValue})
         val, typeconstraint = simple_walk_constraint(compact, val, typeconstraint)
     end
@@ -271,7 +271,7 @@ Starting at `val` walk use-def chains to get all the leaves feeding into this `v
 `predecessors(def, compact)` is a callback which should return the set of possible
 predecessors for a "phi-like" node (PhiNode or Core.ifelse) or `nothing` otherwise.
 """
-function walk_to_defs(compact::IncrementalCompact, @nospecialize(defssa), @nospecialize(typeconstraint), predecessors, 𝕃ₒ::AbstractLattice)
+function walk_to_defs(compact::IncrementalCompact, @nospecialize(defssa), @nospecialize(typeconstraint), predecessors::Pre, 𝕃ₒ::AbstractLattice) where {Pre}
     visited_philikes = AnySSAValue[]
     isa(defssa, AnySSAValue) || return Any[defssa], visited_philikes
     def = compact[defssa][:stmt]
diff --git a/Compiler/src/typeinfer.jl b/Compiler/src/typeinfer.jl
index e07ff4a842e3c..59e0fbd8262e5 100644
--- a/Compiler/src/typeinfer.jl
+++ b/Compiler/src/typeinfer.jl
@@ -144,15 +144,16 @@ function finish!(interp::AbstractInterpreter, caller::InferenceState, validation
             ci, inferred_result, const_flag, first(result.valid_worlds), last(result.valid_worlds), encode_effects(result.ipo_effects),
             result.analysis_results, time_total, caller.time_caches, time_self_ns * 1e-9, di, edges)
         engine_reject(interp, ci)
-        if !discard_src && isdefined(interp, :codegen) && uncompressed isa CodeInfo
+        codegen = codegen_cache(interp)
+        if !discard_src && codegen !== nothing && uncompressed isa CodeInfo
             # record that the caller could use this result to generate code when required, if desired, to avoid repeating n^2 work
-            interp.codegen[ci] = uncompressed
+            codegen[ci] = uncompressed
             if bootstrapping_compiler && inferred_result == nothing
                 # This is necessary to get decent bootstrapping performance
                 # when compiling the compiler to inject everything eagerly
                 # where codegen can start finding and using it right away
                 mi = result.linfo
-                if mi.def isa Method && isa_compileable_sig(mi)
+                if mi.def isa Method && isa_compileable_sig(mi) && is_cached(caller)
                     ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), ci, uncompressed)
                 end
             end
@@ -186,8 +187,9 @@ function finish!(interp::AbstractInterpreter, mi::MethodInstance, ci::CodeInstan
     ccall(:jl_update_codeinst, Cvoid, (Any, Any, Int32, UInt, UInt, UInt32, Any, Float64, Float64, Float64, Any, Any),
         ci, nothing, const_flag, min_world, max_world, ipo_effects, nothing, 0.0, 0.0, 0.0, di, edges)
     code_cache(interp)[mi] = ci
-    if isdefined(interp, :codegen)
-        interp.codegen[ci] = src
+    codegen = codegen_cache(interp)
+    if codegen !== nothing
+        codegen[ci] = src
     end
     engine_reject(interp, ci)
     return nothing
@@ -1097,10 +1099,10 @@ end
 """
     SOURCE_MODE_NOT_REQUIRED
 
-Indicates to inference that the source is not required and the only fields
-of the resulting `CodeInstance` that the caller is interested in are types
-and effects. Inference is still free to create a CodeInstance with source,
-but is not required to do so.
+Indicates to inference that the source is not required and the only fields of
+the resulting `CodeInstance` that the caller is interested in are return or
+exception types and IPO effects. Inference is still free to create source for
+it or add it to the JIT even, but is not required or expected to do so.
 """
 const SOURCE_MODE_NOT_REQUIRED = 0x0
 
@@ -1108,28 +1110,51 @@ const SOURCE_MODE_NOT_REQUIRED = 0x0
     SOURCE_MODE_ABI
 
 Indicates to inference that it should return a CodeInstance that can
-either be `->invoke`'d (because it has already been compiled or because
-it has constabi) or one that can be made so by compiling its `->inferred`
-field.
-
-N.B.: The `->inferred` field is volatile and the compiler may delete it.
+be `->invoke`'d (because it has already been compiled).
 """
 const SOURCE_MODE_ABI = 0x1
 
 """
-    ci_has_abi(code::CodeInstance)
+    SOURCE_MODE_GET_SOURCE
+
+Indicates to inference that it should return a CodeInstance after it has
+prepared interp to be able to provide source code for it.
+"""
+const SOURCE_MODE_GET_SOURCE = 0xf
+
+"""
+    ci_has_abi(interp::AbstractInterpreter, code::CodeInstance)
 
-Determine whether this CodeInstance is something that could be invoked if we gave it
-to the runtime system (either because it already has an ->invoke ptr, or
-because it has source that could be compiled). Note that this information may
-be stale by the time the user see it, so the user will need to perform their
-own checks if they actually need the abi from it.
+Determine whether this CodeInstance is something that could be invoked if
+interp gave it to the runtime system (either because it already has an ->invoke
+ptr, or because interp has source that could be compiled).
 """
-function ci_has_abi(code::CodeInstance)
+function ci_has_abi(interp::AbstractInterpreter, code::CodeInstance)
     (@atomic :acquire code.invoke) !== C_NULL && return true
+    return ci_has_source(interp, code)
+end
+
+"""
+    ci_has_source(interp::AbstractInterpreter, code::CodeInstance)
+
+Determine whether this CodeInstance is something that could be compiled from
+source that interp has.
+"""
+function ci_has_source(interp::AbstractInterpreter, code::CodeInstance)
+    codegen = codegen_cache(interp)
+    codegen === nothing && return false
+    use_const_api(code) && return true
+    haskey(codegen, code) && return true
     inf = @atomic :monotonic code.inferred
-    if code.owner === nothing ? (isa(inf, CodeInfo) || isa(inf, String)) : inf !== nothing
-        # interp.codegen[code] = maybe_uncompress(code, inf) # TODO: the correct way to ensure this information doesn't become stale would be to push it into the stable codegen cache
+    if isa(inf, String)
+        inf = _uncompressed_ir(code, inf)
+    end
+    if code.owner === nothing
+        if isa(inf, CodeInfo)
+            codegen[code] = inf
+            return true
+        end
+    elseif inf !== nothing
         return true
     end
     return false
@@ -1139,9 +1164,10 @@ function ci_has_invoke(code::CodeInstance)
     return (@atomic :monotonic code.invoke) !== C_NULL
 end
 
-function ci_meets_requirement(code::CodeInstance, source_mode::UInt8)
+function ci_meets_requirement(interp::AbstractInterpreter, code::CodeInstance, source_mode::UInt8)
     source_mode == SOURCE_MODE_NOT_REQUIRED && return true
-    source_mode == SOURCE_MODE_ABI && return ci_has_abi(code)
+    source_mode == SOURCE_MODE_ABI && return ci_has_abi(interp, code)
+    source_mode == SOURCE_MODE_GET_SOURCE && return ci_has_source(interp, code)
     return false
 end
 
@@ -1151,7 +1177,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     let code = get(code_cache(interp), mi, nothing)
         if code isa CodeInstance
             # see if this code already exists in the cache
-            if ci_meets_requirement(code, source_mode)
+            if ci_meets_requirement(interp, code, source_mode)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
             end
@@ -1163,7 +1189,7 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     let code = get(code_cache(interp), mi, nothing)
         if code isa CodeInstance
             # see if this code already exists in the cache
-            if ci_meets_requirement(code, source_mode)
+            if ci_meets_requirement(interp, code, source_mode)
                 engine_reject(interp, ci)
                 ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
                 return code
@@ -1194,15 +1220,11 @@ function typeinf_ext(interp::AbstractInterpreter, mi::MethodInstance, source_mod
     ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
 
     ci = result.ci # reload from result in case it changed
+    codegen = codegen_cache(interp)
     @assert frame.cache_mode != CACHE_MODE_NULL
-    @assert is_result_constabi_eligible(result) || (!isdefined(interp, :codegen) || haskey(interp.codegen, ci))
+    @assert is_result_constabi_eligible(result) || codegen === nothing || haskey(codegen, ci)
     @assert is_result_constabi_eligible(result) == use_const_api(ci)
     @assert isdefined(ci, :inferred) "interpreter did not fulfill our expectations"
-    if !is_cached(frame) && source_mode == SOURCE_MODE_ABI
-        # XXX: jl_type_infer somewhat ambiguously assumes this must be cached
-        # XXX: this should be using the CI from the cache, if possible instead: haskey(cache, mi) && (ci = cache[mi])
-        code_cache(interp)[mi] = ci
-    end
     return ci
 end
 
@@ -1216,35 +1238,9 @@ end
 typeinf_type(interp::AbstractInterpreter, match::MethodMatch) =
     typeinf_type(interp, specialize_method(match))
 function typeinf_type(interp::AbstractInterpreter, mi::MethodInstance)
-    # n.b.: this could be replaced with @something(typeinf_ext(interp, mi, SOURCE_MODE_NOT_REQUIRED), return nothing).rettype
-    start_time = ccall(:jl_typeinf_timing_begin, UInt64, ())
-    let code = get(code_cache(interp), mi, nothing)
-        if code isa CodeInstance
-            # see if this rettype already exists in the cache
-            ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
-            return code.rettype
-        end
-    end
-    ci = engine_reserve(interp, mi)
-    let code = get(code_cache(interp), mi, nothing)
-        if code isa CodeInstance
-            engine_reject(interp, ci)
-            # see if this rettype already exists in the cache
-            ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
-            return code.rettype
-        end
-    end
-    result = InferenceResult(mi, typeinf_lattice(interp))
-    result.ci = ci
-    frame = InferenceState(result, #=cache_mode=#:global, interp)
-    if frame === nothing
-        engine_reject(interp, ci)
-        return nothing
-    end
-    typeinf(interp, frame)
-    ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time)
-    is_inferred(result) || return nothing
-    return widenconst(ignorelimited(result.result))
+    ci = typeinf_ext(interp, mi, SOURCE_MODE_NOT_REQUIRED)
+    ci isa CodeInstance || return nothing
+    return ci.rettype
 end
 
 # collect a list of all code that is needed along with CodeInstance to codegen it fully
@@ -1261,44 +1257,68 @@ function collectinvokes!(wq::Vector{CodeInstance}, ci::CodeInfo)
     end
 end
 
-# This is a bridge for the C code calling `jl_typeinf_func()` on a single Method match
-function typeinf_ext_toplevel(mi::MethodInstance, world::UInt, source_mode::UInt8)
-    interp = NativeInterpreter(world)
-    ci = typeinf_ext(interp, mi, source_mode)
-    if source_mode == SOURCE_MODE_ABI && ci isa CodeInstance && !ci_has_invoke(ci)
-        inspected = IdSet{CodeInstance}()
-        tocompile = Vector{CodeInstance}()
-        push!(tocompile, ci)
-        while !isempty(tocompile)
-            # ci_has_real_invoke(ci) && return ci # optimization: cease looping if ci happens to get compiled (not just jl_fptr_wait_for_compiled, but fully jl_is_compiled_codeinst)
-            callee = pop!(tocompile)
-            ci_has_invoke(callee) && continue
-            callee in inspected && continue
-            src = get(interp.codegen, callee, nothing)
+function add_codeinsts_to_jit!(interp::AbstractInterpreter, ci, source_mode::UInt8)
+    source_mode == SOURCE_MODE_ABI || return ci
+    ci isa CodeInstance && !ci_has_invoke(ci) || return ci
+    codegen = codegen_cache(interp)
+    codegen === nothing && return ci
+    inspected = IdSet{CodeInstance}()
+    tocompile = Vector{CodeInstance}()
+    push!(tocompile, ci)
+    while !isempty(tocompile)
+        # ci_has_real_invoke(ci) && return ci # optimization: cease looping if ci happens to get compiled (not just jl_fptr_wait_for_compiled, but fully jl_is_compiled_codeinst)
+        callee = pop!(tocompile)
+        ci_has_invoke(callee) && continue
+        callee in inspected && continue
+        src = get(codegen, callee, nothing)
+        if !isa(src, CodeInfo)
+            src = @atomic :monotonic callee.inferred
+            if isa(src, String)
+                src = _uncompressed_ir(callee, src)
+            end
             if !isa(src, CodeInfo)
-                src = @atomic :monotonic callee.inferred
-                if isa(src, String)
-                    src = _uncompressed_ir(callee, src)
+                newcallee = typeinf_ext(interp, callee.def, source_mode) # always SOURCE_MODE_ABI
+                if newcallee isa CodeInstance
+                    callee === ci && (ci = newcallee) # ci stopped meeting the requirements after typeinf_ext last checked, try again with newcallee
+                    push!(tocompile, newcallee)
                 end
-                if !isa(src, CodeInfo)
-                    newcallee = typeinf_ext(interp, callee.def, source_mode)
-                    if newcallee isa CodeInstance
-                        callee === ci && (ci = newcallee) # ci stopped meeting the requirements after typeinf_ext last checked, try again with newcallee
-                        push!(tocompile, newcallee)
-                    #else
-                    #    println("warning: could not get source code for ", callee.def)
-                    end
-                    continue
+                if newcallee !== callee
+                    push!(inspected, callee)
                 end
+                continue
             end
-            push!(inspected, callee)
-            collectinvokes!(tocompile, src)
-            ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), callee, src)
         end
+        push!(inspected, callee)
+        collectinvokes!(tocompile, src)
+        mi = get_ci_mi(callee)
+        if iszero(ccall(:jl_mi_cache_has_ci, Cint, (Any, Any), mi, callee))
+            cached = ccall(:jl_get_ci_equiv, Any, (Any, UInt), callee, get_inference_world(interp))::CodeInstance
+            if cached === callee
+                # make sure callee is gc-rooted and cached, as required by jl_add_codeinst_to_jit
+                code_cache(interp)[mi] = callee
+            else
+                # use an existing CI from the cache, if there is available one that is compatible
+                callee === ci && (ci = cached)
+                callee = cached
+            end
+        end
+        ccall(:jl_add_codeinst_to_jit, Cvoid, (Any, Any), callee, src)
     end
     return ci
 end
 
+function typeinf_ext_toplevel(interp::AbstractInterpreter, mi::MethodInstance, source_mode::UInt8)
+    ci = typeinf_ext(interp, mi, source_mode)
+    ci = add_codeinsts_to_jit!(interp, ci, source_mode)
+    return ci
+end
+
+# This is a bridge for the C code calling `jl_typeinf_func()` on a single Method match
+function typeinf_ext_toplevel(mi::MethodInstance, world::UInt, source_mode::UInt8)
+    interp = NativeInterpreter(world)
+    return typeinf_ext_toplevel(interp, mi, source_mode)
+end
+
 # This is a bridge for the C code calling `jl_typeinf_func()` on set of Method matches
 # The trim_mode can be any of:
 const TRIM_NO = 0
@@ -1325,7 +1345,7 @@ function typeinf_ext_toplevel(methods::Vector{Any}, worlds::Vector{UInt}, trim_m
                 # and this is either the primary world, or not applicable in the primary world
                 # then we want to compile and emit this
                 if item.def.primary_world <= this_world <= item.def.deleted_world
-                    ci = typeinf_ext(interp, item, SOURCE_MODE_NOT_REQUIRED)
+                    ci = typeinf_ext(interp, item, SOURCE_MODE_GET_SOURCE)
                     ci isa CodeInstance && push!(tocompile, ci)
                 end
             elseif item isa SimpleVector && latest
@@ -1336,7 +1356,7 @@ function typeinf_ext_toplevel(methods::Vector{Any}, worlds::Vector{UInt}, trim_m
                             sig, this_world, #= mt_cache =# 0)
                 if ptr !== C_NULL
                     mi = unsafe_pointer_to_objref(ptr)::MethodInstance
-                    ci = typeinf_ext(interp, mi, SOURCE_MODE_NOT_REQUIRED)
+                    ci = typeinf_ext(interp, mi, SOURCE_MODE_GET_SOURCE)
                     ci isa CodeInstance && push!(tocompile, ci)
                 end
                 # additionally enqueue the ccallable entrypoint / adapter, which implicitly
@@ -1348,26 +1368,37 @@ function typeinf_ext_toplevel(methods::Vector{Any}, worlds::Vector{UInt}, trim_m
         while !isempty(tocompile)
             callee = pop!(tocompile)
             callee in inspected && continue
-            push!(inspected, callee)
             # now make sure everything has source code, if desired
             mi = get_ci_mi(callee)
             def = mi.def
             if use_const_api(callee)
                 src = codeinfo_for_const(interp, mi, callee.rettype_const)
-            elseif haskey(interp.codegen, callee)
-                src = interp.codegen[callee]
-            elseif isa(def, Method) && !InferenceParams(interp).force_enable_inference && ccall(:jl_get_module_infer, Cint, (Any,), def.module) == 0
-                src = retrieve_code_info(mi, get_inference_world(interp))
             else
-                # TODO: typeinf_code could return something with different edges/ages/owner/abi (needing an update to callee), which we don't handle here
-                src = typeinf_code(interp, mi, true)
+                src = get(interp.codegen, callee, nothing)
+                if src === nothing
+                    newcallee = typeinf_ext(interp, mi, SOURCE_MODE_GET_SOURCE)
+                    if newcallee isa CodeInstance
+                        @assert use_const_api(newcallee) || haskey(interp.codegen, newcallee)
+                        push!(tocompile, newcallee)
+                    end
+                    if newcallee !== callee
+                        push!(inspected, callee)
+                    end
+                    continue
+                end
             end
+            push!(inspected, callee)
             if src isa CodeInfo
                 collectinvokes!(tocompile, src)
-                # It is somewhat ambiguous if typeinf_ext might have callee in the caches,
-                # but for the purpose of native compile, we always want them put there.
+                # try to reuse an existing CodeInstance from before to avoid making duplicates in the cache
                 if iszero(ccall(:jl_mi_cache_has_ci, Cint, (Any, Any), mi, callee))
-                    code_cache(interp)[mi] = callee
+                    cached = ccall(:jl_get_ci_equiv, Any, (Any, UInt), callee, this_world)::CodeInstance
+                    if cached === callee
+                        code_cache(interp)[mi] = callee
+                    else
+                        # Use an existing CI from the cache, if there is available one that is compatible
+                        callee = cached
+                    end
                 end
                 push!(codeinfos, callee)
                 push!(codeinfos, src)
diff --git a/Compiler/src/types.jl b/Compiler/src/types.jl
index eb05ba2b8daa6..a04c9e70174fe 100644
--- a/Compiler/src/types.jl
+++ b/Compiler/src/types.jl
@@ -23,6 +23,10 @@ the following methods to satisfy the `AbstractInterpreter` API requirement:
 - `get_inference_world(interp::NewInterpreter)` - return the world age for this interpreter
 - `get_inference_cache(interp::NewInterpreter)` - return the local inference cache
 - `cache_owner(interp::NewInterpreter)` - return the owner of any new cache entries
+
+If `CodeInstance`s compiled using `interp::NewInterpreter` are meant to be executed with `invoke`,
+a method `codegen_cache(interp::NewInterpreter) -> IdDict{CodeInstance, CodeInfo}` must be defined,
+and inference must be triggered via `typeinf_ext_toplevel` with source mode `SOURCE_MODE_ABI`.
 """
 abstract type AbstractInterpreter end
 
@@ -446,6 +450,19 @@ to incorporate customized dispatches for the overridden methods.
 method_table(interp::AbstractInterpreter) = InternalMethodTable(get_inference_world(interp))
 method_table(interp::NativeInterpreter) = interp.method_table
 
+"""
+    codegen_cache(interp::AbstractInterpreter) -> Union{Nothing, IdDict{CodeInstance, CodeInfo}}
+
+Optionally return a cache associating a `CodeInfo` to a `CodeInstance` that should be added to the JIT
+for future execution via `invoke(f, ::CodeInstance, args...)`. This cache is used during `typeinf_ext_toplevel`,
+and may be safely discarded between calls to this function.
+
+By default, a value of `nothing` is returned indicating that `CodeInstance`s should not be added to the JIT.
+Attempting to execute them via `invoke` will result in an error.
+"""
+codegen_cache(interp::AbstractInterpreter) = nothing
+codegen_cache(interp::NativeInterpreter) = interp.codegen
+
 """
 By default `AbstractInterpreter` implements the following inference bail out logic:
 - `bail_out_toplevel_call(::AbstractInterpreter, sig, ::InferenceState)`: bail out from
diff --git a/Compiler/src/verifytrim.jl b/Compiler/src/verifytrim.jl
index 5a80082c63330..2365d885efd79 100644
--- a/Compiler/src/verifytrim.jl
+++ b/Compiler/src/verifytrim.jl
@@ -110,7 +110,7 @@ end
 function verify_print_error(io::IOContext{IO}, desc::CallMissing, parents::ParentMap)
     (; codeinst, codeinfo, sptypes, stmtidx, desc) = desc
     frames = verify_create_stackframes(codeinst, stmtidx, parents)
-    print(io, desc, " from ")
+    print(io, desc, " from statement ")
     verify_print_stmt(io, codeinfo, sptypes, stmtidx)
     Base.show_backtrace(io, frames)
     print(io, "\n\n")
@@ -181,6 +181,11 @@ function verify_codeinstance!(codeinst::CodeInstance, codeinfo::CodeInfo, inspec
             if edge isa CodeInstance
                 haskey(parents, edge) || (parents[edge] = (codeinst, i))
                 edge in inspected && continue
+                edge_mi = get_ci_mi(edge)
+                if edge_mi === edge.def
+                    ci = get(caches, edge_mi, nothing)
+                    ci isa CodeInstance && continue # assume that only this_world matters for trim
+                end
             end
             # TODO: check for calls to Base.atexit?
         elseif isexpr(stmt, :call)
@@ -287,7 +292,7 @@ function get_verify_typeinf_trim(codeinfos::Vector{Any})
                     # TODO: should we find a way to indicate to the user that this gets called via ccallable?
                     # parent[ci] = something
                     asrt = ci.rettype
-                    ci in inspected
+                    true
                 else
                     false
                 end
@@ -326,6 +331,14 @@ function verify_typeinf_trim(io::IO, codeinfos::Vector{Any}, onlywarn::Bool)
         verify_print_error(io, desc, parents)
     end
 
+    ## TODO: compute and display the minimum and/or full call graph instead of merely the first parent stacktrace?
+    #for i = 1:length(codeinfos)
+    #    item = codeinfos[i]
+    #    if item isa CodeInstance
+    #        println(item, "::", item.rettype)
+    #    end
+    #end
+
     let severity = 0
         if counts[1] > 0 || counts[2] > 0
             print("Trim verify finished with ")
diff --git a/Compiler/test/AbstractInterpreter.jl b/Compiler/test/AbstractInterpreter.jl
index 533eaf93937a3..83218d73cad69 100644
--- a/Compiler/test/AbstractInterpreter.jl
+++ b/Compiler/test/AbstractInterpreter.jl
@@ -534,3 +534,17 @@ let interp = DebugInterp()
     end
     @test found
 end
+
+@newinterp InvokeInterp
+struct InvokeOwner end
+codegen = IdDict{CodeInstance, CodeInfo}()
+Compiler.cache_owner(::InvokeInterp) = InvokeOwner()
+Compiler.codegen_cache(::InvokeInterp) = codegen
+let interp = InvokeInterp()
+    source_mode = Compiler.SOURCE_MODE_ABI
+    f = (+)
+    args = (1, 1)
+    mi = @ccall jl_method_lookup(Any[f, args...]::Ptr{Any}, (1+length(args))::Csize_t, Base.tls_world_age()::Csize_t)::Ref{Core.MethodInstance}
+    ci = Compiler.typeinf_ext_toplevel(interp, mi, source_mode)
+    @test invoke(f, ci, args...) == 2
+end
diff --git a/Compiler/test/inference.jl b/Compiler/test/inference.jl
index b77c99513a8b6..ec569a0ba04b5 100644
--- a/Compiler/test/inference.jl
+++ b/Compiler/test/inference.jl
@@ -6194,3 +6194,8 @@ f57292(xs::Union{Tuple{String}, Int}...) = getfield(xs...)
 g57292(xs::String...) = getfield(("abc",), 1, :not_atomic, xs...)
 @test Base.infer_return_type(f57292) == String
 @test Base.infer_return_type(g57292) == String
+
+global invalid_setglobal!_exct_modeling::Int
+@test Base.infer_exception_type((Float64,)) do x
+    setglobal!(@__MODULE__, :invalid_setglobal!_exct_modeling, x)
+end == ErrorException
diff --git a/Compiler/test/verifytrim.jl b/Compiler/test/verifytrim.jl
index a03804a94cb62..0e9d040ef0c9b 100644
--- a/Compiler/test/verifytrim.jl
+++ b/Compiler/test/verifytrim.jl
@@ -33,7 +33,7 @@ let infos = typeinf_ext_toplevel(Any[Core.svec(Base.SecretBuffer, Tuple{Type{Bas
     @test occursin("finalizer", desc.desc)
     repr = sprint(verify_print_error, desc, parents)
     @test occursin(
-        r"""^unresolved finalizer registered from \(Core.finalizer\)\(Base.final_shred!, %new\(\)::Base.SecretBuffer\)::Nothing
+        r"""^unresolved finalizer registered from statement \(Core.finalizer\)\(Base.final_shred!, %new\(\)::Base.SecretBuffer\)::Nothing
             Stacktrace:
              \[1\] finalizer\(f::typeof\(Base.final_shred!\), o::Base.SecretBuffer\)
                @ Base gcutils.jl:(\d+) \[inlined\]
diff --git a/base/boot.jl b/base/boot.jl
index 8cd032817cebe..32975e96af583 100644
--- a/base/boot.jl
+++ b/base/boot.jl
@@ -1017,7 +1017,7 @@ _setparser!(parser) = setglobal!(Core, :_parse, parser)
 
 # support for deprecated uses of builtin functions
 _apply(x...) = _apply_iterate(Main.Base.iterate, x...)
-_apply_pure(x...) = invoke_in_world_total(typemax_UInt, x...)
+const _apply_pure = _apply
 const _call_latest = invokelatest
 const _call_in_world = invoke_in_world
 
diff --git a/base/iobuffer.jl b/base/iobuffer.jl
index 144b0a20568e9..5e08a21d53186 100644
--- a/base/iobuffer.jl
+++ b/base/iobuffer.jl
@@ -1,45 +1,168 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-## work with AbstractVector{UInt8} via I/O primitives ##
+# IOBuffer is a Memory{UInt8} backed IO type for in-memory IO.
+
+# Here, u represents used bytes (already read), X represents bytes still to read,
+# - represents bytes uninitialized data but which can be written to later.
+# . represents bytes before offset, which the buffer will not touch, until
+# a write operation happens.
+
+#   .....uuuuuuuuuuuuuXXXXXXXXXXXXX------------
+#   |   |            |            |           |    |
+#   |   offset       ptr         size         |    maxsize
+#   1                                         lastindex(data)
+
+# N.B: `mark` does not correspond to any index in the buffer. Instead, it stores
+# the mark at virtual offset in the buffer.
+
+#            AFTER COMPACTION
+
+#   XXXXXXXXXXXXX--------------------------
+#  ||    |           |                    |    |
+#  |1    ptr         size                 |    maxsize
+#  |                                      lastindex(data)
+#  offset (set to zero)
+
+# * The underlying array is always 1-indexed
+# * The IOBuffer has full control (ownership) of the underlying array, only when
+#   buffer.write == true.
+# * Unreachable data can be deleted in the buffer's data, shifting the whole thing to the left
+#   to make room for more data, without replacing or resizing data.
+#   This can be done only if the buffer is not seekable
 
-# Stateful string
 mutable struct GenericIOBuffer{T<:AbstractVector{UInt8}} <: IO
-    data::T # T should support: getindex, setindex!, length, copyto!, similar, and (optionally) resize!
-    reinit::Bool # if true, data needs to be re-allocated (after take!)
+    # T should support: getindex, setindex!, length, copyto!, similar, size and (optionally) resize!
+    data::T
+
+    # The user can take control of `data` out of this struct. When that happens, instead of eagerly allocating
+    # a new array, we set `.reinit` to true, and then allocate a new one when needed.
+    # If reinit is true, the buffer is writable, and offset_or_compacted and size is zero. See `take!`
+    reinit::Bool
     readable::Bool
     writable::Bool
-    seekable::Bool # if not seekable, implementation is free to destroy (compact) past read data
-    append::Bool # add data at end instead of at pointer
-    size::Int # end pointer (and write pointer if append == true) + offset
-    maxsize::Int # fixed array size (typically pre-allocated)
-    ptr::Int # read (and maybe write) pointer + offset
-    offset::Int # offset of ptr and size from actual start of data and actual size
-    mark::Int # reset mark location for ptr (or <0 for no mark)
 
-    function GenericIOBuffer{T}(data::T, readable::Bool, writable::Bool, seekable::Bool, append::Bool,
-                                maxsize::Integer) where T<:AbstractVector{UInt8}
-        require_one_based_indexing(data)
-        return new(data, false, readable, writable, seekable, append, length(data), maxsize, 1, 0, -1)
-    end
+    # If not seekable, implementation is free to destroy (compact) data before ptr, unless
+    # it can be recovered using the mark by using `reset`.
+    # If it IS seekable, the user may always recover any data in 1:size by seeking,
+    # so no data can be destroyed.
+    # Non-seekable IOBuffers can only be constructed with `PipeBuffer`, which are writable,
+    # readable and append.
+    seekable::Bool
+
+    # If true, write new data to the index size+1 instead of the index ptr.
+    append::Bool
+
+    # Last index of `data` that has been written to. Data in size+1:end has not yet been used,
+    # and may contain arbitrary values.
+    # This value is always in 0 : lastindex(data)
+    size::Int
+
+    # When the buffer is resized, or a new buffer allocated, this is the maximum size of the buffer.
+    # A new GenericIOBuffer may be constructed with an existing data larger than `maxsize`.
+    # When that happensm we must make sure to not have more than `maxsize` bytes in the buffer,
+    # else reallocating will lose data. So, never write to indices > `maxsize + get_offset(io)`
+    # This value is always in 0:typemax(Int).
+    maxsize::Int
+
+    # Data is read/written from/to ptr, except in situations where append is true, in which case
+    # data is still read from ptr, but written to size+1.
+    # This value is always in offset + 1 : size+1
+    ptr::Int
+
+    # This field has two distinct meanings:
+    # If the value is positive, it encodes an offset of the start of the data in `data`.
+    # This is used if the buffer is instantiated from a Vector with non-zero memory offset.
+    # Then, the IOBuffer stores the underlying memory, and so the first data in the buffer
+    # is not at index 1.
+    # If the value is negative, then `-io.offset_or_compacted` gets the number of compacted
+    # bytes. That's the number of unused bytes deleted from a non-seekable stream to make space.
+    # We need to keep track of it in order to make `mark` and `position` etc work, that is,
+    # we need to know the virtual position of the mark even when an arbitrary number
+    # of unused bytes has been deleted due to compaction.
+    # Since compaction will move data in the buffer and thereby zero the offset, either the
+    # offset or the number of compacted bytes will be zero at any point, so both can be
+    # stored in one field.
+    # If offset: Value is always in 0:lastindex(data)
+    # If compacted: Value is in typemin(Int):0
+    offset_or_compacted::Int
+
+    # The mark is -1 if not set, else the zero-indexed virtual position of ptr in the buffer.
+    # Due to compaction and offset, this value is not an index into the buffer, but may be translated
+    # to an index.
+    # This value is in -1:typemax(Int)
+    mark::Int
+
+    # Unsafe constructor which does not do any checking
+    global function _new_generic_iobuffer(
+            ::Type{T},
+            data::T,
+            readable::Bool,
+            writable::Bool,
+            seekable::Bool,
+            append::Bool,
+            maxsize::Int,
+        ) where T<:AbstractVector{UInt8}
+        len = Int(length(data))::Int
+        return new{T}(data, false, readable, writable, seekable, append, len, maxsize, 1, 0, -1)
+    end
+end
+
+function GenericIOBuffer{T}(
+        data::T,
+        readable::Bool,
+        writable::Bool,
+        seekable::Bool,
+        append::Bool,
+        maxsize::Integer,
+        truncate::Bool,
+    ) where T<:AbstractVector{UInt8}
+    require_one_based_indexing(data)
+    mz = Int(maxsize)::Int
+    len = Int(length(data))::Int
+    if !truncate && mz < len
+        throw(ArgumentError("maxsize must not be smaller than data length"))
+    end
+    buf = _new_generic_iobuffer(T, data, readable, writable, seekable, append, mz)
+    if truncate
+        buf.size = buf.offset_or_compacted
+    end
+    buf
 end
 
 const IOBuffer = GenericIOBuffer{Memory{UInt8}}
 
 function GenericIOBuffer(data::T, readable::Bool, writable::Bool, seekable::Bool, append::Bool,
-                         maxsize::Integer) where T<:AbstractVector{UInt8}
-    GenericIOBuffer{T}(data, readable, writable, seekable, append, maxsize)
+                         maxsize::Integer, truncate::Bool) where T<:AbstractVector{UInt8}
+    GenericIOBuffer{T}(data, readable, writable, seekable, append, maxsize, truncate)
 end
+
+# For this method, we use the underlying Memory of the vector. Therefore, we need to set the,
+# ptr and size accordingly, so the buffer only uses the part of the memory that the vector does.
 function GenericIOBuffer(data::Vector{UInt8}, readable::Bool, writable::Bool, seekable::Bool, append::Bool,
-                         maxsize::Integer)
+                         maxsize::Integer, truncate::Bool)
     ref = data.ref
-    buf = GenericIOBuffer(ref.mem, readable, writable, seekable, append, maxsize)
+    mem = ref.mem
     offset = memoryrefoffset(ref) - 1
-    buf.ptr += offset
-    buf.size = length(data) + offset
-    buf.offset = offset
+    # The user may pass a vector of length <= maxsize, but where the underlying memory
+    # is larger than maxsize. Don't throw an error in that case.
+    mz = Int(maxsize)::Int
+    if !truncate && mz < length(data)
+        throw(ArgumentError("maxsize must not be smaller than data length"))
+    end
+    buf = _new_generic_iobuffer(Memory{UInt8}, mem, readable, writable, seekable, append, mz)
+    buf.offset_or_compacted = offset
+    buf.ptr = offset + 1
+    if truncate
+        buf.size = offset
+    else
+        buf.size = length(data) + offset
+    end
     return buf
 end
 
+get_offset(io::GenericIOBuffer) = max(0, io.offset_or_compacted)
+get_compacted(io::GenericIOBuffer) = max(0, -io.offset_or_compacted)
+
 # allocate Vector{UInt8}s for IOBuffer storage that can efficiently become Strings
 StringMemory(n::Integer) = unsafe_wrap(Memory{UInt8}, _string_n(n))
 StringVector(n::Integer) = wrap(Array, StringMemory(n))
@@ -111,17 +234,11 @@ function IOBuffer(
         truncate::Union{Bool,Nothing}=nothing,
         maxsize::Integer=typemax(Int),
         sizehint::Union{Integer,Nothing}=nothing)
-    if maxsize < 0
-        throw(ArgumentError("negative maxsize"))
-    end
     if sizehint !== nothing
         sizehint!(data, sizehint)
     end
     flags = open_flags(read=read, write=write, append=append, truncate=truncate)
-    buf = GenericIOBuffer(data, flags.read, flags.write, true, flags.append, Int(maxsize))
-    if flags.truncate
-        buf.size = buf.offset
-    end
+    buf = GenericIOBuffer(data, flags.read, flags.write, true, flags.append, maxsize, flags.truncate)
     return buf
 end
 
@@ -131,17 +248,23 @@ function IOBuffer(;
         append::Union{Bool,Nothing}=nothing,
         truncate::Union{Bool,Nothing}=true,
         maxsize::Integer=typemax(Int),
-        sizehint::Union{Integer,Nothing}=nothing)
-    size = sizehint !== nothing ? Int(sizehint) : maxsize != typemax(Int) ? Int(maxsize) : 32
+        sizehint::Union{Integer,Nothing}=nothing,
+    )
+    mz = Int(maxsize)::Int
+    if mz < 0
+        throw(ArgumentError("negative maxsize"))
+    end
+    size = if sizehint !== nothing
+        # Allow negative sizehint, just like `sizehint!` does
+        min(mz, max(0, Int(sizehint)::Int))
+    else
+        min(mz, 32)
+    end
     flags = open_flags(read=read, write=write, append=append, truncate=truncate)
-    buf = IOBuffer(
-        StringMemory(size),
-        read=flags.read,
-        write=flags.write,
-        append=flags.append,
-        truncate=flags.truncate,
-        maxsize=maxsize)
-    fill!(buf.data, 0)
+    # A common usecase of IOBuffer is to incrementally construct strings. By using StringMemory
+    # as the default storage, we can turn the result into a string without copying.
+    buf = _new_generic_iobuffer(Memory{UInt8}, StringMemory(size), flags.read, flags.write, true, flags.append, mz)
+    buf.size = 0
     return buf
 end
 
@@ -158,21 +281,53 @@ If `data` is given, creates a `PipeBuffer` to operate on a data vector,
 optionally specifying a size beyond which the underlying `Array` may not be grown.
 """
 PipeBuffer(data::AbstractVector{UInt8}=Memory{UInt8}(); maxsize::Int = typemax(Int)) =
-    GenericIOBuffer(data, true, true, false, true, maxsize)
+    GenericIOBuffer(data, true, true, false, true, maxsize, false)
 PipeBuffer(maxsize::Integer) = (x = PipeBuffer(StringMemory(maxsize), maxsize = maxsize); x.size = 0; x)
 
+# Internal method where truncation IS supported
+function _truncated_pipebuffer(data::AbstractVector{UInt8}=Memory{UInt8}(); maxsize::Int = typemax(Int))
+    buf = PipeBuffer(data)
+    buf.size = get_offset(buf)
+    buf.maxsize = maxsize
+    buf
+end
+
 _similar_data(b::GenericIOBuffer, len::Int) = similar(b.data, len)
 _similar_data(b::IOBuffer, len::Int) = StringMemory(len)
 
-function copy(b::GenericIOBuffer)
-    ret = typeof(b)(b.reinit ? _similar_data(b, 0) : b.writable ?
-                    copyto!(_similar_data(b, length(b.data)), b.data) : b.data,
-                    b.readable, b.writable, b.seekable, b.append, b.maxsize)
-    ret.size = b.size
-    ret.ptr  = b.ptr
-    ret.mark = b.mark
-    ret.offset = b.offset
-    return ret
+# Note: Copying may change the value of the position (and mark) for un-seekable streams.
+# However, these values are not stable anyway due to compaction.
+
+function copy(b::GenericIOBuffer{T}) where T
+    if b.reinit
+        # If buffer is used up, allocate a new size-zero buffer
+        # Reinit implies writable, and that ptr, size, offset and mark are already the default values
+        return typeof(b)(_similar_data(b, 0), b.readable, b.writable, b.seekable, b.append, b.maxsize, false)
+    elseif b.writable
+        # Else, we just copy the reachable bytes. If buffer is seekable, all bytes
+        # after offset are reachable, since they can be seeked to
+        used_span = get_used_span(b)
+        compacted = first(used_span) - get_offset(b) - 1
+        len = length(used_span)
+        data = copyto!(_similar_data(b, len), view(b.data, used_span))
+        ret = typeof(b)(data, b.readable, b.writable, b.seekable, b.append, b.maxsize, false)
+        ret.size = len
+        # Copying data over implicitly compacts, and may add compaction
+        ret.offset_or_compacted = -get_compacted(b) - compacted
+        ret.ptr = b.ptr - first(used_span) + 1
+        ret.mark = b.mark
+        return ret
+    else
+        # When the buffer is just readable, they can share the same data, so we just make
+        # a shallow copy of the IOBuffer struct.
+        # Use internal constructor because we want to allow b.maxsize to be larger than data,
+        # in case that is the case for `b`.
+        ret = _new_generic_iobuffer(T, b.data, b.readable, b.writable, b.seekable, b.append, b.maxsize)
+        ret.offset_or_compacted = b.offset_or_compacted
+        ret.ptr = b.ptr
+        ret.mark = b.mark
+        return ret
+    end
 end
 
 show(io::IO, b::GenericIOBuffer) = print(io, "IOBuffer(data=UInt8[...], ",
@@ -180,9 +335,9 @@ show(io::IO, b::GenericIOBuffer) = print(io, "IOBuffer(data=UInt8[...], ",
                                       "writable=", b.writable, ", ",
                                       "seekable=", b.seekable, ", ",
                                       "append=",   b.append, ", ",
-                                      "size=",     b.size - b.offset, ", ",
+                                      "size=",     b.size - get_offset(b), ", ",
                                       "maxsize=",  b.maxsize == typemax(Int) ? "Inf" : b.maxsize, ", ",
-                                      "ptr=",      b.ptr - b.offset, ", ",
+                                      "ptr=",      b.ptr - get_offset(b), ", ",
                                       "mark=",     b.mark, ")")
 
 @noinline function _throw_not_readable()
@@ -192,7 +347,7 @@ end
 
 function unsafe_read(from::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
     from.readable || _throw_not_readable()
-    avail = bytesavailable(from)
+    avail = bytesavailable(from) % UInt
     adv = min(avail, nb)
     unsafe_read!(p, from.data, from.ptr, adv)
     from.ptr += adv
@@ -221,7 +376,45 @@ function unsafe_read!(dest::Ptr{UInt8}, src::DenseBytes, so::Integer, nbytes::UI
     nothing
 end
 
-function peek(from::GenericIOBuffer, T::Union{Type{Int16},Type{UInt16},Type{Int32},Type{UInt32},Type{Int64},Type{UInt64},Type{Int128},Type{UInt128},Type{Float16},Type{Float32},Type{Float64}})
+const MultiByteBitNumberType = Union{
+    Type{UInt16},
+    Type{Int16},
+    Type{UInt32},
+    Type{Int32},
+    Type{UInt64},
+    Type{Int64},
+    Type{UInt128},
+    Type{Int128},
+    Type{Float16},
+    Type{Float32},
+    Type{Float64},
+}
+
+function load_from_array(T::MultiByteBitNumberType, data::AbstractArray{UInt8}, from::Int)
+    x = if T <: AbstractFloat
+        uinttype(T)(0)
+    else
+        unsigned(T)(0)
+    end
+    for i in 0:sizeof(x)-1
+        x |= typeof(x)(data[from + i]) << (8 * i)
+    end
+    reinterpret(T, ltoh(x))
+end
+
+function peek(from::GenericIOBuffer, T::MultiByteBitNumberType)
+    from.readable || _throw_not_readable()
+    avail = bytesavailable(from)
+    nb = sizeof(T)
+    if nb > avail
+        throw(EOFError())
+    end
+    return load_from_array(T, from.data, from.ptr)
+end
+
+# This method can use a pointer, since the underlying buffer is dense
+# and memory backed
+function peek(from::GenericIOBuffer{<:MutableDenseArrayType}, T::MultiByteBitNumberType)
     from.readable || _throw_not_readable()
     avail = bytesavailable(from)
     nb = sizeof(T)
@@ -235,29 +428,12 @@ function peek(from::GenericIOBuffer, T::Union{Type{Int16},Type{UInt16},Type{Int3
     return x
 end
 
-function read(from::GenericIOBuffer, T::Union{Type{Int16},Type{UInt16},Type{Int32},Type{UInt32},Type{Int64},Type{UInt64},Type{Int128},Type{UInt128},Type{Float16},Type{Float32},Type{Float64}})
+function read(from::GenericIOBuffer, T::MultiByteBitNumberType)
     x = peek(from, T)
     from.ptr += sizeof(T)
     return x
 end
 
-function read_sub(from::GenericIOBuffer, a::AbstractArray{T}, offs, nel) where T
-    require_one_based_indexing(a)
-    from.readable || _throw_not_readable()
-    if offs+nel-1 > length(a) || offs < 1 || nel < 0
-        throw(BoundsError())
-    end
-    if isa(a, MutableDenseArrayType{UInt8})
-        nb = UInt(nel * sizeof(T))
-        GC.@preserve a unsafe_read(from, pointer(a, offs), nb)
-    else
-        for i = offs:offs+nel-1
-            a[i] = read(from, T)
-        end
-    end
-    return a
-end
-
 @inline function read(from::GenericIOBuffer, ::Type{UInt8})
     from.readable || _throw_not_readable()
     ptr = from.ptr
@@ -283,20 +459,35 @@ read(from::GenericIOBuffer, ::Type{Ptr{T}}) where {T} = convert(Ptr{T}, read(fro
 isreadable(io::GenericIOBuffer) = io.readable
 iswritable(io::GenericIOBuffer) = io.writable
 
-filesize(io::GenericIOBuffer) = (io.seekable ? io.size - io.offset : bytesavailable(io))
+# Number of bytes that can be read from the buffer, if you seek to the start first.
+filesize(io::GenericIOBuffer) = (io.seekable ? io.size - get_offset(io) : bytesavailable(io))
+
+# Number of bytes that can be read from the buffer.
 bytesavailable(io::GenericIOBuffer) = io.size - io.ptr + 1
-position(io::GenericIOBuffer) = io.ptr - io.offset - 1
+
+# TODO: Document that position for an unmarked and unseekable stream is invalid (and make it error?)
+function position(io::GenericIOBuffer)
+    # Position is zero-indexed, but ptr is one-indexed, hence the -1
+    io.ptr - io.offset_or_compacted - 1
+end
 
 function skip(io::GenericIOBuffer, n::Integer)
     skip(io, clamp(n, Int))
 end
+
 function skip(io::GenericIOBuffer, n::Int)
+    # In both cases, the result will never go to before the first position,
+    # nor beyond the last position, and will not throw an error unless the stream
+    # is not seekable and try to skip a negative number of bytes.
     if signbit(n)
+        # Skipping a negative number of bytes is equivalent to seeking backwards.
         seekto = clamp(widen(position(io)) + widen(n), Int)
         seek(io, seekto) # Does error checking
     else
-        n_max = io.size + 1 - io.ptr
-        io.ptr += min(n, n_max)
+        # Don't use seek in order to allow a non-seekable IO to still skip bytes.
+        # Handle overflow.
+        maxptr = io.size + 1
+        io.ptr = n > maxptr || io.ptr - n > maxptr ? maxptr : io.ptr + n
         io
     end
 end
@@ -304,16 +495,30 @@ end
 function seek(io::GenericIOBuffer, n::Integer)
     seek(io, clamp(n, Int))
 end
+
+function translate_seek_position(io::GenericIOBuffer, n::Int)
+    # If there is an offset (the field F is positive), then there are F unused bytes at the beginning
+    # of the data, and we need to seek to n + F + 1. (Also compensate for `seek` being zero-
+    # indexed)
+
+    # If bytes has been compacted (field F is negative), then F bytes has been deleted from
+    # the buffer, and a virtual position n means a position n + F in the data.
+    # Remember that F is negative, so n + F is subtracting from n. So we also end up with
+    # n + F + 1.
+    clamp(widen(n) + widen(io.offset_or_compacted) + widen(1), Int)
+end
+
 function seek(io::GenericIOBuffer, n::Int)
     if !io.seekable
         ismarked(io) || throw(ArgumentError("seek failed, IOBuffer is not seekable and is not marked"))
         n == io.mark || throw(ArgumentError("seek failed, IOBuffer is not seekable and n != mark"))
     end
+
     # TODO: REPL.jl relies on the fact that this does not throw (by seeking past the beginning or end
     #       of an GenericIOBuffer), so that would need to be fixed in order to throw an error here
-    #(n < 0 || n > io.size - io.offset) && throw(ArgumentError("Attempted to seek outside IOBuffer boundaries."))
-    #io.ptr = n + io.offset + 1
-    io.ptr = clamp(n, 0, io.size - io.offset) + io.offset + 1
+    max_ptr = io.size + 1
+    min_ptr = get_offset(io) + 1
+    io.ptr = clamp(translate_seek_position(io, n), min_ptr, max_ptr)
     return io
 end
 
@@ -322,113 +527,163 @@ function seekend(io::GenericIOBuffer)
     return io
 end
 
-# choose a resize strategy based on whether `resize!` is defined:
-# for a Vector, we use `resize!`, but for most other types,
-# this calls `similar`+copy
-function _resize!(io::GenericIOBuffer, sz::Int)
-    a = io.data
-    offset = io.offset
-    if applicable(resize!, a, sz)
-        if offset != 0
-            size = io.size
-            size > offset && copyto!(a, 1, a, offset + 1, min(sz, size - offset))
-            io.ptr -= offset
-            io.size -= offset
-            io.offset = 0
-        end
-        resize!(a, sz)
+# Resize the io's data to `new_size`, which must not be > io.maxsize.
+# Use `resize!` if the data supports it, else reallocate a new one and
+# copy the old data over.
+# If not `exact` and resizing is not supported, overallocate in order to
+# prevent excessive resizing.
+function _resize!(io::GenericIOBuffer, new_size::Int, exact::Bool)
+    old_data = io.data
+    if applicable(resize!, old_data, new_size)
+        resize!(old_data, new_size)
     else
-        size = io.size
-        if size >= sz && sz != 0
-            b = a
-        else
-            b = _similar_data(io, sz == 0 ? 0 : max(overallocation(size - io.offset), sz))
-        end
-        size > offset && copyto!(b, 1, a, offset + 1, min(sz, size - offset))
-        io.data = b
-        io.ptr -= offset
-        io.size -= offset
-        io.offset = 0
+        new_size = exact ? new_size : min(io.maxsize, overallocation(new_size))
+        used_span = get_used_span(io)
+        deleted = first(used_span) - 1
+        compacted = deleted - get_offset(io)
+        new_data = _similar_data(io, new_size)
+        io.data = new_data
+        iszero(new_size) && return io
+        len_used = length(used_span)
+        iszero(len_used) || copyto!(new_data, 1, old_data, first(used_span), len_used)
+        # Copying will implicitly compact, and so compaction must be updated
+        io.offset_or_compacted = -get_compacted(io) - compacted
+        io.ptr -= deleted
+        io.size = len_used
     end
     return io
 end
 
 function truncate(io::GenericIOBuffer, n::Integer)
     io.writable || throw(ArgumentError("truncate failed, IOBuffer is not writeable"))
+    # Non-seekable buffers can only be constructed with `PipeBuffer`, which is explicitly
+    # documented to not be truncatable.
     io.seekable || throw(ArgumentError("truncate failed, IOBuffer is not seekable"))
     n < 0 && throw(ArgumentError("truncate failed, n bytes must be ≥ 0, got $n"))
     n > io.maxsize && throw(ArgumentError("truncate failed, $(n) bytes is exceeds IOBuffer maxsize $(io.maxsize)"))
-    n = Int(n)
+    n = Int(n)::Int
+    offset = get_offset(io)
+    current_size = io.size - offset
     if io.reinit
-        io.data = _similar_data(io, n)
+        # If reinit, we don't need to truncate anything but just reinitializes
+        # the buffer with zeros. Mark, ptr and offset has already been reset.
+        io.data = fill!(_similar_data(io, n), 0x00)
         io.reinit = false
-    elseif n > length(io.data) + io.offset
-        _resize!(io, n)
-    end
-    ismarked(io) && io.mark > n && unmark(io)
-    n += io.offset
-    io.data[io.size+1:n] .= 0
-    io.size = n
-    io.ptr = min(io.ptr, n+1)
+        io.size = n
+    elseif n < current_size
+        # Else, if we need to shrink the iobuffer, we simply change the pointers without
+        # actually shrinking the underlying storage, or copying data.
+
+        # Clear the mark if it points to data that has now been deleted.
+        if translate_seek_position(io, io.mark) > n+offset
+            io.mark = -1
+        end
+        io.size = n + offset
+        io.ptr = min(io.ptr, n + offset + 1)
+    elseif n > current_size
+        if n + offset > io.maxsize
+            compact!(io)
+        end
+        _resize!(io, n + get_offset(io), false)
+        fill!(view(io.data, io.size + 1:min(length(io.data), n + get_offset(io))), 0x00)
+        io.size = min(length(io.data), n + get_offset(io))
+    end
     return io
 end
 
-function compact(io::GenericIOBuffer)
-    io.writable || throw(ArgumentError("compact failed, IOBuffer is not writeable"))
-    io.seekable && throw(ArgumentError("compact failed, IOBuffer is seekable"))
-    io.reinit && return
-    local ptr::Int, bytes_to_move::Int
-    if ismarked(io) && io.mark < position(io)
-        io.mark == 0 && return
-        ptr = io.mark + io.offset
-        bytes_to_move = bytesavailable(io) + (io.ptr - ptr)
-    else
-        ptr = io.ptr
-        bytes_to_move = bytesavailable(io)
+# Ensure that the buffer has room for at least `nshort` more bytes, except when
+# doing that would exceed maxsize.
+@inline ensureroom(io::GenericIOBuffer, nshort::Int) = ensureroom(io, UInt(nshort))
+
+@inline function ensureroom(io::GenericIOBuffer, nshort::UInt)
+    # If the IO is not writable, we call the slow path only to error.
+    # If reinit, the data has been handed out to the user, and the IOBuffer
+    # no longer controls it, so we need to allocate a new one.
+    if !io.writable || io.reinit
+        return ensureroom_reallocate(io, nshort)
+    end
+    # The fast path here usually checks there is already room, then does nothing.
+    # When append is true, new data is added after io.size, not io.ptr
+    existing_space = min(lastindex(io.data), io.maxsize + get_offset(io)) - (io.append ? io.size : io.ptr - 1)
+    if existing_space < nshort % Int
+        # Outline this function to make it more likely that ensureroom inlines itself
+        return ensureroom_slowpath(io, nshort, existing_space)
     end
-    copyto!(io.data, 1, io.data, ptr, bytes_to_move)
-    io.size -= ptr - 1
-    io.ptr -= ptr - 1
-    io.offset = 0
-    return
+    return io
 end
 
-@noinline function ensureroom_slowpath(io::GenericIOBuffer, nshort::UInt)
+# Throw error (placed in this function to outline it) or reinit the buffer
+@noinline function ensureroom_reallocate(io::GenericIOBuffer, nshort::UInt)
     io.writable || throw(ArgumentError("ensureroom failed, IOBuffer is not writeable"))
-    if io.reinit
-        io.data = _similar_data(io, nshort % Int)
-        io.reinit = false
-    end
-    if !io.seekable
-        if !ismarked(io) && io.ptr > io.offset+1 && io.size <= io.ptr - 1
-            io.ptr = 1
-            io.size = 0
-            io.offset = 0
-        else
-            datastart = (ismarked(io) ? io.mark : io.ptr - io.offset)
-            if (io.size-io.offset+nshort > io.maxsize) ||
-                (datastart > 4096 && datastart > io.size - io.ptr) ||
-                (datastart > 262144)
-                # apply somewhat arbitrary heuristics to decide when to destroy
-                # old, read data to make more room for new data
-                compact(io)
-            end
+    io.data = _similar_data(io, min(io.maxsize, nshort % Int))
+    io.reinit = false
+    io.offset_or_compacted = -get_compacted(io)
+    return io
+end
+
+# Here, we already know there is not enough room at the end of the io's data.
+@noinline function ensureroom_slowpath(io::GenericIOBuffer, nshort::UInt, available_bytes::Int)
+    reclaimable_bytes = first(get_used_span(io)) - 1
+    # Avoid resizing and instead compact the buffer, only if we gain enough bytes from
+    # doing so (at least 32 bytes and 1/8th of the data length). Also, if we would have
+    # to resize anyway, there would be no point in compacting, so also check that.
+    if (
+            reclaimable_bytes ≥ 32 &&
+            reclaimable_bytes ≥ length(io.data) >>> 3 &&
+            (reclaimable_bytes + available_bytes) % UInt ≥ nshort
+        )
+        compact!(io)
+        return io
+    end
+
+    desired_size = length(io.data) + Int(nshort) - available_bytes
+    if desired_size > io.maxsize
+        # If we can't fit all the requested data in the new buffer, we need to
+        # fit as much as possible, so we must compact
+        if !iszero(reclaimable_bytes)
+            desired_size -= compact!(io)
+        end
+        # Max out the buffer size if we want more than the buffer size
+        if length(io.data) < io.maxsize
+            _resize!(io, io.maxsize, true)
         end
+    else
+        # Else, we request only the requested size, but set `exact` to `false`,
+        # in order to overallocate to avoid growing the buffer by too little
+        _resize!(io, desired_size, false)
     end
-    return
+
+    return io
 end
 
-@inline ensureroom(io::GenericIOBuffer, nshort::Int) = ensureroom(io, UInt(nshort))
-@inline function ensureroom(io::GenericIOBuffer, nshort::UInt)
-    if !io.writable || (!io.seekable && io.ptr > io.offset+1) || io.reinit
-        ensureroom_slowpath(io, nshort)
-    end
-    n = min((nshort % Int) + (io.append ? io.size : io.ptr-1) - io.offset, io.maxsize)
-    l = length(io.data) + io.offset
-    if n > l
-        _resize!(io, Int(n))
+# Get the indices in data which cannot be deleted
+function get_used_span(io::IOBuffer)
+    # A seekable buffer can recover data before ptr
+    return if io.seekable
+        get_offset(io) + 1 : io.size
+    # If non-seekable, the mark can be used to recover data before ptr,
+    # so data at the mark and after must also be saved
+    elseif io.mark > -1
+        min(io.ptr, translate_seek_position(io, io.mark)) : io.size
+    else
+        io.ptr : io.size
     end
-    return io
+end
+
+# Delete any offset, and also compact data if buffer is not seekable.
+# Return the number of bytes deleted
+function compact!(io::GenericIOBuffer)::Int
+    offset = get_offset(io)
+    used_span = get_used_span(io)
+    deleted = first(used_span) - 1
+    compacted = deleted - offset
+    iszero(deleted) && return 0
+    data = io.data
+    copyto!(data, 1, data, deleted + 1, length(used_span))
+    io.offset_or_compacted = -get_compacted(io) - compacted
+    io.ptr -= deleted
+    io.size -= deleted
+    return deleted
 end
 
 eof(io::GenericIOBuffer) = (io.ptr - 1 >= io.size)
@@ -439,17 +694,17 @@ function closewrite(io::GenericIOBuffer)
 end
 
 @noinline function close(io::GenericIOBuffer{T}) where T
+    if io.writable && !io.reinit
+        _resize!(io, 0, true)
+    end
     io.readable = false
     io.writable = false
     io.seekable = false
     io.size = 0
-    io.offset = 0
     io.maxsize = 0
     io.ptr = 1
     io.mark = -1
-    if io.writable && !io.reinit
-        io.data = _resize!(io, 0)
-    end
+    io.offset_or_compacted = -get_compacted(io)
     nothing
 end
 
@@ -472,31 +727,42 @@ julia> String(take!(io))
 ```
 """
 function take!(io::GenericIOBuffer)
-    ismarked(io) && unmark(io)
+    io.mark = -1
     if io.seekable
-        nbytes = io.size - io.offset
-        data = copyto!(StringVector(nbytes), 1, io.data, io.offset + 1, nbytes)
+        # If the buffer is seekable, then the previously consumed bytes from ptr+1:size
+        # must still be output, as they are not truly gone.
+        # Hence, we output all bytes from 1:io.size
+        offset = get_offset(io)
+        nbytes = io.size - offset
+        data = copyto!(StringVector(nbytes), 1, io.data, offset + 1, nbytes)
     else
+        # Else, if not seekable, bytes from 1:ptr-1 are truly gone and should not
+        # be output. Hence, we output `bytesavailable`, which is ptr:size
         nbytes = bytesavailable(io)
         data = read!(io, StringVector(nbytes))
     end
     if io.writable
+        io.reinit = true
         io.ptr = 1
         io.size = 0
-        io.offset = 0
+        io.offset_or_compacted = 0
     end
     return data
 end
+
+# This method is specialized because we know the underlying data is a Memory, so we can
+# e.g. wrap directly in an array without copying. Otherwise the logic is the same as
+# the generic method
 function take!(io::IOBuffer)
-    ismarked(io) && unmark(io)
+    io.mark = -1
     if io.seekable
         nbytes = filesize(io)
         if nbytes == 0 || io.reinit
             data = StringVector(0)
         elseif io.writable
-            data = wrap(Array, memoryref(io.data, io.offset + 1), nbytes)
+            data = wrap(Array, memoryref(io.data, get_offset(io) + 1), nbytes)
         else
-            data = copyto!(StringVector(nbytes), 1, io.data, io.offset + 1, nbytes)
+            data = copyto!(StringVector(nbytes), 1, io.data, get_offset(io) + 1, nbytes)
         end
     else
         nbytes = bytesavailable(io)
@@ -512,7 +778,7 @@ function take!(io::IOBuffer)
         io.reinit = true
         io.ptr = 1
         io.size = 0
-        io.offset = 0
+        io.offset_or_compacted = 0
     end
     return data
 end
@@ -529,46 +795,79 @@ state.  This should only be used internally for performance-critical
 It might save an allocation compared to `take!` (if the compiler elides the
 Array allocation), as well as omits some checks.
 """
-_unsafe_take!(io::IOBuffer) =
-    wrap(Array, io.size == io.offset ?
-        memoryref(Memory{UInt8}()) :
-        memoryref(io.data, io.offset + 1),
-        io.size - io.offset)
+function _unsafe_take!(io::IOBuffer)
+    offset = get_offset(io)
+    mem = if io.size == offset
+        memoryref(Memory{UInt8}())
+    else
+        memoryref(io.data, offset + 1)
+    end
+    wrap(Array, mem, io.size - offset)
+end
 
 function write(to::IO, from::GenericIOBuffer)
-    written::Int = bytesavailable(from)
+    # This would cause an infinite loop, as it should read until the end, but more
+    # data is being written into it continuously.
     if to === from
-        from.ptr = from.size + 1
+        throw(ArgumentError("Writing all content fron an IOBuffer into itself in invalid"))
     else
-        written = GC.@preserve from unsafe_write(to, pointer(from.data, from.ptr), UInt(written))
-        from.ptr += written
+        available = bytesavailable(from)
+        written = GC.@preserve from unsafe_write(to, pointer(from.data, from.ptr), UInt(available))
+        from.ptr = from.size + 1
     end
     return written
 end
 
 function unsafe_write(to::GenericIOBuffer, p::Ptr{UInt8}, nb::UInt)
     ensureroom(to, nb)
-    ptr = (to.append ? to.size+1 : to.ptr)
-    written = Int(min(nb, Int(length(to.data))::Int - ptr + 1))
-    towrite = written
-    d = to.data
-    while towrite > 0
-        @inbounds d[ptr] = unsafe_load(p)
-        ptr += 1
+    size = to.size
+    append = to.append
+    ptr = append ? size+1 : to.ptr
+    data = to.data
+    to_write = min(nb, (min(Int(length(data))::Int, to.maxsize + get_offset(to)) - ptr + 1) % UInt) % Int
+    # Dispatch based on the type of data, to possibly allow using memcpy
+    _unsafe_write(data, p, ptr, to_write % UInt)
+    # Update to.size only if the ptr has advanced to higher than
+    # the previous size. Otherwise, we just overwrote existing data
+    to.size = max(size, ptr + to_write - 1)
+    # If to.append, we only update size, not ptr.
+    if !append
+        to.ptr = ptr + to_write
+    end
+    return to_write
+end
+
+@inline function _unsafe_write(data::AbstractVector{UInt8}, p::Ptr{UInt8}, from::Int, nb::UInt)
+    for i in 0:nb-1
+        data[from + i] = unsafe_load(p)
         p += 1
-        towrite -= 1
     end
-    to.size = max(to.size, ptr - 1)
-    if !to.append
-        to.ptr += written
+end
+
+@inline function _unsafe_write(data::MutableDenseArrayType{UInt8}, p::Ptr{UInt8}, from::Int, nb::UInt)
+    # Calling `unsafe_copyto!` is very efficient for large arrays, but has some overhead
+    # for small (< 5 bytes) arrays.
+    # Since a common use case of IOBuffer is to construct strings incrementally, often
+    # one char at a time, it's crucial to be fast in the case of small arrays.
+    # This optimization only gives a minor 10% speed boost in the best case.
+    if nb < 5
+        @inbounds for i in UInt(1):nb
+            data[from + (i % Int) - 1] = unsafe_load(p, i)
+        end
+    else
+        GC.@preserve data begin
+            ptr = Ptr{UInt8}(pointer(data, from))::Ptr{UInt8}
+            @inline unsafe_copyto!(ptr, p, nb)
+        end
     end
-    return written
 end
 
 @inline function write(to::GenericIOBuffer, a::UInt8)
     ensureroom(to, UInt(1))
     ptr = (to.append ? to.size+1 : to.ptr)
-    if ptr > to.maxsize
+    # We have just ensured there is room for 1 byte, EXCEPT if we were to exceed
+    # maxsize. So, we just need to check that here.
+    if ptr > to.maxsize + get_offset(to)
         return 0
     else
         to.data[ptr] = a
@@ -581,31 +880,26 @@ end
 end
 
 readbytes!(io::GenericIOBuffer, b::MutableDenseArrayType{UInt8}, nb=length(b)) = readbytes!(io, b, Int(nb))
+
 function readbytes!(io::GenericIOBuffer, b::MutableDenseArrayType{UInt8}, nb::Int)
-    nr = min(nb, bytesavailable(io))
-    if length(b) < nr
-        resize!(b, nr)
+    io.readable || _throw_not_readable()
+    to_read = min(nb, bytesavailable(io))
+    if length(b) < to_read
+        resize!(b, to_read)
     end
-    read_sub(io, b, 1, nr)
-    return nr
+    checkbounds(b, 1:to_read)
+    GC.@preserve b unsafe_read(io, pointer(b), to_read)
+    to_read
 end
 read(io::GenericIOBuffer) = read!(io, StringVector(bytesavailable(io)))
+
+# For IO buffers, all the data is immediately available.
 readavailable(io::GenericIOBuffer) = read(io)
-read(io::GenericIOBuffer, nb::Integer) = read!(io, StringVector(min(nb, bytesavailable(io))))
 
-function occursin(delim::UInt8, buf::IOBuffer)
-    p = pointer(buf.data, buf.ptr)
-    q = GC.@preserve buf ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, delim, bytesavailable(buf))
-    return q != C_NULL
-end
+read(io::GenericIOBuffer, nb::Integer) = read!(io, StringVector(min(nb, bytesavailable(io))))
 
 function occursin(delim::UInt8, buf::GenericIOBuffer)
-    data = buf.data
-    for i = buf.ptr:buf.size
-        @inbounds b = data[i]
-        b == delim && return true
-    end
-    return false
+    return in(delim, view(buf.data, buf.ptr:buf.size))
 end
 
 function copyuntil(out::IO, io::GenericIOBuffer, delim::UInt8; keep::Bool=false)
@@ -622,21 +916,45 @@ function copyuntil(out::IO, io::GenericIOBuffer, delim::UInt8; keep::Bool=false)
 end
 
 function copyline(out::GenericIOBuffer, s::IO; keep::Bool=false)
-    copyuntil(out, s, 0x0a, keep=true)
-    line = out.data
-    i = out.size # XXX: this is only correct for appended data. if the data was inserted, only ptr should change
-    if keep || i == out.offset || line[i] != 0x0a
+    # If the data is copied into the middle of the buffer of `out` instead of appended to the end,
+    # and !keep, and the line copied ends with \r\n, then the copyuntil (even if keep=false)
+    # will overwrite one too many bytes with the new \r byte.
+    # Work around this by making a new temporary buffer.
+    # Could perhaps be done better
+    if !out.append && out.ptr < out.size + 1
+        newbuf = IOBuffer()
+        copyuntil(newbuf, s, 0x0a, keep=true)
+        v = take!(newbuf)
+        # Remove \r\n or \n if present
+        if !keep
+            if length(v) > 1 && last(v) == UInt8('\n')
+                pop!(v)
+            end
+            if length(v) > 1 && last(v) == UInt8('\r')
+                pop!(v)
+            end
+        end
+        write(out, v)
         return out
-    elseif i < 2 || line[i-1] != 0x0d
-        i -= 1
     else
-        i -= 2
-    end
-    out.size = i
-    if !out.append
-        out.ptr = i+1
+        # Else, we can just copy the data directly into the buffer, and then
+        # subtract the last one or two bytes depending on `keep`.
+        copyuntil(out, s, 0x0a, keep=true)
+        line = out.data
+        i = out.size
+        if keep || i == out.offset_or_compacted || line[i] != 0x0a
+            return out
+        elseif i < 2 || line[i-1] != 0x0d
+            i -= 1
+        else
+            i -= 2
+        end
+        out.size = i
+        if !out.append
+            out.ptr = i+1
+        end
+        return out
     end
-    return out
 end
 
 function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
@@ -644,6 +962,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
     # note: findfirst + copyto! is much faster than a single loop
     #       except for nout ≲ 20.  A single loop is 2x faster for nout=5.
     nout = nread = something(findfirst(==(0x0a), data), length(data))::Int
+    # Remove the 0x0a (newline) if not keep, and also remove the 0x0d (\r) if it is there
     if !keep && nout > 0 && data[nout] == 0x0a
         nout -= 1
         nout > 0 && data[nout] == 0x0d && (nout -= 1)
@@ -652,6 +971,7 @@ function _copyline(out::IO, io::GenericIOBuffer; keep::Bool=false)
     io.ptr += nread
     return out
 end
+
 copyline(out::IO, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)
 copyline(out::GenericIOBuffer, io::GenericIOBuffer; keep::Bool=false) = _copyline(out, io; keep)
 
diff --git a/base/precompilation.jl b/base/precompilation.jl
index 5392e119d25a2..3ab9fcad5aee6 100644
--- a/base/precompilation.jl
+++ b/base/precompilation.jl
@@ -835,8 +835,8 @@ function _precompilepkgs(pkgs::Vector{String},
                         # window between print cycles
                         termwidth = displaysize(io)[2] - 4
                         if !final_loop
-                            str = sprint(io -> show_progress(io, bar; termwidth, carriagereturn=false); context=io)
-                            print(iostr, Base._truncate_at_width_or_chars(true, str, termwidth), "\n")
+                            s = sprint(io -> show_progress(io, bar; termwidth, carriagereturn=false); context=io)
+                            print(iostr, Base._truncate_at_width_or_chars(true, s, termwidth), "\n")
                         end
                         for pkg_config in pkg_queue_show
                             dep, config = pkg_config
diff --git a/base/shell.jl b/base/shell.jl
index e07fff128acfe..68925cbd5d5af 100644
--- a/base/shell.jl
+++ b/base/shell.jl
@@ -344,7 +344,7 @@ function shell_escape_csh(io::IO, args::AbstractString...)
 end
 shell_escape_csh(args::AbstractString...) =
     sprint(shell_escape_csh, args...;
-           sizehint = sum(sizeof.(args)) + length(args) * 3)
+           sizehint = sum(sizeof, args) + length(args) * 3)
 
 """
     shell_escape_wincmd(s::AbstractString)
@@ -494,4 +494,4 @@ function escape_microsoft_c_args(io::IO, args::AbstractString...)
 end
 escape_microsoft_c_args(args::AbstractString...) =
     sprint(escape_microsoft_c_args, args...;
-           sizehint = (sum(sizeof.(args)) + 3*length(args)))
+           sizehint = (sum(sizeof, args) + 3*length(args)))
diff --git a/base/stream.jl b/base/stream.jl
index 33d884018d5ad..5732a62c2153b 100644
--- a/base/stream.jl
+++ b/base/stream.jl
@@ -615,9 +615,9 @@ end
 ## BUFFER ##
 ## Allocate space in buffer (for immediate use)
 function alloc_request(buffer::IOBuffer, recommended_size::UInt)
-    ensureroom(buffer, Int(recommended_size))
+    ensureroom(buffer, recommended_size)
     ptr = buffer.append ? buffer.size + 1 : buffer.ptr
-    nb = min(length(buffer.data)-buffer.offset, buffer.maxsize) + buffer.offset - ptr + 1
+    nb = min(length(buffer.data), buffer.maxsize + get_offset(buffer)) - ptr + 1
     return (Ptr{Cvoid}(pointer(buffer.data, ptr)), nb)
 end
 
@@ -942,8 +942,7 @@ function readbytes!(s::LibuvStream, a::Vector{UInt8}, nb::Int)
         nread = readbytes!(sbuf, a, nb)
     else
         initsize = length(a)
-        newbuf = PipeBuffer(a, maxsize=nb)
-        newbuf.size = newbuf.offset # reset the write pointer to the beginning
+        newbuf = _truncated_pipebuffer(a; maxsize=nb)
         nread = try
             s.buffer = newbuf
             write(newbuf, sbuf)
@@ -990,8 +989,7 @@ function unsafe_read(s::LibuvStream, p::Ptr{UInt8}, nb::UInt)
     if bytesavailable(sbuf) >= nb
         unsafe_read(sbuf, p, nb)
     else
-        newbuf = PipeBuffer(unsafe_wrap(Array, p, nb), maxsize=Int(nb))
-        newbuf.size = newbuf.offset # reset the write pointer to the beginning
+        newbuf = _truncated_pipebuffer(unsafe_wrap(Array, p, nb); maxsize=Int(nb))
         try
             s.buffer = newbuf
             write(newbuf, sbuf)
@@ -1599,8 +1597,7 @@ function readbytes!(s::BufferStream, a::Vector{UInt8}, nb::Int)
             nread = readbytes!(sbuf, a, nb)
         else
             initsize = length(a)
-            newbuf = PipeBuffer(a, maxsize=nb)
-            newbuf.size = newbuf.offset # reset the write pointer to the beginning
+            newbuf = _truncated_pipebuffer(a; maxsize=nb)
             nread = try
                 s.buffer = newbuf
                 write(newbuf, sbuf)
diff --git a/base/strings/annotated.jl b/base/strings/annotated.jl
index 0dcac0bf2de3b..1fbbdc1dc44e9 100644
--- a/base/strings/annotated.jl
+++ b/base/strings/annotated.jl
@@ -460,201 +460,109 @@ function annotated_chartransform(f::Function, str::AnnotatedString, state=nothin
     AnnotatedString(String(take!(outstr)), annots)
 end
 
-## AnnotatedIOBuffer
-
-struct AnnotatedIOBuffer <: AbstractPipe
-    io::IOBuffer
-    annotations::Vector{RegionAnnotation}
-end
-
-AnnotatedIOBuffer(io::IOBuffer) = AnnotatedIOBuffer(io, Vector{RegionAnnotation}())
-AnnotatedIOBuffer() = AnnotatedIOBuffer(IOBuffer())
-
-function show(io::IO, aio::AnnotatedIOBuffer)
-    show(io, AnnotatedIOBuffer)
-    size = filesize(aio.io)
-    print(io, '(', size, " byte", ifelse(size == 1, "", "s"), ", ",
-          length(aio.annotations), " annotation", ifelse(length(aio.annotations) == 1, "", "s"), ")")
+struct RegionIterator{S <: AbstractString}
+    str::S
+    regions::Vector{UnitRange{Int}}
+    annotations::Vector{Vector{Annotation}}
 end
 
-pipe_reader(io::AnnotatedIOBuffer) = io.io
-pipe_writer(io::AnnotatedIOBuffer) = io.io
-
-# Useful `IOBuffer` methods that we don't get from `AbstractPipe`
-position(io::AnnotatedIOBuffer) = position(io.io)
-seek(io::AnnotatedIOBuffer, n::Integer) = (seek(io.io, n); io)
-seekend(io::AnnotatedIOBuffer) = (seekend(io.io); io)
-skip(io::AnnotatedIOBuffer, n::Integer) = (skip(io.io, n); io)
-copy(io::AnnotatedIOBuffer) = AnnotatedIOBuffer(copy(io.io), copy(io.annotations))
-
-annotations(io::AnnotatedIOBuffer) = io.annotations
-
-annotate!(io::AnnotatedIOBuffer, range::UnitRange{Int}, label::Symbol, @nospecialize(val::Any)) =
-    (_annotate!(io.annotations, range, label, val); io)
-
-function write(io::AnnotatedIOBuffer, astr::Union{AnnotatedString, SubString{<:AnnotatedString}})
-    astr = AnnotatedString(astr)
-    offset = position(io.io)
-    eof(io) || _clear_annotations_in_region!(io.annotations, offset+1:offset+ncodeunits(astr))
-    _insert_annotations!(io, astr.annotations)
-    write(io.io, String(astr))
-end
+Base.length(si::RegionIterator) = length(si.regions)
 
-write(io::AnnotatedIOBuffer, c::AnnotatedChar) =
-    write(io, AnnotatedString(string(c), [(region=1:ncodeunits(c), a...) for a in c.annotations]))
-write(io::AnnotatedIOBuffer, x::AbstractString) = write(io.io, x)
-write(io::AnnotatedIOBuffer, s::Union{SubString{String}, String}) = write(io.io, s)
-write(io::AnnotatedIOBuffer, b::UInt8) = write(io.io, b)
-
-function write(dest::AnnotatedIOBuffer, src::AnnotatedIOBuffer)
-    destpos = position(dest)
-    isappending = eof(dest)
-    srcpos = position(src)
-    nb = write(dest.io, src.io)
-    isappending || _clear_annotations_in_region!(dest.annotations, destpos:destpos+nb)
-    srcannots = [setindex(annot, max(1 + srcpos, first(annot.region)):last(annot.region), :region)
-                 for annot in src.annotations if first(annot.region) >= srcpos]
-    _insert_annotations!(dest, srcannots, destpos - srcpos)
-    nb
+Base.@propagate_inbounds function Base.iterate(si::RegionIterator, i::Integer=1)
+    if i <= length(si.regions)
+        @inbounds ((SubString(si.str, si.regions[i]), si.annotations[i]), i+1)
+    end
 end
 
-# So that read/writes with `IOContext` (and any similar `AbstractPipe` wrappers)
-# work as expected.
-function write(io::AbstractPipe, s::Union{AnnotatedString, SubString{<:AnnotatedString}})
-    if pipe_writer(io) isa AnnotatedIOBuffer
-        write(pipe_writer(io), s)
-    else
-        invoke(write, Tuple{IO, typeof(s)}, io, s)
-    end::Int
-end
-# Can't be part of the `Union` above because it introduces method ambiguities
-function write(io::AbstractPipe, c::AnnotatedChar)
-    if pipe_writer(io) isa AnnotatedIOBuffer
-        write(pipe_writer(io), c)
-    else
-        invoke(write, Tuple{IO, typeof(c)}, io, c)
-    end::Int
-end
+Base.eltype(::RegionIterator{S}) where { S <: AbstractString} =
+    Tuple{SubString{S}, Vector{Annotation}}
 
 """
-    _clear_annotations_in_region!(annotations::Vector{$RegionAnnotation}, span::UnitRange{Int})
+    eachregion(s::AnnotatedString{S})
+    eachregion(s::SubString{AnnotatedString{S}})
 
-Erase the presence of `annotations` within a certain `span`.
+Identify the contiguous substrings of `s` with a constant annotations, and return
+an iterator which provides each substring and the applicable annotations as a
+`Tuple{SubString{S}, Vector{$Annotation}}`.
 
-This operates by removing all elements of `annotations` that are entirely
-contained in `span`, truncating ranges that partially overlap, and splitting
-annotations that subsume `span` to just exist either side of `span`.
+# Examples
+
+```jldoctest; setup=:(using Base: AnnotatedString, eachregion)
+julia> collect(eachregion(AnnotatedString(
+           "hey there", [(1:3, :face, :bold),
+                         (5:9, :face, :italic)])))
+3-element Vector{Tuple{SubString{String}, Vector{$Annotation}}}:
+ ("hey", [$Annotation((:face, :bold))])
+ (" ", [])
+ ("there", [$Annotation((:face, :italic))])
+```
 """
-function _clear_annotations_in_region!(annotations::Vector{RegionAnnotation}, span::UnitRange{Int})
-    # Clear out any overlapping pre-existing annotations.
-    filter!(ann -> first(ann.region) < first(span) || last(ann.region) > last(span), annotations)
-    extras = Tuple{Int, RegionAnnotation}[]
-    for i in eachindex(annotations)
-        annot = annotations[i]
-        region = annot.region
-        # Test for partial overlap
-        if first(region) <= first(span) <= last(region) || first(region) <= last(span) <= last(region)
-            annotations[i] =
-                setindex(annot,
-                         if first(region) < first(span)
-                             first(region):first(span)-1
-                         else
-                             last(span)+1:last(region)
-                         end,
-                         :region)
-            # If `span` fits exactly within `region`, then we've only copied over
-            # the beginning overhang, but also need to conserve the end overhang.
-            if first(region) < first(span) && last(span) < last(region)
-                push!(extras, (i, setindex(annot, last(span)+1:last(region), :region)))
-            end
+function eachregion(s::AnnotatedString, subregion::UnitRange{Int}=firstindex(s):lastindex(s))
+    isempty(s) || isempty(subregion) &&
+        return RegionIterator(s.string, UnitRange{Int}[], Vector{Annotation}[])
+    events = annotation_events(s, subregion)
+    isempty(events) && return RegionIterator(s.string, [subregion], [Annotation[]])
+    annotvals = Annotation[
+        (; label, value) for (; label, value) in annotations(s)]
+    regions = Vector{UnitRange{Int}}()
+    annots = Vector{Vector{Annotation}}()
+    pos = first(events).pos
+    if pos > first(subregion)
+        push!(regions, thisind(s, first(subregion)):prevind(s, pos))
+        push!(annots, [])
+    end
+    activelist = Int[]
+    for event in events
+        if event.pos != pos
+            push!(regions, pos:prevind(s, event.pos))
+            push!(annots, annotvals[activelist])
+            pos = event.pos
+        end
+        if event.active
+            insert!(activelist, searchsortedfirst(activelist, event.index), event.index)
+        else
+            deleteat!(activelist, searchsortedfirst(activelist, event.index))
         end
     end
-    # Insert any extra entries in the appropriate position
-    for (offset, (i, entry)) in enumerate(extras)
-        insert!(annotations, i + offset, entry)
+    if last(events).pos < nextind(s, last(subregion))
+        push!(regions, last(events).pos:thisind(s, last(subregion)))
+        push!(annots, [])
     end
-    annotations
+    RegionIterator(s.string, regions, annots)
 end
 
-"""
-    _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{$RegionAnnotation}, offset::Int = position(io))
+function eachregion(s::SubString{<:AnnotatedString}, pos::UnitRange{Int}=firstindex(s):lastindex(s))
+    if isempty(s)
+        RegionIterator(s.string, Vector{UnitRange{Int}}(), Vector{Vector{Annotation}}())
+    else
+        eachregion(s.string, first(pos)+s.offset:last(pos)+s.offset)
+    end
+end
 
-Register new `annotations` in `io`, applying an `offset` to their regions.
+"""
+    annotation_events(string::AbstractString, annots::Vector{$RegionAnnotation}, subregion::UnitRange{Int})
+    annotation_events(string::AnnotatedString, subregion::UnitRange{Int})
 
-The largely consists of simply shifting the regions of `annotations` by `offset`
-and pushing them onto `io`'s annotations. However, when it is possible to merge
-the new annotations with recent annotations in accordance with the semantics
-outlined in [`AnnotatedString`](@ref), we do so. More specifically, when there
-is a run of the most recent annotations that are also present as the first
-`annotations`, with the same value and adjacent regions, the new annotations are
-merged into the existing recent annotations by simply extending their range.
+Find all annotation "change events" that occur within a `subregion` of `annots`,
+with respect to `string`. When `string` is styled, `annots` is inferred.
 
-This is implemented so that one can say write an `AnnotatedString` to an
-`AnnotatedIOBuffer` one character at a time without needlessly producing a
-new annotation for each character.
+Each change event is given in the form of a `@NamedTuple{pos::Int, active::Bool,
+index::Int}` where `pos` is the position of the event, `active` is a boolean
+indicating whether the annotation is being activated or deactivated, and `index`
+is the index of the annotation in question.
 """
-function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{RegionAnnotation}, offset::Int = position(io))
-    run = 0
-    if !isempty(io.annotations) && last(last(io.annotations).region) == offset
-        for i in reverse(axes(annotations, 1))
-            annot = annotations[i]
-            first(annot.region) == 1 || continue
-            i <= length(io.annotations) || continue
-            if annot.label == last(io.annotations).label && annot.value == last(io.annotations).value
-                valid_run = true
-                for runlen in 1:i
-                    new = annotations[begin+runlen-1]
-                    old = io.annotations[end-i+runlen]
-                    if last(old.region) != offset || first(new.region) != 1 || old.label != new.label || old.value != new.value
-                        valid_run = false
-                        break
-                    end
-                end
-                if valid_run
-                    run = i
-                    break
-                end
-            end
+function annotation_events(s::AbstractString, annots::Vector{RegionAnnotation}, subregion::UnitRange{Int})
+    events = Vector{NamedTuple{(:pos, :active, :index), Tuple{Int, Bool, Int}}}() # Position, Active?, Annotation index
+    for (i, (; region)) in enumerate(annots)
+        if !isempty(intersect(subregion, region))
+            start, stop = max(first(subregion), first(region)), min(last(subregion), last(region))
+            start <= stop || continue # Currently can't handle empty regions
+            push!(events, (pos=thisind(s, start), active=true, index=i))
+            push!(events, (pos=nextind(s, stop), active=false, index=i))
         end
     end
-    for runindex in 0:run-1
-        old_index = lastindex(io.annotations) - run + 1 + runindex
-        old = io.annotations[old_index]
-        new = annotations[begin+runindex]
-        io.annotations[old_index] = setindex(old, first(old.region):last(new.region)+offset, :region)
-    end
-    for index in run+1:lastindex(annotations)
-        annot = annotations[index]
-        start, stop = first(annot.region), last(annot.region)
-        push!(io.annotations, setindex(annotations[index], start+offset:stop+offset, :region))
-    end
+    sort(events, by=e -> e.pos)
 end
 
-function read(io::AnnotatedIOBuffer, ::Type{AnnotatedString{T}}) where {T <: AbstractString}
-    if (start = position(io)) == 0
-        AnnotatedString(read(io.io, T), copy(io.annotations))
-    else
-        annots = [setindex(annot, UnitRange{Int}(max(1, first(annot.region) - start), last(annot.region)-start), :region)
-                  for annot in io.annotations if last(annot.region) > start]
-        AnnotatedString(read(io.io, T), annots)
-    end
-end
-read(io::AnnotatedIOBuffer, ::Type{AnnotatedString{AbstractString}}) = read(io, AnnotatedString{String})
-read(io::AnnotatedIOBuffer, ::Type{AnnotatedString}) = read(io, AnnotatedString{String})
-
-function read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar{T}}) where {T <: AbstractChar}
-    pos = position(io)
-    char = read(io.io, T)
-    annots = [NamedTuple{(:label, :value)}(annot) for annot in io.annotations if pos+1 in annot.region]
-    AnnotatedChar(char, annots)
-end
-read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar{AbstractChar}}) = read(io, AnnotatedChar{Char})
-read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar}) = read(io, AnnotatedChar{Char})
-
-function truncate(io::AnnotatedIOBuffer, size::Integer)
-    truncate(io.io, size)
-    filter!(ann -> first(ann.region) <= size, io.annotations)
-    map!(ann -> setindex(ann, first(ann.region):min(size, last(ann.region)), :region),
-         io.annotations, io.annotations)
-    io
-end
+annotation_events(s::AnnotatedString, subregion::UnitRange{Int}) =
+    annotation_events(s.string, annotations(s), subregion)
diff --git a/base/strings/annotated_io.jl b/base/strings/annotated_io.jl
new file mode 100644
index 0000000000000..87db57b8030c9
--- /dev/null
+++ b/base/strings/annotated_io.jl
@@ -0,0 +1,201 @@
+# This file is a part of Julia. License is MIT: https://julialang.org/license
+
+## AnnotatedIOBuffer
+
+struct AnnotatedIOBuffer <: AbstractPipe
+    io::IOBuffer
+    annotations::Vector{RegionAnnotation}
+end
+
+AnnotatedIOBuffer(io::IOBuffer) = AnnotatedIOBuffer(io, Vector{RegionAnnotation}())
+AnnotatedIOBuffer() = AnnotatedIOBuffer(IOBuffer())
+
+function show(io::IO, aio::AnnotatedIOBuffer)
+    show(io, AnnotatedIOBuffer)
+    size = filesize(aio.io)
+    print(io, '(', size, " byte", ifelse(size == 1, "", "s"), ", ",
+          length(aio.annotations), " annotation", ifelse(length(aio.annotations) == 1, "", "s"), ")")
+end
+
+pipe_reader(io::AnnotatedIOBuffer) = io.io
+pipe_writer(io::AnnotatedIOBuffer) = io.io
+
+# Useful `IOBuffer` methods that we don't get from `AbstractPipe`
+position(io::AnnotatedIOBuffer) = position(io.io)
+seek(io::AnnotatedIOBuffer, n::Integer) = (seek(io.io, n); io)
+seekend(io::AnnotatedIOBuffer) = (seekend(io.io); io)
+skip(io::AnnotatedIOBuffer, n::Integer) = (skip(io.io, n); io)
+copy(io::AnnotatedIOBuffer) = AnnotatedIOBuffer(copy(io.io), copy(io.annotations))
+
+annotations(io::AnnotatedIOBuffer) = io.annotations
+
+annotate!(io::AnnotatedIOBuffer, range::UnitRange{Int}, label::Symbol, @nospecialize(val::Any)) =
+    (_annotate!(io.annotations, range, label, val); io)
+
+function write(io::AnnotatedIOBuffer, astr::Union{AnnotatedString, SubString{<:AnnotatedString}})
+    astr = AnnotatedString(astr)
+    offset = position(io.io)
+    eof(io) || _clear_annotations_in_region!(io.annotations, offset+1:offset+ncodeunits(astr))
+    _insert_annotations!(io, astr.annotations)
+    write(io.io, String(astr))
+end
+
+write(io::AnnotatedIOBuffer, c::AnnotatedChar) =
+    write(io, AnnotatedString(string(c), [(region=1:ncodeunits(c), a...) for a in c.annotations]))
+write(io::AnnotatedIOBuffer, x::AbstractString) = write(io.io, x)
+write(io::AnnotatedIOBuffer, s::Union{SubString{String}, String}) = write(io.io, s)
+write(io::AnnotatedIOBuffer, b::UInt8) = write(io.io, b)
+
+function write(dest::AnnotatedIOBuffer, src::AnnotatedIOBuffer)
+    destpos = position(dest)
+    isappending = eof(dest)
+    srcpos = position(src)
+    nb = write(dest.io, src.io)
+    isappending || _clear_annotations_in_region!(dest.annotations, destpos:destpos+nb)
+    srcannots = [setindex(annot, max(1 + srcpos, first(annot.region)):last(annot.region), :region)
+                 for annot in src.annotations if first(annot.region) >= srcpos]
+    _insert_annotations!(dest, srcannots, destpos - srcpos)
+    nb
+end
+
+# So that read/writes with `IOContext` (and any similar `AbstractPipe` wrappers)
+# work as expected.
+function write(io::AbstractPipe, s::Union{AnnotatedString, SubString{<:AnnotatedString}})
+    if pipe_writer(io) isa AnnotatedIOBuffer
+        write(pipe_writer(io), s)
+    else
+        invoke(write, Tuple{IO, typeof(s)}, io, s)
+    end::Int
+end
+
+# Can't be part of the `Union` above because it introduces method ambiguities
+function write(io::AbstractPipe, c::AnnotatedChar)
+    if pipe_writer(io) isa AnnotatedIOBuffer
+        write(pipe_writer(io), c)
+    else
+        invoke(write, Tuple{IO, typeof(c)}, io, c)
+    end::Int
+end
+
+function read(io::AnnotatedIOBuffer, ::Type{AnnotatedString{T}}) where {T <: AbstractString}
+    if (start = position(io)) == 0
+        AnnotatedString(read(io.io, T), copy(io.annotations))
+    else
+        annots = [setindex(annot, UnitRange{Int}(max(1, first(annot.region) - start), last(annot.region)-start), :region)
+                  for annot in io.annotations if last(annot.region) > start]
+        AnnotatedString(read(io.io, T), annots)
+    end
+end
+read(io::AnnotatedIOBuffer, ::Type{AnnotatedString{AbstractString}}) = read(io, AnnotatedString{String})
+read(io::AnnotatedIOBuffer, ::Type{AnnotatedString}) = read(io, AnnotatedString{String})
+
+function read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar{T}}) where {T <: AbstractChar}
+    pos = position(io)
+    char = read(io.io, T)
+    annots = [NamedTuple{(:label, :value)}(annot) for annot in io.annotations if pos+1 in annot.region]
+    AnnotatedChar(char, annots)
+end
+read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar{AbstractChar}}) = read(io, AnnotatedChar{Char})
+read(io::AnnotatedIOBuffer, ::Type{AnnotatedChar}) = read(io, AnnotatedChar{Char})
+
+function truncate(io::AnnotatedIOBuffer, size::Integer)
+    truncate(io.io, size)
+    filter!(ann -> first(ann.region) <= size, io.annotations)
+    map!(ann -> setindex(ann, first(ann.region):min(size, last(ann.region)), :region),
+         io.annotations, io.annotations)
+    io
+end
+
+"""
+    _clear_annotations_in_region!(annotations::Vector{$RegionAnnotation}, span::UnitRange{Int})
+
+Erase the presence of `annotations` within a certain `span`.
+
+This operates by removing all elements of `annotations` that are entirely
+contained in `span`, truncating ranges that partially overlap, and splitting
+annotations that subsume `span` to just exist either side of `span`.
+"""
+function _clear_annotations_in_region!(annotations::Vector{RegionAnnotation}, span::UnitRange{Int})
+    # Clear out any overlapping pre-existing annotations.
+    filter!(ann -> first(ann.region) < first(span) || last(ann.region) > last(span), annotations)
+    extras = Tuple{Int, RegionAnnotation}[]
+    for i in eachindex(annotations)
+        annot = annotations[i]
+        region = annot.region
+        # Test for partial overlap
+        if first(region) <= first(span) <= last(region) || first(region) <= last(span) <= last(region)
+            annotations[i] =
+                setindex(annot,
+                         if first(region) < first(span)
+                             first(region):first(span)-1
+                         else
+                             last(span)+1:last(region)
+                         end,
+                         :region)
+            # If `span` fits exactly within `region`, then we've only copied over
+            # the beginning overhang, but also need to conserve the end overhang.
+            if first(region) < first(span) && last(span) < last(region)
+                push!(extras, (i, setindex(annot, last(span)+1:last(region), :region)))
+            end
+        end
+    end
+    # Insert any extra entries in the appropriate position
+    for (offset, (i, entry)) in enumerate(extras)
+        insert!(annotations, i + offset, entry)
+    end
+    annotations
+end
+
+"""
+    _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{$RegionAnnotation}, offset::Int = position(io))
+
+Register new `annotations` in `io`, applying an `offset` to their regions.
+
+The largely consists of simply shifting the regions of `annotations` by `offset`
+and pushing them onto `io`'s annotations. However, when it is possible to merge
+the new annotations with recent annotations in accordance with the semantics
+outlined in [`AnnotatedString`](@ref), we do so. More specifically, when there
+is a run of the most recent annotations that are also present as the first
+`annotations`, with the same value and adjacent regions, the new annotations are
+merged into the existing recent annotations by simply extending their range.
+
+This is implemented so that one can say write an `AnnotatedString` to an
+`AnnotatedIOBuffer` one character at a time without needlessly producing a
+new annotation for each character.
+"""
+function _insert_annotations!(io::AnnotatedIOBuffer, annotations::Vector{RegionAnnotation}, offset::Int = position(io))
+    run = 0
+    if !isempty(io.annotations) && last(last(io.annotations).region) == offset
+        for i in reverse(axes(annotations, 1))
+            annot = annotations[i]
+            first(annot.region) == 1 || continue
+            i <= length(io.annotations) || continue
+            if annot.label == last(io.annotations).label && annot.value == last(io.annotations).value
+                valid_run = true
+                for runlen in 1:i
+                    new = annotations[begin+runlen-1]
+                    old = io.annotations[end-i+runlen]
+                    if last(old.region) != offset || first(new.region) != 1 || old.label != new.label || old.value != new.value
+                        valid_run = false
+                        break
+                    end
+                end
+                if valid_run
+                    run = i
+                    break
+                end
+            end
+        end
+    end
+    for runindex in 0:run-1
+        old_index = lastindex(io.annotations) - run + 1 + runindex
+        old = io.annotations[old_index]
+        new = annotations[begin+runindex]
+        io.annotations[old_index] = setindex(old, first(old.region):last(new.region)+offset, :region)
+    end
+    for index in run+1:lastindex(annotations)
+        annot = annotations[index]
+        start, stop = first(annot.region), last(annot.region)
+        push!(io.annotations, setindex(annotations[index], start+offset:stop+offset, :region))
+    end
+end
diff --git a/base/strings/strings.jl b/base/strings/strings.jl
index 8dae311f475b4..32975b6ea3fc7 100644
--- a/base/strings/strings.jl
+++ b/base/strings/strings.jl
@@ -11,3 +11,4 @@ import .Iterators: PartitionIterator
 
 include("strings/util.jl")
 include("strings/io.jl")
+include("strings/annotated_io.jl")
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 688aeca1b242c..b1d925d89c7ce 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -675,6 +675,20 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
     fargs[0] = (jl_value_t*)codeinfos;
     void *data = jl_emit_native(codeinfos, llvmmod, &cgparams, external_linkage);
 
+    // examine everything just emitted and save it to the caches
+    if (!external_linkage) {
+        for (size_t i = 0, l = jl_array_nrows(codeinfos); i < l; i++) {
+            jl_value_t *item = jl_array_ptr_ref(codeinfos, i);
+            if (jl_is_code_instance(item)) {
+                // now add it to our compilation results
+                jl_code_instance_t *codeinst = (jl_code_instance_t*)item;
+                jl_code_info_t *src = (jl_code_info_t*)jl_array_ptr_ref(codeinfos, ++i);
+                assert(jl_is_code_info(src));
+                jl_add_codeinst_to_cache(codeinst, src);
+            }
+        }
+    }
+
     // move everything inside, now that we've merged everything
     // (before adding the exported headers)
     ((jl_native_code_desc_t*)data)->M.withModuleDo([&](Module &M) {
diff --git a/src/gc-stock.c b/src/gc-stock.c
index 3b49b82caf530..66a1724ecf4ce 100644
--- a/src/gc-stock.c
+++ b/src/gc-stock.c
@@ -948,6 +948,7 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
 
 done:
     if (re_use_page) {
+        gc_update_page_fragmentation_data(pg);
         push_lf_back(allocd, pg);
     }
     else {
@@ -956,7 +957,6 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
         push_lf_back(&global_page_pool_lazily_freed, pg);
     }
     gc_page_profile_write_to_file(s);
-    gc_update_page_fragmentation_data(pg);
     gc_time_count_page(freedall, pg_skpd);
     jl_ptls_t ptls = jl_current_task->ptls;
     // Note that we aggregate the `pool_live_bytes` over all threads before returning this
diff --git a/src/gf.c b/src/gf.c
index cc3966da5f393..860e5e5aa1247 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -532,7 +532,10 @@ JL_DLLEXPORT jl_value_t *jl_call_in_typeinf_world(jl_value_t **args, int nargs)
     jl_task_t *ct = jl_current_task;
     size_t last_age = ct->world_age;
     ct->world_age = jl_typeinf_world;
+    int last_pure = ct->ptls->in_pure_callback;
+    ct->ptls->in_pure_callback = 0;
     jl_value_t *ret = jl_apply(args, nargs);
+    ct->ptls->in_pure_callback = last_pure;
     ct->world_age = last_age;
     return ret;
 }
@@ -582,8 +585,8 @@ JL_DLLEXPORT int jl_mi_cache_has_ci(jl_method_instance_t *mi,
     return 0;
 }
 
-// look for something with an egal ABI and properties that is already in the JIT (compiled=true) or simply in the cache (compiled=false)
-JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPAGATES_ROOT, int compiled) JL_NOTSAFEPOINT
+// look for something with an egal ABI and properties that is already in the JIT for a whole edge (target_world=0) or can be added to the JIT with new source just for target_world.
+JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPAGATES_ROOT, size_t target_world) JL_NOTSAFEPOINT
 {
     jl_value_t *def = ci->def;
     jl_method_instance_t *mi = jl_get_ci_mi(ci);
@@ -595,9 +598,9 @@ JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPA
     while (codeinst) {
         if (codeinst != ci &&
             jl_atomic_load_relaxed(&codeinst->inferred) != NULL &&
-            (!compiled || jl_atomic_load_relaxed(&codeinst->invoke) != NULL) &&
-            jl_atomic_load_relaxed(&codeinst->min_world) <= min_world &&
-            jl_atomic_load_relaxed(&codeinst->max_world) >= max_world &&
+            (target_world ? 1 : jl_atomic_load_relaxed(&codeinst->invoke) != NULL) &&
+            jl_atomic_load_relaxed(&codeinst->min_world) <= (target_world ? target_world : min_world) &&
+            jl_atomic_load_relaxed(&codeinst->max_world) >= (target_world ? target_world : max_world) &&
             jl_egal(codeinst->def, def) &&
             jl_egal(codeinst->owner, owner) &&
             jl_egal(codeinst->rettype, rettype)) {
@@ -605,7 +608,7 @@ JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPA
         }
         codeinst = jl_atomic_load_relaxed(&codeinst->next);
     }
-    return (jl_code_instance_t*)jl_nothing;
+    return ci;
 }
 
 
@@ -2792,10 +2795,9 @@ void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_c
 
 jl_method_instance_t *jl_normalize_to_compilable_mi(jl_method_instance_t *mi JL_PROPAGATES_ROOT);
 
-JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src)
+JL_DLLEXPORT void jl_add_codeinst_to_cache(jl_code_instance_t *codeinst, jl_code_info_t *src)
 {
     assert(jl_is_code_info(src));
-    jl_emit_codeinst_to_jit(codeinst, src);
     jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
     if (jl_generating_output() && jl_is_method(mi->def.method) && jl_atomic_load_relaxed(&codeinst->inferred) == jl_nothing) {
         jl_value_t *compressed = jl_compress_ir(mi->def.method, src);
@@ -2811,6 +2813,14 @@ JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_i
     }
 }
 
+
+JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src)
+{
+    assert(jl_is_code_info(src));
+    jl_emit_codeinst_to_jit(codeinst, src);
+    jl_add_codeinst_to_cache(codeinst, src);
+}
+
 jl_code_instance_t *jl_compile_method_internal(jl_method_instance_t *mi, size_t world)
 {
     // quick check if we already have a compiled result
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 695953b602653..b8781d2bfe898 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -383,8 +383,8 @@ static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t
         }
         if (preal_decl.empty()) {
             // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead
-            jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 1);
-            if ((jl_value_t*)compiled_ci != jl_nothing) {
+            jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 0);
+            if (compiled_ci != codeinst) {
                 codeinst = compiled_ci;
                 uint8_t specsigflags;
                 void *fptr;
diff --git a/src/julia-syntax.scm b/src/julia-syntax.scm
index 739fa45e088ca..b5d90a24ea13f 100644
--- a/src/julia-syntax.scm
+++ b/src/julia-syntax.scm
@@ -4105,6 +4105,7 @@ f(x) = yt(x)
                                            (capt-var-access v fname opaq)
                                            v)))
                                    cvs)))
+               (set-car! (cdddr (lam:vinfo lam2)) '()) ;; must capture static_parameters as values inside opaque_closure
                `(new_opaque_closure
                  ,(cadr e) ,(or (caddr e) '(call (core apply_type) (core Union))) ,(or (cadddr e) '(core Any)) ,allow-partial
                  (opaque_closure_method (null) ,nargs ,isva ,functionloc ,(convert-lambda lam2 (car (lam:args lam2)) #f '() (symbol-to-idx-map cvs) parsed-method-stack))
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 479ccbf961e71..bdcc816cbdd1b 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -685,6 +685,7 @@ JL_DLLEXPORT jl_method_instance_t *jl_get_unspecialized(jl_method_t *def JL_PROP
 JL_DLLEXPORT void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile);
 JL_DLLEXPORT jl_method_instance_t *jl_method_match_to_mi(jl_method_match_t *match, size_t world, size_t min_valid, size_t max_valid, int mt_cache);
 JL_DLLEXPORT void jl_add_codeinst_to_jit(jl_code_instance_t *codeinst, jl_code_info_t *src);
+JL_DLLEXPORT void jl_add_codeinst_to_cache(jl_code_instance_t *codeinst, jl_code_info_t *src);
 
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst_uninit(jl_method_instance_t *mi, jl_value_t *owner);
 JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst(
@@ -694,7 +695,7 @@ JL_DLLEXPORT jl_code_instance_t *jl_new_codeinst(
         int32_t const_flags, size_t min_world, size_t max_world,
         uint32_t effects, jl_value_t *analysis_results,
         jl_debuginfo_t *di, jl_svec_t *edges /* , int absolute_max*/);
-JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPAGATES_ROOT, int compiled) JL_NOTSAFEPOINT;
+JL_DLLEXPORT jl_code_instance_t *jl_get_ci_equiv(jl_code_instance_t *ci JL_PROPAGATES_ROOT, size_t target_world) JL_NOTSAFEPOINT;
 
 STATIC_INLINE jl_method_instance_t *jl_get_ci_mi(jl_code_instance_t *ci JL_PROPAGATES_ROOT) JL_NOTSAFEPOINT
 {
diff --git a/src/llvm-alloc-helpers.cpp b/src/llvm-alloc-helpers.cpp
index 194c6837860ca..a1ed66a190190 100644
--- a/src/llvm-alloc-helpers.cpp
+++ b/src/llvm-alloc-helpers.cpp
@@ -214,6 +214,7 @@ void jl_alloc::runEscapeAnalysis(llvm::CallInst *I, EscapeAnalysisRequiredArgs r
         }
         if (auto call = dyn_cast<CallInst>(inst)) {
             // TODO handle `memcmp`
+            // TODO handle `memcpy` which is used a lot more often since opaque pointers
             // None of the intrinsics should care if the memory is stack or heap allocated.
             auto callee = call->getCalledOperand();
             if (auto II = dyn_cast<IntrinsicInst>(call)) {
diff --git a/src/llvm-alloc-opt.cpp b/src/llvm-alloc-opt.cpp
index 7dd794a4d8847..ce1d22f42d0ae 100644
--- a/src/llvm-alloc-opt.cpp
+++ b/src/llvm-alloc-opt.cpp
@@ -742,7 +742,9 @@ void Optimizer::moveToStack(CallInst *orig_inst, size_t sz, bool has_ref, AllocF
     auto replace_inst = [&] (Instruction *user) {
         Instruction *orig_i = cur.orig_i;
         Instruction *new_i = cur.new_i;
-        if (isa<LoadInst>(user) || isa<StoreInst>(user)) {
+        if (isa<LoadInst>(user) || isa<StoreInst>(user) ||
+            isa<AtomicCmpXchgInst>(user) || isa<AtomicRMWInst>(user)) {
+            // TODO: these atomics are likely removable if the user is the first argument
             user->replaceUsesOfWith(orig_i, new_i);
         }
         else if (auto call = dyn_cast<CallInst>(user)) {
@@ -1111,6 +1113,7 @@ void Optimizer::splitOnStack(CallInst *orig_inst)
             return;
         }
         else if (isa<AtomicCmpXchgInst>(user) || isa<AtomicRMWInst>(user)) {
+            // TODO: Downgrade atomics here potentially
             auto slot_idx = find_slot(offset);
             auto &slot = slots[slot_idx];
             assert(slot.offset <= offset && slot.offset + slot.size >= offset);
diff --git a/src/precompile_utils.c b/src/precompile_utils.c
index 281dbe0163586..c602a15c1fb74 100644
--- a/src/precompile_utils.c
+++ b/src/precompile_utils.c
@@ -170,6 +170,10 @@ static void jl_compile_all_defs(jl_array_t *mis, int all)
     size_t i, l = jl_array_nrows(allmeths);
     for (i = 0; i < l; i++) {
         jl_method_t *m = (jl_method_t*)jl_array_ptr_ref(allmeths, i);
+        int is_macro_method = jl_symbol_name(m->name)[0] == '@';
+        if (is_macro_method && !all)
+            continue; // Avoid inference / pre-compilation for macros
+
         if (jl_is_datatype(m->sig) && jl_isa_compileable_sig((jl_tupletype_t*)m->sig, jl_emptysvec, m)) {
             // method has a single compilable specialization, e.g. its definition
             // signature is concrete. in this case we can just hint it.
diff --git a/src/staticdata.c b/src/staticdata.c
index 62f3feeaa2159..b51013e7e0563 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -92,6 +92,22 @@ static const size_t WORLD_AGE_REVALIDATION_SENTINEL = 0x1;
 JL_DLLEXPORT size_t jl_require_world = ~(size_t)0;
 JL_DLLEXPORT _Atomic(size_t) jl_first_image_replacement_world = ~(size_t)0;
 
+// This structure is used to store hash tables for the memoization
+// of queries in staticdata.c (currently only `type_in_worklist`).
+typedef struct {
+    htable_t type_in_worklist;
+} jl_query_cache;
+
+static void init_query_cache(jl_query_cache *cache)
+{
+    htable_new(&cache->type_in_worklist, 0);
+}
+
+static void destroy_query_cache(jl_query_cache *cache)
+{
+    htable_free(&cache->type_in_worklist);
+}
+
 #include "staticdata_utils.c"
 #include "precompile_utils.c"
 
@@ -555,6 +571,7 @@ typedef struct {
     jl_array_t *method_roots_list;
     htable_t method_roots_index;
     uint64_t worklist_key;
+    jl_query_cache *query_cache;
     jl_ptls_t ptls;
     jl_image_t *image;
     int8_t incremental;
@@ -702,14 +719,13 @@ static int jl_needs_serialization(jl_serializer_state *s, jl_value_t *v) JL_NOTS
     return 1;
 }
 
-
-static int caching_tag(jl_value_t *v) JL_NOTSAFEPOINT
+static int caching_tag(jl_value_t *v, jl_query_cache *query_cache) JL_NOTSAFEPOINT
 {
     if (jl_is_method_instance(v)) {
         jl_method_instance_t *mi = (jl_method_instance_t*)v;
         jl_value_t *m = mi->def.value;
         if (jl_is_method(m) && jl_object_in_image(m))
-            return 1 + type_in_worklist(mi->specTypes);
+            return 1 + type_in_worklist(mi->specTypes, query_cache);
     }
     if (jl_is_binding(v)) {
         jl_globalref_t *gr = ((jl_binding_t*)v)->globalref;
@@ -724,24 +740,24 @@ static int caching_tag(jl_value_t *v) JL_NOTSAFEPOINT
         if (jl_is_tuple_type(dt) ? !dt->isconcretetype : dt->hasfreetypevars)
             return 0; // aka !is_cacheable from jltypes.c
         if (jl_object_in_image((jl_value_t*)dt->name))
-            return 1 + type_in_worklist(v);
+            return 1 + type_in_worklist(v, query_cache);
     }
     jl_value_t *dtv = jl_typeof(v);
     if (jl_is_datatype_singleton((jl_datatype_t*)dtv)) {
-        return 1 - type_in_worklist(dtv); // these are already recached in the datatype in the image
+        return 1 - type_in_worklist(dtv, query_cache); // these are already recached in the datatype in the image
     }
     return 0;
 }
 
-static int needs_recaching(jl_value_t *v) JL_NOTSAFEPOINT
+static int needs_recaching(jl_value_t *v, jl_query_cache *query_cache) JL_NOTSAFEPOINT
 {
-    return caching_tag(v) == 2;
+    return caching_tag(v, query_cache) == 2;
 }
 
-static int needs_uniquing(jl_value_t *v) JL_NOTSAFEPOINT
+static int needs_uniquing(jl_value_t *v, jl_query_cache *query_cache) JL_NOTSAFEPOINT
 {
     assert(!jl_object_in_image(v));
-    return caching_tag(v) == 1;
+    return caching_tag(v, query_cache) == 1;
 }
 
 static void record_field_change(jl_value_t **addr, jl_value_t *newval) JL_NOTSAFEPOINT
@@ -825,7 +841,6 @@ static void jl_queue_module_for_serialization(jl_serializer_state *s, jl_module_
                      // ... or point to Base functions accessed by the runtime
                      (m == jl_base_module && (!strcmp(jl_symbol_name(b->globalref->name), "wait") ||
                                               !strcmp(jl_symbol_name(b->globalref->name), "task_done_hook"))))) {
-                    record_field_change((jl_value_t**)&b->backedges, NULL);
                     jl_queue_for_serialization(s, b);
                 }
             }
@@ -836,8 +851,14 @@ static void jl_queue_module_for_serialization(jl_serializer_state *s, jl_module_
         jl_queue_for_serialization(s, module_usings_getmod(m, i));
     }
 
-    jl_queue_for_serialization(s, m->usings_backedges);
-    jl_queue_for_serialization(s, m->scanned_methods);
+    if (jl_options.trim || jl_options.strip_ir) {
+        record_field_change((jl_value_t**)&m->usings_backedges, jl_nothing);
+        record_field_change((jl_value_t**)&m->scanned_methods, jl_nothing);
+    }
+    else {
+        jl_queue_for_serialization(s, m->usings_backedges);
+        jl_queue_for_serialization(s, m->scanned_methods);
+    }
 }
 
 // Anything that requires uniquing or fixing during deserialization needs to be "toplevel"
@@ -861,7 +882,7 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
         jl_datatype_t *dt = (jl_datatype_t*)v;
         // ensure all type parameters are recached
         jl_queue_for_serialization_(s, (jl_value_t*)dt->parameters, 1, 1);
-        if (jl_is_datatype_singleton(dt) && needs_uniquing(dt->instance)) {
+        if (jl_is_datatype_singleton(dt) && needs_uniquing(dt->instance, s->query_cache)) {
             assert(jl_needs_serialization(s, dt->instance)); // should be true, since we visited dt
             // do not visit dt->instance for our template object as it leads to unwanted cycles here
             // (it may get serialized from elsewhere though)
@@ -872,7 +893,7 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
     if (s->incremental && jl_is_method_instance(v)) {
         jl_method_instance_t *mi = (jl_method_instance_t*)v;
         jl_value_t *def = mi->def.value;
-        if (needs_uniquing(v)) {
+        if (needs_uniquing(v, s->query_cache)) {
             // we only need 3 specific fields of this (the rest are not used)
             jl_queue_for_serialization(s, mi->def.value);
             jl_queue_for_serialization(s, mi->specTypes);
@@ -887,7 +908,7 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
             record_field_change((jl_value_t**)&mi->cache, NULL);
         }
         else {
-            assert(!needs_recaching(v));
+            assert(!needs_recaching(v, s->query_cache));
         }
         // n.b. opaque closures cannot be inspected and relied upon like a
         // normal method since they can get improperly introduced by generated
@@ -897,7 +918,7 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
         // error now.
     }
     if (s->incremental && jl_is_binding(v)) {
-        if (needs_uniquing(v)) {
+        if (needs_uniquing(v, s->query_cache)) {
             jl_binding_t *b = (jl_binding_t*)v;
             jl_queue_for_serialization(s, b->globalref->mod);
             jl_queue_for_serialization(s, b->globalref->name);
@@ -1065,6 +1086,9 @@ static void jl_insert_into_serialization_queue(jl_serializer_state *s, jl_value_
                         record_field_change((jl_value_t **)&tn->mt, NULL);
                 }
             }
+            else if (jl_is_binding(v)) {
+                record_field_change((jl_value_t**)&((jl_binding_t*)v)->backedges, NULL);
+            }
         }
         char *data = (char*)jl_data_ptr(v);
         size_t i, np = layout->npointers;
@@ -1121,9 +1145,9 @@ static void jl_queue_for_serialization_(jl_serializer_state *s, jl_value_t *v, i
     // Items that require postorder traversal must visit their children prior to insertion into
     // the worklist/serialization_order (and also before their first use)
     if (s->incremental && !immediate) {
-        if (jl_is_datatype(t) && needs_uniquing(v))
+        if (jl_is_datatype(t) && needs_uniquing(v, s->query_cache))
             immediate = 1;
-        if (jl_is_datatype_singleton((jl_datatype_t*)t) && needs_uniquing(v))
+        if (jl_is_datatype_singleton((jl_datatype_t*)t) && needs_uniquing(v, s->query_cache))
             immediate = 1;
     }
 
@@ -1286,7 +1310,7 @@ static uintptr_t _backref_id(jl_serializer_state *s, jl_value_t *v, jl_array_t *
 
 static void record_uniquing(jl_serializer_state *s, jl_value_t *fld, uintptr_t offset) JL_NOTSAFEPOINT
 {
-    if (s->incremental && jl_needs_serialization(s, fld) && needs_uniquing(fld)) {
+    if (s->incremental && jl_needs_serialization(s, fld) && needs_uniquing(fld, s->query_cache)) {
         if (jl_is_datatype(fld) || jl_is_datatype_singleton((jl_datatype_t*)jl_typeof(fld)))
             arraylist_push(&s->uniquing_types, (void*)(uintptr_t)offset);
         else if (jl_is_method_instance(fld) || jl_is_binding(fld))
@@ -1349,10 +1373,10 @@ static void jl_write_module(jl_serializer_state *s, uintptr_t item, jl_module_t
         newm->line = 0;
     newm->usings_backedges = NULL;
     arraylist_push(&s->relocs_list, (void*)(reloc_offset + offsetof(jl_module_t, usings_backedges)));
-    arraylist_push(&s->relocs_list, (void*)backref_id(s, m->usings_backedges, s->link_ids_relocs));
+    arraylist_push(&s->relocs_list, (void*)backref_id(s, get_replaceable_field(&m->usings_backedges, 1), s->link_ids_relocs));
     newm->scanned_methods = NULL;
     arraylist_push(&s->relocs_list, (void*)(reloc_offset + offsetof(jl_module_t, scanned_methods)));
-    arraylist_push(&s->relocs_list, (void*)backref_id(s, m->scanned_methods, s->link_ids_relocs));
+    arraylist_push(&s->relocs_list, (void*)backref_id(s, get_replaceable_field(&m->scanned_methods, 1), s->link_ids_relocs));
 
     // After reload, everything that has happened in this process happened semantically at
     // (for .incremental) or before jl_require_world, so reset this flag.
@@ -1510,7 +1534,7 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
         // write header
         if (object_id_expected)
             write_uint(f, jl_object_id(v));
-        if (s->incremental && jl_needs_serialization(s, (jl_value_t*)t) && needs_uniquing((jl_value_t*)t))
+        if (s->incremental && jl_needs_serialization(s, (jl_value_t*)t) && needs_uniquing((jl_value_t*)t, s->query_cache))
             arraylist_push(&s->uniquing_types, (void*)(uintptr_t)(ios_pos(f)|1));
         if (f == s->const_data)
             write_uint(s->const_data, ((uintptr_t)t->smalltag << 4) | GC_OLD_MARKED | GC_IN_IMAGE);
@@ -1521,7 +1545,7 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
         layout_table.items[item] = (void*)(reloc_offset | (f == s->const_data)); // store the inverse mapping of `serialization_order` (`id` => object-as-streampos)
 
         if (s->incremental) {
-            if (needs_uniquing(v)) {
+            if (needs_uniquing(v, s->query_cache)) {
                 if (jl_typetagis(v, jl_binding_type)) {
                     jl_binding_t *b = (jl_binding_t*)v;
                     if (b->globalref == NULL)
@@ -1550,7 +1574,7 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
                     assert(jl_is_datatype_singleton(t) && "unreachable");
                 }
             }
-            else if (needs_recaching(v)) {
+            else if (needs_recaching(v, s->query_cache)) {
                 arraylist_push(jl_is_datatype(v) ? &s->fixup_types : &s->fixup_objs, (void*)reloc_offset);
             }
         }
@@ -1985,7 +2009,7 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
                     }
                 }
                 void *superidx = ptrhash_get(&serialization_order, dt->super);
-                if (s->incremental && superidx != HT_NOTFOUND && from_seroder_entry(superidx) > item && needs_uniquing((jl_value_t*)dt->super))
+                if (s->incremental && superidx != HT_NOTFOUND && from_seroder_entry(superidx) > item && needs_uniquing((jl_value_t*)dt->super, s->query_cache))
                     arraylist_push(&s->uniquing_super, dt->super);
             }
             else if (jl_is_typename(v)) {
@@ -2919,13 +2943,14 @@ JL_DLLEXPORT jl_value_t *jl_as_global_root(jl_value_t *val, int insert)
 static void jl_prepare_serialization_data(jl_array_t *mod_array, jl_array_t *newly_inferred,
                            /* outputs */  jl_array_t **extext_methods JL_REQUIRE_ROOTED_SLOT,
                                           jl_array_t **new_ext_cis JL_REQUIRE_ROOTED_SLOT,
-                                          jl_array_t **edges JL_REQUIRE_ROOTED_SLOT)
+                                          jl_array_t **edges JL_REQUIRE_ROOTED_SLOT,
+                                          jl_query_cache *query_cache)
 {
     // extext_methods: [method1, ...], worklist-owned "extending external" methods added to functions owned by modules outside the worklist
     // edges: [caller1, ext_targets, ...] for worklist-owned methods calling external methods
 
     // Save the inferred code from newly inferred, external methods
-    *new_ext_cis = queue_external_cis(newly_inferred);
+    *new_ext_cis = queue_external_cis(newly_inferred, query_cache);
 
     // Collect method extensions and edges data
     *extext_methods = jl_alloc_vec_any(0);
@@ -2955,7 +2980,8 @@ static void jl_prepare_serialization_data(jl_array_t *mod_array, jl_array_t *new
 // In addition to the system image (where `worklist = NULL`), this can also save incremental images with external linkage
 static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
                                            jl_array_t *worklist, jl_array_t *extext_methods,
-                                           jl_array_t *new_ext_cis, jl_array_t *edges)
+                                           jl_array_t *new_ext_cis, jl_array_t *edges,
+                                           jl_query_cache *query_cache)
 {
     htable_new(&field_replace, 0);
     htable_new(&bits_replace, 0);
@@ -3062,6 +3088,7 @@ static void jl_save_system_image_to_stream(ios_t *f, jl_array_t *mod_array,
     ios_mem(&gvar_record, 0);
     ios_mem(&fptr_record, 0);
     jl_serializer_state s = {0};
+    s.query_cache = query_cache;
     s.incremental = !(worklist == NULL);
     s.s = &sysimg;
     s.const_data = &const_data;
@@ -3422,11 +3449,14 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     int64_t datastartpos = 0;
     JL_GC_PUSH4(&mod_array, &extext_methods, &new_ext_cis, &edges);
 
+    jl_query_cache query_cache;
+    init_query_cache(&query_cache);
+
     if (worklist) {
         mod_array = jl_get_loaded_modules();  // __toplevel__ modules loaded in this session (from Base.loaded_modules_array)
         // Generate _native_data`
         if (_native_data != NULL) {
-            jl_prepare_serialization_data(mod_array, newly_inferred, &extext_methods, &new_ext_cis, NULL);
+            jl_prepare_serialization_data(mod_array, newly_inferred, &extext_methods, &new_ext_cis, NULL, &query_cache);
             jl_precompile_toplevel_module = (jl_module_t*)jl_array_ptr_ref(worklist, jl_array_len(worklist)-1);
             *_native_data = jl_precompile_worklist(worklist, extext_methods, new_ext_cis);
             jl_precompile_toplevel_module = NULL;
@@ -3457,7 +3487,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     assert((ct->reentrant_timing & 0b1110) == 0);
     ct->reentrant_timing |= 0b1000;
     if (worklist) {
-        jl_prepare_serialization_data(mod_array, newly_inferred, &extext_methods, &new_ext_cis, &edges);
+        jl_prepare_serialization_data(mod_array, newly_inferred, &extext_methods, &new_ext_cis, &edges, &query_cache);
         if (!emit_split) {
             write_int32(f, 0); // No clone_targets
             write_padding(f, LLT_ALIGN(ios_pos(f), JL_CACHE_BYTE_ALIGNMENT) - ios_pos(f));
@@ -3469,7 +3499,7 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
     }
     if (_native_data != NULL)
         native_functions = *_native_data;
-    jl_save_system_image_to_stream(ff, mod_array, worklist, extext_methods, new_ext_cis, edges);
+    jl_save_system_image_to_stream(ff, mod_array, worklist, extext_methods, new_ext_cis, edges, &query_cache);
     if (_native_data != NULL)
         native_functions = NULL;
     // make sure we don't run any Julia code concurrently before this point
@@ -3498,6 +3528,8 @@ JL_DLLEXPORT void jl_create_system_image(void **_native_data, jl_array_t *workli
         }
     }
 
+    destroy_query_cache(&query_cache);
+
     JL_GC_POP();
     *s = f;
     if (emit_split)
@@ -3609,7 +3641,7 @@ static int jl_validate_binding_partition(jl_binding_t *b, jl_binding_partition_t
         jl_sym_t *name = b->globalref->name;
         JL_LOCK(&mod->lock);
         jl_atomic_store_release(&mod->export_set_changed_since_require_world, 1);
-        if (mod->usings_backedges) {
+        if (mod->usings_backedges != jl_nothing) {
             for (size_t i = 0; i < jl_array_len(mod->usings_backedges); i++) {
                 jl_module_t *edge = (jl_module_t*)jl_array_ptr_ref(mod->usings_backedges, i);
                 jl_binding_t *importee = jl_get_module_binding(edge, name, 0);
diff --git a/src/staticdata_utils.c b/src/staticdata_utils.c
index e9f464b64470e..9bfd4c355efe6 100644
--- a/src/staticdata_utils.c
+++ b/src/staticdata_utils.c
@@ -131,63 +131,81 @@ JL_DLLEXPORT void jl_push_newly_inferred(jl_value_t* ci)
     JL_UNLOCK(&newly_inferred_mutex);
 }
 
-
 // compute whether a type references something internal to worklist
 // and thus could not have existed before deserialize
 // and thus does not need delayed unique-ing
-static int type_in_worklist(jl_value_t *v) JL_NOTSAFEPOINT
+static int type_in_worklist(jl_value_t *v, jl_query_cache *cache) JL_NOTSAFEPOINT
 {
     if (jl_object_in_image(v))
         return 0; // fast-path for rejection
+
+    void *cached = HT_NOTFOUND;
+    if (cache != NULL)
+        cached = ptrhash_get(&cache->type_in_worklist, v);
+
+    // fast-path for memoized results
+    if (cached != HT_NOTFOUND)
+        return cached == v;
+
+    int result = 0;
     if (jl_is_uniontype(v)) {
         jl_uniontype_t *u = (jl_uniontype_t*)v;
-        return type_in_worklist(u->a) ||
-               type_in_worklist(u->b);
+        result = type_in_worklist(u->a, cache) ||
+                 type_in_worklist(u->b, cache);
     }
     else if (jl_is_unionall(v)) {
         jl_unionall_t *ua = (jl_unionall_t*)v;
-        return type_in_worklist((jl_value_t*)ua->var) ||
-               type_in_worklist(ua->body);
+        result = type_in_worklist((jl_value_t*)ua->var, cache) ||
+                 type_in_worklist(ua->body, cache);
     }
     else if (jl_is_typevar(v)) {
         jl_tvar_t *tv = (jl_tvar_t*)v;
-        return type_in_worklist(tv->lb) ||
-               type_in_worklist(tv->ub);
+        result = type_in_worklist(tv->lb, cache) ||
+                 type_in_worklist(tv->ub, cache);
     }
     else if (jl_is_vararg(v)) {
         jl_vararg_t *tv = (jl_vararg_t*)v;
-        if (tv->T && type_in_worklist(tv->T))
-            return 1;
-        if (tv->N && type_in_worklist(tv->N))
-            return 1;
+        result = ((tv->T && type_in_worklist(tv->T, cache)) ||
+                  (tv->N && type_in_worklist(tv->N, cache)));
     }
     else if (jl_is_datatype(v)) {
         jl_datatype_t *dt = (jl_datatype_t*)v;
-        if (!jl_object_in_image((jl_value_t*)dt->name))
-            return 1;
-        jl_svec_t *tt = dt->parameters;
-        size_t i, l = jl_svec_len(tt);
-        for (i = 0; i < l; i++)
-            if (type_in_worklist(jl_tparam(dt, i)))
-                return 1;
+        if (!jl_object_in_image((jl_value_t*)dt->name)) {
+            result = 1;
+        }
+        else {
+            jl_svec_t *tt = dt->parameters;
+            size_t i, l = jl_svec_len(tt);
+            for (i = 0; i < l; i++) {
+                if (type_in_worklist(jl_tparam(dt, i), cache)) {
+                    result = 1;
+                    break;
+                }
+            }
+        }
     }
     else {
-        return type_in_worklist(jl_typeof(v));
+        return type_in_worklist(jl_typeof(v), cache);
     }
-    return 0;
+
+    // Memoize result
+    if (cache != NULL)
+        ptrhash_put(&cache->type_in_worklist, (void*)v, result ? (void*)v : NULL);
+
+    return result;
 }
 
 // When we infer external method instances, ensure they link back to the
 // package. Otherwise they might be, e.g., for external macros.
 // Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable
-static int has_backedge_to_worklist(jl_method_instance_t *mi, htable_t *visited, arraylist_t *stack)
+static int has_backedge_to_worklist(jl_method_instance_t *mi, htable_t *visited, arraylist_t *stack, jl_query_cache *query_cache)
 {
     jl_module_t *mod = mi->def.module;
     if (jl_is_method(mod))
         mod = ((jl_method_t*)mod)->module;
     assert(jl_is_module(mod));
     uint8_t is_precompiled = jl_atomic_load_relaxed(&mi->flags) & JL_MI_FLAGS_MASK_PRECOMPILED;
-    if (is_precompiled || !jl_object_in_image((jl_value_t*)mod) || type_in_worklist(mi->specTypes)) {
+    if (is_precompiled || !jl_object_in_image((jl_value_t*)mod) || type_in_worklist(mi->specTypes, query_cache)) {
         return 1;
     }
     if (!mi->backedges) {
@@ -211,7 +229,7 @@ static int has_backedge_to_worklist(jl_method_instance_t *mi, htable_t *visited,
         jl_code_instance_t *be;
         i = get_next_edge(mi->backedges, i, NULL, &be);
         JL_GC_PROMISE_ROOTED(be); // get_next_edge propagates the edge for us here
-        int child_found = has_backedge_to_worklist(jl_get_ci_mi(be), visited, stack);
+        int child_found = has_backedge_to_worklist(jl_get_ci_mi(be), visited, stack, query_cache);
         if (child_found == 1 || child_found == 2) {
             // found what we were looking for, so terminate early
             found = 1;
@@ -243,7 +261,7 @@ static int has_backedge_to_worklist(jl_method_instance_t *mi, htable_t *visited,
 // from the worklist or explicitly added by a `precompile` statement, and
 // (4) are the most recently computed result for that method.
 // These will be preserved in the image.
-static jl_array_t *queue_external_cis(jl_array_t *list)
+static jl_array_t *queue_external_cis(jl_array_t *list, jl_query_cache *query_cache)
 {
     if (list == NULL)
         return NULL;
@@ -262,7 +280,7 @@ static jl_array_t *queue_external_cis(jl_array_t *list)
         jl_method_instance_t *mi = jl_get_ci_mi(ci);
         jl_method_t *m = mi->def.method;
         if (ci->owner == jl_nothing && jl_atomic_load_relaxed(&ci->inferred) && jl_is_method(m) && jl_object_in_image((jl_value_t*)m->module)) {
-            int found = has_backedge_to_worklist(mi, &visited, &stack);
+            int found = has_backedge_to_worklist(mi, &visited, &stack, query_cache);
             assert(found == 0 || found == 1 || found == 2);
             assert(stack.len == 0);
             if (found == 1 && jl_atomic_load_relaxed(&ci->max_world) == ~(size_t)0) {
diff --git a/stdlib/REPL/docs/src/index.md b/stdlib/REPL/docs/src/index.md
index eabd7e729280e..ddd0a0953fcfc 100644
--- a/stdlib/REPL/docs/src/index.md
+++ b/stdlib/REPL/docs/src/index.md
@@ -343,7 +343,15 @@ mapfoldl mapfoldr
 
 When a single complete tab-complete result is available at the end of an input line and 2 or more characters
 have been typed, a hint of the completion will show in a lighter color.
-This can be disabled via `Base.active_repl.options.hint_tab_completes = false`.
+This can be disabled via `Base.active_repl.options.hint_tab_completes = false` or by adding
+```
+atreplinit() do repl
+    if VERSION >= v"1.11.0-0"
+        repl.options.hint_tab_completes = false
+    end
+end
+```
+to your `~/.julia/config/startup.jl`.
 
 !!! compat "Julia 1.11"
     Tab-complete hinting was added in Julia 1.11
diff --git a/stdlib/Random/src/RNGs.jl b/stdlib/Random/src/RNGs.jl
index 7782de88ba537..2ea2fb3a684df 100644
--- a/stdlib/Random/src/RNGs.jl
+++ b/stdlib/Random/src/RNGs.jl
@@ -147,21 +147,26 @@ function show(io::IO, rng::MersenneTwister)
     end
     print(io, MersenneTwister, "(", repr(rng.seed), ", (")
     # state
-    adv = Integer[rng.adv_jump, rng.adv]
+    sep = ", "
+    show(io, rng.adv_jump)
+    print(io, sep)
+    show(io, rng.adv)
     if rng.adv_vals != -1 || rng.adv_ints != -1
-        if rng.adv_vals == -1
-            @assert rng.idxF == MT_CACHE_F
-            push!(adv, 0, 0) # "(0, 0)" is nicer on the eyes than (-1, 1002)
-        else
-            push!(adv, rng.adv_vals, rng.idxF)
-        end
+        # "(0, 0)" is nicer on the eyes than (-1, 1002)
+        s = rng.adv_vals != -1
+        print(io, sep)
+        show(io, s ? rng.adv_vals : zero(rng.adv_vals))
+        print(io, sep)
+        show(io, s ? rng.idxF : zero(rng.idxF))
     end
     if rng.adv_ints != -1
         idxI = (length(rng.ints)*16 - rng.idxI) / 8 # 8 represents one Int64
         idxI = Int(idxI) # idxI should always be an integer when using public APIs
-        push!(adv, rng.adv_ints, idxI)
+        print(io, sep)
+        show(io, rng.adv_ints)
+        print(io, sep)
+        show(io, idxI)
     end
-    join(io, adv, ", ")
     print(io, "))")
 end
 
diff --git a/test/atomics.jl b/test/atomics.jl
index 7e9f29c23ca10..2d4a713b1d30d 100644
--- a/test/atomics.jl
+++ b/test/atomics.jl
@@ -1099,3 +1099,14 @@ test_once_undef(Any)
 test_once_undef(Union{Nothing,Integer})
 test_once_undef(UndefComplex{Any})
 test_once_undef(UndefComplex{UndefComplex{Any}})
+
+mutable struct Atomic57190
+    @atomic x::Int
+end
+
+
+function add_one57190!()
+    @atomic (Atomic57190(0).x) += 1
+end
+
+@test add_one57190!() == 1
diff --git a/test/iobuffer.jl b/test/iobuffer.jl
index a9d58f4b7871e..7ed5c1f5b3ed6 100644
--- a/test/iobuffer.jl
+++ b/test/iobuffer.jl
@@ -6,6 +6,267 @@ ioslength(io::IOBuffer) = (io.seekable ? io.size : bytesavailable(io))
 
 bufcontents(io::Base.GenericIOBuffer) = unsafe_string(pointer(io.data), io.size)
 
+# Julia Base's internals uses the PipeBuffer, which is an unseekable IOBuffer.
+# There are no public constructors to build such a buffer, but we need to test
+# it anyway.
+# I make a new method here such that if the implementation of Base.PipeBuffer
+# changes, these tests will still work.
+new_unseekable_buffer() = Base.GenericIOBuffer(Memory{UInt8}(), true, true, false, true, typemax(Int), false)
+
+@testset "Basic tests" begin
+    @test_throws ArgumentError IOBuffer(;maxsize=-1)
+    @test_throws ArgumentError IOBuffer([0x01]; maxsize=-1)
+
+    # Test that sizehint actually will sizehint the vector,
+    v = UInt8[]
+    buf = IOBuffer(v; sizehint=64, write=true)
+    @test length(v.ref.mem) >= 64
+
+    # Test that you can't make an IOBuffer with a maxsize
+    # smaller than the size you actually give it
+    @test_throws ArgumentError IOBuffer([0x01, 0x02]; maxsize=1)
+    @test_throws ArgumentError IOBuffer(b"abcdefghij"; maxsize=8)
+end
+
+@testset "Basic reading" begin
+    # Readavailable is equal to read
+    buf = IOBuffer("abcdef")
+    @test read(buf, UInt8) == UInt8('a')
+    @test bytesavailable(buf) == 5
+    @test readavailable(buf) == b"bcdef"
+
+    # Reading less than all the bytes
+    buf = IOBuffer(b"ABCDEFGHIJ")
+    @test read(buf, 1) == b"A"
+    @test read(buf, 3) == b"BCD"
+
+    # Reading more bytes than available will not error
+    @test read(buf, 100) == b"EFGHIJ"
+
+    # Passing truncate=false will still truncate an IOBuffer with no
+    # initialized data
+    @test isempty(read(IOBuffer(;sizehint=34, truncate=false)))
+end
+
+@testset "Byte occursin GenericIOBuffer" begin
+    buf = IOBuffer(@view(collect(0x1f:0x3d)[1:end]))
+    @test occursin(0x1f, buf)
+    @test occursin(0x3d, buf)
+    @test occursin(0x2a, buf)
+
+    @test !occursin(0xff, buf)
+    @test !occursin(0x00, buf)
+
+    v = Vector{UInt8}("bcdefg")
+    pushfirst!(v, UInt8('a'))
+    buf = IOBuffer(v)
+    @test occursin(UInt8('a'), buf)
+    read(buf, UInt8)
+    @test !occursin(UInt8('a'), buf)
+    @test !occursin(0x00, buf)
+
+    buf = IOBuffer("abcdefg")
+    @test occursin(UInt8('a'), buf)
+end
+
+@testset "Non-Memory backed IOBuffer" begin
+    buf = IOBuffer(Test.GenericArray(collect(0x02:0x0d)), read=true)
+    @test read(buf) == 0x02:0x0d
+
+    buf = IOBuffer(Test.GenericArray(collect(0x02:0x0d)), read=true)
+    @test read(buf, UInt8) == 0x02
+    @test read(buf) == 0x03:0x0d
+
+    v = view(collect(UInt8('a'):UInt8('z')), 4:10)
+    buf = IOBuffer(v, read=true, write=true)
+    @test read(buf, UInt8) == UInt8('d')
+    @test read(buf) == UInt8('e'):UInt8('j')
+    seekstart(buf)
+    @test read(buf, UInt8) == UInt8('d')
+    write(buf, UInt8('x'))
+    write(buf, "ABC")
+    seekstart(buf)
+    @test read(buf) == b"dxABCij"
+end
+
+@testset "Copying" begin
+    # Test offset is preserved when copying
+    v = UInt8[]
+    pushfirst!(v, UInt8('a'), UInt8('b'), UInt8('c'))
+    buf = IOBuffer(v; write=true, read=true, append=true)
+    write(buf, "def")
+    read(buf, UInt16)
+    buf2 = copy(buf)
+    @test String(read(buf)) == "cdef"
+    @test String(read(buf2)) == "cdef"
+
+    # Test copying with non-Memory backed GenericIOBuffer
+    buf = IOBuffer(Test.GenericArray(collect(0x02:0x0d)), read=true)
+    @test read(buf, UInt16) == 0x0302
+    buf2 = copy(buf)
+    @test isreadable(buf2)
+    @test !iswritable(buf2)
+    @test read(buf2) == 0x04:0x0d
+
+    # Test copying a non-seekable stream
+    buf = new_unseekable_buffer()
+    write(buf, "abcdef")
+    read(buf, UInt16)
+    mark(buf)
+    read(buf, UInt16)
+    buf2 = copy(buf)
+    @test read(buf2) == b"ef"
+    reset(buf2)
+    @test read(buf2) == b"cdef"
+
+    # Test copying seekable stream
+    buf = IOBuffer()
+    write(buf, "abcdef")
+    seekstart(buf)
+    read(buf)
+    mark(buf)
+    buf2 = copy(buf)
+    @test reset(buf2) == 6
+    seekstart(buf2)
+    @test read(buf2) == b"abcdef"
+
+    # Test copying a taken buffer
+    buf = IOBuffer()
+    write(buf, "abcdef")
+    take!(buf)
+    buf2 = copy(buf)
+    @test eof(buf2)
+    seekstart(buf2)
+    @test eof(buf2)
+end
+
+@testset "copyuntil" begin
+    a = IOBuffer(b"abcdeajdgabdfg")
+    b = IOBuffer(collect(b"xx"); write=true, read=true, append=true)
+    copyuntil(b, a, UInt8('a'))
+    @test read(b) == b"xx"
+    seekstart(b)
+    copyuntil(b, a, UInt8('a'); keep=true)
+    @test read(b) == b"xxbcdea"
+    seekstart(b)
+    copyuntil(b, a, UInt('w'))
+    @test read(b) == b"xxbcdeajdgabdfg"
+end
+
+@testset "copyline" begin
+    a = IOBuffer(b"abcde\nabc\r\nabc\n\r\nac")
+    b = IOBuffer()
+    copyline(b, a)
+    @test take!(copy(b)) == b"abcde"
+    copyline(b, a)
+    @test take!(copy(b)) == b"abcdeabc"
+    copyline(b, a; keep=true)
+    @test take!(copy(b)) == b"abcdeabcabc\n"
+    copyline(b, a; keep=false)
+    @test take!(copy(b)) == b"abcdeabcabc\n"
+    copyline(b, a; keep=false)
+    @test take!(copy(b)) == b"abcdeabcabc\nac"
+
+    # Test a current bug in copyline
+    a = Base.SecretBuffer("abcde\r\n")
+    b = IOBuffer()
+    write(b, "xxxxxxxxxx")
+    seek(b, 2)
+    copyline(b, a; keep=false)
+    Base.shred!(a)
+    @test take!(b) == b"xxabcdexxx"
+end
+
+@testset "take!" begin
+    a = IOBuffer("abc")
+    @test take!(a) == b"abc"
+
+    v = UInt8[]
+    pushfirst!(v, 0x0a)
+    buf = IOBuffer(v; write=true, append=true)
+    write(buf, "def")
+    @test take!(buf) == b"\ndef"
+
+    v = view(collect(b"abcdefghij"), 3:9)
+    buf = IOBuffer(v; write=true, read=true)
+    read(buf, UInt8)
+    write(buf, "xxy")
+    @test take!(buf) == b"cxxyghi"
+
+    v = view(collect(b"abcdefghij"), 3:9)
+    buf = IOBuffer(v; write=true, read=true)
+
+    # Take on unseekable buffer does not return used bytes.
+    buf = new_unseekable_buffer()
+    write(buf, 0x61)
+    write(buf, "bcd")
+    @test read(buf, UInt8) == 0x61
+    @test take!(buf) == b"bcd"
+
+    # Compaction is reset after take!
+    buf = Base.GenericIOBuffer(Memory{UInt8}(), true, true, false, true, 100, false)
+    write(buf, rand(UInt8, 50))
+    read(buf, 40)
+    write(buf, rand(UInt8, 100))
+    mark(buf)
+    read(buf, 70)
+    @test position(buf) == 110
+    @test length(buf.data) <= 100
+    v = take!(buf)
+    write(buf, 0xf1)
+    @test position(buf) == 0
+    @test !ismarked(buf)
+end
+
+@testset "maxsize is preserved" begin
+    # After take!
+    buf = IOBuffer(; maxsize=3)
+    print(buf, "abcdef")
+    @test take!(buf) == b"abc"
+    print(buf, "abcdef")
+    @test take!(buf) == b"abc"
+
+    # After resizing
+    buf = IOBuffer(;maxsize=128)
+    write(buf, collect(0x00:0x10))
+    write(buf, collect(0x11:0x30))
+    write(buf, collect(0x31:0x98))
+    write(buf, collect(0x99:0xff))
+    seekstart(buf)
+    @test read(buf) == 0x00:UInt8(127)
+
+    # Edge case: When passing a Vector, does not error if the
+    # underlying mem is larger than maxsize
+    v = pushfirst!([0x01], 0x02)
+    io = IOBuffer(v; maxsize=2)
+    @test read(io) == b"\x02\x01"
+
+    # Buffer will not write past maxsize, even if given a larger buffer
+    # And also even if the data is taken and replaced
+    v = sizehint!(UInt8[], 128)
+    io = IOBuffer(v; write=true, read=true, maxsize=12)
+    write(io, 0x01:0x0f)
+    seekstart(io)
+    @test read(io) == 0x01:0x0c
+    @test write(io, 0x01) == 0
+    @test write(io, "abc") == 0
+    @test take!(io).ref.mem === v.ref.mem
+    write(io, 0x01:0x0f)
+    @test take!(io) == 0x01:0x0c
+end
+
+@testset "Write to self" begin
+    buffer = IOBuffer()
+    @test_throws ArgumentError write(buffer, buffer)
+
+    # Write to another IOBuffer with limited size
+    to = IOBuffer(;maxsize=4)
+    from = IOBuffer(collect(b"abcdefghi"))
+    write(to, from)
+    @test String(take!(to)) == "abcd"
+    @test eof(from)
+end
+
 @testset "Read/write empty IOBuffer" begin
     io = IOBuffer()
     @test eof(io)
@@ -33,7 +294,7 @@ bufcontents(io::Base.GenericIOBuffer) = unsafe_string(pointer(io.data), io.size)
     @test position(io) == 0
     truncate(io, 10)
     @test position(io) == 0
-    @test all(io.data .== 0)
+    @test all(view(io.data, 1:10) .== 0)
     @test write(io, Int16[1, 2, 3, 4, 5, 6]) === 12
     seek(io, 2)
     truncate(io, 10)
@@ -67,22 +328,89 @@ end
     @test_throws ArgumentError write(io,UInt8[0])
     @test String(take!(io)) == "hamster\nguinea pig\nturtle"
     @test String(take!(io)) == "hamster\nguinea pig\nturtle" #should be unchanged
-    @test_throws ArgumentError Base.compact(io) # not writeable
     close(io)
 end
 
+@testset "Truncate" begin
+    # Fails for non-writable and non-seekable
+    @test_throws ArgumentError truncate(PipeBuffer(), 0)
+    @test_throws ArgumentError truncate(IOBuffer(b"abcde"), 3)
+
+    # Standard use
+    buf = IOBuffer(collect(b"abcdef"); write=true, read=true)
+    truncate(buf, 4)
+    @test read(buf) == b"abcd"
+    @test take!(buf) == b"abcd"
+
+    # Mark is removed if beyond the size
+    buf = IOBuffer()
+    write(buf, "abcde")
+    seek(buf, 4)
+    mark(buf)
+    truncate(buf, 4)
+    @test !ismarked(buf)
+
+    # Making it larger
+    buf = IOBuffer(collect(b"abcdef"); write=true, read=true)
+    seek(buf, 3)
+    truncate(buf, 3)
+    write(buf, 'X')
+    mark(buf)
+    truncate(buf, 5)
+    @test ismarked(buf)
+    @test reset(buf) == 4
+    @test take!(buf) == b"abcX\0"
+
+    # With offset
+    v = pushfirst!(UInt8[0x62, 0x63, 0x64], 0x61)
+    buf = IOBuffer(v; write=true, read=true)
+    seekstart(buf)
+    read(buf, UInt8)
+    mark(buf)
+    truncate(buf, 7)
+    @test reset(buf) == 1
+    @test take!(buf) == b"abcd\0\0\0"
+end
+
+@testset "Position of compactable buffer" begin
+    # Set maxsize, because otherwise compaction it too hard to reason about,
+    # and this test will be brittle
+    io = Base.GenericIOBuffer(Memory{UInt8}(), true, true, false, true, 100, false)
+    write(io, "abcd")
+    read(io, UInt16)
+    @test position(io) == 2
+    write(io, "abcde"^80)
+    @test position(io) == 2
+    read(io, 60)
+    @test position(io) == 62
+    mark(io)
+    # Trigger compaction
+    write(io, rand(UInt8, 50))
+    @test position(io) == 62
+    v1 = read(io, 20)
+    @test position(io) == 82
+    @test reset(io) == 62
+    @test position(io) == 62
+    v2 = read(io, 20)
+    @test v1 == v2
+end
+
 @testset "PipeBuffer" begin
-    io = PipeBuffer()
+    io = new_unseekable_buffer()
     @test_throws EOFError read(io,UInt8)
     @test write(io,"pancakes\nwaffles\nblueberries\n") > 0
+
+    # PipeBuffer is append, so writing to it does not advance the position
     @test position(io) == 0
     @test readline(io) == "pancakes"
-    Base.compact(io)
     @test readline(io) == "waffles"
     @test write(io,"whipped cream\n") > 0
     @test readline(io) == "blueberries"
+
+    # Pipebuffers do not support seeking, and therefore do not support truncation.
     @test_throws ArgumentError seek(io,0)
     @test_throws ArgumentError truncate(io,0)
+
     @test readline(io) == "whipped cream"
     @test write(io,"pancakes\nwaffles\nblueberries\n") > 0
     @test readlines(io) == String["pancakes", "waffles", "blueberries"]
@@ -116,58 +444,6 @@ end
         end
         rm(fname)
     end
-
-    Base.compact(io)
-    @test position(io) == 0
-    @test ioslength(io) == 0
-    Base._resize!(io,0)
-    Base.ensureroom(io,50)
-    @test position(io) == 0
-    @test ioslength(io) == 0
-    @test length(io.data) == 50
-    Base.ensureroom(io,10)
-    @test ioslength(io) == 0
-    @test length(io.data) == 50
-    io.maxsize = 75
-    Base.ensureroom(io,100)
-    @test ioslength(io) == 0
-    @test length(io.data) == 75
-    seekend(io)
-    @test ioslength(io) == 0
-    @test position(io) == 0
-    write(io,zeros(UInt8,200))
-    @test ioslength(io) == 75
-    @test length(io.data) == 75
-    write(io,1)
-    @test ioslength(io) == 75
-    @test length(io.data) == 75
-    write(io,[1,2,3])
-    @test ioslength(io) == 75
-    @test length(io.data) == 75
-    skip(io,1)
-    @test write(io,UInt8(104)) === 1
-    skip(io,3)
-    @test write(io,b"apples") === 3
-    skip(io,71)
-    @test write(io,'y') === 1
-    @test read(io, String) == "happy"
-    @test eof(io)
-    write(io,zeros(UInt8,73))
-    write(io,'a')
-    write(io,'b')
-    write(io,'c')
-    write(io,'d')
-    write(io,'e')
-    @test ioslength(io) == 75
-    @test length(io.data) == 75
-    @test position(io) == 0
-    skip(io,72)
-    @test String(take!(io)) == "\0ab"
-    @test String(take!(io)) == ""
-
-    # issues 4021
-    print(io, true)
-    close(io)
 end
 
 @testset "issue 5453" begin
@@ -248,9 +524,6 @@ end
     truncate(io2, io2.size - 2)
     @test read(io2, String) == "goodnightmoonhelloworld"
     seek(io2, 0)
-    write(io2, io2)
-    @test read(io2, String) == ""
-    @test bufcontents(io2) == "goodnightmoonhelloworld"
 end
 
 # issue #11917
@@ -347,24 +620,42 @@ end
     @test n == 5
 end
 
-@testset "Base.compact" begin
-    a = Base.GenericIOBuffer(UInt8[], true, true, false, true, typemax(Int))
-    mark(a) # mark at position 0
-    write(a, "Hello!")
-    @test Base.compact(a) === nothing # because pointer > mark
-    close(a)
-    b = Base.GenericIOBuffer(UInt8[], true, true, false, true, typemax(Int))
-    write(b, "Hello!")
-    read(b)
-    mark(b) # mark at position 6
-    write(b, "Goodbye!") # now pointer is > mark but mark is > 0
-    Base.compact(b)
-    @test readline(b) == "Goodbye!"
-    close(b)
+@testset "Compacting" begin
+    # Compacting works
+    buf = Base.GenericIOBuffer(UInt8[], true, true, false, true, 20, false)
+    mark(buf)
+    write(buf, "Hello"^5)
+    reset(buf)
+    unmark(buf)
+    read(buf, UInt8)
+    read(buf, UInt8)
+    write(buf, "a!")
+    @test length(buf.data) == 20
+    @test String(take!(buf)) == "llo" * "Hello"^3 * "a!"
+
+    # Compacting does not do anything when mark == 0
+    buf = Base.GenericIOBuffer(UInt8[], true, true, false, true, 5, false)
+    mark(buf)
+    write(buf, "Hello")
+    reset(buf)
+    mark(buf)
+    read(buf, UInt8)
+    read(buf, UInt8)
+    @test write(buf, "a!") == 0
+    @test take!(buf) == b"llo"
+
+    # Compacting without maxsize still works
+    buf = new_unseekable_buffer()
+    data = repeat(b"abcdefg", 100)
+    write(buf, data)
+    read(buf, 600)
+    data_len = length(buf.data)
+    write(buf, view(data, 1:500))
+    @test length(buf.data) == data_len
 end
 
 @testset "peek(::GenericIOBuffer)" begin
-    io = Base.GenericIOBuffer(UInt8[], true, true, false, true, typemax(Int))
+    io = Base.GenericIOBuffer(UInt8[], true, true, false, true, typemax(Int), false)
     write(io, "こんにちは")
     @test peek(io) == 0xe3
     @test peek(io, Char) == 'こ'
@@ -381,13 +672,22 @@ end
     v = @view a[1:2]
     io = IOBuffer()
     write(io,1)
+    write(io,0)
     seek(io,0)
-    @test Base.read_sub(io,v,1,1) == [1,0]
+    @test read!(io, v) == [1, 0]
 end
 
 @testset "with offset" begin
     b = pushfirst!([0x02], 0x01)
     @test take!(IOBuffer(b)) == [0x01, 0x02]
+
+    # Read-only buffer does not take control of underlying buffer
+    v = pushfirst!([0x62, 0x63], 0x61)
+    buf = IOBuffer(v; write=false)
+    @test read(buf) == b"abc"
+    @test v == b"abc" # v is unchanged
+
+    # Truncate
 end
 
 @testset "#54636 reading from non-dense vectors" begin
diff --git a/test/llvmpasses/alloc-opt-pass.ll b/test/llvmpasses/alloc-opt-pass.ll
index 665687e86835d..83f2118412cc1 100644
--- a/test/llvmpasses/alloc-opt-pass.ll
+++ b/test/llvmpasses/alloc-opt-pass.ll
@@ -73,6 +73,11 @@ L3:                                               ; preds = %L2, %L1, %0
 }
 ; CHECK-LABEL: }{{$}}
 
+declare void @external_function()
+
+declare ptr addrspace(10) @external_function2()
+
+
 ; CHECK-LABEL: @legal_int_types
 ; CHECK: alloca [12 x i8]
 ; CHECK-NOT: alloca i96
@@ -89,21 +94,6 @@ define void @legal_int_types() {
 }
 ; CHECK-LABEL: }{{$}}
 
-declare void @external_function()
-
-declare ptr addrspace(10) @external_function2()
-
-declare ptr @julia.ptls_states()
-
-declare ptr @julia.get_pgcstack()
-
-declare noalias ptr addrspace(10) @julia.gc_alloc_obj(ptr, i64, ptr addrspace(10))
-
-declare ptr @julia.pointer_from_objref(ptr addrspace(11))
-
-declare token @llvm.julia.gc_preserve_begin(...)
-
-declare void @llvm.julia.gc_preserve_end(token)
 
 ; CHECK-LABEL: @memref_collision
 ; OPAQUE: call ptr @julia.ptls_states()
@@ -171,13 +161,13 @@ define void @initializers() {
   %pgcstack = call ptr @julia.get_pgcstack()
   %ptls = call ptr @julia.ptls_states()
   %ptls_i8 = bitcast ptr %ptls to ptr
-  %var1 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 1, ptr addrspace(10) @tag) #1
+  %var1 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 1, ptr addrspace(10) @tag) #4
   %var2 = addrspacecast ptr addrspace(10) %var1 to ptr addrspace(11)
   %var3 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var2)
-  %var4 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 2, ptr addrspace(10) @tag) #2
+  %var4 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 2, ptr addrspace(10) @tag) #7
   %var5 = addrspacecast ptr addrspace(10) %var4 to ptr addrspace(11)
   %var6 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var5)
-  %var7 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 3, ptr addrspace(10) @tag) #3
+  %var7 = call ptr addrspace(10) @julia.gc_alloc_obj(ptr %ptls_i8, i64 3, ptr addrspace(10) @tag) #1
   %var8 = addrspacecast ptr addrspace(10) %var7 to ptr addrspace(11)
   %var9 = call ptr @julia.pointer_from_objref(ptr addrspace(11) %var8)
   ret void
@@ -203,14 +193,154 @@ union_move9:                                      ; No predecessors!
 }
 ; CHECK-LABEL: }{{$}}
 
+@0 = private unnamed_addr constant ptr inttoptr (i64 4373799056 to ptr), !julia.constgv !0
+@1 = private unnamed_addr constant i64 0, align 8
+
+; CHECK-LABEL: @cmpxchg
+; CHECK: alloca 
+; CHECK: alloca
+; CHECK:  %20 = cmpxchg ptr %2,
+define swiftcc i64 @"cmpxchg"(ptr nonnull swiftself %0) #0 {
+  %2 = alloca i64, align 16
+  %3 = call ptr @julia.get_pgcstack()
+  %4 = getelementptr inbounds i8, ptr %3, i32 -152
+  %5 = getelementptr inbounds i8, ptr %4, i32 168
+  %6 = load ptr, ptr %5, align 8, !tbaa !4
+  %7 = getelementptr inbounds i8, ptr %6, i32 16
+  %8 = load ptr, ptr %7, align 8, !tbaa !8, !invariant.load !0
+  fence syncscope("singlethread") seq_cst
+  call void @julia.safepoint(ptr %8)
+  fence syncscope("singlethread") seq_cst
+  %9 = load ptr, ptr @0, align 8, !tbaa !8, !invariant.load !0, !alias.scope !10, !noalias !13, !nonnull !0, !dereferenceable !18, !align !19
+  %10 = ptrtoint ptr %9 to i64
+  %11 = inttoptr i64 %10 to ptr
+  %12 = getelementptr inbounds i8, ptr %3, i32 -152
+  %13 = addrspacecast ptr %11 to ptr addrspace(10)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %2)
+  %14 = call noalias nonnull align 8 dereferenceable(8) ptr addrspace(10) @julia.gc_alloc_obj(ptr %12, i64 8, ptr addrspace(10) %13) #7
+  %15 = addrspacecast ptr addrspace(10) %14 to ptr addrspace(11)
+  call void @llvm.memcpy.p11.p0.i64(ptr addrspace(11) align 8 %15, ptr align 8 @1, i64 8, i1 false), !tbaa !20, !alias.scope !23, !noalias !24
+  %16 = addrspacecast ptr addrspace(10) %14 to ptr addrspace(11)
+  %17 = load atomic i64, ptr addrspace(11) %16 monotonic, align 8, !tbaa !25, !alias.scope !23, !noalias !24
+  br label %19
+
+18:                                               ; preds = %19
+  ret i64 %21
+
+19:                                               ; preds = %19, %1
+  %20 = phi i64 [ %17, %1 ], [ %23, %19 ]
+  %21 = call swiftcc i64 @"jlsys_+_47"(ptr nonnull swiftself %3, i64 signext %20, i64 signext 1)
+  %22 = cmpxchg ptr addrspace(11) %16, i64 %20, i64 %21 seq_cst monotonic, align 8, !tbaa !25, !alias.scope !23, !noalias !24
+  %23 = extractvalue { i64, i1 } %22, 0
+  %24 = extractvalue { i64, i1 } %22, 1
+  br i1 %24, label %18, label %19
+}
+
+; CHECK-LABEL: }{{$}}
+; CHECK-LABEL: @atomicrmw
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: atomicrmw xchg ptr %2,
+define swiftcc i64 @"atomicrmw"(ptr nonnull swiftself %0) #0 {
+  %2 = alloca i64, align 16
+  %3 = call ptr @julia.get_pgcstack()
+  %4 = getelementptr inbounds i8, ptr %3, i32 -152
+  %5 = getelementptr inbounds i8, ptr %4, i32 168
+  %6 = load ptr, ptr %5, align 8, !tbaa !4
+  %7 = getelementptr inbounds i8, ptr %6, i32 16
+  %8 = load ptr, ptr %7, align 8, !tbaa !8, !invariant.load !0
+  fence syncscope("singlethread") seq_cst
+  call void @julia.safepoint(ptr %8)
+  fence syncscope("singlethread") seq_cst
+  %9 = load ptr, ptr @0, align 8, !tbaa !8, !invariant.load !0, !alias.scope !10, !noalias !13, !nonnull !0, !dereferenceable !18, !align !19
+  %10 = ptrtoint ptr %9 to i64
+  %11 = inttoptr i64 %10 to ptr
+  %12 = getelementptr inbounds i8, ptr %3, i32 -152
+  %13 = addrspacecast ptr %11 to ptr addrspace(10)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %2)
+  %14 = call noalias nonnull align 8 dereferenceable(8) ptr addrspace(10) @julia.gc_alloc_obj(ptr %12, i64 8, ptr addrspace(10) %13) #7
+  %15 = addrspacecast ptr addrspace(10) %14 to ptr addrspace(11)
+  call void @llvm.memcpy.p11.p0.i64(ptr addrspace(11) align 8 %15, ptr align 8 @1, i64 8, i1 false), !tbaa !20, !alias.scope !23, !noalias !24
+  %16 = addrspacecast ptr addrspace(10) %14 to ptr addrspace(11)
+  %17 = load atomic i64, ptr addrspace(11) %16 monotonic, align 8, !tbaa !25, !alias.scope !23, !noalias !24
+  %18 = call swiftcc i64 @"jlsys_+_47"(ptr nonnull swiftself %3, i64 signext %17, i64 signext 1)
+  %19 = atomicrmw xchg ptr addrspace(11) %16, i64 %18 seq_cst, align 8, !tbaa !25, !alias.scope !23, !noalias !24                                    ; preds = %19
+  ret i64 %19
+}
+
+declare ptr @julia.ptls_states()
+
+declare ptr @julia.pointer_from_objref(ptr addrspace(11))
+
+declare token @llvm.julia.gc_preserve_begin(...)
+
+declare void @llvm.julia.gc_preserve_end(token)
+
+declare ptr @julia.get_pgcstack()
+
+; Function Attrs: mustprogress nounwind willreturn memory(inaccessiblemem: readwrite)
+declare nonnull align 8 dereferenceable(8) ptr addrspace(10) @ijl_box_int64(i64 signext) #2
+
+; Function Attrs: memory(argmem: readwrite, inaccessiblemem: readwrite)
+declare void @julia.safepoint(ptr) #3
+
+; Function Attrs: mustprogress nounwind willreturn allockind("alloc") allocsize(1) memory(argmem: read, inaccessiblemem: readwrite)
+declare noalias nonnull ptr addrspace(10) @julia.gc_alloc_obj(ptr, i64, ptr addrspace(10)) #4
+
 ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.memcpy.p11.p0.i64(ptr addrspace(11) noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
+declare void @llvm.memcpy.p11.p0.i64(ptr addrspace(11) noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5
+
 ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.memcpy.p0.p11.i64(ptr noalias nocapture writeonly, ptr addrspace(11) noalias nocapture readonly, i64, i1 immarg) #0
+declare void @llvm.memcpy.p0.p11.i64(ptr noalias nocapture writeonly, ptr addrspace(11) noalias nocapture readonly, i64, i1 immarg) #5
+
 ; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #5
+
+declare swiftcc i64 @"jlsys_+_47"(ptr nonnull swiftself, i64 signext, i64 signext) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #6
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #6
+
+attributes #0 = { "probe-stack"="inline-asm" }
+attributes #1 = { nounwind willreturn allockind("alloc,zeroed") allocsize(1) memory(argmem: read, inaccessiblemem: readwrite) }
+attributes #2 = { mustprogress nounwind willreturn memory(inaccessiblemem: readwrite) }
+attributes #3 = { memory(argmem: readwrite, inaccessiblemem: readwrite) }
+attributes #4 = { mustprogress nounwind willreturn allockind("alloc") allocsize(1) memory(argmem: read, inaccessiblemem: readwrite) }
+attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #6 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #7 = { nounwind willreturn allockind("alloc,uninitialized") allocsize(1) memory(argmem: read, inaccessiblemem: readwrite) }
+attributes #8 = { nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.module.flags = !{!1, !2, !3}
+
+!0 = !{}
+!1 = !{i32 2, !"Dwarf Version", i32 4}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 2, !"julia.optlevel", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"jtbaa_gcframe", !6, i64 0}
+!6 = !{!"jtbaa", !7, i64 0}
+!7 = !{!"jtbaa"}
+!8 = !{!9, !9, i64 0, i64 1}
+!9 = !{!"jtbaa_const", !6, i64 0}
+!10 = !{!11}
+!11 = !{!"jnoalias_const", !12}
+!12 = !{!"jnoalias"}
+!13 = !{!14, !15, !16, !17}
+!14 = !{!"jnoalias_gcframe", !12}
+!15 = !{!"jnoalias_stack", !12}
+!16 = !{!"jnoalias_data", !12}
+!17 = !{!"jnoalias_typemd", !12}
+!18 = !{i64 56}
+!19 = !{i64 16}
+!20 = !{!21, !21, i64 0}
+!21 = !{!"jtbaa_value", !22, i64 0}
+!22 = !{!"jtbaa_data", !6, i64 0}
+!23 = !{!16}
+!24 = !{!14, !15, !17, !11}
+!25 = !{!26, !26, i64 0}
+!26 = !{!"jtbaa_mutab", !21, i64 0}
 
-attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-attributes #1 = { allockind("alloc") }
-attributes #2 = { allockind("alloc,uninitialized") }
-attributes #3 = { allockind("alloc,zeroed") }
diff --git a/test/opaque_closure.jl b/test/opaque_closure.jl
index 6c988b068a668..7b02578a86621 100644
--- a/test/opaque_closure.jl
+++ b/test/opaque_closure.jl
@@ -390,3 +390,20 @@ let ir = first(only(Base.code_ircode(sin, (Int,))))
     oc = Core.OpaqueClosure(ir; do_compile=false)
     @test oc(1) == sin(1)
 end
+
+function typed_add54236(::Type{T}) where T
+    return @opaque (x::Int)->T(x) + T(1)
+end
+let f = typed_add54236(Float64)
+    @test f isa Core.OpaqueClosure
+    @test f(32) === 33.0
+end
+
+f54357(g, ::Type{AT}) where {AT} = Base.Experimental.@opaque AT->_ (args...) -> g((args::AT)...)
+let f = f54357(+, Tuple{Int,Int})
+    @test f isa Core.OpaqueClosure
+    @test f(32, 34) === 66
+    g = f54357(+, Tuple{Float64,Float64})
+    @test g isa Core.OpaqueClosure
+    @test g(32.0, 34.0) === 66.0
+end
diff --git a/test/precompile.jl b/test/precompile.jl
index 7c5c63a277e27..f7b31c125014c 100644
--- a/test/precompile.jl
+++ b/test/precompile.jl
@@ -2416,4 +2416,9 @@ precompile_test_harness("Package top-level load itself") do load_path
     end
 end
 
+# Verify that inference / caching was not performed for any macros in the sysimage
+let m = only(methods(Base.var"@big_str"))
+    @test m.specializations === Core.svec() || !isdefined(m.specializations, :cache)
+end
+
 finish_precompile_test!()
diff --git a/test/strings/annotated.jl b/test/strings/annotated.jl
index 8658c1b52a2ab..7f53740b9eec1 100644
--- a/test/strings/annotated.jl
+++ b/test/strings/annotated.jl
@@ -258,3 +258,51 @@ end
     write(aio, Base.AnnotatedString("hello", [(1:5, :tag, 1)]))
     @test sprint(show, aio) == "Base.AnnotatedIOBuffer(5 bytes, 1 annotation)"
 end
+
+@testset "Eachregion" begin
+    annregions(str::String, annots::Vector{<:Tuple{UnitRange{Int}, Symbol, <:Any}}) =
+        [(s, Tuple.(a)) for (s, a) in Base.eachregion(Base.AnnotatedString(str, annots))]
+    # Regions that do/don't extend to the left/right edges
+    @test annregions(" abc ", [(2:4, :face, :bold)]) ==
+        [(" ", []),
+         ("abc", [(:face, :bold)]),
+         (" ", [])]
+    @test annregions(" x ", [(2:2, :face, :bold)]) ==
+        [(" ", []),
+         ("x", [(:face, :bold)]),
+         (" ", [])]
+    @test annregions(" x", [(2:2, :face, :bold)]) ==
+        [(" ", []),
+         ("x", [(:face, :bold)])]
+    @test annregions("x ", [(1:1, :face, :bold)]) ==
+        [("x", [(:face, :bold)]),
+         (" ", [])]
+    @test annregions("x", [(1:1, :face, :bold)]) ==
+        [("x", [(:face, :bold)])]
+    # Overlapping/nested regions
+    @test annregions(" abc ", [(2:4, :face, :bold), (3:3, :face, :italic)]) ==
+        [(" ", []),
+         ("a", [(:face, :bold)]),
+         ("b", [(:face, :bold), (:face, :italic)]),
+         ("c", [(:face, :bold)]),
+         (" ", [])]
+    @test annregions("abc-xyz", [(1:7, :face, :bold), (1:3, :face, :green), (4:4, :face, :yellow), (4:7, :face, :italic)]) ==
+        [("abc", [(:face, :bold), (:face, :green)]),
+         ("-", [(:face, :bold), (:face, :yellow), (:face, :italic)]),
+         ("xyz", [(:face, :bold), (:face, :italic)])]
+    # Preserving annotation order
+    @test annregions("abcd", [(1:3, :face, :red), (2:2, :face, :yellow), (2:3, :face, :green), (2:4, :face, :blue)]) ==
+        [("a", [(:face, :red)]),
+         ("b", [(:face, :red), (:face, :yellow), (:face, :green), (:face, :blue)]),
+         ("c", [(:face, :red), (:face, :green), (:face, :blue)]),
+         ("d", [(:face, :blue)])]
+    @test annregions("abcd", [(2:4, :face, :blue), (1:3, :face, :red), (2:3, :face, :green), (2:2, :face, :yellow)]) ==
+        [("a", [(:face, :red)]),
+         ("b", [(:face, :blue), (:face, :red), (:face, :green), (:face, :yellow)]),
+         ("c", [(:face, :blue), (:face, :red), (:face, :green)]),
+         ("d", [(:face, :blue)])]
+    # Region starting after a character spanning multiple codepoints.
+    @test annregions("𝟏x", [(1:4, :face, :red)]) ==
+        [("𝟏", [(:face, :red)]),
+         ("x", [])]
+end
diff --git a/test/trimming/trimming.jl b/test/trimming/trimming.jl
index 0c5226cba01fe..a752c69460ad4 100644
--- a/test/trimming/trimming.jl
+++ b/test/trimming/trimming.jl
@@ -4,7 +4,7 @@ let exe_suffix = splitext(Base.julia_exename())[2]
 
     hello_exe = joinpath(@__DIR__, "hello" * exe_suffix)
     @test readchomp(`$hello_exe`) == "Hello, world!"
-    @test filesize(hello_exe) < filesize(unsafe_string(Base.JLOptions().image_file))/10
+    @test filesize(hello_exe) < 2000000
 
     basic_jll_exe = joinpath(@__DIR__, "basic_jll" * exe_suffix)
     lines = split(readchomp(`$basic_jll_exe`), "\n")