diff --git a/base/Makefile b/base/Makefile
index a9fe6ae7fa6c5..ca7eb6e1e2879 100644
--- a/base/Makefile
+++ b/base/Makefile
@@ -17,17 +17,21 @@ else
   PCRE_INCL_PATH := $(build_includedir)/pcre2.h
 endif
 
-define parse_features
-@printf "%s\n" "# $(2) features" >> $@
-@$(call PRINT_PERL, cat $(SRCDIR)/../src/features_$(1).h | perl -lne 'print "const JL_$(2)_$$1 = UInt32($$2)" if /^\s*JL_FEATURE_DEF(?:_NAME)?\(\s*(\w+)\s*,\s*([^,]+)\s*,.*\)\s*(?:\/\/.*)?$$/' >> $@)
+# Extract feature indices from cpufeatures generated headers.
+# The FeatureIndex enum has entries like: FEAT_SSE3 = 108,
+# We convert them to: const JL_X86_sse3 = UInt32(108)
+CPUFEATURES_GENDIR := $(build_includedir)/cpufeatures
+
+define parse_cpufeatures
+@printf "%s\n" "# $(2) features (from cpufeatures)" >> $@
+@$(call PRINT_PERL, perl -lne 'if (/^\s*FEAT_(\w+)\s*=\s*(\d+)/) { my $$n = lc($$1); print "const JL_$(2)_$$n = UInt32($$2)" }' $(1) >> $@)
 @printf "\n" >> $@
 endef
 
-$(BUILDDIR)/features_h.jl: $(SRCDIR)/../src/features_x86.h $(SRCDIR)/../src/features_aarch32.h $(SRCDIR)/../src/features_aarch64.h
+$(BUILDDIR)/features_h.jl: $(wildcard $(CPUFEATURES_GENDIR)/target_tables_*.h)
 	@-rm -f $@
-	@$(call parse_features,x86,X86)
-	@$(call parse_features,aarch32,AArch32)
-	@$(call parse_features,aarch64,AArch64)
+	@$(call parse_cpufeatures,$(CPUFEATURES_GENDIR)/target_tables_x86_64.h,X86)
+	@$(call parse_cpufeatures,$(CPUFEATURES_GENDIR)/target_tables_aarch64.h,AArch64)
 
 $(BUILDDIR)/pcre_h.jl: $(PCRE_INCL_PATH)
 	@$(call PRINT_PERL, $(CPP) -D PCRE2_CODE_UNIT_WIDTH=8 -dM $< | perl -nle '/^\s*#define\s+PCRE2_(\w*)\s*\(?($(PCRE_CONST))\)?u?\s*$$/ and print index($$1, "ERROR_") == 0 ? "const $$1 = Cint($$2)" : "const $$1 = UInt32($$2)"' | LC_ALL=C sort > $@)
diff --git a/base/cpuid.jl b/base/cpuid.jl
index 0370bd33b83e5..5f58f596b0af9 100644
--- a/base/cpuid.jl
+++ b/base/cpuid.jl
@@ -10,8 +10,8 @@ export cpu_isa
 A structure which represents the Instruction Set Architecture (ISA) of a
 computer.  It holds the `Set` of features of the CPU.
 
-The numerical values of the features are automatically generated from the C
-source code of Julia and stored in the `features_h.jl` Julia file.
+Feature bit indices come from the cpufeatures library's generated tables
+(extracted from LLVM's TableGen data at build time).
 """
 struct ISA
     features::Set{UInt32}
@@ -23,55 +23,167 @@ Base.isless(a::ISA,  b::ISA) = a < b
 
 include(string(Base.BUILDROOT, "features_h.jl"))  # include($BUILDROOT/base/features_h.jl)
 
-# Keep in sync with `arch_march_isa_mapping`.
+"""
+    _featurebytes_to_isa(buf::Vector{UInt8}) -> ISA
+
+Convert a raw feature byte buffer (from cpufeatures) into an ISA.
+"""
+function _featurebytes_to_isa(buf::Vector{UInt8})
+    features = Set{UInt32}()
+    for byte_idx in 0:length(buf)-1
+        b = buf[byte_idx + 1]
+        b == 0 && continue
+        for bit in 0:7
+            if (b >> bit) & 1 != 0
+                push!(features, UInt32(byte_idx * 8 + bit))
+            end
+        end
+    end
+    return ISA(features)
+end
+
+"""
+    _cross_lookup_cpu(arch::String, name::String) -> ISA
+
+Look up hardware features for a CPU on any architecture using the
+cross-arch tables. Works regardless of host architecture.
+Returns an empty ISA if the CPU or architecture is not found.
+"""
+function _cross_lookup_cpu(arch::String, name::String)
+    nbytes = ccall(:jl_cpufeatures_cross_nbytes, Csize_t, (Cstring,), arch)
+    nbytes == 0 && return ISA(Set{UInt32}())
+    buf = Vector{UInt8}(undef, nbytes)
+    written = ccall(:jl_cpufeatures_cross_lookup, Csize_t,
+                    (Cstring, Cstring, Ptr{UInt8}, Csize_t),
+                    arch, name, buf, nbytes)
+    written == 0 && return ISA(Set{UInt32}())
+    return _featurebytes_to_isa(buf)
+end
+
+"""
+    _build_bit_to_name(arch::String) -> Dict{UInt32, String}
+
+Build a mapping from feature bit index to feature name for an architecture.
+"""
+function _build_bit_to_name(arch::String)
+    nfeats = ccall(:jl_cpufeatures_cross_num_features, UInt32, (Cstring,), arch)
+    result = Dict{UInt32, String}()
+    for i in 0:nfeats-1
+        name_ptr = ccall(:jl_cpufeatures_cross_feature_name, Cstring, (Cstring, UInt32), arch, i)
+        name_ptr == C_NULL && continue
+        bit = ccall(:jl_cpufeatures_cross_feature_bit, Cint, (Cstring, UInt32), arch, i)
+        bit < 0 && continue
+        result[UInt32(bit)] = unsafe_string(name_ptr)
+    end
+    return result
+end
+
+"""
+    feature_names(arch::String, cpu::String) -> Vector{String}
+    feature_names(arch::String, isa::ISA) -> Vector{String}
+    feature_names(isa::ISA) -> Vector{String}
+    feature_names() -> Vector{String}
+
+Return sorted hardware feature names. Can query by CPU name (on any
+architecture) or by ISA. Defaults to the host architecture and CPU.
+
+# Examples
+```julia
+feature_names()                           # host CPU features
+feature_names("x86_64", "haswell")        # haswell's features
+feature_names("aarch64", "cortex-x925")   # cross-arch query
+```
+"""
+feature_names() = feature_names(string(Sys.ARCH), _host_isa())
+feature_names(isa::ISA) = feature_names(string(Sys.ARCH), isa)
+function feature_names(arch::String, cpu::String)
+    isa = _cross_lookup_cpu(arch, cpu)
+    return feature_names(arch, isa)
+end
+function feature_names(arch::String, isa::ISA)
+    mapping = _build_bit_to_name(arch)
+    return sort([get(mapping, bit, "unknown_$bit") for bit in isa.features])
+end
+
+"""
+    _lookup_cpu(name::String) -> ISA
+
+Look up hardware features for the named CPU on the host architecture.
+Returns an empty ISA if the CPU name is not found.
+"""
+function _lookup_cpu(name::String)
+    nbytes = ccall(:jl_cpufeatures_nbytes, Csize_t, ())
+    buf = Vector{UInt8}(undef, nbytes)
+    ret = ccall(:jl_cpufeatures_lookup, Cint, (Cstring, Ptr{UInt8}, Csize_t), name, buf, nbytes)
+    ret != 0 && return ISA(Set{UInt32}())
+    return _featurebytes_to_isa(buf)
+end
+
+"""
+    _host_isa() -> ISA
+
+Get the hardware features of the host CPU from the cpufeatures library.
+"""
+function _host_isa()
+    nbytes = ccall(:jl_cpufeatures_nbytes, Csize_t, ())
+    buf = Vector{UInt8}(undef, nbytes)
+    ccall(:jl_cpufeatures_host, Cvoid, (Ptr{UInt8}, Csize_t), buf, nbytes)
+    return _featurebytes_to_isa(buf)
+end
+
+# Build an ISA list for a given architecture family.
+# Uses cross-arch lookup so it works on any host.
+# Entries with empty cpuname get an empty ISA (generic baseline).
+function _make_isa_list(arch::String, entries::Vector{Pair{String,String}})
+    result = Pair{String,ISA}[]
+    for (label, cpuname) in entries
+        if isempty(cpuname)
+            push!(result, label => ISA(Set{UInt32}()))
+        else
+            push!(result, label => _cross_lookup_cpu(arch, cpuname))
+        end
+    end
+    return result
+end
+
+# ISA definitions per architecture family.
+# CPU names are LLVM names in the cpufeatures database.
+# Keep in sync with `arch_march_isa_mapping` in binaryplatforms.jl.
 const ISAs_by_family = Dict(
-    "i686" => [
-        # Source: https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html.
-        # Implicit in all sets, because always required by Julia: mmx, sse, sse2
-        "pentium4" => ISA(Set{UInt32}()),
-        "prescott" => ISA(Set((JL_X86_sse3,))),
-    ],
-    "x86_64" => [
-        # Source: https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html.
-        # Implicit in all sets, because always required by x86-64 architecture: mmx, sse, sse2
-        "x86_64" => ISA(Set{UInt32}()),
-        "core2" => ISA(Set((JL_X86_sse3, JL_X86_ssse3))),
-        "nehalem" => ISA(Set((JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt))),
-        "sandybridge" => ISA(Set((JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_aes, JL_X86_pclmul))),
-        "haswell" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c))),
-        "skylake" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c, JL_X86_rdseed, JL_X86_adx, JL_X86_prfchw, JL_X86_clflushopt, JL_X86_xsavec, JL_X86_xsaves))),
-        "skylake_avx512" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_pku, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c, JL_X86_rdseed, JL_X86_adx, JL_X86_prfchw, JL_X86_clflushopt, JL_X86_xsavec, JL_X86_xsaves, JL_X86_avx512f, JL_X86_clwb, JL_X86_avx512vl, JL_X86_avx512bw, JL_X86_avx512dq, JL_X86_avx512cd))),
-    ],
-    "armv6l" => [
-        # The only armv6l processor we know of that runs Julia on armv6l
-        # We don't have a good way to tell the different armv6l variants apart through features,
-        # and honestly we don't care much since it's basically this one chip that people want to use with Julia.
-        "arm1176jzfs" => ISA(Set{UInt32}()),
-    ],
-    "armv7l" => [
-        "armv7l" => ISA(Set{UInt32}()),
-        "armv7l+neon" => ISA(Set((JL_AArch32_neon,))),
-        "armv7l+neon+vfpv4" => ISA(Set((JL_AArch32_neon, JL_AArch32_vfp4))),
-    ],
-    "aarch64" => [
-        # Implicit in all sets, because always required: fp, asimd
-        "armv8.0-a" => ISA(Set{UInt32}()),
-        "armv8.1-a" => ISA(Set((JL_AArch64_v8_1a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm))),
-        "armv8.2-a+crypto" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2))),
-        "a64fx" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_sha2, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fullfp16, JL_AArch64_sve))),
-        "apple_m1" => ISA(Set((JL_AArch64_v8_5a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2, JL_AArch64_sha3, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fp16fml, JL_AArch64_fullfp16, JL_AArch64_dotprod, JL_AArch64_rcpc, JL_AArch64_altnzcv))),
-    ],
-    "riscv64" => [
-        "riscv64" => ISA(Set{UInt32}()),
-    ],
-    "powerpc64le" => [
-        # We have no way to test powerpc64le features yet, so we're only going to declare the lowest ISA:
-        "power8" => ISA(Set{UInt32}()),
-    ],
-    "riscv64" => [
-        # We have no way to test riscv64 features yet, so we're only going to declare the lowest ISA:
-        "riscv64" => ISA(Set{UInt32}()),
-    ],
+    "i686" => _make_isa_list("x86_64", [
+        "pentium4" => "",
+        "prescott" => "prescott",
+    ]),
+    "x86_64" => _make_isa_list("x86_64", [
+        "x86_64" => "",
+        "core2" => "core2",
+        "nehalem" => "nehalem",
+        "sandybridge" => "sandybridge",
+        "haswell" => "haswell",
+        "skylake" => "skylake",
+        "skylake_avx512" => "skylake-avx512",
+    ]),
+    "aarch64" => _make_isa_list("aarch64", [
+        "armv8.0-a" => "",
+        "armv8.1-a" => "cortex-a76",
+        "armv8.2-a+crypto" => "cortex-a78",
+        "a64fx" => "a64fx",
+        "apple_m1" => "apple-a14",
+    ]),
+    "armv6l" => _make_isa_list("aarch64", [
+        "arm1176jzfs" => "",
+    ]),
+    "armv7l" => _make_isa_list("aarch64", [
+        "armv7l" => "",
+        "armv7l+neon" => "",
+        "armv7l+neon+vfpv4" => "",
+    ]),
+    "riscv64" => _make_isa_list("riscv64", [
+        "riscv64" => "",
+    ]),
+    "powerpc64le" => _make_isa_list("powerpc64le", [
+        "power8" => "",
+    ]),
 )
 
 # Test a CPU feature exists on the currently-running host
@@ -96,27 +208,13 @@ function normalize_arch(arch::String)
     return arch
 end
 
-let
-    # Collect all relevant features for the current architecture, if any.
-    FEATURES = UInt32[]
-    arch = normalize_arch(String(Sys.ARCH))
-    if arch in keys(ISAs_by_family)
-        for isa in ISAs_by_family[arch]
-            unique!(append!(FEATURES, last(isa).features))
-        end
-    end
-
-    # Use `@eval` to inline the list of features.
-    @eval function cpu_isa()
-        return ISA(Set{UInt32}(feat for feat in $(FEATURES) if test_cpu_feature(feat)))
-    end
-end
-
 """
     cpu_isa()
 
 Return the [`ISA`](@ref) (instruction set architecture) of the current CPU.
 """
-cpu_isa
+function cpu_isa()
+    return _host_isa()
+end
 
 end # module CPUID
diff --git a/base/loading.jl b/base/loading.jl
index fa5694227f382..8d0e68ece3b1c 100644
--- a/base/loading.jl
+++ b/base/loading.jl
@@ -1933,21 +1933,25 @@ end
 struct ImageTarget
     name::String
     flags::Int32
+    base::Int32
     ext_features::String
-    features_en::Vector{UInt8}
-    features_dis::Vector{UInt8}
+    features_en::String
+    features_dis::String
 end
 
 function parse_image_target(io::IO)
     flags = read(io, Int32)
-    nfeature = read(io, Int32)
-    feature_en = read(io, 4*nfeature)
-    feature_dis = read(io, 4*nfeature)
+    base = read(io, Int32)
+    nwords = read(io, Int32)  # number of uint64_t feature words
+    feature_en_raw = read(io, 8*nwords)
+    feature_dis_raw = read(io, 8*nwords)
     name_len = read(io, Int32)
     name = String(read(io, name_len))
     ext_features_len = read(io, Int32)
     ext_features = String(read(io, ext_features_len))
-    ImageTarget(name, flags, ext_features, feature_en, feature_dis)
+    features_en = @ccall jl_feature_bits_to_string(feature_en_raw::Ptr{UInt8}, nwords::Int32)::Ref{String}
+    features_dis = @ccall jl_feature_bits_to_string(feature_dis_raw::Ptr{UInt8}, nwords::Int32)::Ref{String}
+    ImageTarget(name, flags, base, ext_features, features_en, features_dis)
 end
 
 function parse_image_targets(targets::Vector{UInt8})
@@ -1965,51 +1969,18 @@ function current_image_targets()
     return parse_image_targets(targets)
 end
 
-struct FeatureName
-    name::Cstring
-    bit::UInt32 # bit index into a `uint32_t` array;
-    llvmver::UInt32 # 0 if it is available on the oldest LLVM version we support
-end
-
-function feature_names()
-    fnames = Ref{Ptr{FeatureName}}()
-    nf = Ref{Csize_t}()
-    @ccall jl_reflect_feature_names(fnames::Ptr{Ptr{FeatureName}}, nf::Ptr{Csize_t})::Cvoid
-    if fnames[] == C_NULL
-        @assert nf[] == 0
-        return Vector{FeatureName}(undef, 0)
-    end
-    Base.unsafe_wrap(Array, fnames[], nf[], own=false)
-end
-
-function test_feature(features::Vector{UInt8}, feat::FeatureName)
-    bitidx = feat.bit
-    u8idx = div(bitidx, 8) + 1
-    bit = bitidx % 8
-    return (features[u8idx] & (1 << bit)) != 0
-end
-
 function show(io::IO, it::ImageTarget)
     print(io, it.name)
     if !isempty(it.ext_features)
         print(io, ",", it.ext_features)
     end
+    if it.base >= 0
+        print(io, "; base=", it.base)
+    end
     print(io, "; flags=", it.flags)
-    print(io, "; features_en=(")
-    first = true
-    for feat in feature_names()
-        if test_feature(it.features_en, feat)
-            name = Base.unsafe_string(feat.name)
-            if first
-                first = false
-                print(io, name)
-            else
-                print(io, ", ", name)
-            end
-        end
+    if !isempty(it.features_en)
+        print(io, "; features_en=(", it.features_en, ")")
     end
-    print(io, ")")
-    # Is feature_dis useful?
 end
 
 # should sync with the types of arguments of `stale_cachefile`
diff --git a/deps/Makefile b/deps/Makefile
index cea1e52c55156..1dd0d5dd9e1e5 100644
--- a/deps/Makefile
+++ b/deps/Makefile
@@ -95,6 +95,8 @@ ifeq ($(USE_SYSTEM_DSFMT), 0)
 DEP_LIBS += dsfmt
 endif
 
+DEP_LIBS += cpufeatures
+
 ifeq ($(USE_SYSTEM_LLVM), 0)
 DEP_LIBS += llvm
 endif
@@ -211,7 +213,7 @@ DEP_LIBS_STAGED_ALL := llvm llvm-tools clang llvmunwind unwind libuv pcre \
 	openlibm dsfmt blastrampoline openblas lapack gmp mpfr patchelf utf8proc \
 	objconv openssl libssh2 nghttp2 curl libgit2 libwhich zlib zstd p7zip csl \
 	sanitizers libsuitesparse lld libtracyclient ittapi nvtx \
-	terminfo mmtk_julia
+	terminfo mmtk_julia cpufeatures
 DEP_LIBS_ALL := $(DEP_LIBS_STAGED_ALL)
 
 ifneq ($(USE_BINARYBUILDER_OPENBLAS),0)
@@ -282,6 +284,7 @@ include $(SRCDIR)/unwind.mk
 include $(SRCDIR)/gmp.mk
 include $(SRCDIR)/mpfr.mk
 include $(SRCDIR)/patchelf.mk
+include $(SRCDIR)/cpufeatures.mk
 include $(SRCDIR)/openssl.mk
 include $(SRCDIR)/libssh2.mk
 include $(SRCDIR)/nghttp2.mk
diff --git a/deps/cpufeatures.mk b/deps/cpufeatures.mk
new file mode 100644
index 0000000000000..0bac863c6704e
--- /dev/null
+++ b/deps/cpufeatures.mk
@@ -0,0 +1,48 @@
+## CPUFEATURES - standalone CPU feature detection library ##
+include $(SRCDIR)/cpufeatures.version
+
+CPUFEATURES_SRC_DIR := $(BUILDDIR)/cpufeatures-$(CPUFEATURES_VER)
+
+$(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz: | $(SRCCACHE)
+	$(JLDOWNLOAD) $@ $(CPUFEATURES_TAR_URL)
+	touch -c $@
+
+$(CPUFEATURES_SRC_DIR)/source-extracted: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz
+	rm -rf $(dir $@)
+	mkdir -p $(dir $@)
+	$(TAR) -C $(dir $@) --strip-components 1 -xf $<
+	echo 1 > $@
+
+checksum-cpufeatures: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz
+	$(JLCHECKSUM) $<
+
+$(CPUFEATURES_SRC_DIR)/build-compiled: $(CPUFEATURES_SRC_DIR)/source-extracted
+	$(MAKE) -C $(CPUFEATURES_SRC_DIR) lib \
+		CXX="$(CXX)" \
+		CXXFLAGS="$(JCXXFLAGS) -O2" \
+		ARCH=$(ARCH)
+	echo 1 > $@
+
+define CPUFEATURES_INSTALL
+	mkdir -p $2/$$(build_includedir)/cpufeatures
+	mkdir -p $2/$$(build_libdir)
+	cp $1/include/*.h $2/$$(build_includedir)/cpufeatures/
+	cp $1/generated/target_tables_*.h $2/$$(build_includedir)/cpufeatures/
+	cp $1/build/libtarget_parsing.a $2/$$(build_libdir)/
+endef
+$(eval $(call staged-install, \
+	cpufeatures,cpufeatures-$(CPUFEATURES_VER), \
+	CPUFEATURES_INSTALL,,,,))
+
+clean-cpufeatures:
+	-rm -f $(CPUFEATURES_SRC_DIR)/build-compiled
+
+distclean-cpufeatures:
+	rm -rf $(SRCCACHE)/cpufeatures*.tar.gz $(CPUFEATURES_SRC_DIR)
+
+get-cpufeatures: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz
+extract-cpufeatures: $(CPUFEATURES_SRC_DIR)/source-extracted
+configure-cpufeatures: extract-cpufeatures
+compile-cpufeatures: $(CPUFEATURES_SRC_DIR)/build-compiled
+fastcheck-cpufeatures: check-cpufeatures
+check-cpufeatures: compile-cpufeatures
diff --git a/deps/cpufeatures.version b/deps/cpufeatures.version
new file mode 100644
index 0000000000000..ced12eccd62f5
--- /dev/null
+++ b/deps/cpufeatures.version
@@ -0,0 +1,7 @@
+# -*- makefile -*-
+
+## source build
+CPUFEATURES_VER := 0.2.0
+CPUFEATURES_GIT_URL := https://github.com/gbaraldi/cpufeatures.git
+CPUFEATURES_TAR_URL := https://github.com/gbaraldi/cpufeatures/archive/e8178f952870a83c506f3f08150e3915193ab862.tar.gz
+CPUFEATURES_SHA := e8178f952870a83c506f3f08150e3915193ab862
diff --git a/src/Makefile b/src/Makefile
index 495a923f372e3..2a6e0a554b757 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -106,7 +106,7 @@ else
 # JULIACODEGEN != LLVM
 endif
 
-RT_LLVM_LIBS := support targetparser
+RT_LLVM_LIBS := support
 
 ifeq ($(OS),WINNT)
 SRCS += win32_ucontext
@@ -203,7 +203,7 @@ LIBJULIA_PATH_REL := libjulia
 endif
 
 COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir)
-RT_LIBS := $(call whole_archive,$(LIBUV)) $(call whole_archive,$(LIBUTF8PROC)) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) -lzstd
+RT_LIBS := $(call whole_archive,$(LIBUV)) $(call whole_archive,$(LIBUTF8PROC)) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) -lzstd -L$(build_libdir) -ltarget_parsing
 # NB: CG needs uv_mutex_* symbols, but we expect to export them from libjulia-internal
 CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI)
 
@@ -422,7 +422,7 @@ $(BUILDDIR)/llvm-pass-helpers.o $(BUILDDIR)/llvm-pass-helpers.dbg.obj: $(SRCDIR)
 $(BUILDDIR)/llvm-propagate-addrspaces.o $(BUILDDIR)/llvm-propagate-addrspaces.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h
 $(BUILDDIR)/llvm-remove-addrspaces.o $(BUILDDIR)/llvm-remove-addrspaces.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h
 $(BUILDDIR)/llvm-ptls.o $(BUILDDIR)/llvm-ptls.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h
-$(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h)
+$(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(SRCDIR)/processor.h
 $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c)
 $(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/staticdata_utils.c $(SRCDIR)/precompile_utils.c $(SRCDIR)/processor.h $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/toplevel.o $(BUILDDIR)/toplevel.dbg.obj: $(SRCDIR)/builtin_proto.h
@@ -577,10 +577,7 @@ INCLUDED_CXX_FILES := \
 	codegen.cpp:abi_x86.cpp \
 	codegen.cpp:cgutils.cpp \
 	codegen.cpp:intrinsics.cpp \
-	codegen.cpp:ccall.cpp \
-	processor.cpp:processor_x86.cpp \
-	processor.cpp:processor_arm.cpp \
-	processor.cpp:processor_fallback.cpp
+	codegen.cpp:ccall.cpp
 
 .PHONY: clean
 clean:
@@ -612,7 +609,7 @@ $(build_shlibdir)/lib%Plugin.$(SHLIB_EXT): $(SRCDIR)/clangsa/%.cpp $(LLVM_CONFIG
 # before attempting this static analysis, so that all necessary headers
 # and dependencies are properly installed:
 #   make -C src install-analysis-deps
-ANALYSIS_DEPS := llvm clang llvm-tools libuv utf8proc zstd
+ANALYSIS_DEPS := llvm clang llvm-tools libuv utf8proc zstd cpufeatures
 ifeq ($(OS),Darwin)
 ANALYSIS_DEPS += llvmunwind
 else ifeq ($(OS),OpenBSD)
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index 1fbfd459fc2ea..4a7936b697b94 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -2105,8 +2105,6 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname,
 
     // Reset the target triple to make sure it matches the new target machine
 
-    bool has_veccall = false;
-
     {
         JL_TIMING(NATIVE_AOT, NATIVE_Setup);
         dataM.setDataLayout(DL);
@@ -2185,7 +2183,6 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname,
             }
         }
 
-        has_veccall = !!dataM.getModuleFlag("julia.mv.veccall");
     };
 
     {
@@ -2245,20 +2242,8 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname,
             builder.CreateRet(ConstantInt::get(T_int32, 1));
         }
         if (imaging_mode) {
-            auto specs = jl_get_llvm_clone_targets(jl_options.cpu_target);
-            const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0;
-            SmallVector<uint8_t, 0> data;
-            auto push_i32 = [&] (uint32_t v) {
-                uint8_t buff[4];
-                memcpy(buff, &v, 4);
-                data.insert(data.end(), buff, buff + 4);
-            };
-            push_i32(specs.size());
-            for (uint32_t i = 0; i < specs.size(); i++) {
-                push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME));
-                auto &specdata = specs[i].data;
-                data.insert(data.end(), specdata.begin(), specdata.end());
-            }
+            auto targets = jl_get_llvm_clone_targets(jl_options.cpu_target);
+            auto &data = targets.data;
             auto value = ConstantDataArray::get(Context, data);
             auto target_ids = new GlobalVariable(metadataM, value->getType(), true,
                                         GlobalVariable::InternalLinkage,
@@ -2274,8 +2259,9 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname,
             jl_small_typeof_copy->setVisibility(GlobalValue::HiddenVisibility);
             jl_small_typeof_copy->setDSOLocal(true);
 
-            // Create CPU target string constant
-            auto cpu_target_str = jl_options.cpu_target ? jl_options.cpu_target : "native";
+            // Create CPU target string constant.
+            // Don't store "sysimage" keyword — store the actual resolved target string.
+            std::string cpu_target_str = jl_expand_sysimage_keyword(jl_options.cpu_target);
             auto cpu_target_data = ConstantDataArray::getString(Context, cpu_target_str, true);
             auto cpu_target_global = new GlobalVariable(metadataM, cpu_target_data->getType(), true,
                                                        GlobalVariable::InternalLinkage,
diff --git a/src/clangsa/GCChecker.cpp b/src/clangsa/GCChecker.cpp
index d5e421c8f65d0..04f204b923cf0 100644
--- a/src/clangsa/GCChecker.cpp
+++ b/src/clangsa/GCChecker.cpp
@@ -905,7 +905,7 @@ bool GCChecker::isSafepoint(const CallEvent &Call, CheckerContext &C) const {
     while (DC) {
       // Anything in llvm or std is not a safepoint
       if (const NamespaceDecl *NDC = dyn_cast<NamespaceDecl>(DC))
-        if (NDC->getName() == "llvm" || NDC->getName() == "std")
+        if (NDC->getName() == "llvm" || NDC->getName() == "std" || NDC->getName() == "tp")
           return false;
       DC = DC->getParent();
     }
diff --git a/src/crc32c.c b/src/crc32c.c
index 50d2acc603359..4994015a930e2 100644
--- a/src/crc32c.c
+++ b/src/crc32c.c
@@ -345,7 +345,8 @@ JL_DLLEXPORT uint32_t jl_crc32c(uint32_t crc, const char *buf, size_t len)
 #  elif defined(_OS_LINUX_)
 static crc32c_func_t crc32c_dispatch(unsigned long hwcap)
 {
-    if (hwcap & (1 << JL_AArch64_crc))
+    // HWCAP_CRC32 is bit 7 in the Linux AArch64 HWCAP
+    if (hwcap & (1 << 7))
         return crc32c_armv8;
     return jl_crc32c_sw;
 }
diff --git a/src/init.c b/src/init.c
index 4d7cdf70ef1f8..346e684d29e94 100644
--- a/src/init.c
+++ b/src/init.c
@@ -582,9 +582,14 @@ static NOINLINE void _finish_jl_init_(jl_image_buf_t sysimage, jl_ptls_t ptls, j
 
     if (jl_options.cpu_target == NULL)
         jl_options.cpu_target = "native";
+    if (jl_options.cpu_target[0] == '\0')
+        jl_error("Invalid target option: empty CPU name");
+
+    // Validate CPU target: check for unknown names, multiple targets, clone_all
+    jl_check_cpu_target(jl_options.cpu_target, jl_generating_output());
 
     // Parse image, perform relocations, and init JIT targets, etc.
-    jl_image_t parsed_image = jl_init_processor_sysimg(sysimage, jl_options.cpu_target);
+    jl_image_t parsed_image = jl_load_sysimg(sysimage, jl_options.cpu_target);
 
     jl_init_codegen();
 
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index acfd7de43838e..dce16e3387a20 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -1125,38 +1125,13 @@ namespace {
         options.MCOptions.ABIName = "lp64";
 #endif
 #endif
-        uint32_t target_flags = 0;
-        auto target = jl_get_llvm_target(jl_options.cpu_target, jl_generating_output(), target_flags);
-        auto &TheCPU = target.first;
-        SmallVector<std::string, 10> targetFeatures(target.second.begin(), target.second.end());
+        auto [TheCPU, FeaturesStr] = jl_get_llvm_target(jl_options.cpu_target, jl_generating_output());
         std::string errorstr;
         const Target *TheTarget = TargetRegistry::lookupTarget("", TheTriple, errorstr);
         if (!TheTarget) {
             jl_errorf("Internal problem with process triple %s lookup: %s", TheTriple.str().c_str(), errorstr.c_str());
             return nullptr;
         }
-        if (jl_processor_print_help || (target_flags & JL_TARGET_UNKNOWN_NAME)) {
-            std::unique_ptr<MCSubtargetInfo> MSTI(
-                TheTarget->createMCSubtargetInfo(TheTriple.str(), "", ""));
-            if (!MSTI->isCPUStringValid(TheCPU)) {
-                jl_errorf("Invalid CPU name \"%s\".", TheCPU.c_str());
-                return nullptr;
-            }
-            if (jl_processor_print_help) {
-                // This is the only way I can find to print the help message once.
-                // It'll be nice if we can iterate through the features and print our own help
-                // message...
-                MSTI->setDefaultFeatures("help", "", "");
-            }
-        }
-        // Package up features to be passed to target/subtarget
-        std::string FeaturesStr;
-        if (!targetFeatures.empty()) {
-            SubtargetFeatures Features;
-            for (unsigned i = 0; i != targetFeatures.size(); ++i)
-                Features.AddFeature(targetFeatures[i]);
-            FeaturesStr = Features.getString();
-        }
         // Allocate a target...
         std::optional<CodeModel::Model> codemodel =
 #ifdef _P64
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 55b20479408f9..c9ac19ba616dd 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -49,9 +49,17 @@ using namespace llvm;
 
 extern std::optional<bool> always_have_fma(Function&, const Triple &TT);
 
+// Per-function clone categories (set by IR analysis)
+enum {
+    JL_CLONE_LOOP     = 1 << 0,
+    JL_CLONE_SIMD     = 1 << 1,
+    JL_CLONE_MATH     = 1 << 2,
+    JL_CLONE_CPU      = 1 << 3,
+    JL_CLONE_FLOAT16  = 1 << 4,
+    JL_CLONE_BFLOAT16 = 1 << 5,
+};
+
 namespace {
-constexpr uint32_t clone_mask =
-    JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16;
 
 // Treat identical mapping as missing and return `def` in that case.
 // We mainly need this to identify cloned function using value map after LLVM cloning
@@ -83,9 +91,9 @@ static uint32_t collect_func_info(Function &F, const Triple &TT, bool &has_vecca
     LoopInfo LI(DT);
     uint32_t flag = 0;
     if (!LI.empty())
-        flag |= JL_TARGET_CLONE_LOOP;
+        flag |= JL_CLONE_LOOP;
     if (is_vector(F.getFunctionType())) {
-        flag |= JL_TARGET_CLONE_SIMD;
+        flag |= JL_CLONE_SIMD;
         has_veccall = true;
     }
     for (auto &bb: F) {
@@ -93,50 +101,47 @@ static uint32_t collect_func_info(Function &F, const Triple &TT, bool &has_vecca
             if (auto call = dyn_cast<CallInst>(&I)) {
                 if (is_vector(call->getFunctionType())) {
                     has_veccall = true;
-                    flag |= JL_TARGET_CLONE_SIMD;
+                    flag |= JL_CLONE_SIMD;
                 }
                 if (auto callee = call->getCalledFunction()) {
                     auto name = callee->getName();
                     if (name.starts_with("llvm.muladd.") || name.starts_with("llvm.fma.")) {
-                        flag |= JL_TARGET_CLONE_MATH;
+                        flag |= JL_CLONE_MATH;
                     }
                     else if (name.starts_with("julia.cpu.")) {
                         if (name.starts_with("julia.cpu.have_fma.")) {
-                            // for some platforms we know they always do (or don't) support
-                            // FMA. in those cases we don't need to clone the function.
-                            // always_have_fma returns an optional<bool>
                             if (!always_have_fma(*callee, TT))
-                                flag |= JL_TARGET_CLONE_CPU;
+                                flag |= JL_CLONE_CPU;
                         } else {
-                            flag |= JL_TARGET_CLONE_CPU;
+                            flag |= JL_CLONE_CPU;
                         }
                     }
                 }
             }
             else if (auto store = dyn_cast<StoreInst>(&I)) {
                 if (store->getValueOperand()->getType()->isVectorTy()) {
-                    flag |= JL_TARGET_CLONE_SIMD;
+                    flag |= JL_CLONE_SIMD;
                 }
             }
             else if (I.getType()->isVectorTy()) {
-                flag |= JL_TARGET_CLONE_SIMD;
+                flag |= JL_CLONE_SIMD;
             }
             if (auto mathOp = dyn_cast<FPMathOperator>(&I)) {
                 if (mathOp->getFastMathFlags().any()) {
-                    flag |= JL_TARGET_CLONE_MATH;
+                    flag |= JL_CLONE_MATH;
                 }
             }
 
             for (size_t i = 0; i < I.getNumOperands(); i++) {
                 if(I.getOperand(i)->getType()->isHalfTy()) {
-                    flag |= JL_TARGET_CLONE_FLOAT16;
+                    flag |= JL_CLONE_FLOAT16;
                 }
                 if(I.getOperand(i)->getType()->isBFloatTy()) {
-                    flag |= JL_TARGET_CLONE_BFLOAT16;
+                    flag |= JL_CLONE_BFLOAT16;
                 }
             }
-            uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16;
-            if (has_veccall && (flag & veccall_flags) == veccall_flags) {
+            constexpr uint32_t all_flags = JL_CLONE_SIMD | JL_CLONE_MATH | JL_CLONE_CPU | JL_CLONE_FLOAT16 | JL_CLONE_BFLOAT16;
+            if (has_veccall && (flag & all_flags) == all_flags) {
                 return flag;
             }
         }
@@ -148,7 +153,20 @@ struct TargetSpec {
     std::string cpu_name;
     std::string cpu_features;
     uint32_t base;
-    uint32_t flags;
+    bool clone_all = false;
+    bool opt_size = false;
+    bool min_size = false;
+    tp::FeatureDiff diff;
+
+    // Which per-function categories to clone for this target
+    uint32_t clone_flags() const {
+        uint32_t mask = JL_CLONE_LOOP | JL_CLONE_CPU;
+        if (diff.has_new_math)     mask |= JL_CLONE_MATH;
+        if (diff.has_new_simd)     mask |= JL_CLONE_SIMD;
+        if (diff.has_new_float16)  mask |= JL_CLONE_FLOAT16;
+        if (diff.has_new_bfloat16) mask |= JL_CLONE_BFLOAT16;
+        return mask;
+    }
 
     TargetSpec() = default;
 
@@ -157,17 +175,43 @@ struct TargetSpec {
         out.cpu_name = spec.cpu_name;
         out.cpu_features = spec.cpu_features;
         out.base = spec.base;
-        out.flags = spec.flags;
+        out.clone_all = spec.clone_all;
+        out.opt_size = spec.opt_size;
+        out.min_size = spec.min_size;
+        out.diff = spec.diff;
         return out;
     }
 
+    // Pack/unpack for LLVM metadata serialization
+    uint32_t packed_flags() const {
+        uint32_t f = 0;
+        if (clone_all)             f |= 1 << 0;
+        if (opt_size)              f |= 1 << 1;
+        if (min_size)              f |= 1 << 2;
+        if (diff.has_new_math)     f |= 1 << 3;
+        if (diff.has_new_simd)     f |= 1 << 4;
+        if (diff.has_new_float16)  f |= 1 << 5;
+        if (diff.has_new_bfloat16) f |= 1 << 6;
+        return f;
+    }
+
+    void unpack_flags(uint32_t f) {
+        clone_all             = f & (1 << 0);
+        opt_size              = f & (1 << 1);
+        min_size              = f & (1 << 2);
+        diff.has_new_math     = f & (1 << 3);
+        diff.has_new_simd     = f & (1 << 4);
+        diff.has_new_float16  = f & (1 << 5);
+        diff.has_new_bfloat16 = f & (1 << 6);
+    }
+
     static TargetSpec fromMD(MDTuple *tup) {
         TargetSpec out;
         assert(tup->getNumOperands() == 4);
         out.cpu_name = cast<MDString>(tup->getOperand(0))->getString().str();
         out.cpu_features = cast<MDString>(tup->getOperand(1))->getString().str();
         out.base = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(2))->getValue())->getZExtValue();
-        out.flags = cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(3))->getValue())->getZExtValue();
+        out.unpack_flags(cast<ConstantInt>(cast<ConstantAsMetadata>(tup->getOperand(3))->getValue())->getZExtValue());
         return out;
     }
 
@@ -176,7 +220,7 @@ struct TargetSpec {
             MDString::get(ctx, cpu_name),
             MDString::get(ctx, cpu_features),
             ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), base)),
-            ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), flags))
+            ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), packed_flags()))
         });
     }
 };
@@ -216,12 +260,14 @@ static void annotate_module_clones(Module &M) {
     if (auto maybe_specs = get_target_specs(M)) {
         specs = std::move(*maybe_specs);
     } else {
-        auto full_specs = jl_get_llvm_clone_targets(jl_options.cpu_target);
-        specs.reserve(full_specs.size());
-        for (auto &spec: full_specs) {
+#ifndef __clang_analyzer__
+        auto full = jl_get_llvm_clone_targets(jl_options.cpu_target);
+        specs.reserve(full.specs.size());
+        for (auto &spec: full.specs) {
             specs.push_back(TargetSpec::fromSpec(spec));
         }
         set_target_specs(M, specs);
+#endif
     }
     SmallVector<APInt, 0> clones(orig_funcs.size(), APInt(specs.size(), 0));
     BitVector subtarget_cloned(orig_funcs.size());
@@ -231,12 +277,12 @@ static void annotate_module_clones(Module &M) {
         func_infos[i] = collect_func_info(*orig_funcs[i], TT, has_veccall);
     }
     for (unsigned i = 1; i < specs.size(); i++) {
-        if (specs[i].flags & JL_TARGET_CLONE_ALL) {
+        if (specs[i].clone_all) {
             for (unsigned j = 0; j < orig_funcs.size(); j++) {
                 clones[j].setBit(i);
             }
         } else {
-            unsigned flag = specs[i].flags & clone_mask;
+            unsigned flag = specs[i].clone_flags();
             std::set<Function*> sets[2];
             for (unsigned j = 0; j < orig_funcs.size(); j++) {
                 if (!(func_infos[j] & flag)) {
@@ -455,7 +501,7 @@ CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars)
     uint32_t ntargets = specs.size();
     for (uint32_t i = 1; i < ntargets; i++) {
         auto &spec = specs[i];
-        if (spec.flags & JL_TARGET_CLONE_ALL) {
+        if (spec.clone_all) {
             group_ids[i] = groups.size();
             groups.emplace_back(i);
         }
@@ -586,7 +632,7 @@ void CloneCtx::clone_decls()
             new_F->setVisibility(F->getVisibility());
             new_F->setDSOLocal(true);
             auto base_func = F;
-            if (!(specs[i].flags & JL_TARGET_CLONE_ALL))
+            if (!(specs[i].clone_all))
                 base_func = static_cast<Group*>(linearized[specs[i].base])->base_func(F);
             (*linearized[i]->vmap)[base_func] = new_F;
         }
@@ -619,10 +665,10 @@ static void add_features(Function *F, TargetSpec &spec)
     }
     F->addFnAttr("target-cpu", spec.cpu_name);
     if (!F->hasFnAttribute(Attribute::OptimizeNone)) {
-        if (spec.flags & JL_TARGET_OPTSIZE) {
+        if (spec.opt_size) {
             F->addFnAttr(Attribute::OptimizeForSize);
         }
-        else if (spec.flags & JL_TARGET_MINSIZE) {
+        else if (spec.min_size) {
             F->addFnAttr(Attribute::MinSize);
         }
     }
@@ -1012,7 +1058,7 @@ void CloneCtx::emit_metadata()
             uint32_t len_idx = idxs.size();
             idxs.push_back(0); // We will fill in the real value later.
             uint32_t count = 0;
-            if (i == 0 || spec.flags & JL_TARGET_CLONE_ALL) {
+            if (i == 0 || spec.clone_all) {
                 auto grp = static_cast<Group*>(tgt);
                 count = jl_sysimg_tag_mask;
                 for (uint32_t j = 0; j < nfvars; j++) {
diff --git a/src/processor.cpp b/src/processor.cpp
index 1a25171082d82..fa15cbe6449f7 100644
--- a/src/processor.cpp
+++ b/src/processor.cpp
@@ -1,15 +1,14 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
-// Processor feature detection
-
-#include "llvm-version.h"
-#include <llvm/ADT/StringRef.h>
-#include <llvm/ADT/ArrayRef.h>
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/StringMap.h>
-#include <llvm/TargetParser/Host.h>
-#include <llvm/Support/MathExtras.h>
-#include <llvm/Support/raw_ostream.h>
+// Processor feature detection and dispatch using the cpufeatures library.
+// CPU/feature tables are generated from LLVM's TableGen data and committed
+// to https://github.com/gbaraldi/cpufeatures
+//
+// On LLVM version bump:
+//   1. cd cpufeatures && make -f Makefile.generate LLVM_VER=<new>
+//   2. Review and commit regenerated generated/ headers
+//   3. Update Julia's deps/cpufeatures.version with the new commit hash
+//   4. The static_assert below will catch major version mismatches
 
 #include "processor.h"
 
@@ -17,6 +16,8 @@
 #include "julia_internal.h"
 
 #include <algorithm>
+#include <vector>
+#include <string>
 
 #include "julia_assert.h"
 
@@ -24,617 +25,16 @@
 #include <dlfcn.h>
 #endif
 
-// CPU target string is a list of strings separated by `;` each string starts with a CPU
-// or architecture name and followed by an optional list of features separated by `,`.
-// A "generic" or empty CPU name means the basic required feature set of the target ISA
-// which is at least the architecture the C/C++ runtime is compiled with.
-
-// CPU dispatch needs to determine the version to be used by the sysimg as well as
-// the target and feature used by the JIT. Currently the only limitation on JIT target
-// and feature is matching register size between the sysimg and JIT so that SIMD vectors
-// can be passed correctly. This means disabling AVX and AVX2 if AVX was not enabled
-// in sysimg and disabling AVX512 if it was not enabled in sysimg.
-// This also possibly means that SVE needs to be disabled on AArch64 if sysimg doesn't have it
-// enabled.
-
-// CPU dispatch starts by first deciding the max feature set and CPU requested for JIT.
-// This is the host or the target specified on the command line with features unavailable
-// on the host disabled. All sysimg targets that require features not available in this set
-// will be ignored.
-
-// The next step is matching CPU name.
-// If exact name match with compatible feature set exists, all versions without name match
-// are ignored.
-// This step will query LLVM first so it can accept CPU names that is recognized by LLVM but
-// not by us (yet) when LLVM is enabled.
-
-// If there are still more than one candidates, a feature match is performed.
-// The ones with the largest register size will be used
-// (i.e. AVX512 > AVX2/AVX > SSE, SVE > ASIMD). If there's a tie, the one with the most features
-// enabled will be used. If there's still a tie the one that appears later in the list will be
-// used. (i.e. the order in the version list is significant in this case).
-
-// Features that are not recognized will be passed to LLVM directly during codegen
-// but ignored otherwise.
-
-// A few special features are supported:
-// 1. `clone_all`
-//
-//     This forces the target to have all functions in sysimg cloned.
-//     When used in negative form (i.e. `-clone_all`), this disables full clone that's
-//     enabled by default for certain targets.
-//
-// 2. `base([0-9]*)`
-//
-//     This specifies the (0-based) base target index. The base target is the target
-//     that the current target is based on, i.e. the functions that are not being cloned
-//     will use the version in the base target. This option causes the base target to be
-//     fully cloned (as if `clone_all` is specified for it) if it is not the default target (0).
-//     The index can only be smaller than the current index.
-//
-// 3. `opt_size`
-//
-//     Optimize for size with minimum performance impact. Clang/GCC's `-Os`.
-//
-// 4. `min_size`
-//
-//     Optimize only for size. Clang's `-Oz`.
 
-JL_DLLEXPORT bool jl_processor_print_help = false;
+// Forward declarations for sysimage CPU target storage
+static std::string sysimage_cpu_target;
+void jl_set_sysimage_cpu_target(const char *cpu_target);
 
 namespace {
 
-// Helper functions to test/set feature bits
-
-template<typename T1, typename T2, typename T3>
-static inline bool test_bits(T1 v, T2 mask, T3 test)
-{
-    return T3(v & mask) == test;
-}
-
-template<typename T1, typename T2>
-static inline bool test_all_bits(T1 v, T2 mask)
-{
-    return test_bits(v, mask, mask);
-}
-
-template<typename T1, typename T2>
-static inline bool test_nbit(const T1 &bits, T2 _bitidx)
-{
-    auto bitidx = static_cast<uint32_t>(_bitidx);
-    auto u32idx = bitidx / 32;
-    auto bit = bitidx % 32;
-    return (bits[u32idx] & (1 << bit)) != 0;
-}
-
-template<typename T>
-static inline void unset_bits(T &bits) JL_NOTSAFEPOINT
-{
-    (void)bits;
-}
-
-template<typename T, typename T1, typename... Rest>
-static inline void unset_bits(T &bits, T1 _bitidx, Rest... rest) JL_NOTSAFEPOINT
-{
-    auto bitidx = static_cast<uint32_t>(_bitidx);
-    auto u32idx = bitidx / 32;
-    auto bit = bitidx % 32;
-    bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit);
-    unset_bits(bits, rest...);
-}
-
-template<typename T, typename T1>
-static inline void set_bit(T &bits, T1 _bitidx, bool val)
-{
-    auto bitidx = static_cast<uint32_t>(_bitidx);
-    auto u32idx = bitidx / 32;
-    auto bit = bitidx % 32;
-    if (val) {
-        bits[u32idx] = bits[u32idx] | uint32_t(1 << bit);
-    }
-    else {
-        bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit);
-    }
-}
-
-// Helper functions to create feature masks
-
-// This can be `std::array<uint32_t,n>` on C++14
-template<size_t n>
-struct FeatureList {
-    uint32_t eles[n];
-    uint32_t &operator[](size_t pos) JL_NOTSAFEPOINT
-    {
-        return eles[pos];
-    }
-    constexpr const uint32_t &operator[](size_t pos) const
-    {
-        return eles[pos];
-    }
-    inline int nbits() const
-    {
-        int cnt = 0;
-        for (size_t i = 0; i < n; i++)
-            cnt += llvm::popcount(eles[i]);
-        return cnt;
-    }
-    inline bool empty() const
-    {
-        for (size_t i = 0; i < n; i++) {
-            if (eles[i]) {
-                return false;
-            }
-        }
-        return true;
-    }
-};
-
-static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx)
-{
-    return mask;
-}
-
-template<typename T, typename... Rest>
-static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx,
-                                                      T bit, Rest... args)
-{
-    return add_feature_mask_u32(mask | ((int(bit) >= 0 && int(bit) / 32 == (int)u32idx) ?
-                                        (1 << (int(bit) % 32)) : 0),
-                                u32idx, args...);
-}
-
-template<typename... Args>
-static inline constexpr uint32_t get_feature_mask_u32(uint32_t u32idx, Args... args)
-{
-    return add_feature_mask_u32(uint32_t(0), u32idx, args...);
-}
-
-template<uint32_t... Is> struct seq{};
-template<uint32_t N, uint32_t... Is>
-struct gen_seq : gen_seq<N-1, N-1, Is...>{};
-template<uint32_t... Is>
-struct gen_seq<0, Is...> : seq<Is...>{};
-
-template<size_t n, uint32_t... I, typename... Args>
-static inline constexpr FeatureList<n>
-_get_feature_mask(seq<I...>, Args... args)
-{
-    return FeatureList<n>{{get_feature_mask_u32(I, args...)...}};
-}
-
-template<size_t n, typename... Args>
-static inline constexpr FeatureList<n> get_feature_masks(Args... args)
-{
-    return _get_feature_mask<n>(gen_seq<n>(), args...);
-}
-
-template<size_t n, uint32_t... I>
-static inline constexpr FeatureList<n>
-_feature_mask_or(seq<I...>, const FeatureList<n> &a, const FeatureList<n> &b)
-{
-    return FeatureList<n>{{(a[I] | b[I])...}};
-}
-
-template<size_t n>
-static inline constexpr FeatureList<n> operator|(const FeatureList<n> &a, const FeatureList<n> &b)
-{
-    return _feature_mask_or<n>(gen_seq<n>(), a, b);
-}
-
-template<size_t n, uint32_t... I>
-static inline constexpr FeatureList<n>
-_feature_mask_and(seq<I...>, const FeatureList<n> &a, const FeatureList<n> &b)
-{
-    return FeatureList<n>{{(a[I] & b[I])...}};
-}
-
-template<size_t n>
-static inline constexpr FeatureList<n> operator&(const FeatureList<n> &a, const FeatureList<n> &b)
-{
-    return _feature_mask_and<n>(gen_seq<n>(), a, b);
-}
-
-template<size_t n, uint32_t... I>
-static inline constexpr FeatureList<n>
-_feature_mask_not(seq<I...>, const FeatureList<n> &a)
-{
-    return FeatureList<n>{{(~a[I])...}};
-}
-
-template<size_t n>
-static inline constexpr FeatureList<n> operator~(const FeatureList<n> &a)
-{
-    return _feature_mask_not<n>(gen_seq<n>(), a);
-}
-
-template<size_t n>
-static inline void mask_features(const FeatureList<n> masks, uint32_t *features)
-{
-    for (size_t i = 0; i < n; i++) {
-        features[i] = features[i] & masks[i];
-    }
-}
-
-// Turn feature list to a string the LLVM accept
-static inline std::string join_feature_strs(const llvm::ArrayRef<std::string> &strs)
-{
-    size_t nstr = strs.size();
-    if (!nstr)
-        return std::string("");
-    std::string str = strs[0];
-    for (size_t i = 1; i < nstr; i++)
-        str += ',' + strs[i];
-    return str;
-}
-
-static inline void append_ext_features(std::string &features, const std::string &ext_features)
-{
-    if (ext_features.empty())
-        return;
-    if (!features.empty())
-        features.push_back(',');
-    features.append(ext_features);
-}
-
-static inline void append_ext_features(llvm::SmallVectorImpl<std::string> &features,
-                                       const std::string &ext_features)
-{
-    if (ext_features.empty())
-        return;
-    const char *start = ext_features.c_str();
-    const char *p = start;
-    for (; *p; p++) {
-        if (*p == ',') {
-            features.emplace_back(start, p - start);
-            start = p + 1;
-        }
-    }
-    if (p > start) {
-        features.emplace_back(start, p - start);
-    }
-}
-
-/**
- * Target specific type/constant definitions, always enable.
- */
-
-template<typename CPU, size_t n>
-struct CPUSpec {
-    const char *name;
-    CPU cpu;
-    CPU fallback;
-    uint32_t llvmver;
-    FeatureList<n> features;
-};
-
-struct FeatureDep {
-    uint32_t feature;
-    uint32_t dep;
-};
-
-// Recursively enable all features that the current feature set depends on.
-template<size_t n>
-static inline void enable_depends(FeatureList<n> &features, const FeatureDep *deps, size_t ndeps)
-{
-    bool changed = true;
-    while (changed) {
-        changed = false;
-        for (ssize_t i = ndeps - 1; i >= 0; i--) {
-            auto &dep = deps[i];
-            if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep))
-                continue;
-            set_bit(features, dep.dep, true);
-            changed = true;
-        }
-    }
-}
-
-// Recursively disable all features that the current feature set does not provide.
-template<size_t n>
-static inline void disable_depends(FeatureList<n> &features, const FeatureDep *deps, size_t ndeps)
-{
-    bool changed = true;
-    while (changed) {
-        changed = false;
-        for (ssize_t i = ndeps - 1; i >= 0; i--) {
-            auto &dep = deps[i];
-            if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep))
-                continue;
-            unset_bits(features, dep.feature);
-            changed = true;
-        }
-    }
-}
-
-template<typename CPU, size_t n>
-static const CPUSpec<CPU,n> *find_cpu(uint32_t cpu, const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
-{
-    for (uint32_t i = 0; i < ncpus; i++) {
-        if (cpu == uint32_t(cpus[i].cpu)) {
-            return &cpus[i];
-        }
-    }
-    return nullptr;
-}
-
-template<typename CPU, size_t n>
-static const CPUSpec<CPU,n> *find_cpu(llvm::StringRef name, const CPUSpec<CPU,n> *cpus,
-                                      uint32_t ncpus)
-{
-    for (uint32_t i = 0; i < ncpus; i++) {
-        if (name == cpus[i].name) {
-            return &cpus[i];
-        }
-    }
-    return nullptr;
-}
-
-template<typename CPU, size_t n>
-static const char *find_cpu_name(uint32_t cpu, const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
-{
-    if (auto *spec = find_cpu(cpu, cpus, ncpus))
-        return spec->name;
-    return "generic";
-}
-
-JL_UNUSED static uint32_t find_feature_bit(const FeatureName *features, size_t nfeatures,
-                                           const char *str, size_t len)
-{
-    for (size_t i = 0; i < nfeatures; i++) {
-        auto &feature = features[i];
-        if (strncmp(feature.name, str, len) == 0 && feature.name[len] == 0) {
-            return feature.bit;
-        }
-    }
-    return UINT32_MAX;
-}
-
-// This is how we save the target identification.
-// CPU name is saved as string instead of binary data like features because
-// 1. CPU ID is less stable (they are not bound to hardware/OS API)
-// 2. We need to support CPU names that are not recognized by us and therefore doesn't have an ID
-// 3. CPU name is trivial to parse
-static inline llvm::SmallVector<uint8_t, 0>
-serialize_target_data(llvm::StringRef name, uint32_t nfeature, const uint32_t *features_en,
-                      const uint32_t *features_dis, llvm::StringRef ext_features)
-{
-    llvm::SmallVector<uint8_t, 0> res;
-    auto add_data = [&] (const void *data, size_t sz) {
-        if (sz == 0)
-            return;
-        size_t old_sz = res.size();
-        res.resize(old_sz + sz);
-        memcpy(&res[old_sz], data, sz);
-    };
-    add_data(&nfeature, 4);
-    add_data(features_en, 4 * nfeature);
-    add_data(features_dis, 4 * nfeature);
-    uint32_t namelen = name.size();
-    add_data(&namelen, 4);
-    add_data(name.data(), namelen);
-    uint32_t ext_features_len = ext_features.size();
-    add_data(&ext_features_len, 4);
-    add_data(ext_features.data(), ext_features_len);
-    return res;
-}
-
-template<size_t n>
-static inline llvm::SmallVector<uint8_t, 0>
-serialize_target_data(llvm::StringRef name, const FeatureList<n> &features_en,
-                      const FeatureList<n> &features_dis, llvm::StringRef ext_features)
-{
-    return serialize_target_data(name, n, &features_en[0], &features_dis[0], ext_features);
-}
-
-template<size_t n>
-struct TargetData {
-    std::string name;
-    std::string ext_features;
-    struct {
-        FeatureList<n> features;
-        uint32_t flags;
-    } en, dis;
-    int base;
-};
-
-// In addition to the serialized data, the first `uint32_t` gives the number of targets saved
-// and each target has a `uint32_t` flag before the serialized target data.
-template<size_t n>
-static inline llvm::SmallVector<TargetData<n>, 0> deserialize_target_data(const uint8_t *data)
-{
-    auto load_data = [&] (void *dest, size_t sz) {
-        memcpy(dest, data, sz);
-        data += sz;
-    };
-    auto load_string = [&] () {
-        uint32_t len;
-        load_data(&len, 4);
-        std::string res((const char*)data, len);
-        data += len;
-        return res;
-    };
-    uint32_t ntarget;
-    load_data(&ntarget, 4);
-    llvm::SmallVector<TargetData<n>, 0> res(ntarget);
-    for (uint32_t i = 0; i < ntarget; i++) {
-        auto &target = res[i];
-        load_data(&target.en.flags, 4);
-        target.dis.flags = 0;
-        // Starting serialized target data
-        uint32_t nfeature;
-        load_data(&nfeature, 4);
-        assert(nfeature == n);
-        load_data(&target.en.features[0], 4 * n);
-        load_data(&target.dis.features[0], 4 * n);
-        target.name = load_string();
-        target.ext_features = load_string();
-        target.base = 0;
-    }
-    return res;
-}
-
-// Try getting clone base argument. Return 1-based index. Return 0 if match failed.
-static inline int get_clone_base(const char *start, const char *end)
-{
-    const char *prefix = "base(";
-    const int prefix_len = strlen(prefix);
-    if (end - start <= prefix_len)
-        return 0;
-    if (memcmp(start, prefix, prefix_len) != 0)
-        return 0;
-    start += prefix_len;
-    if (*start > '9' || *start < '0')
-        return 0;
-    char *digit_end;
-    auto idx = strtol(start, &digit_end, 10);
-    if (idx < 0)
-        return 0;
-    if (*digit_end != ')' || digit_end + 1 != end)
-        return 0;
-    return (int)idx + 1;
-}
-
-// Parse cmdline string. This handles `clone_all` and `base` special features.
-// Other feature names will be passed to `feature_cb` for target dependent parsing.
-template<size_t n, typename F>
-static inline llvm::SmallVector<TargetData<n>, 0>
-parse_cmdline(const char *option, F &&feature_cb)
-{
-    if (!option)
-        abort();
-
-    // Preprocess the option string to expand "sysimage" keyword
-    std::string processed_option;
-    if (strncmp(option, "sysimage", 8) == 0 && (option[8] == '\0' || option[8] == ';')) {
-        // Replace "sysimage" with the actual sysimage CPU target
-        jl_value_t *target_str = jl_get_sysimage_cpu_target();
-        if (target_str != nullptr) {
-            processed_option = std::string(jl_string_data(target_str), jl_string_len(target_str));
-            if (option[8] == ';') {
-                processed_option += option + 8;  // append the rest after "sysimage"
-            }
-            option = processed_option.c_str();
-        }
-    }
-
-    llvm::SmallVector<TargetData<n>, 0> res;
-    TargetData<n> arg{};
-    auto reset_arg = [&] {
-        res.push_back(arg);
-        arg.name.clear();
-        arg.ext_features.clear();
-        memset(&arg.en.features[0], 0, 4 * n);
-        memset(&arg.dis.features[0], 0, 4 * n);
-        arg.en.flags = 0;
-        arg.dis.flags = 0;
-    };
-    const char *start = option;
-    for (const char *p = option; ; p++) {
-        switch (*p) {
-        case ',':
-        case ';':
-        case '\0': {
-            bool done = *p == '\0';
-            bool next_target = *p == ';' || done;
-            if (arg.name.empty()) {
-                if (p == start)
-                    jl_error("Invalid target option: empty CPU name");
-                arg.name.append(start, p - start);
-                if (arg.name == "help") {
-                    arg.name = "native";
-                    jl_processor_print_help = true;
-                }
-                start = p + 1;
-                if (next_target)
-                    reset_arg();
-                if (done)
-                    return res;
-                continue;
-            }
-            bool disable = false;
-            const char *full = start;
-            const char *fname = full;
-            start = p + 1;
-            if (*full == '-') {
-                disable = true;
-                fname++;
-            }
-            else if (*full == '+') {
-                fname++;
-            }
-            if (llvm::StringRef(fname, p - fname) == "clone_all") {
-                if (!disable) {
-                    arg.en.flags |= JL_TARGET_CLONE_ALL;
-                    arg.dis.flags &= ~JL_TARGET_CLONE_ALL;
-                }
-                else {
-                    arg.dis.flags |= JL_TARGET_CLONE_ALL;
-                    arg.en.flags &= ~JL_TARGET_CLONE_ALL;
-                }
-            }
-            else if (llvm::StringRef(fname, p - fname) == "opt_size") {
-                if (disable)
-                    jl_error("Invalid target option: disabled opt_size.");
-                if (arg.en.flags & JL_TARGET_MINSIZE)
-                    jl_error("Conflicting target option: both opt_size and min_size are specified.");
-                arg.en.flags |= JL_TARGET_OPTSIZE;
-            }
-            else if (llvm::StringRef(fname, p - fname) == "min_size") {
-                if (disable)
-                    jl_error("Invalid target option: disabled min_size.");
-                if (arg.en.flags & JL_TARGET_OPTSIZE)
-                    jl_error("Conflicting target option: both opt_size and min_size are specified.");
-                arg.en.flags |= JL_TARGET_MINSIZE;
-            }
-            else if (int base = get_clone_base(fname, p)) {
-                if (disable)
-                    jl_error("Invalid target option: disabled base index.");
-                base -= 1;
-                if (base >= (int)res.size())
-                    jl_error("Invalid target option: base index must refer to a previous target.");
-                if (res[base].dis.flags & JL_TARGET_CLONE_ALL ||
-                    !(res[base].en.flags & JL_TARGET_CLONE_ALL))
-                    jl_error("Invalid target option: base target must be clone_all.");
-                arg.base = base;
-            }
-            else if (llvm::StringRef(fname, p - fname) == "help") {
-                jl_processor_print_help = true;
-            }
-            else {
-                FeatureList<n> &list = disable ? arg.dis.features : arg.en.features;
-                if (!feature_cb(fname, p - fname, list)) {
-                    if (!arg.ext_features.empty())
-                        arg.ext_features += ',';
-                    arg.ext_features += disable ? '-' : '+';
-                    arg.ext_features.append(fname, p - fname);
-                }
-            }
-            if (next_target)
-                reset_arg();
-            if (done) {
-                return res;
-            }
-        }
-            JL_FALLTHROUGH;
-        default:
-            continue;
-        }
-    }
-}
-
-// Cached version of command line parsing
-template<size_t n, typename F>
-static inline llvm::SmallVector<TargetData<n>, 0> &get_cmdline_targets(const char *cpu_target, F &&feature_cb)
-{
-    static llvm::SmallVector<TargetData<n>, 0> targets =
-        parse_cmdline<n>(cpu_target, std::forward<F>(feature_cb));
-    return targets;
-}
-
-// Load sysimg, use the `callback` for dispatch and perform all relocations
-// for the selected target.
+// Load sysimg/pkgimg, use the callback for dispatch and perform all relocations
 template<typename F>
-static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void *ctx)
+static inline jl_image_t load_sysimg_target(jl_image_buf_t image, F &&callback, void *ctx)
 {
     JL_TIMING(LOAD_IMAGE, LOAD_Processor);
     jl_image_t res{};
@@ -662,10 +62,10 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void *
         jl_error("Image file is not compatible with this version of Julia");
     }
 
-    llvm::SmallVector<void*, 0> fvars(pointers->header->nfvars);
-    llvm::SmallVector<const char*, 0> gvars(pointers->header->ngvars);
+    std::vector<void*> fvars(pointers->header->nfvars);
+    std::vector<const char*> gvars(pointers->header->ngvars);
 
-    llvm::SmallVector<std::pair<uint32_t, void*>, 0> clones;
+    std::vector<std::pair<uint32_t, void*>> clones;
 
     for (unsigned i = 0; i < pointers->header->nshards; i++) {
         auto shard = pointers->shards[i];
@@ -682,7 +82,7 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void *
         clone_idxs++;
 
         assert(tag_len & jl_sysimg_tag_mask);
-        llvm::SmallVector<void**, 0> base_ptrs(0);
+        std::vector<void**> base_ptrs(0);
         base_ptrs.push_back(fvar_shard);
         // Find target
         for (uint32_t i = 0; i < target_idx; i++) {
@@ -826,188 +226,656 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void *
     return res;
 }
 
-template<typename T>
-static inline void check_cmdline(T &&cmdline, bool imaging)
+} // namespace
+
+
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// Unified processor detection and dispatch using the cpufeatures library.
+// Replaces processor_x86.cpp, processor_arm.cpp, and processor_fallback.cpp.
+// No hand-maintained CPU/feature tables — all data comes from LLVM TableGen
+// via generated headers committed to the cpufeatures repository.
+
+// Include cpufeatures generated tables (defines FeatureBits, feature_table, etc.)
+#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+#include <cpufeatures/target_tables_x86_64.h>
+#elif defined(_CPU_AARCH64_)
+#include <cpufeatures/target_tables_aarch64.h>
+#elif defined(__riscv) && __riscv_xlen == 64
+#include <cpufeatures/target_tables_riscv64.h>
+#else
+#include <cpufeatures/target_tables_fallback.h>
+#endif
+
+#include <cpufeatures/target_parsing.h>
+#include <cpufeatures/cross_arch.h>
+
+// Verify the cpufeatures tables were generated from a compatible LLVM version.
+#if defined(TARGET_TABLES_LLVM_VERSION_MAJOR) && defined(LLVM_VERSION_MAJOR)
+static_assert(TARGET_TABLES_LLVM_VERSION_MAJOR == LLVM_VERSION_MAJOR,
+    "cpufeatures tables were generated with a different LLVM major version than Julia uses");
+#endif
+
+// ============================================================================
+// Debug output
+// ============================================================================
+
+static bool cpufeatures_debug_enabled() {
+    static int enabled = -1;
+    if (enabled == -1) {
+        const char *debug_env = getenv("JULIA_DEBUG");
+        enabled = debug_env && (strstr(debug_env, "cpufeatures") || strstr(debug_env, "all"));
+    }
+    return enabled;
+}
+
+#define CF_DEBUG(...) do { if (cpufeatures_debug_enabled()) jl_safe_printf(__VA_ARGS__); } while (0)
+
+// ============================================================================
+// Convert feature bits to a comma-separated string of feature names.
+// Called from Julia's loading.jl to display ImageTarget features.
+JL_DLLEXPORT jl_value_t *jl_feature_bits_to_string(const uint8_t *bits, int32_t nwords)
 {
-    assert(cmdline.size() > 0);
-    // It's unclear what does specifying multiple target when not generating
-    // sysimg means. Make it an error for now.
-    if (!imaging) {
-        if (cmdline.size() > 1) {
-            jl_safe_printf("More than one command line CPU targets specified "
-                      "without a `--output-` flag specified");
-            exit(1);
-        }
-        if (cmdline[0].en.flags & JL_TARGET_CLONE_ALL) {
-            jl_safe_printf("\"clone_all\" feature specified "
-                      "without a `--output-` flag specified");
-            exit(1);
-        }
-        if (cmdline[0].en.flags & JL_TARGET_OPTSIZE) {
-            jl_safe_printf("\"opt_size\" feature specified "
-                      "without a `--output-` flag specified");
-            exit(1);
-        }
-        if (cmdline[0].en.flags & JL_TARGET_MINSIZE) {
-            jl_safe_printf("\"min_size\" feature specified "
-                      "without a `--output-` flag specified");
-            exit(1);
+    FeatureBits fb{};
+    int copy_words = nwords < TARGET_FEATURE_WORDS ? nwords : TARGET_FEATURE_WORDS;
+    memcpy(fb.bits, bits, copy_words * sizeof(uint64_t));
+    auto str = tp::build_feature_string(fb);
+    return jl_pchar_to_string(str.data(), str.size());
+}
+
+// ============================================================================
+// Host CPU detection — thin wrappers around cpufeatures library
+// ============================================================================
+
+static inline const std::string &host_cpu_name()
+{
+    return tp::get_host_cpu_name();
+}
+
+static std::string get_host_feature_string()
+{
+    auto fb = tp::get_host_features();
+    return tp::build_feature_string(fb);
+}
+
+// ============================================================================
+// JIT target management
+// ============================================================================
+
+static std::vector<tp::LLVMTargetSpec> jit_targets;
+
+// If cpu_target starts with "sysimage", replace it with the target string
+// stored in the loaded sysimage. Otherwise return as-is.
+extern "C" std::string jl_expand_sysimage_keyword(const char *cpu_target) {
+    if (!cpu_target || !*cpu_target)
+        return "";
+    std::string option(cpu_target);
+    if (option.substr(0, 8) == "sysimage" && (option.size() == 8 || option[8] == ';')) {
+        jl_value_t *target_str = jl_get_sysimage_cpu_target();
+        if (target_str && jl_string_len(target_str) > 0) {
+            std::string expanded(jl_string_data(target_str), jl_string_len(target_str));
+            if (option.size() > 8)
+                expanded += option.substr(8);
+            CF_DEBUG("[cpufeatures] expanded 'sysimage' -> '%s'\n", expanded.c_str());
+            return expanded;
         }
+        CF_DEBUG("[cpufeatures] WARNING: 'sysimage' keyword but no stored target, using 'native'\n");
+        return "native";
+    }
+    return option;
+}
+
+static void init_jit_targets(const char *cpu_target, bool imaging)
+{
+
+    if (!jit_targets.empty())
+        return;
+
+    auto target_str = jl_expand_sysimage_keyword(cpu_target);
+    CF_DEBUG("[cpufeatures] init_jit_targets: '%s' imaging=%d\n",
+             target_str.c_str(), imaging);
+
+    if (target_str.empty())
+        jl_error("Invalid target option: empty CPU name");
+
+    auto specs = tp::resolve_targets_for_llvm(target_str);
+
+    if (specs.empty())
+        jl_error("No targets specified");
+
+    for (auto &s : specs) {
+        CF_DEBUG("[cpufeatures]   target: name='%s' base=%d features=%s\n",
+                 s.cpu_name.c_str(), s.base, s.cpu_features.c_str());
+        jit_targets.push_back(std::move(s));
     }
 }
 
-struct SysimgMatch {
-    uint32_t best_idx{UINT32_MAX};
-    int vreg_size{0};
-};
+// ============================================================================
+// Sysimage / pkgimage target matching
+// ============================================================================
 
-// Find the best match in the sysimg.
-// Select the best one based on the largest vector register and largest compatible feature set.
-template<typename S, typename T, typename F>
-static inline SysimgMatch match_sysimg_targets(S &&sysimg, T &&target, F &&max_vector_size, jl_value_t **rejection_reason)
+// Shared: deserialize image targets, match against a resolved target.
+// Returns {target_index, vreg_size} or {UINT32_MAX, 0} on failure.
+static std::pair<uint32_t, int> match_image_targets(
+        const void *id, const tp::LLVMTargetSpec &target, jl_value_t **rejection_reason)
 {
-    SysimgMatch match;
-    bool match_name = false;
-    int feature_size = 0;
-    llvm::SmallVector<const char *, 0> rejection_reasons;
-    rejection_reasons.reserve(sysimg.size());
-    for (uint32_t i = 0; i < sysimg.size(); i++) {
-        auto &imgt = sysimg[i];
-        if (!(imgt.en.features & target.dis.features).empty()) {
-            // Check sysimg enabled features against runtime disabled features
-            // This is valid (and all what we can do)
-            // even if one or both of the targets are unknown.
-            rejection_reasons.push_back("Rejecting this target due to use of runtime-disabled features\n");
-            continue;
-        }
-        if (imgt.name == target.name) {
-            if (!match_name) {
-                match_name = true;
-                match.vreg_size = 0;
-                feature_size = 0;
-            }
-        }
-        else if (match_name) {
-            rejection_reasons.push_back("Rejecting this target since another target has a cpu name match\n");
-            continue;
-        }
-        int new_vsz = max_vector_size(imgt.en.features);
-        if (match.vreg_size > new_vsz) {
-            rejection_reasons.push_back("Rejecting this target since another target has a larger vector register size\n");
-            continue;
-        }
-        int new_feature_size = imgt.en.features.nbits();
-        if (match.vreg_size < new_vsz) {
-            match.best_idx = i;
-            match.vreg_size = new_vsz;
-            feature_size = new_feature_size;
-            rejection_reasons.push_back("Updating best match to this target due to larger vector register size\n");
-            continue;
+    auto image_targets = tp::deserialize_targets((const uint8_t *)id);
+    CF_DEBUG("[cpufeatures]   image has %zu target(s)\n", image_targets.size());
+
+    auto match = tp::match_targets(image_targets, target);
+    if (match.best_idx < 0) {
+        CF_DEBUG("[cpufeatures]   NO compatible target found!\n");
+        if (rejection_reason) {
+            std::string msg = "Unable to find compatible target in cached code image.";
+            *rejection_reason = jl_pchar_to_string(msg.data(), msg.size());
         }
-        if (new_feature_size < feature_size) {
-            rejection_reasons.push_back("Rejecting this target since another target has a larger feature set\n");
-            continue;
+        return {UINT32_MAX, 0};
+    }
+
+    CF_DEBUG("[cpufeatures]   selected target %d '%s' (vreg_size=%d)\n",
+             match.best_idx, image_targets[match.best_idx].cpu_name.c_str(), match.vreg_size);
+    return {(uint32_t)match.best_idx, match.vreg_size};
+}
+
+static uint32_t match_sysimg_target(void *ctx, const void *id, jl_value_t **rejection_reason)
+{
+    const char *cpu_target = (const char *)ctx;
+    CF_DEBUG("[cpufeatures] match_sysimg_target: cpu_target='%s'\n",
+             cpu_target ? cpu_target : "(null)");
+
+    // For multi-target strings (sysimage building), use only the first
+    // target for matching against the image being loaded.
+    auto target_str = jl_expand_sysimage_keyword(cpu_target);
+    auto semi = target_str.find(';');
+    auto first = semi != std::string::npos ? target_str.substr(0, semi) : target_str;
+    auto host_specs = tp::resolve_targets_for_llvm(first);
+    if (host_specs.empty())
+        jl_error("No targets specified");
+
+    auto &target = host_specs[0];
+    CF_DEBUG("[cpufeatures]   JIT target: name='%s'\n", target.cpu_name.c_str());
+
+#if defined(_CPU_X86_64_)
+    // CX16 check: only error if sysimage requires it and host doesn't have it
+    {
+        auto sysimg_peek = tp::deserialize_targets((const uint8_t *)id);
+        bool sysimg_allows_no_cx16 = false;
+        for (auto &t : sysimg_peek)
+            sysimg_allows_no_cx16 |= !tp::has_feature(t.en_features, "cx16");
+        if (!sysimg_allows_no_cx16 && !tp::has_feature(target.en_features, "cx16")) {
+            jl_error("Your CPU does not support the CX16 instruction, which is required "
+                     "by this version of Julia!  This is often due to running inside of a "
+                     "virtualized environment.  Please read "
+                     "https://docs.julialang.org/en/v1/devdocs/sysimg/ for more.");
         }
-        match.best_idx = i;
-        feature_size = new_feature_size;
-        rejection_reasons.push_back("Updating best match to this target\n");
     }
-    if (match.best_idx == UINT32_MAX) {
-        // Construct a nice error message for debugging purposes
-        std::string error_msg = "Unable to find compatible target in cached code image.\n";
-        for (size_t i = 0; i < rejection_reasons.size(); i++) {
-            error_msg += "Target ";
-            error_msg += std::to_string(i);
-            error_msg += " (";
-            error_msg += sysimg[i].name;
-            error_msg += "): ";
-            error_msg += rejection_reasons[i];
+#endif
+
+    // Match against image targets
+    auto match_result = match_image_targets(id, target, rejection_reason);
+    if (match_result.first == UINT32_MAX)
+        return UINT32_MAX;
+
+    // Clamp JIT vector features to match the sysimage target's vector width.
+    // On x86, AVX/AVX-512 change how VecElement tuples are passed in registers
+    // (FixedVectorType maps to xmm/ymm/zmm), so the JIT must not use wider
+    // vectors than the sysimage clone it calls into.
+    // TODO: aarch64 SVE uses scalable vectors which Julia doesn't generate
+    // (only FixedVectorType/NEON), so SVE clamping is not needed for ABI
+    // correctness. RISC-V V is similar. Revisit if Julia adds scalable vector
+    // support.
+    int matched_vreg = match_result.second;
+    int host_vreg = tp::max_vector_size(target.en_features);
+#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+    if (matched_vreg != host_vreg) {
+        if (matched_vreg < 64) {
+            static const char *avx512[] = {
+                "avx512f", "avx512dq", "avx512ifma", "avx512cd",
+                "avx512bw", "avx512vl", "avx512vbmi", "avx512vpopcntdq",
+                "avx512vbmi2", "avx512vnni", "avx512bitalg",
+                "avx512vp2intersect", "avx512bf16", "avx512fp16", nullptr
+            };
+            for (const char **f = avx512; *f; f++) {
+                const FeatureEntry *fe = find_feature(*f);
+                if (fe) feature_clear(&target.en_features, fe->bit);
+            }
         }
-        if (rejection_reason)
-            *rejection_reason = jl_pchar_to_string(error_msg.data(), error_msg.size());
+        if (matched_vreg < 32) {
+            static const char *avx[] = {
+                "avx", "avx2", "fma", "f16c", "fma4", "xop",
+                "vaes", "vpclmulqdq", nullptr
+            };
+            for (const char **f = avx; *f; f++) {
+                const FeatureEntry *fe = find_feature(*f);
+                if (fe) feature_clear(&target.en_features, fe->bit);
+            }
+        }
+        for (int w = 0; w < TARGET_FEATURE_WORDS; w++)
+            target.dis_features.bits[w] = hw_feature_mask.bits[w] & ~target.en_features.bits[w];
+        target.cpu_features = tp::build_llvm_feature_string(target.en_features, target.dis_features);
     }
-    return match;
+#else
+    (void)matched_vreg;
+    (void)host_vreg;
+#endif
+
+    jit_targets.push_back(std::move(target));
+    return match_result.first;
 }
 
-// Debug helper
+static uint32_t match_pkgimg_target(void *ctx, const void *id, jl_value_t **rejection_reason)
+{
+    auto &target = jit_targets.front();
+    auto result = match_image_targets(id, target, rejection_reason);
+    return result.first;
+}
 
-template<typename CPU, size_t n>
-static inline void dump_cpu_spec(uint32_t cpu, const FeatureList<n> &features,
-                                 const FeatureName *feature_names, uint32_t nfeature_names,
-                                 const CPUSpec<CPU,n> *cpus, uint32_t ncpus)
+// ============================================================================
+// Exported functions
+// ============================================================================
+
+#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+
+extern "C" JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
 {
-    bool cpu_found = false;
-    for (uint32_t i = 0;i < ncpus;i++) {
-        if (cpu == uint32_t(cpus[i].cpu)) {
-            cpu_found = true;
-            jl_safe_printf("CPU: %s\n", cpus[i].name);
-            break;
-        }
-    }
-    if (!cpu_found)
-        jl_safe_printf("CPU: generic\n");
+    asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType)
+    );
+}
+
+extern "C" JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType)
+{
+    asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+        "xchg %%ebx, %%esi;"
+        "cpuid;"
+        "xchg %%esi, %%ebx;" :
+        "=S" (CPUInfo[1]),
+#else
+        "cpuid" :
+        "=b" (CPUInfo[1]),
+#endif
+        "=a" (CPUInfo[0]),
+        "=c" (CPUInfo[2]),
+        "=d" (CPUInfo[3]) :
+        "a" (InfoType),
+        "c" (subInfoType)
+    );
+}
+
+#endif // x86
+
+JL_DLLEXPORT void jl_dump_host_cpu(void)
+{
+
+    jl_safe_printf("CPU: %s\n", host_cpu_name().c_str());
     jl_safe_printf("Features:");
+    auto host_feats = tp::get_host_features();
     bool first = true;
-    for (uint32_t i = 0;i < nfeature_names;i++) {
-        if (test_nbit(&features[0], feature_names[i].bit)) {
+    for (uint32_t i = 0; i < num_features; i++) {
+        if (feature_test(&host_feats, feature_table[i].bit)) {
             if (first) {
-                jl_safe_printf(" %s", feature_names[i].name);
+                jl_safe_printf(" %s", feature_table[i].name);
                 first = false;
-            }
-            else {
-                jl_safe_printf(", %s", feature_names[i].name);
+            } else {
+                jl_safe_printf(", %s", feature_table[i].name);
             }
         }
     }
     jl_safe_printf("\n");
 }
 
+JL_DLLEXPORT jl_value_t *jl_check_pkgimage_clones(char *data)
+{
+    jl_value_t *rejection_reason = NULL;
+    JL_GC_PUSH1(&rejection_reason);
+    uint32_t match_idx = match_pkgimg_target(NULL, data, &rejection_reason);
+    JL_GC_POP();
+    if (match_idx == UINT32_MAX)
+        return rejection_reason;
+    return jl_nothing;
 }
 
-static std::string jl_get_cpu_name_llvm(void)
+JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits)
 {
-    return llvm::sys::getHostCPUName().str();
+#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+    if ((bits == 32 || bits == 64) && !jit_targets.empty()) {
+        const auto &feats = jit_targets.front().en_features;
+        if (tp::has_feature(feats, "fma") || tp::has_feature(feats, "fma4"))
+            return jl_true;
+    }
+#elif defined(_CPU_AARCH64_)
+    if (bits == 32 || bits == 64)
+        return jl_true;
+#endif
+    return jl_false;
 }
 
-static std::string jl_get_cpu_features_llvm(void)
+// Validate cpu_target string before any processing.
+// Called from init.c early in startup.
+extern "C" JL_DLLEXPORT void jl_check_cpu_target(const char *cpu_target, int imaging)
 {
-#if JL_LLVM_VERSION >= 190000
-    auto HostFeatures = llvm::sys::getHostCPUFeatures();
-#else
-    llvm::StringMap<bool> HostFeatures;
-    llvm::sys::getHostCPUFeatures(HostFeatures);
+
+    if (!cpu_target || !*cpu_target)
+        return; // NULL/empty handled elsewhere
+
+    auto target_str = jl_expand_sysimage_keyword(cpu_target);
+    if (target_str.empty())
+        return;
+
+    // Handle "help": print available CPU targets and exit
+    if (target_str == "help" || target_str.find(",help") != std::string::npos) {
+        tp::print_cpu_targets();
+        exit(0);
+    }
+
+    auto specs = tp::resolve_targets_for_llvm(target_str);
+
+    for (auto &s : specs) {
+        if (s.flags & tp::TF_UNKNOWN_NAME) {
+            jl_safe_printf("Unknown cpu target: \"%s\"\n", s.cpu_name.c_str());
+            exit(1);
+        }
+    }
+
+    if (!imaging) {
+        if (specs.size() > 1) {
+            jl_safe_printf("More than one command line CPU targets specified "
+                      "without a `--output-` flag specified");
+            exit(1);
+        }
+        if (!specs.empty() && (specs[0].flags & tp::TF_CLONE_ALL)) {
+            jl_safe_printf("\"clone_all\" feature specified "
+                      "without a `--output-` flag specified");
+            exit(1);
+        }
+    }
+}
+
+jl_image_t jl_load_sysimg(jl_image_buf_t image, const char *cpu_target)
+{
+
+    if (!jit_targets.empty())
+        jl_error("JIT targets already initialized");
+    return load_sysimg_target(image, match_sysimg_target, (void *)cpu_target);
+}
+
+jl_image_t jl_load_pkgimg(jl_image_buf_t image)
+{
+    if (jit_targets.empty())
+        jl_error("JIT targets not initialized");
+    if (jit_targets.size() > 1)
+        jl_error("Expected only one JIT target");
+    return load_sysimg_target(image, match_pkgimg_target, NULL);
+}
+
+#ifndef __clang_analyzer__
+std::pair<std::string, std::string>
+jl_get_llvm_target(const char *cpu_target, bool imaging)
+{
+    init_jit_targets(cpu_target, imaging);
+    auto &spec = jit_targets[0];
+
+    std::string features = spec.cpu_features;
+    if (!spec.ext_features.empty()) {
+        if (!features.empty()) features += ',';
+        features += spec.ext_features;
+    }
+
+    return {spec.cpu_name, std::move(features)};
+}
 #endif
-    std::string attr;
-    for (auto &ele: HostFeatures) {
-        if (ele.getValue()) {
-            if (!attr.empty()) {
-                attr.append(",+");
-            }
-            else {
-                attr.append("+");
+
+#ifndef __clang_analyzer__
+const std::pair<std::string, std::string> &jl_get_llvm_disasm_target(void)
+{
+    // Use generic CPU with all features enabled so the disassembler
+    // can decode any instruction (including sysimage clones compiled
+    // for targets beyond the current JIT target).
+    static const auto res = [] {
+        std::string features;
+        for (uint32_t i = 0; i < num_features; i++) {
+            if (feature_table[i].is_hw) {
+                if (!features.empty()) features += ',';
+                features += '+';
+                features += feature_table[i].name;
             }
-            attr.append(ele.getKey().str());
         }
+        return std::make_pair(std::string("generic"), std::move(features));
+    }();
+    return res;
+}
+#endif
+
+#ifndef __clang_gcanalyzer__
+jl_clone_targets_t jl_get_llvm_clone_targets(const char *cpu_target)
+{
+
+
+    auto target_str = jl_expand_sysimage_keyword(cpu_target);
+    auto specs = tp::resolve_targets_for_llvm(target_str);
+
+    if (specs.empty())
+        jl_error("No targets specified");
+
+    jl_clone_targets_t result;
+
+    // Serialized blob for sysimage embedding
+    auto blob = tp::serialize_targets(specs);
+    result.data.assign(blob.begin(), blob.end());
+
+    // LLVM specs for codegen
+    for (auto &s : specs) {
+        jl_target_spec_t ele;
+        ele.cpu_name = s.cpu_name;
+        ele.cpu_features = s.cpu_features;
+        if (!s.ext_features.empty()) {
+            if (!ele.cpu_features.empty()) ele.cpu_features += ',';
+            ele.cpu_features += s.ext_features;
+        }
+        ele.base = s.base;
+        ele.clone_all = (s.flags & tp::TF_CLONE_ALL) != 0;
+        ele.opt_size = (s.flags & tp::TF_OPTSIZE) != 0;
+        ele.min_size = (s.flags & tp::TF_MINSIZE) != 0;
+        ele.diff = s.diff;
+        result.specs.push_back(std::move(ele));
     }
-    return attr;
+    return result;
 }
+#endif
 
-#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
+{
+    auto host_feats = tp::get_host_features();
+    if (feature >= TARGET_FEATURE_WORDS * 64)
+        return 0;
+    return feature_test(&host_feats, feature);
+}
 
-#include "processor_x86.cpp"
+// ============================================================================
+// Cross-architecture CPU/feature queries
+// ============================================================================
 
-#elif defined(_CPU_AARCH64_) || defined(_CPU_ARM_)
+extern "C" JL_DLLEXPORT size_t jl_cpufeatures_nbytes(void)
+{
+    return sizeof(FeatureBits);
+}
 
-#include "processor_arm.cpp"
+extern "C" JL_DLLEXPORT int jl_cpufeatures_lookup(const char *cpu_name,
+                                                    uint8_t *features_out,
+                                                    size_t bufsize)
+{
+    if (bufsize < sizeof(FeatureBits))
+        return -1;
+    const CPUEntry *entry = find_cpu(cpu_name);
+    if (!entry)
+        return -1;
+    FeatureBits hw;
+    for (int i = 0; i < TARGET_FEATURE_WORDS; i++)
+        hw.bits[i] = entry->features.bits[i] & hw_feature_mask.bits[i];
+    memcpy(features_out, &hw, sizeof(FeatureBits));
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT void jl_cpufeatures_host(uint8_t *features_out, size_t bufsize)
+{
+    if (bufsize < sizeof(FeatureBits))
+        return;
+    auto fb = tp::get_host_features();
+    for (int i = 0; i < TARGET_FEATURE_WORDS; i++)
+        fb.bits[i] &= hw_feature_mask.bits[i];
+    memcpy(features_out, &fb, sizeof(FeatureBits));
+}
+
+extern "C" JL_DLLEXPORT size_t jl_cpufeatures_cross_lookup(
+        const char *arch, const char *cpu_name,
+        uint8_t *features_out, size_t bufsize)
+{
+    tp::CrossFeatureBits fb;
+    if (!tp::cross_lookup_cpu(arch, cpu_name, fb))
+        return 0;
+    size_t nbytes = fb.num_words * sizeof(uint64_t);
+    if (bufsize < nbytes)
+        return 0;
+    memcpy(features_out, fb.bits, nbytes);
+    return nbytes;
+}
+
+extern "C" JL_DLLEXPORT size_t jl_cpufeatures_cross_nbytes(const char *arch)
+{
+    return tp::cross_feature_words(arch) * sizeof(uint64_t);
+}
+
+extern "C" JL_DLLEXPORT unsigned jl_cpufeatures_cross_num_features(const char *arch)
+{
+    return tp::cross_num_features(arch);
+}
+
+extern "C" JL_DLLEXPORT unsigned jl_cpufeatures_cross_num_cpus(const char *arch)
+{
+    return tp::cross_num_cpus(arch);
+}
+
+extern "C" JL_DLLEXPORT const char *jl_cpufeatures_cross_feature_name(const char *arch, unsigned idx)
+{
+    return tp::cross_feature_name(arch, idx);
+}
+
+extern "C" JL_DLLEXPORT int jl_cpufeatures_cross_feature_bit(const char *arch, unsigned idx)
+{
+    return tp::cross_feature_bit_at(arch, idx);
+}
+
+extern "C" JL_DLLEXPORT const char *jl_cpufeatures_cross_cpu_name(const char *arch, unsigned idx)
+{
+    return tp::cross_cpu_name(arch, idx);
+}
+
+// ============================================================================
+// FPU control
+// ============================================================================
+
+#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+
+#include <xmmintrin.h>
+
+static uint32_t subnormal_flags = [] {
+    int32_t info[4];
+    jl_cpuid(info, 0);
+    if (info[0] >= 1) {
+        jl_cpuid(info, 1);
+        if (info[3] & (1 << 26)) {
+            return 0x00008040u;
+        }
+        else if (info[3] & (1 << 25)) {
+            return 0x00008000u;
+        }
+    }
+    return 0u;
+}();
+
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    return _mm_getcsr() & subnormal_flags;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t flags = subnormal_flags;
+    if (flags) {
+        uint32_t state = _mm_getcsr();
+        if (isZero) state |= flags;
+        else state &= ~flags;
+        _mm_setcsr(state);
+        return 0;
+    }
+    return isZero;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) { return 0; }
+extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) { return isDefault; }
+
+#elif defined(_CPU_AARCH64_)
+
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
+{
+    uint64_t fpcr;
+    asm volatile ("mrs %0, fpcr" : "=r"(fpcr));
+    return (fpcr & (1 << 24)) != 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint64_t fpcr;
+    asm volatile ("mrs %0, fpcr" : "=r"(fpcr));
+    if (isZero) fpcr |= (1 << 24);
+    else fpcr &= ~(uint64_t)(1 << 24);
+    asm volatile ("msr fpcr, %0" :: "r"(fpcr));
+    return 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
+{
+    uint64_t fpcr;
+    asm volatile ("mrs %0, fpcr" : "=r"(fpcr));
+    return (fpcr & (1 << 25)) != 0;
+}
+
+extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
+{
+    uint64_t fpcr;
+    asm volatile ("mrs %0, fpcr" : "=r"(fpcr));
+    if (isDefault) fpcr |= (1 << 25);
+    else fpcr &= ~(uint64_t)(1 << 25);
+    asm volatile ("msr fpcr, %0" :: "r"(fpcr));
+    return 0;
+}
 
 #else
 
-#include "processor_fallback.cpp"
+extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) { return 0; }
+extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) { return isZero; }
+extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) { return 0; }
+extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) { return isDefault; }
 
 #endif
 
-// Global variable to store the CPU target string used for the sysimage
-static std::string sysimage_cpu_target;
+
+// ============================================================================
+// Global exports (defined after backend)
+// ============================================================================
 
 JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
 {
@@ -1016,35 +884,19 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void)
 
 JL_DLLEXPORT jl_value_t *jl_get_cpu_features(void)
 {
-    return jl_cstr_to_string(jl_get_cpu_features_llvm().c_str());
+    return jl_cstr_to_string(get_host_feature_string().c_str());
 }
 
+#ifndef __clang_analyzer__
 extern "C" JL_DLLEXPORT jl_value_t* jl_reflect_clone_targets() {
-    auto specs = jl_get_llvm_clone_targets(jl_options.cpu_target);
-    const uint32_t base_flags = 0;
-    llvm::SmallVector<uint8_t, 0> data;
-    auto push_i32 = [&] (uint32_t v) {
-        uint8_t buff[4];
-        memcpy(buff, &v, 4);
-        data.insert(data.end(), buff, buff + 4);
-    };
-    push_i32(specs.size());
-    for (uint32_t i = 0; i < specs.size(); i++) {
-        push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME));
-        auto &specdata = specs[i].data;
-        data.insert(data.end(), specdata.begin(), specdata.end());
-    }
-
+    auto targets = jl_get_llvm_clone_targets(jl_options.cpu_target);
+    auto &data = targets.data;
     jl_value_t *arr = (jl_value_t*)jl_alloc_array_1d(jl_array_uint8_type, data.size());
     uint8_t *out = jl_array_data(arr, uint8_t);
     memcpy(out, data.data(), data.size());
     return arr;
 }
-
-extern "C" JL_DLLEXPORT void jl_reflect_feature_names(const FeatureName **fnames, size_t *nf) {
-    *fnames = feature_names;
-    *nf = nfeature_names;
-}
+#endif
 
 extern "C" JL_DLLEXPORT jl_value_t *jl_get_sysimage_cpu_target(void) {
     if (sysimage_cpu_target.empty()) {
@@ -1053,7 +905,6 @@ extern "C" JL_DLLEXPORT jl_value_t *jl_get_sysimage_cpu_target(void) {
     return jl_cstr_to_string(sysimage_cpu_target.c_str());
 }
 
-// Function to set the sysimage CPU target (called during initialization)
 void jl_set_sysimage_cpu_target(const char *cpu_target) {
     if (cpu_target) {
         sysimage_cpu_target = cpu_target;
diff --git a/src/processor.h b/src/processor.h
index 091defadd4951..6567dcea01031 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -18,46 +18,9 @@ extern "C" {
 // Every image exports a `jl_image_pointers_t` as a global symbol `jl_image_pointers`.
 // This symbol acts as a root for all other code-related symbols in the image.
 
-enum {
-    JL_TARGET_VEC_CALL = 1 << 0,
-    // Clone all functions
-    JL_TARGET_CLONE_ALL = 1 << 1,
-    // Clone when there's scalar math operations that can benefit from target-specific
-    // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags.
-    JL_TARGET_CLONE_MATH = 1 << 2,
-    // Clone when the function has a loop
-    JL_TARGET_CLONE_LOOP = 1 << 3,
-    // Clone when the function uses any vectors
-    // When this is specified, the cloning pass should also record if any of the cloned functions
-    // used this in any function call (including the signature of the function itself)
-    JL_TARGET_CLONE_SIMD = 1 << 4,
-    // The CPU name is unknown
-    JL_TARGET_UNKNOWN_NAME = 1 << 5,
-    // Optimize for size for this target
-    JL_TARGET_OPTSIZE = 1 << 6,
-    // Only optimize for size for this target
-    JL_TARGET_MINSIZE = 1 << 7,
-    // Clone when the function queries CPU features
-    JL_TARGET_CLONE_CPU = 1 << 8,
-    // Clone when the function uses fp16
-    JL_TARGET_CLONE_FLOAT16 = 1 << 9,
-    // Clone when the function uses bf16
-    JL_TARGET_CLONE_BFLOAT16 = 1 << 10,
-};
-
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
-typedef enum {
-#define JL_FEATURE_DEF(name, bit, llvmver) JL_X86_##name = bit,
-#include "features_x86.h"
-#undef JL_FEATURE_DEF
-#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch32_##name = bit,
-#include "features_aarch32.h"
-#undef JL_FEATURE_DEF
-#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch64_##name = bit,
-#include "features_aarch64.h"
-#undef JL_FEATURE_DEF
-} jl_cpu_feature_t;
-#undef JL_FEATURE_DEF_NAME
+// Feature indices come from the cpufeatures library's generated tables.
+// The actual constants are defined in base/features_h.jl (auto-generated).
+typedef uint32_t jl_cpu_feature_t;
 
 JL_DLLEXPORT int jl_test_cpu_feature(jl_cpu_feature_t feature);
 
@@ -209,8 +172,9 @@ typedef struct {
  *
  * Return the data about the function pointers selected.
  */
-jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target);
-jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image);
+void jl_check_cpu_target(const char *cpu_target, int imaging);
+jl_image_t jl_load_sysimg(jl_image_buf_t image, const char *cpu_target);
+jl_image_t jl_load_pkgimg(jl_image_buf_t image);
 
 // Internal function to set the sysimage CPU target during initialization
 void jl_set_sysimage_cpu_target(const char *cpu_target);
@@ -250,7 +214,17 @@ extern jl_image_unpack_func_t *jl_image_unpack;
 #include <string>
 #include <vector>
 
-extern JL_DLLEXPORT bool jl_processor_print_help;
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
+#include <cpufeatures/target_tables_x86_64.h>
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#include <cpufeatures/target_tables_aarch64.h>
+#elif defined(__riscv) && __riscv_xlen == 64
+#include <cpufeatures/target_tables_riscv64.h>
+#else
+#include <cpufeatures/target_tables_fallback.h>
+#endif
+#include <cpufeatures/target_parsing.h>
+
 // NOLINTBEGIN(clang-diagnostic-return-type-c-linkage)
 /**
  * Returns the CPU name and feature string to be used by LLVM JIT.
@@ -258,7 +232,7 @@ extern JL_DLLEXPORT bool jl_processor_print_help;
  * If the detected/specified CPU name is not available on the LLVM version specified,
  * a fallback CPU name will be used. Unsupported features will be ignored.
  */
-extern "C" JL_DLLEXPORT std::pair<std::string,llvm::SmallVector<std::string, 0>> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags) JL_NOTSAFEPOINT;
+extern "C" JL_DLLEXPORT std::pair<std::string,std::string> jl_get_llvm_target(const char *cpu_target, bool imaging) JL_NOTSAFEPOINT;
 
 /**
  * Returns the CPU name and feature string to be used by LLVM disassembler.
@@ -268,30 +242,28 @@ extern "C" JL_DLLEXPORT std::pair<std::string,llvm::SmallVector<std::string, 0>>
 extern "C" JL_DLLEXPORT const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void) JL_NOTSAFEPOINT;
 
 struct jl_target_spec_t {
-    // LLVM target name
     std::string cpu_name;
-    // LLVM feature string
     std::string cpu_features;
-    // serialized identification data
-    llvm::SmallVector<uint8_t, 0> data;
-    // Clone condition.
-    uint32_t flags;
-    // Base target index.
     int base;
+    bool clone_all = false;
+    bool opt_size = false;
+    bool min_size = false;
+    tp::FeatureDiff diff;
 };
+
+struct jl_clone_targets_t {
+    std::vector<jl_target_spec_t> specs;
+    std::vector<uint8_t> data; // serialized target identification blob
+};
+
 /**
- * Return the list of targets to clone
+ * Return the list of targets to clone and their serialized identification data
  */
-extern "C" JL_DLLEXPORT llvm::SmallVector<jl_target_spec_t, 0> jl_get_llvm_clone_targets(const char *cpu_target) JL_NOTSAFEPOINT;
+extern "C" JL_DLLEXPORT jl_clone_targets_t jl_get_llvm_clone_targets(const char *cpu_target) JL_NOTSAFEPOINT;
 // NOLINTEND(clang-diagnostic-return-type-c-linkage)
-struct FeatureName {
-    const char *name;
-    uint32_t bit; // bit index into a `uint32_t` array;
-    uint32_t llvmver; // 0 if it is available on the oldest LLVM version we support
-};
-
 extern "C" JL_DLLEXPORT jl_value_t* jl_reflect_clone_targets();
-extern "C" JL_DLLEXPORT void jl_reflect_feature_names(const FeatureName **feature_names, size_t *nfeatures);
+extern "C" JL_DLLEXPORT jl_value_t *jl_feature_bits_to_string(const uint8_t *bits, int32_t nwords);
+extern "C" JL_DLLEXPORT std::string jl_expand_sysimage_keyword(const char *cpu_target);
 #endif
 
 #endif
diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp
deleted file mode 100644
index 0fba135c0b17e..0000000000000
--- a/src/processor_arm.cpp
+++ /dev/null
@@ -1,2085 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-// ARM (AArch32/AArch64) specific processor detection and dispatch
-
-#include <sys/stat.h>
-#include <sys/utsname.h>
-#include <fcntl.h>
-#include <set>
-#include <fstream>
-#include <algorithm>
-
-// This nesting is required to allow compilation on musl
-#define USE_DYN_GETAUXVAL
-#if (defined(_OS_LINUX_) || defined(_OS_FREEBSD_)) && defined(_CPU_AARCH64_)
-#  undef USE_DYN_GETAUXVAL
-#  include <sys/auxv.h>
-#elif defined(__GLIBC_PREREQ)
-#  if __GLIBC_PREREQ(2, 16)
-#    undef USE_DYN_GETAUXVAL
-#    include <sys/auxv.h>
-#  endif
-#elif defined _CPU_AARCH64_ && defined _OS_DARWIN_
-#include <sys/sysctl.h>
-#include <string.h>
-#endif
-
-namespace ARM {
-enum class CPU : uint32_t {
-    generic = 0,
-
-    // Architecture targets
-    armv7_a,
-    armv7_m,
-    armv7e_m,
-    armv7_r,
-    armv8_a,
-    armv8_m_base,
-    armv8_m_main,
-    armv8_r,
-    armv8_1_a,
-    armv8_2_a,
-    armv8_3_a,
-    armv8_4_a,
-    armv8_5_a,
-    armv8_6_a,
-
-    // ARM
-    // armv6l
-    arm_mpcore,
-    arm_1136jf_s,
-    arm_1156t2f_s,
-    arm_1176jzf_s,
-    arm_cortex_m0,
-    arm_cortex_m1,
-    // armv7ml
-    arm_cortex_m3,
-    arm_cortex_m4,
-    arm_cortex_m7,
-    // armv7l
-    arm_cortex_a5,
-    arm_cortex_a7,
-    arm_cortex_a8,
-    arm_cortex_a9,
-    arm_cortex_a12,
-    arm_cortex_a15,
-    arm_cortex_a17,
-    arm_cortex_r4,
-    arm_cortex_r5,
-    arm_cortex_r7,
-    arm_cortex_r8,
-    // armv8ml
-    arm_cortex_m23,
-    arm_cortex_m33,
-    // armv8l
-    arm_cortex_a32,
-    arm_cortex_r52,
-    // aarch64
-    arm_cortex_a34,
-    arm_cortex_a35,
-    arm_cortex_a53,
-    arm_cortex_a55,
-    arm_cortex_a57,
-    arm_cortex_a65,
-    arm_cortex_a65ae,
-    arm_cortex_a72,
-    arm_cortex_a73,
-    arm_cortex_a75,
-    arm_cortex_a76,
-    arm_cortex_a76ae,
-    arm_cortex_a77,
-    arm_cortex_a78,
-    arm_cortex_x1,
-    arm_neoverse_e1,
-    arm_neoverse_n1,
-    arm_neoverse_v1,
-    arm_neoverse_n2,
-
-    // Cavium
-    // aarch64
-    cavium_thunderx,
-    cavium_thunderx88,
-    cavium_thunderx88p1,
-    cavium_thunderx81,
-    cavium_thunderx83,
-    cavium_thunderx2t99,
-    cavium_thunderx2t99p1,
-    cavium_octeontx2,
-    cavium_octeontx2t98,
-    cavium_octeontx2t96,
-    cavium_octeontx2f95,
-    cavium_octeontx2f95n,
-    cavium_octeontx2f95mm,
-
-    // Fujitsu
-    // aarch64
-    fujitsu_a64fx,
-
-    // HiSilicon
-    // aarch64
-    hisilicon_tsv110,
-
-    // Huaxingtong
-    // aarch64
-    hxt_phecda,
-
-    // NVIDIA
-    // aarch64
-    nvidia_denver1,
-    nvidia_denver2,
-    nvidia_carmel,
-
-    // AppliedMicro
-    // aarch64
-    apm_xgene1,
-    apm_xgene2,
-    apm_xgene3,
-
-    // Qualcomm
-    // armv7l
-    qualcomm_scorpion,
-    qualcomm_krait,
-    // aarch64
-    qualcomm_kyro,
-    qualcomm_falkor,
-    qualcomm_saphira,
-
-    // Samsung
-    // aarch64
-    samsung_exynos_m1,
-    samsung_exynos_m2,
-    samsung_exynos_m3,
-    samsung_exynos_m4,
-    samsung_exynos_m5,
-
-    // Apple
-    // armv7l
-    apple_swift,
-    // aarch64
-    apple_a7, // cyclone
-    apple_a8, // typhoon
-    apple_a9, // twister
-    apple_a10, // hurricane
-    apple_a11,
-    apple_a12,
-    apple_a13,
-    apple_a14,
-    apple_a15,
-    apple_a16,
-    apple_a17,
-    apple_m1,
-    apple_m2,
-    apple_m3,
-    apple_m4,
-    apple_s4,
-    apple_s5,
-
-    // Marvell
-    // armv7l
-    marvell_pj4,
-    // aarch64
-    marvell_thunderx3t110,
-
-    // Intel
-    // armv7l
-    intel_3735d,
-};
-
-#ifdef _CPU_AARCH64_
-static constexpr size_t feature_sz = 3;
-static constexpr FeatureName feature_names[] = {
-#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
-#include "features_aarch64.h"
-#undef JL_FEATURE_DEF
-#undef JL_FEATURE_DEF_NAME
-};
-static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
-
-template<typename... Args>
-static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
-{
-    return ::get_feature_masks<feature_sz>(args...);
-}
-
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
-static constexpr auto feature_masks = get_feature_masks(
-#define JL_FEATURE_DEF(name, bit, llvmver) bit,
-#include "features_aarch64.h"
-#undef JL_FEATURE_DEF
-    -1);
-static const auto real_feature_masks =
-    feature_masks & FeatureList<feature_sz>{{UINT32_MAX, UINT32_MAX, 0}};
-
-namespace Feature {
-enum : uint32_t {
-#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
-#include "features_aarch64.h"
-#undef JL_FEATURE_DEF
-};
-#undef JL_FEATURE_DEF_NAME
-// This does not cover all dependencies (e.g. the ones that depends on arm versions)
-static constexpr FeatureDep deps[] = {
-    {rcpc_immo, rcpc},
-    {sha3, sha2},
-    // {sha512, sha3},
-    {ccdp, ccpp},
-    {sve, fullfp16},
-    {fp16fml, fullfp16},
-    {altnzcv, flagm},
-    {sve2, sve},
-    {sve2_aes, sve2},
-    {sve2_aes, aes},
-    {sve2_bitperm, sve2},
-    {sve2_sha3, sve2},
-    {sve2_sha3, sha3},
-    {sve2_sm4, sve2},
-    {sve2_sm4, sm4},
-    {f32mm, sve},
-    {f64mm, sve},
-};
-
-constexpr auto generic = get_feature_masks();
-constexpr auto armv8a_crc = get_feature_masks(crc);
-constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(aes, sha2);
-constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a, lse, rdm); // lor
-constexpr auto armv8_1a_crypto = armv8_1a | get_feature_masks(aes, sha2);
-constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a, ccpp);
-constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(aes, sha2);
-constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a, jsconv, complxnum, rcpc);
-constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(aes, sha2);
-constexpr auto armv8_4a = armv8_3a | get_feature_masks(v8_4a, dit, rcpc_immo, flagm);
-constexpr auto armv8_4a_crypto = armv8_4a | get_feature_masks(aes, sha2);
-constexpr auto armv8_5a = armv8_4a | get_feature_masks(v8_5a, sb, ccdp, altnzcv, fptoint);
-constexpr auto armv8_5a_crypto = armv8_5a | get_feature_masks(aes, sha2);
-constexpr auto armv8_6a = armv8_5a | get_feature_masks(v8_6a, i8mm, bf16);
-
-// For ARM cores, the features required can be found in the technical reference manual
-// The relevant register values and the features they are related to are:
-// ID_AA64ISAR0_EL1:
-//     .AES: aes, pmull
-//     .SHA1: sha1
-//     .SHA2: sha2, sha512
-//     .CRC32: crc
-//     .Atomic: les
-//     .RDM: rdm
-//     .SHA3: sha3
-//     .SM3: sm3 (sm4)
-//     .SM4: sm4
-//     .DP: dotprod
-//     .FHM: fp16fml
-//     .TS: flagm, altnzcz
-//     .RNDR: rand
-
-// ID_AA64ISAR1_EL1
-//     .JSCVT: jsconv
-//     .FCMA: complxnum
-//     .LRCPC: rcpc, rcpc_immo
-//     .DPB: ccpp, ccdp
-//     .SB: sb
-//     .APA/.API: paca (pa)
-//     .GPA/.GPI: paga (pa)
-//     .FRINTTS: fptoint
-//     .I8MM: i8mm
-//     .BF16: bf16
-//     .DGH: dgh
-
-// ID_AA64PFR0_EL1
-//     .FP: fullfp16
-//     .SVE: sve
-//     .DIT: dit
-//     .BT: bti
-
-// ID_AA64PFR1_EL1
-//     .SSBS: ssbs
-//     .MTE: mte
-
-// ID_AA64MMFR2_EL1.AT: uscat
-
-// ID_AA64ZFR0_EL1
-//     .SVEVer: sve2
-//     .AES: sve2-aes, sve2-pmull
-//     .BitPerm: sve2-bitperm
-//     .SHA3: sve2-sha3
-//     .SM4: sve2-sm4
-//     .F32MM: f32mm
-//     .F64MM: f64mm
-
-constexpr auto arm_cortex_a34 = armv8a_crc;
-constexpr auto arm_cortex_a35 = armv8a_crc;
-constexpr auto arm_cortex_a53 = armv8a_crc;
-constexpr auto arm_cortex_a55 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs);
-constexpr auto arm_cortex_a57 = armv8a_crc;
-constexpr auto arm_cortex_a65 = armv8_2a | get_feature_masks(rcpc, fullfp16, ssbs);
-constexpr auto arm_cortex_a72 = armv8a_crc;
-constexpr auto arm_cortex_a73 = armv8a_crc;
-constexpr auto arm_cortex_a75 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16);
-constexpr auto arm_cortex_a76 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs);
-constexpr auto arm_cortex_a77 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs);
-constexpr auto arm_cortex_a78 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); // spe
-constexpr auto arm_cortex_x1 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); // spe
-constexpr auto arm_neoverse_e1 = armv8_2a | get_feature_masks(rcpc, fullfp16, ssbs);
-constexpr auto arm_neoverse_n1 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs);
-constexpr auto arm_neoverse_v1 = armv8_4a | get_feature_masks(sve, i8mm, bf16, fullfp16, ssbs, rand);
-constexpr auto arm_neoverse_n2 = armv8_5a | get_feature_masks(sve, i8mm, bf16, fullfp16, sve2,
-                                                              sve2_bitperm, rand, mte);
-constexpr auto cavium_thunderx = armv8a_crc_crypto;
-constexpr auto cavium_thunderx88 = armv8a_crc_crypto;
-constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto;
-constexpr auto cavium_thunderx81 = armv8a_crc_crypto;
-constexpr auto cavium_thunderx83 = armv8a_crc_crypto;
-constexpr auto cavium_thunderx2t99 = armv8_1a_crypto;
-constexpr auto cavium_thunderx2t99p1 = cavium_thunderx2t99;
-constexpr auto cavium_octeontx2 = armv8_2a_crypto;
-constexpr auto fujitsu_a64fx = armv8_2a | get_feature_masks(sha2, fullfp16, sve, complxnum);
-constexpr auto hisilicon_tsv110 = armv8_2a_crypto | get_feature_masks(dotprod, fullfp16);
-constexpr auto hxt_phecda = armv8a_crc_crypto;
-constexpr auto marvell_thunderx3t110 = armv8_3a_crypto;
-constexpr auto nvidia_denver1 = generic; // TODO? (crc, crypto)
-constexpr auto nvidia_denver2 = armv8a_crc_crypto;
-constexpr auto nvidia_carmel = armv8_2a_crypto | get_feature_masks(fullfp16);
-constexpr auto apm_xgene1 = generic;
-constexpr auto apm_xgene2 = generic; // TODO?
-constexpr auto apm_xgene3 = generic; // TODO?
-constexpr auto qualcomm_kyro = armv8a_crc_crypto;
-constexpr auto qualcomm_falkor = armv8a_crc_crypto | get_feature_masks(rdm);
-constexpr auto qualcomm_saphira = armv8_4a_crypto;
-constexpr auto samsung_exynos_m1 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m2 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m3 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m4 = armv8_2a_crypto | get_feature_masks(dotprod, fullfp16);
-constexpr auto samsung_exynos_m5 = samsung_exynos_m4;
-constexpr auto apple_a7 = armv8a_crc_crypto;
-constexpr auto apple_a10 = armv8a_crc_crypto | get_feature_masks(rdm);
-constexpr auto apple_a11 = armv8_2a_crypto | get_feature_masks(fullfp16);
-constexpr auto apple_a12 = armv8_3a_crypto | get_feature_masks(fullfp16);
-constexpr auto apple_a13 = armv8_4a_crypto | get_feature_masks(fp16fml, fullfp16, sha3);
-constexpr auto apple_a14 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3);
-constexpr auto apple_a15 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-constexpr auto apple_a16 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-constexpr auto apple_a17 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-constexpr auto apple_m1 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3);
-constexpr auto apple_m2 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-constexpr auto apple_m3 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-constexpr auto apple_m4 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16);
-// Features based on https://github.com/llvm/llvm-project/blob/82507f1798768280cf5d5aab95caaafbc7fe6f47/llvm/include/llvm/Support/AArch64TargetParser.def
-// and sysctl -a hw.optional
-constexpr auto apple_s4 = apple_a12;
-constexpr auto apple_s5 = apple_a12;
-
-}
-
-static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
-    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
-    {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a},
-    {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a},
-    {"armv8.3_a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a},
-    {"armv8.4-a", CPU::armv8_4_a, CPU::generic, 0, Feature::armv8_4a},
-    {"armv8.5-a", CPU::armv8_5_a, CPU::generic, 0, Feature::armv8_5a},
-    {"armv8.6_a", CPU::armv8_6_a, CPU::generic, 0, Feature::armv8_6a},
-    {"cortex-a34", CPU::arm_cortex_a34, CPU::arm_cortex_a35, 110000, Feature::arm_cortex_a34},
-    {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35},
-    {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53},
-    {"cortex-a55", CPU::arm_cortex_a55, CPU::generic, 0, Feature::arm_cortex_a55},
-    {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57},
-    {"cortex-a65", CPU::arm_cortex_a65, CPU::arm_cortex_a75, 100000, Feature::arm_cortex_a65},
-    {"cortex-a65ae", CPU::arm_cortex_a65ae, CPU::arm_cortex_a75, 100000, Feature::arm_cortex_a65},
-    {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72},
-    {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73},
-    {"cortex-a75", CPU::arm_cortex_a75, CPU::generic, 0, Feature::arm_cortex_a75},
-    {"cortex-a76", CPU::arm_cortex_a76, CPU::generic, 0, Feature::arm_cortex_a76},
-    {"cortex-a76ae", CPU::arm_cortex_a76ae, CPU::generic, 0, Feature::arm_cortex_a76},
-    {"cortex-a77", CPU::arm_cortex_a77, CPU::arm_cortex_a76, 110000, Feature::arm_cortex_a77},
-    {"cortex-a78", CPU::arm_cortex_a78, CPU::arm_cortex_a77, 110000, Feature::arm_cortex_a78},
-    {"cortex-x1", CPU::arm_cortex_x1, CPU::arm_cortex_a78, 110000, Feature::arm_cortex_x1},
-    {"neoverse-e1", CPU::arm_neoverse_e1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_e1},
-    {"neoverse-n1", CPU::arm_neoverse_n1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_n1},
-    {"neoverse-v1", CPU::arm_neoverse_v1, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_v1},
-    {"neoverse-n2", CPU::arm_neoverse_n2, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_n2},
-    {"thunderx", CPU::cavium_thunderx, CPU::generic, 0, Feature::cavium_thunderx},
-    {"thunderxt88", CPU::cavium_thunderx88, CPU::generic, 0, Feature::cavium_thunderx88},
-    {"thunderxt88p1", CPU::cavium_thunderx88p1, CPU::cavium_thunderx88, UINT32_MAX,
-     Feature::cavium_thunderx88p1},
-    {"thunderxt81", CPU::cavium_thunderx81, CPU::generic, 0, Feature::cavium_thunderx81},
-    {"thunderxt83", CPU::cavium_thunderx83, CPU::generic, 0, Feature::cavium_thunderx83},
-    {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::generic, 0, Feature::cavium_thunderx2t99},
-    {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::cavium_thunderx2t99, UINT32_MAX,
-     Feature::cavium_thunderx2t99p1},
-    {"octeontx2", CPU::cavium_octeontx2, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"octeontx2t98", CPU::cavium_octeontx2t98, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"octeontx2t96", CPU::cavium_octeontx2t96, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"octeontx2f95", CPU::cavium_octeontx2f95, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"octeontx2f95n", CPU::cavium_octeontx2f95n, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"octeontx2f95mm", CPU::cavium_octeontx2f95mm, CPU::arm_cortex_a57, UINT32_MAX,
-     Feature::cavium_octeontx2},
-    {"a64fx", CPU::fujitsu_a64fx, CPU::generic, 110000, Feature::fujitsu_a64fx},
-    {"tsv110", CPU::hisilicon_tsv110, CPU::generic, 0, Feature::hisilicon_tsv110},
-    {"phecda", CPU::hxt_phecda, CPU::qualcomm_falkor, UINT32_MAX, Feature::hxt_phecda},
-    {"denver1", CPU::nvidia_denver1, CPU::generic, UINT32_MAX, Feature::nvidia_denver1},
-    {"denver2", CPU::nvidia_denver2, CPU::generic, UINT32_MAX, Feature::nvidia_denver2},
-    {"carmel", CPU::nvidia_carmel, CPU::generic, 110000, Feature::nvidia_carmel},
-    {"xgene1", CPU::apm_xgene1, CPU::generic, UINT32_MAX, Feature::apm_xgene1},
-    {"xgene2", CPU::apm_xgene2, CPU::generic, UINT32_MAX, Feature::apm_xgene2},
-    {"xgene3", CPU::apm_xgene3, CPU::generic, UINT32_MAX, Feature::apm_xgene3},
-    {"kyro", CPU::qualcomm_kyro, CPU::generic, 0, Feature::qualcomm_kyro},
-    {"falkor", CPU::qualcomm_falkor, CPU::generic, 0, Feature::qualcomm_falkor},
-    {"saphira", CPU::qualcomm_saphira, CPU::generic, 0, Feature::qualcomm_saphira},
-    {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m1},
-    {"exynos-m2", CPU::samsung_exynos_m2, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m2},
-    {"exynos-m3", CPU::samsung_exynos_m3, CPU::generic, 0, Feature::samsung_exynos_m3},
-    {"exynos-m4", CPU::samsung_exynos_m4, CPU::generic, 0, Feature::samsung_exynos_m4},
-    {"exynos-m5", CPU::samsung_exynos_m5, CPU::samsung_exynos_m4, 110000,
-     Feature::samsung_exynos_m5},
-    {"apple-a7", CPU::apple_a7, CPU::generic, 100000, Feature::apple_a7},
-    {"apple-a8", CPU::apple_a8, CPU::generic, 100000, Feature::apple_a7},
-    {"apple-a9", CPU::apple_a9, CPU::generic, 100000, Feature::apple_a7},
-    {"apple-a10", CPU::apple_a10, CPU::generic, 100000, Feature::apple_a10},
-    {"apple-a11", CPU::apple_a11, CPU::generic, 100000, Feature::apple_a11},
-    {"apple-a12", CPU::apple_a12, CPU::generic, 100000, Feature::apple_a12},
-    {"apple-a13", CPU::apple_a13, CPU::generic, 100000, Feature::apple_a13},
-    {"apple-a14", CPU::apple_a14, CPU::apple_a13, 120000, Feature::apple_a14},
-    {"apple-a15", CPU::apple_a15, CPU::apple_a14, 160000, Feature::apple_a15},
-    {"apple-a16", CPU::apple_a16, CPU::apple_a14, 160000, Feature::apple_a16},
-    {"apple-a17", CPU::apple_a17, CPU::apple_a16, 190000, Feature::apple_a17},
-    {"apple-m1", CPU::apple_m1, CPU::apple_a14, 130000, Feature::apple_m1},
-    {"apple-m2", CPU::apple_m2, CPU::apple_m1, 160000, Feature::apple_m2},
-    {"apple-m3", CPU::apple_m3, CPU::apple_m2, 180000, Feature::apple_m3},
-    {"apple-m4", CPU::apple_m4, CPU::apple_m3, 190000, Feature::apple_m4},
-    {"apple-s4", CPU::apple_s4, CPU::generic, 100000, Feature::apple_s4},
-    {"apple-s5", CPU::apple_s5, CPU::generic, 100000, Feature::apple_s5},
-    {"thunderx3t110", CPU::marvell_thunderx3t110, CPU::cavium_thunderx2t99, 110000,
-     Feature::marvell_thunderx3t110},
-};
-#else
-static constexpr size_t feature_sz = 3;
-static constexpr FeatureName feature_names[] = {
-#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
-#include "features_aarch32.h"
-#undef JL_FEATURE_DEF
-#undef JL_FEATURE_DEF_NAME
-};
-static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
-
-template<typename... Args>
-static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
-{
-    return ::get_feature_masks<feature_sz>(args...);
-}
-
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
-static constexpr auto feature_masks = get_feature_masks(
-#define JL_FEATURE_DEF(name, bit, llvmver) bit,
-#include "features_aarch32.h"
-#undef JL_FEATURE_DEF
-    -1);
-static const auto real_feature_masks =
-    feature_masks & FeatureList<feature_sz>{{UINT32_MAX, UINT32_MAX, 0}};
-
-namespace Feature {
-enum : uint32_t {
-#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
-#include "features_aarch32.h"
-#undef JL_FEATURE_DEF
-};
-#undef JL_FEATURE_DEF_NAME
-// This does not cover all dependencies (e.g. the ones that depends on arm versions)
-static constexpr FeatureDep deps[] = {
-    {neon, vfp3},
-    {vfp4, vfp3},
-    {crypto, neon},
-};
-
-// These are the real base requirements of the specific architectures
-constexpr auto _armv7m = get_feature_masks(v7, mclass, hwdiv);
-constexpr auto _armv7a = get_feature_masks(v7, aclass);
-constexpr auto _armv7r = get_feature_masks(v7, rclass);
-constexpr auto _armv8m = get_feature_masks(v7, v8, mclass, hwdiv);
-constexpr auto _armv8a = get_feature_masks(v7, v8, aclass, neon, vfp3, vfp4, d32,
-                                           hwdiv, hwdiv_arm);
-constexpr auto _armv8r = get_feature_masks(v7, v8, rclass, neon, vfp3, vfp4, d32,
-                                           hwdiv, hwdiv_arm);
-
-// Set `generic` to match the feature requirement of the `C` code.
-// we'll require at least these when compiling the sysimg.
-#if __ARM_ARCH >= 8
-#  if !defined(__ARM_ARCH_PROFILE)
-constexpr auto generic = get_feature_masks(v7, v8, hwdiv);
-#  elif __ARM_ARCH_PROFILE == 'A'
-constexpr auto generic = _armv8a;
-#  elif __ARM_ARCH_PROFILE == 'R'
-constexpr auto generic = _armv8r;
-#  elif __ARM_ARCH_PROFILE == 'M'
-constexpr auto generic = _armv8m;
-#  else
-constexpr auto generic = get_feature_masks(v7, v8, hwdiv);
-#  endif
-#elif __ARM_ARCH == 7
-#  if !defined(__ARM_ARCH_PROFILE)
-constexpr auto generic = get_feature_masks(v7);
-#  elif __ARM_ARCH_PROFILE == 'A'
-constexpr auto generic = _armv7a;
-#  elif __ARM_ARCH_PROFILE == 'R'
-constexpr auto generic = _armv7r;
-#  elif __ARM_ARCH_PROFILE == 'M'
-constexpr auto generic = _armv7m;
-#  else
-constexpr auto generic = get_feature_masks(v7);
-#  endif
-#else
-constexpr auto generic = get_feature_masks();
-#endif
-
-// All feature sets below should use or be or'ed with one of these (or generic).
-// This makes sure that, for example, the `generic` target on `armv7-a` binary is equivalent
-// to the `armv7-a` target.
-constexpr auto armv7m = generic | _armv7m;
-constexpr auto armv7a = generic | _armv7a;
-constexpr auto armv7r = generic | _armv7r;
-constexpr auto armv8m = generic | _armv8m;
-constexpr auto armv8a = generic | _armv8a;
-constexpr auto armv8r = generic | _armv8r;
-
-// armv7l
-constexpr auto arm_cortex_a5 = armv7a;
-constexpr auto arm_cortex_a7 = armv7a | get_feature_masks(vfp3, vfp4, neon);
-constexpr auto arm_cortex_a8 = armv7a | get_feature_masks(d32, vfp3, neon);
-constexpr auto arm_cortex_a9 = armv7a;
-constexpr auto arm_cortex_a12 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
-constexpr auto arm_cortex_a15 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
-constexpr auto arm_cortex_a17 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon);
-constexpr auto arm_cortex_r4 = armv7r | get_feature_masks(vfp3, hwdiv);
-constexpr auto arm_cortex_r5 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
-constexpr auto arm_cortex_r7 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
-constexpr auto arm_cortex_r8 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm);
-constexpr auto qualcomm_scorpion = armv7a | get_feature_masks(v7, aclass, vfp3, neon);
-constexpr auto qualcomm_krait = armv7a | get_feature_masks(vfp3, vfp4, neon, hwdiv, hwdiv_arm);
-constexpr auto apple_swift = armv7a | get_feature_masks(d32, vfp3, vfp4, neon, hwdiv, hwdiv_arm);
-constexpr auto marvell_pj4 = armv7a | get_feature_masks(vfp3);
-constexpr auto intel_3735d = armv7a | get_feature_masks(vfp3, neon);
-// armv8ml
-constexpr auto arm_cortex_m23 = armv8m; // unsupported
-constexpr auto arm_cortex_m33 = armv8m | get_feature_masks(v8_m_main); // unsupported
-// armv8l
-constexpr auto armv8a_crc = armv8a | get_feature_masks(crc);
-constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a);
-constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a);
-constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto);
-constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto);
-constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a);
-constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto);
-constexpr auto armv8_4a = armv8_3a | get_feature_masks(v8_4a);
-constexpr auto armv8_4a_crypto = armv8_4a | get_feature_masks(crypto);
-constexpr auto armv8_5a = armv8_4a | get_feature_masks(v8_5a);
-constexpr auto armv8_5a_crypto = armv8_5a | get_feature_masks(crypto);
-constexpr auto armv8_6a = armv8_5a | get_feature_masks(v8_6a);
-constexpr auto armv8_6a_crypto = armv8_6a | get_feature_masks(crypto);
-
-constexpr auto arm_cortex_a32 = armv8a_crc;
-constexpr auto arm_cortex_r52 = armv8a_crc;
-constexpr auto arm_cortex_a35 = armv8a_crc;
-constexpr auto arm_cortex_a53 = armv8a_crc;
-constexpr auto arm_cortex_a55 = armv8_2a;
-constexpr auto arm_cortex_a57 = armv8a_crc;
-constexpr auto arm_cortex_a72 = armv8a_crc;
-constexpr auto arm_cortex_a73 = armv8a_crc;
-constexpr auto arm_cortex_a75 = armv8_2a;
-constexpr auto arm_cortex_a76 = armv8_2a;
-constexpr auto arm_cortex_a77 = armv8_2a;
-constexpr auto arm_cortex_a78 = armv8_2a;
-constexpr auto arm_cortex_x1 = armv8_2a;
-constexpr auto arm_neoverse_n1 = armv8_2a;
-constexpr auto arm_neoverse_v1 = armv8_4a;
-constexpr auto arm_neoverse_n2 = armv8_5a;
-constexpr auto nvidia_denver1 = armv8a; // TODO? (crc, crypto)
-constexpr auto nvidia_denver2 = armv8a_crc_crypto;
-constexpr auto apm_xgene1 = armv8a;
-constexpr auto apm_xgene2 = armv8a; // TODO?
-constexpr auto apm_xgene3 = armv8a; // TODO?
-constexpr auto qualcomm_kyro = armv8a_crc_crypto;
-constexpr auto qualcomm_falkor = armv8a_crc_crypto;
-constexpr auto qualcomm_saphira = armv8_3a_crypto;
-constexpr auto samsung_exynos_m1 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m2 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m3 = armv8a_crc_crypto;
-constexpr auto samsung_exynos_m4 = armv8_2a_crypto;
-constexpr auto samsung_exynos_m5 = samsung_exynos_m4;
-constexpr auto apple_a7 = armv8a_crc_crypto;
-
-}
-
-static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
-    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
-    // armv6
-    {"mpcore", CPU::arm_mpcore, CPU::generic, 0, Feature::generic},
-    {"arm1136jf-s", CPU::arm_1136jf_s, CPU::generic, 0, Feature::generic},
-    {"arm1156t2f-s", CPU::arm_1156t2f_s, CPU::generic, 0, Feature::generic},
-    {"arm1176jzf-s", CPU::arm_1176jzf_s, CPU::generic, 0, Feature::generic},
-    {"cortex-m0", CPU::arm_cortex_m0, CPU::generic, 0, Feature::generic},
-    {"cortex-m1", CPU::arm_cortex_m1, CPU::generic, 0, Feature::generic},
-    // armv7ml
-    {"armv7-m", CPU::armv7_m, CPU::generic, 0, Feature::armv7m},
-    {"armv7e-m", CPU::armv7e_m, CPU::generic, 0, Feature::armv7m},
-    {"cortex-m3", CPU::arm_cortex_m3, CPU::generic, 0, Feature::armv7m},
-    {"cortex-m4", CPU::arm_cortex_m4, CPU::generic, 0, Feature::armv7m},
-    {"cortex-m7", CPU::arm_cortex_m7, CPU::generic, 0, Feature::armv7m},
-    // armv7l
-    {"armv7-a", CPU::armv7_a, CPU::generic, 0, Feature::armv7a},
-    {"armv7-r", CPU::armv7_r, CPU::generic, 0, Feature::armv7r},
-    {"cortex-a5", CPU::arm_cortex_a5, CPU::generic, 0, Feature::arm_cortex_a5},
-    {"cortex-a7", CPU::arm_cortex_a7, CPU::generic, 0, Feature::arm_cortex_a7},
-    {"cortex-a8", CPU::arm_cortex_a8, CPU::generic, 0, Feature::arm_cortex_a8},
-    {"cortex-a9", CPU::arm_cortex_a9, CPU::generic, 0, Feature::arm_cortex_a9},
-    {"cortex-a12", CPU::arm_cortex_a12, CPU::generic, 0, Feature::arm_cortex_a12},
-    {"cortex-a15", CPU::arm_cortex_a15, CPU::generic, 0, Feature::arm_cortex_a15},
-    {"cortex-a17", CPU::arm_cortex_a17, CPU::generic, 0, Feature::arm_cortex_a17},
-    {"cortex-r4", CPU::arm_cortex_r4, CPU::generic, 0, Feature::arm_cortex_r4},
-    {"cortex-r5", CPU::arm_cortex_r5, CPU::generic, 0, Feature::arm_cortex_r5},
-    {"cortex-r7", CPU::arm_cortex_r7, CPU::generic, 0, Feature::arm_cortex_r7},
-    {"cortex-r8", CPU::arm_cortex_r8, CPU::generic, 0, Feature::arm_cortex_r8},
-    {"scorpion", CPU::qualcomm_scorpion, CPU::armv7_a, UINT32_MAX, Feature::qualcomm_scorpion},
-    {"krait", CPU::qualcomm_krait, CPU::generic, 0, Feature::qualcomm_krait},
-    {"swift", CPU::apple_swift, CPU::generic, 0, Feature::apple_swift},
-    {"pj4", CPU::marvell_pj4, CPU::armv7_a, UINT32_MAX, Feature::marvell_pj4},
-    {"3735d", CPU::intel_3735d, CPU::armv7_a, UINT32_MAX, Feature::intel_3735d},
-
-    // armv8ml
-    {"armv8-m.base", CPU::armv8_m_base, CPU::generic, 0, Feature::armv8m},
-    {"armv8-m.main", CPU::armv8_m_main, CPU::generic, 0, Feature::armv8m},
-    {"cortex-m23", CPU::arm_cortex_m23, CPU::armv8_m_base, 0, Feature::arm_cortex_m23},
-    {"cortex-m33", CPU::arm_cortex_m33, CPU::armv8_m_main, 0, Feature::arm_cortex_m33},
-
-    // armv8l
-    {"armv8-a", CPU::armv8_a, CPU::generic, 0, Feature::armv8a},
-    {"armv8-r", CPU::armv8_r, CPU::generic, 0, Feature::armv8r},
-    {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a},
-    {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a},
-    {"armv8.3-a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a},
-    {"armv8.4-a", CPU::armv8_4_a, CPU::generic, 0, Feature::armv8_4a},
-    {"armv8.5-a", CPU::armv8_5_a, CPU::generic, 0, Feature::armv8_5a},
-    {"armv8.6_a", CPU::armv8_6_a, CPU::generic, 0, Feature::armv8_6a},
-    {"cortex-a32", CPU::arm_cortex_a32, CPU::generic, 0, Feature::arm_cortex_a32},
-    {"cortex-r52", CPU::arm_cortex_r52, CPU::generic, 0, Feature::arm_cortex_r52},
-    {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35},
-    {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53},
-    {"cortex-a55", CPU::arm_cortex_a55, CPU::generic, 0, Feature::arm_cortex_a55},
-    {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57},
-    {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72},
-    {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73},
-    {"cortex-a75", CPU::arm_cortex_a75, CPU::generic, 0, Feature::arm_cortex_a75},
-    {"cortex-a76", CPU::arm_cortex_a76, CPU::generic, 0, Feature::arm_cortex_a76},
-    {"cortex-a76ae", CPU::arm_cortex_a76ae, CPU::generic, 0, Feature::arm_cortex_a76},
-    {"cortex-a77", CPU::arm_cortex_a77, CPU::arm_cortex_a76, 110000, Feature::arm_cortex_a77},
-    {"cortex-a78", CPU::arm_cortex_a78, CPU::arm_cortex_a77, 110000, Feature::arm_cortex_a78},
-    {"cortex-x1", CPU::arm_cortex_x1, CPU::arm_cortex_a78, 110000, Feature::arm_cortex_x1},
-    {"neoverse-n1", CPU::arm_neoverse_n1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_n1},
-    {"neoverse-v1", CPU::arm_neoverse_v1, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_v1},
-    {"neoverse-n2", CPU::arm_neoverse_n2, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_n2},
-    {"denver1", CPU::nvidia_denver1, CPU::arm_cortex_a53, UINT32_MAX, Feature::nvidia_denver1},
-    {"denver2", CPU::nvidia_denver2, CPU::arm_cortex_a57, UINT32_MAX, Feature::nvidia_denver2},
-    {"xgene1", CPU::apm_xgene1, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene1},
-    {"xgene2", CPU::apm_xgene2, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene2},
-    {"xgene3", CPU::apm_xgene3, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene3},
-    {"kyro", CPU::qualcomm_kyro, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_kyro},
-    {"falkor", CPU::qualcomm_falkor, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_falkor},
-    {"saphira", CPU::qualcomm_saphira, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_saphira},
-    {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m1},
-    {"exynos-m2", CPU::samsung_exynos_m2, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m2},
-    {"exynos-m3", CPU::samsung_exynos_m3, CPU::generic, 0, Feature::samsung_exynos_m3},
-    {"exynos-m4", CPU::samsung_exynos_m4, CPU::generic, 0, Feature::samsung_exynos_m4},
-    {"exynos-m5", CPU::samsung_exynos_m5, CPU::samsung_exynos_m4, 110000, Feature::samsung_exynos_m5},
-    {"apple-a7", CPU::apple_a7, CPU::generic, 0, Feature::apple_a7},
-};
-#endif
-static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]);
-
-static inline const CPUSpec<CPU,feature_sz> *find_cpu(uint32_t cpu)
-{
-    return ::find_cpu(cpu, cpus, ncpu_names);
-}
-
-static inline const CPUSpec<CPU,feature_sz> *find_cpu(llvm::StringRef name)
-{
-    return ::find_cpu(name, cpus, ncpu_names);
-}
-
-static inline const char *find_cpu_name(uint32_t cpu)
-{
-    return ::find_cpu_name(cpu, cpus, ncpu_names);
-}
-
-#if defined _CPU_AARCH64_ && defined _OS_DARWIN_
-
-static NOINLINE std::pair<uint32_t,FeatureList<feature_sz>> _get_host_cpu()
-{
-    using namespace llvm;
-    char buffer[128];
-    size_t bufferlen = 128;
-    sysctlbyname("machdep.cpu.brand_string",&buffer,&bufferlen,NULL,0);
-    StringRef cpu_name(buffer);
-    if (cpu_name.find("M1") != StringRef ::npos)
-        return std::make_pair((uint32_t)CPU::apple_m1, Feature::apple_m1);
-    else if (cpu_name.find("M2") != StringRef ::npos)
-        return std::make_pair((uint32_t)CPU::apple_m2, Feature::apple_m2);
-    else if (cpu_name.find("M3") != StringRef ::npos)
-        return std::make_pair((uint32_t)CPU::apple_m3, Feature::apple_m3);
-    else if (cpu_name.find("M4") != StringRef ::npos)
-        return std::make_pair((uint32_t)CPU::apple_m4, Feature::apple_m4);
-    else
-        return std::make_pair((uint32_t)CPU::apple_m1, Feature::apple_m1);
-}
-
-#else
-
-// auxval reader
-
-#ifndef AT_HWCAP
-#  define AT_HWCAP 16
-#endif
-#ifndef AT_HWCAP2
-#  define AT_HWCAP2 26
-#endif
-
-#if defined(_OS_FREEBSD_)
-static inline unsigned long jl_getauxval(unsigned long type)
-{
-    unsigned long val;
-    if (elf_aux_info((int)type, &val, sizeof(val)) != 0) {
-        return 0;
-    }
-    return val;
-}
-#elif defined(USE_DYN_GETAUXVAL)
-static unsigned long getauxval_procfs(unsigned long type)
-{
-    int fd = open("/proc/self/auxv", O_RDONLY);
-    if (fd == -1)
-        return 0;
-    unsigned long val = 0;
-    unsigned long buff[2];
-    while (read(fd, buff, sizeof(buff)) == sizeof(buff)) {
-        if (buff[0] == 0)
-            break;
-        if (buff[0] == type) {
-            val = buff[1];
-            break;
-        }
-    }
-    close(fd);
-    return val;
-}
-
-static inline unsigned long jl_getauxval(unsigned long type)
-{
-    // First, try resolving getauxval in libc
-    auto libc = jl_dlopen(nullptr, JL_RTLD_LOCAL);
-    static unsigned long (*getauxval_p)(unsigned long) = NULL;
-    if (getauxval_p == NULL && jl_dlsym(libc, "getauxval", (void **)&getauxval_p, 0, 0)) {
-        return getauxval_p(type);
-    }
-
-    // If we couldn't resolve it, use procfs.
-    return getauxval_procfs(type);
-}
-#else
-static inline unsigned long jl_getauxval(unsigned long type)
-{
-    return getauxval(type);
-}
-#endif
-
-struct CPUID {
-    uint8_t implementer;
-    uint8_t variant;
-    uint16_t part;
-    bool operator<(const CPUID &right) const
-    {
-        if (implementer < right.implementer)
-            return true;
-        if (implementer > right.implementer)
-            return false;
-        if (part < right.part)
-            return true;
-        if (part > right.part)
-            return false;
-        return variant < right.variant;
-    }
-};
-
-// /sys/devices/system/cpu/cpu<n>/regs/identification/midr_el1 reader
-static inline void get_cpuinfo_sysfs(std::set<CPUID> &res)
-{
-    // This only works on a 64bit 4.7+ kernel
-    auto dir = opendir("/sys/devices/system/cpu");
-    if (!dir)
-        return;
-    while (auto entry = readdir(dir)) {
-        if (entry->d_type != DT_DIR)
-            continue;
-        if (strncmp(entry->d_name, "cpu", 3) != 0)
-            continue;
-        std::string stm;
-        llvm::raw_string_ostream(stm) << "/sys/devices/system/cpu/" << entry->d_name << "/regs/identification/midr_el1";
-        std::ifstream file(stm);
-        if (!file)
-            continue;
-        uint64_t val = 0;
-        file >> std::hex >> val;
-        if (!file)
-            continue;
-        CPUID cpuid = {
-            uint8_t(val >> 24),
-            uint8_t((val >> 20) & 0xf),
-            uint16_t((val >> 4) & 0xfff)
-        };
-        res.insert(cpuid);
-    }
-    closedir(dir);
-}
-
-// Use an external template since lambda's can't be templated in C++11
-template<typename T, typename F>
-static inline bool try_read_procfs_line(llvm::StringRef line, const char *prefix, T &out,
-                                        bool &flag, F &&reset)
-{
-    if (!line.starts_with(prefix))
-        return false;
-    if (flag)
-        reset();
-    flag = line.substr(strlen(prefix)).ltrim("\t :").getAsInteger(0, out);
-    return true;
-}
-
-// /proc/cpuinfo reader
-static inline void get_cpuinfo_procfs(std::set<CPUID> &res)
-{
-    std::ifstream file("/proc/cpuinfo");
-    CPUID cpuid = {0, 0, 0};
-    bool impl = false;
-    bool part = false;
-    bool var = false;
-    auto reset = [&] () {
-        if (impl && part)
-            res.insert(cpuid);
-        impl = false;
-        part = false;
-        var = false;
-        memset(&cpuid, 0, sizeof(cpuid));
-    };
-    for (std::string line; std::getline(file, line);) {
-        if (line.empty()) {
-            reset();
-            continue;
-        }
-        try_read_procfs_line(line, "CPU implementer", cpuid.implementer, impl, reset) ||
-            try_read_procfs_line(line, "CPU variant", cpuid.variant, var, reset) ||
-            try_read_procfs_line(line, "CPU part", cpuid.part, part, reset);
-    }
-    reset();
-}
-
-static std::set<CPUID> get_cpuinfo(void)
-{
-    std::set<CPUID> res;
-    get_cpuinfo_sysfs(res);
-    if (res.empty())
-        get_cpuinfo_procfs(res);
-    return res;
-}
-
-static CPU get_cpu_name(CPUID cpuid)
-{
-    switch (cpuid.implementer) {
-    case 0x41: // 'A': ARM
-        switch (cpuid.part) {
-        case 0xb02: return CPU::arm_mpcore;
-        case 0xb36: return CPU::arm_1136jf_s;
-        case 0xb56: return CPU::arm_1156t2f_s;
-        case 0xb76: return CPU::arm_1176jzf_s;
-        case 0xc05: return CPU::arm_cortex_a5;
-        case 0xc07: return CPU::arm_cortex_a7;
-        case 0xc08: return CPU::arm_cortex_a8;
-        case 0xc09: return CPU::arm_cortex_a9;
-        case 0xc0d: return CPU::arm_cortex_a12;
-        case 0xc0f: return CPU::arm_cortex_a15;
-        case 0xc0e: return CPU::arm_cortex_a17;
-        case 0xc14: return CPU::arm_cortex_r4;
-        case 0xc15: return CPU::arm_cortex_r5;
-        case 0xc17: return CPU::arm_cortex_r7;
-        case 0xc18: return CPU::arm_cortex_r8;
-        case 0xc20: return CPU::arm_cortex_m0;
-        case 0xc21: return CPU::arm_cortex_m1;
-        case 0xc23: return CPU::arm_cortex_m3;
-        case 0xc24: return CPU::arm_cortex_m4;
-        case 0xc27: return CPU::arm_cortex_m7;
-        case 0xd01: return CPU::arm_cortex_a32;
-        case 0xd02: return CPU::arm_cortex_a34;
-        case 0xd03: return CPU::arm_cortex_a53;
-        case 0xd04: return CPU::arm_cortex_a35;
-        case 0xd05: return CPU::arm_cortex_a55;
-        case 0xd06: return CPU::arm_cortex_a65;
-        case 0xd07: return CPU::arm_cortex_a57;
-        case 0xd08: return CPU::arm_cortex_a72;
-        case 0xd09: return CPU::arm_cortex_a73;
-        case 0xd0a: return CPU::arm_cortex_a75;
-        case 0xd0b: return CPU::arm_cortex_a76;
-        case 0xd0c: return CPU::arm_neoverse_n1;
-        case 0xd0d: return CPU::arm_cortex_a77;
-        case 0xd0e: return CPU::arm_cortex_a76ae;
-        case 0xd13: return CPU::arm_cortex_r52;
-        case 0xd20: return CPU::arm_cortex_m23;
-        case 0xd21: return CPU::arm_cortex_m33;
-            // case 0xd22: return CPU::arm_cortex_m55;
-        case 0xd40: return CPU::arm_neoverse_v1;
-        case 0xd41: return CPU::arm_cortex_a78;
-        case 0xd43: return CPU::arm_cortex_a65ae;
-        case 0xd44: return CPU::arm_cortex_x1;
-        case 0xd49: return CPU::arm_neoverse_n2;
-        case 0xd4a: return CPU::arm_neoverse_e1;
-        default: return CPU::generic;
-        }
-    case 0x42: // 'B': Broadcom (Cavium)
-        switch (cpuid.part) {
-            // case 0x00f: return CPU::broadcom_brahma_b15;
-            // case 0x100: return CPU::broadcom_brahma_b53;
-        case 0x516: return CPU::cavium_thunderx2t99p1;
-        default: return CPU::generic;
-        }
-    case 0x43: // 'C': Cavium
-        switch (cpuid.part) {
-        case 0xa0: return CPU::cavium_thunderx;
-        case 0xa1:
-            if (cpuid.variant == 0)
-                return CPU::cavium_thunderx88p1;
-            return CPU::cavium_thunderx88;
-        case 0xa2: return CPU::cavium_thunderx81;
-        case 0xa3: return CPU::cavium_thunderx83;
-        case 0xaf: return CPU::cavium_thunderx2t99;
-        case 0xb0: return CPU::cavium_octeontx2;
-        case 0xb1: return CPU::cavium_octeontx2t98;
-        case 0xb2: return CPU::cavium_octeontx2t96;
-        case 0xb3: return CPU::cavium_octeontx2f95;
-        case 0xb4: return CPU::cavium_octeontx2f95n;
-        case 0xb5: return CPU::cavium_octeontx2f95mm;
-        case 0xb8: return CPU::marvell_thunderx3t110;
-        default: return CPU::generic;
-        }
-    case 0x46: // 'F': Fujitsu
-        switch (cpuid.part) {
-        case 0x1: return CPU::fujitsu_a64fx;
-        default: return CPU::generic;
-        }
-    case 0x48: // 'H': HiSilicon
-        switch (cpuid.part) {
-        case 0xd01: return CPU::hisilicon_tsv110;
-        case 0xd40: return CPU::arm_cortex_a76; // Kirin 980
-        default: return CPU::generic;
-        }
-    case 0x4e: // 'N': NVIDIA
-        switch (cpuid.part) {
-        case 0x000: return CPU::nvidia_denver1;
-        case 0x003: return CPU::nvidia_denver2;
-        case 0x004: return CPU::nvidia_carmel;
-        default: return CPU::generic;
-        }
-    case 0x50: // 'P': AppliedMicro
-        // x-gene 2
-        // x-gene 3
-        switch (cpuid.part) {
-        case 0x000: return CPU::apm_xgene1;
-        default: return CPU::generic;
-        }
-    case 0x51: // 'Q': Qualcomm
-        switch (cpuid.part) {
-        case 0x00f:
-        case 0x02d:
-            return CPU::qualcomm_scorpion;
-        case 0x04d:
-        case 0x06f:
-            return CPU::qualcomm_krait;
-        case 0x201: // silver
-        case 0x205: // gold
-        case 0x211: // silver
-            return CPU::qualcomm_kyro;
-            // kryo 2xx
-        case 0x800: // gold
-            return CPU::arm_cortex_a73;
-        case 0x801: // silver
-            return CPU::arm_cortex_a53;
-            // kryo 3xx
-        case 0x802: // gold
-            return CPU::arm_cortex_a75;
-        case 0x803: // silver
-            return CPU::arm_cortex_a55;
-            // kryo 4xx
-        case 0x804: // gold
-            return CPU::arm_cortex_a76;
-        case 0x805: // silver
-            return CPU::arm_cortex_a55;
-            // kryo 5xx seems to be using ID for cortex-a77 directly
-        case 0xc00:
-            return CPU::qualcomm_falkor;
-        case 0xc01:
-            return CPU::qualcomm_saphira;
-        default: return CPU::generic;
-        }
-    case 0x53: // 'S': Samsung
-        if (cpuid.part == 1) {
-            if (cpuid.variant == 4)
-                return CPU::samsung_exynos_m2;
-            return CPU::samsung_exynos_m1;
-        }
-        if (cpuid.variant != 1)
-            return CPU::generic;
-        switch (cpuid.part) {
-        case 0x2: return CPU::samsung_exynos_m3;
-        case 0x3: return CPU::samsung_exynos_m4;
-        case 0x4: return CPU::samsung_exynos_m5;
-        default: return CPU::generic;
-        }
-    case 0x56: // 'V': Marvell
-        switch (cpuid.part) {
-        case 0x581:
-        case 0x584:
-            return CPU::marvell_pj4;
-        default: return CPU::generic;
-        }
-    case 0x61: // 'a': Apple
-        // Data here is partially based on these sources:
-        // https://github.com/apple-oss-distributions/xnu/blob/main/osfmk/arm/cpuid.h
-        // https://asahilinux.org/docs/hw/soc/soc-codenames/#socs
-        // https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64Processors.td
-        switch (cpuid.part) {
-        case 0x0: // Swift
-            return CPU::apple_swift;
-        case 0x1: // Cyclone
-            return CPU::apple_a7;
-        case 0x2: // Typhoon
-        case 0x3: // Typhoo/Capri
-            return CPU::apple_a8;
-        case 0x4: // Twister
-        case 0x5: // Twister/Elba/Malta
-            return CPU::apple_a9;
-        case 0x6: // Hurricane
-        case 0x7: // Hurricane/Myst
-            return CPU::apple_a10;
-        case 0x8: // Monsoon
-        case 0x9: // Mistral
-            return CPU::apple_a11;
-        case 0xB: // Vortex
-        case 0xC: // Tempest
-        case 0x10: // A12X, Vortex Aruba
-        case 0x11: // A12X, Tempest Aruba
-            return CPU::apple_a12;
-        case 0xF: // Tempest M9
-            return CPU::apple_s4;
-        case 0x12: // H12 Cebu p-Core "Lightning"
-        case 0x13: // H12 Cebu e-Core "Thunder"
-            return CPU::apple_a13;
-        case 0x20: // H13 Sicily e-Core "Icestorm"
-        case 0x21: // H13 Sicily p-Core "Firestorm"
-            return CPU::apple_a14;
-        case 0x22: // H13G Tonga e-Core "Icestorm" used in Apple M1
-        case 0x23: // H13G Tonga p-Core "Firestorm" used in Apple M1
-        case 0x24: // H13J Jade Chop e-Core "Icestorm" used in Apple M1 Pro
-        case 0x25: // H13J Jade Chop p-Core "Firestorm" used in Apple M1 Pro
-        case 0x28: // H13J Jade Die e-Core "Icestorm" used in Apple M1 Max / Ultra
-        case 0x29: // H13J Jade Die p-Core "Firestorm" used in Apple M1 Max / Ultra
-            return CPU::apple_m1;
-        case 0x30: // H14 Ellis e-Core "Blizzard" used in Apple A15
-        case 0x31: // H14 Ellis p-Core "Avalanche" used in Apple A15
-            return CPU::apple_a15;
-        case 0x32: // H14G Staten e-Core "Blizzard" used in Apple M2
-        case 0x33: // H14G Staten p-Core "Avalanche" used in Apple M2
-        case 0x34: // H14S Rhodes Chop e-Core "Blizzard" used in Apple M2 Pro
-        case 0x35: // H14S Rhodes Chop p-Core "Avalanche" used in Apple M2 Pro
-        case 0x38: // H14C Rhodes Die e-Core "Blizzard" used in Apple M2 Max / Ultra
-        case 0x39: // H14C Rhodes Die p-Core "Avalanche" used in Apple M2 Max / Ultra
-            return CPU::apple_m2;
-        case 0x40: // H15 Crete e-Core "Sawtooth" used in Apple A16
-        case 0x41: // H15 Crete p-Core "Everest" used in Apple A16
-            return CPU::apple_a16;
-        case 0x42: // H15 Ibiza e-Core "Sawtooth" used in Apple M3
-        case 0x43: // H15 Ibiza p-Core "Everest" used in Apple M3
-        case 0x44: // H15 Lobos e-Core "Sawtooth" used in Apple M3 Pro
-        case 0x45: // H15 Lobos p-Core "Everest" used in Apple M3 Pro
-        case 0x49: // H15 Palma e-Core "Sawtooth" used in Apple M3 Max
-        case 0x48: // H15 Palma p-Core "Everest" used in Apple M3 Max
-            return CPU::apple_m3;
-        //case 0x46: // M11 e-Core "Sawtooth" used in Apple S9
-        //case 0x47:  does not exist
-            //return CPU::apple_s9;
-        case 0x50: // H15 Coll e-Core "Sawtooth" used in Apple A17 Pro
-        case 0x51: // H15 Coll p-Core "Everest" used in Apple A17 Pro
-            return CPU::apple_a17;
-        case 0x52: // H16G Donan e-Core used in Apple M4
-        case 0x53: // H16H Donan p-Core used in Apple M4
-        case 0x54: // H16S Brava S e-Core used in Apple M4 Pro
-        case 0x55: // H16S Brava S p-Core used in Apple M4 Pro
-        case 0x58: // H16C Brava C e-Core used in Apple M4 Max
-        case 0x59: // H16C Brava C p-Core used in Apple M4 Max
-            return CPU::apple_m4;
-        //case 0x60: // H17P Tahiti e-Core used in Apple A18 Pro
-        //case 0x61: // H17P Tahiti p-Core used in Apple A18 Pro
-        //case 0x6a: // H17A Tupai e-Core used in Apple A18
-        //case 0x6b: // H17A Tupai p-Core used in Apple A18
-            //return CPU::apple_a18;
-        default: return CPU::generic;
-        }
-    case 0x68: // 'h': Huaxintong Semiconductor
-        switch (cpuid.part) {
-        case 0x0: return CPU::hxt_phecda;
-        default: return CPU::generic;
-        }
-    case 0x69: // 'i': Intel
-        switch (cpuid.part) {
-        case 0x001: return CPU::intel_3735d;
-        default: return CPU::generic;
-        }
-    default:
-        return CPU::generic;
-    }
-}
-
-
-
-
-namespace {
-
-struct arm_arch {
-    int version;
-    char klass;
-    constexpr bool mclass() const { return klass == 'M'; }
-};
-
-}
-
-static arm_arch get_elf_arch(void)
-{
-#ifdef _CPU_AARCH64_
-    return {8, 'A'};
-#else
-    int ver = 0;
-    char profile = 0;
-    struct utsname name;
-    if (uname(&name) >= 0) {
-        // name.machine is the elf_platform in the kernel.
-        if (strcmp(name.machine, "armv6l") == 0) {
-            ver = 6;
-        }
-        else if (strcmp(name.machine, "armv7l") == 0) {
-            ver = 7;
-        }
-        else if (strcmp(name.machine, "armv7ml") == 0) {
-            ver = 7;
-            profile = 'M';
-        }
-        else if (strcmp(name.machine, "armv8l") == 0 || strcmp(name.machine, "aarch64") == 0) {
-            ver = 8;
-        }
-    }
-    if (__ARM_ARCH > ver)
-        ver = __ARM_ARCH;
-#  if __ARM_ARCH > 6 && defined(__ARM_ARCH_PROFILE)
-    profile = __ARM_ARCH_PROFILE;
-#  endif
-    return {ver, profile};
-#endif
-}
-
-static arm_arch feature_arch_version(const FeatureList<feature_sz> &feature)
-{
-#ifdef _CPU_AARCH64_
-    return {8, 'A'};
-#else
-    int ver;
-    if (test_nbit(feature, Feature::v8)) {
-        ver = 8;
-    }
-    else if (test_nbit(feature, Feature::v7)) {
-        ver = 7;
-    }
-    else {
-        return {6, 0};
-    }
-    if (test_nbit(feature, Feature::mclass)) {
-        return {ver, 'M'};
-    }
-    else if (test_nbit(feature, Feature::rclass)) {
-        return {ver, 'R'};
-    }
-    else if (test_nbit(feature, Feature::aclass)) {
-        return {ver, 'A'};
-    }
-    return {ver, 0};
-#endif
-}
-
-static CPU generic_for_arch(arm_arch arch)
-{
-#ifdef _CPU_AARCH64_
-    return CPU::generic;
-#else
-#  if defined(__ARM_ARCH_PROFILE)
-    char klass = __ARM_ARCH_PROFILE;
-#  else
-    char klass = arch.klass;
-#  endif
-    if (arch.version >= 8) {
-        if (klass == 'M') {
-            return CPU::armv8_m_base;
-        }
-        else if (klass == 'R') {
-            return CPU::armv8_r;
-        }
-        else {
-            return CPU::armv8_a;
-        }
-    }
-    else if (arch.version == 7) {
-        if (klass == 'M') {
-            return CPU::armv7_m;
-        }
-        else if (klass == 'R') {
-            return CPU::armv7_r;
-        }
-        else {
-            return CPU::armv7_a;
-        }
-    }
-    return CPU::generic;
-#endif
-}
-
-static bool check_cpu_arch_ver(uint32_t cpu, arm_arch arch)
-{
-    auto spec = find_cpu(cpu);
-    // This happens on AArch64 and indicates that the cpu name isn't a valid aarch64 CPU
-    if (!spec)
-        return false;
-    auto feature_arch = feature_arch_version(spec->features);
-    if (arch.mclass() != feature_arch.mclass())
-        return false;
-    if (arch.version > feature_arch.version)
-        return false;
-    return true;
-}
-
-static void shrink_big_little(llvm::SmallVectorImpl<std::pair<uint32_t,CPUID>> &list,
-                              const CPU *cpus, uint32_t ncpu)
-{
-    auto find = [&] (uint32_t name) {
-        for (uint32_t i = 0; i < ncpu; i++) {
-            if (cpus[i] == CPU(name)) {
-                return (int)i;
-            }
-        }
-        return -1;
-    };
-    int maxidx = -1;
-    for (auto &ele: list) {
-        int idx = find(ele.first);
-        if (idx > maxidx) {
-            maxidx = idx;
-        }
-    }
-    if (maxidx >= 0) {
-        list.erase(std::remove_if(list.begin(), list.end(), [&] (std::pair<uint32_t,CPUID> &ele) {
-                    int idx = find(ele.first);
-                    return idx != -1 && idx < maxidx;
-                }), list.end());
-    }
-}
-
-static NOINLINE std::pair<uint32_t,FeatureList<feature_sz>> _get_host_cpu()
-{
-    FeatureList<feature_sz> features = {};
-    // Here we assume that only the lower 32bit are used on aarch64
-    // Change the cast here when that's not the case anymore (and when there's features in the
-    // high bits that we want to detect).
-    features[0] = (uint32_t)jl_getauxval(AT_HWCAP);
-    features[1] = (uint32_t)jl_getauxval(AT_HWCAP2);
-#ifdef _CPU_AARCH64_
-    if (test_nbit(features, 31)) // HWCAP_PACG
-        set_bit(features, Feature::pauth, true);
-#endif
-    auto cpuinfo = get_cpuinfo();
-    auto arch = get_elf_arch();
-#ifdef _CPU_ARM_
-    if (arch.version >= 7) {
-        if (arch.klass == 'M') {
-            set_bit(features, Feature::mclass, true);
-        }
-        else if (arch.klass == 'R') {
-            set_bit(features, Feature::rclass, true);
-        }
-        else if (arch.klass == 'A') {
-            set_bit(features, Feature::aclass, true);
-        }
-    }
-    switch (arch.version) {
-    case 8:
-        set_bit(features, Feature::v8, true);
-        JL_FALLTHROUGH;
-    case 7:
-        set_bit(features, Feature::v7, true);
-        break;
-    default:
-        break;
-    }
-#endif
-
-    std::set<uint32_t> cpus;
-    llvm::SmallVector<std::pair<uint32_t,CPUID>, 0> list;
-    // Ideally the feature detection above should be enough.
-    // However depending on the kernel version not all features are available
-    // and it's also impossible to detect the ISA version which contains
-    // some features not yet exposed by the kernel.
-    // We therefore try to get a more complete feature list from the CPU name.
-    // Since it is possible to pair cores that have different feature set
-    // (Observed for exynos 9810 with exynos-m3 + cortex-a55) we'll compute
-    // an intersection of the known features from each core.
-    // If there's a core that we don't recognize, treat it as generic.
-    bool extra_initialized = false;
-    FeatureList<feature_sz> extra_features = {};
-    for (auto info: cpuinfo) {
-        auto name = (uint32_t)get_cpu_name(info);
-        if (name == 0) {
-            // no need to clear the feature set if it wasn't initialized
-            if (extra_initialized)
-                extra_features = FeatureList<feature_sz>{};
-            extra_initialized = true;
-            continue;
-        }
-        if (!check_cpu_arch_ver(name, arch))
-            continue;
-        if (cpus.insert(name).second) {
-            if (extra_initialized) {
-                extra_features = extra_features & find_cpu(name)->features;
-            }
-            else {
-                extra_initialized = true;
-                extra_features = find_cpu(name)->features;
-            }
-            list.emplace_back(name, info);
-        }
-    }
-    features = features | extra_features;
-
-    // Not all elements/pairs are valid
-    static constexpr CPU v8order[] = {
-        CPU::arm_cortex_a35,
-        CPU::arm_cortex_a53,
-        CPU::arm_cortex_a55,
-        CPU::arm_cortex_a57,
-        CPU::arm_cortex_a72,
-        CPU::arm_cortex_a73,
-        CPU::arm_cortex_a75,
-        CPU::arm_cortex_a76,
-        CPU::arm_neoverse_n1,
-        CPU::arm_neoverse_n2,
-        CPU::arm_neoverse_v1,
-        CPU::nvidia_denver2,
-        CPU::nvidia_carmel,
-        CPU::samsung_exynos_m1,
-        CPU::samsung_exynos_m2,
-        CPU::samsung_exynos_m3,
-        CPU::samsung_exynos_m4,
-        CPU::samsung_exynos_m5,
-    };
-    shrink_big_little(list, v8order, sizeof(v8order) / sizeof(CPU));
-#ifdef _CPU_ARM_
-    // Not all elements/pairs are valid
-    static constexpr CPU v7order[] = {
-        CPU::arm_cortex_a5,
-        CPU::arm_cortex_a7,
-        CPU::arm_cortex_a8,
-        CPU::arm_cortex_a9,
-        CPU::arm_cortex_a12,
-        CPU::arm_cortex_a15,
-        CPU::arm_cortex_a17
-    };
-    shrink_big_little(list, v7order, sizeof(v7order) / sizeof(CPU));
-#endif
-    uint32_t cpu = 0;
-    if (list.empty()) {
-        cpu = (uint32_t)generic_for_arch(arch);
-    }
-    else {
-        // This also covers `list.size() > 1` case which means there's a unknown combination
-        // consists of CPU's we know. Unclear what else we could try so just randomly return
-        // one...
-        cpu = list[0].first;
-    }
-    // Ignore feature bits that we are not interested in.
-    mask_features(feature_masks, &features[0]);
-    return std::make_pair(cpu, features);
-}
-#endif
-
-static inline const std::pair<uint32_t,FeatureList<feature_sz>> &get_host_cpu()
-{
-    static auto host_cpu = _get_host_cpu();
-    return host_cpu;
-}
-
-static bool is_generic_cpu_name(uint32_t cpu)
-{
-    switch ((CPU)cpu) {
-    case CPU::generic:
-    case CPU::armv7_a:
-    case CPU::armv7_m:
-    case CPU::armv7e_m:
-    case CPU::armv7_r:
-    case CPU::armv8_a:
-    case CPU::armv8_m_base:
-    case CPU::armv8_m_main:
-    case CPU::armv8_r:
-    case CPU::armv8_1_a:
-    case CPU::armv8_2_a:
-    case CPU::armv8_3_a:
-    case CPU::armv8_4_a:
-    case CPU::armv8_5_a:
-    case CPU::armv8_6_a:
-        return true;
-    default:
-        return false;
-    }
-}
-
-static inline const std::string &host_cpu_name()
-{
-    static std::string name = [] {
-        if (is_generic_cpu_name(get_host_cpu().first)) {
-            auto llvm_name = jl_get_cpu_name_llvm();
-            if (llvm_name != "generic") {
-                return llvm_name;
-            }
-        }
-        return std::string(find_cpu_name(get_host_cpu().first));
-    }();
-    return name;
-}
-
-static inline const char *normalize_cpu_name(llvm::StringRef name)
-{
-    if (name == "ares")
-        return "neoverse-n1";
-    if (name == "zeus")
-        return "neoverse-v1";
-    if (name == "cyclone")
-        return "apple-a7";
-    if (name == "typhoon")
-        return "apple-a8";
-    if (name == "twister")
-        return "apple-a9";
-    if (name == "hurricane")
-        return "apple-a10";
-    return nullptr;
-}
-
-template<size_t n>
-static inline void enable_depends(FeatureList<n> &features)
-{
-    if (test_nbit(features, Feature::v8_6a))
-        set_bit(features, Feature::v8_5a, true);
-    if (test_nbit(features, Feature::v8_5a))
-        set_bit(features, Feature::v8_4a, true);
-    if (test_nbit(features, Feature::v8_4a))
-        set_bit(features, Feature::v8_3a, true);
-    if (test_nbit(features, Feature::v8_3a))
-        set_bit(features, Feature::v8_2a, true);
-    if (test_nbit(features, Feature::v8_2a))
-        set_bit(features, Feature::v8_1a, true);
-    if (test_nbit(features, Feature::v8_1a))
-        set_bit(features, Feature::crc, true);
-#ifdef _CPU_ARM_
-    if (test_nbit(features, Feature::v8_1a)) {
-        set_bit(features, Feature::v8, true);
-        set_bit(features, Feature::aclass, true);
-    }
-    if (test_nbit(features, Feature::v8_m_main)) {
-        set_bit(features, Feature::v8, true);
-        set_bit(features, Feature::mclass, true);
-    }
-    if (test_nbit(features, Feature::v8)) {
-        set_bit(features, Feature::v7, true);
-        if (test_nbit(features, Feature::aclass)) {
-            set_bit(features, Feature::neon, true);
-            set_bit(features, Feature::vfp3, true);
-            set_bit(features, Feature::vfp4, true);
-            set_bit(features, Feature::hwdiv_arm, true);
-            set_bit(features, Feature::hwdiv, true);
-            set_bit(features, Feature::d32, true);
-        }
-    }
-#else
-    if (test_nbit(features, Feature::v8_1a)) {
-        set_bit(features, Feature::lse, true);
-        set_bit(features, Feature::rdm, true);
-    }
-    if (test_nbit(features, Feature::v8_2a)) {
-        set_bit(features, Feature::ccpp, true);
-    }
-    if (test_nbit(features, Feature::v8_3a)) {
-        set_bit(features, Feature::jsconv, true);
-        set_bit(features, Feature::complxnum, true);
-        set_bit(features, Feature::rcpc, true);
-    }
-    if (test_nbit(features, Feature::v8_4a)) {
-        set_bit(features, Feature::dit, true);
-        set_bit(features, Feature::rcpc_immo, true);
-        set_bit(features, Feature::flagm, true);
-    }
-    if (test_nbit(features, Feature::v8_5a)) {
-        set_bit(features, Feature::sb, true);
-        set_bit(features, Feature::ccdp, true);
-        set_bit(features, Feature::altnzcv, true);
-        set_bit(features, Feature::fptoint, true);
-    }
-    if (test_nbit(features, Feature::v8_6a)) {
-        set_bit(features, Feature::i8mm, true);
-        set_bit(features, Feature::bf16, true);
-    }
-#endif
-    ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
-}
-
-template<size_t n>
-static inline void disable_depends(FeatureList<n> &features)
-{
-    ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
-}
-
-static const llvm::SmallVector<TargetData<feature_sz>, 0> &get_cmdline_targets(const char *cpu_target)
-{
-    auto feature_cb = [] (const char *str, size_t len, FeatureList<feature_sz> &list) {
-#ifdef _CPU_AARCH64_
-        // On AArch64, treat `crypto` as an alias of aes + sha2 just like LLVM
-        if (llvm::StringRef(str, len) == "crypto") {
-            set_bit(list, Feature::aes, true);
-            set_bit(list, Feature::sha2, true);
-            return true;
-        }
-#endif
-        auto fbit = find_feature_bit(feature_names, nfeature_names, str, len);
-        if (fbit == UINT32_MAX)
-            return false;
-        set_bit(list, fbit, true);
-        return true;
-    };
-    auto &targets = ::get_cmdline_targets<feature_sz>(cpu_target, feature_cb);
-    for (auto &t: targets) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-    }
-    return targets;
-}
-
-static llvm::SmallVector<TargetData<feature_sz>, 0> jit_targets;
-
-static TargetData<feature_sz> arg_target_data(const TargetData<feature_sz> &arg, bool require_host)
-{
-    TargetData<feature_sz> res = arg;
-    const FeatureList<feature_sz> *cpu_features = nullptr;
-    if (res.name == "native") {
-        res.name = host_cpu_name();
-        cpu_features = &get_host_cpu().second;
-    }
-    else if (auto spec = find_cpu(res.name)) {
-        cpu_features = &spec->features;
-    }
-    else {
-        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
-    }
-    if (cpu_features) {
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.en.features[i] |= (*cpu_features)[i];
-        }
-    }
-    enable_depends(res.en.features);
-    for (size_t i = 0; i < feature_sz; i++)
-        res.en.features[i] &= ~res.dis.features[i];
-    if (require_host) {
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.en.features[i] &= get_host_cpu().second[i];
-        }
-    }
-    disable_depends(res.en.features);
-    if (cpu_features) {
-        // If the base feature if known, fill in the disable features
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.dis.features[i] = feature_masks[i] & ~res.en.features[i];
-        }
-    }
-    return res;
-}
-
-static int max_vector_size(const FeatureList<feature_sz> &features)
-{
-#ifdef _CPU_ARM_
-    if (test_nbit(features, Feature::neon))
-        return 16;
-    return 8;
-#else
-    if (test_nbit(features, Feature::sve2))
-        return 256;
-    if (test_nbit(features, Feature::sve))
-        return 128;
-    return 16;
-#endif
-}
-
-static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason)
-{
-    // First see what target is requested for the JIT.
-    const char *cpu_target = (const char *)ctx;
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    TargetData<feature_sz> target = arg_target_data(cmdline[0], true);
-    // Then find the best match in the sysimg
-    auto sysimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
-    for (auto &t: sysimg) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-    }
-    auto match = match_sysimg_targets(sysimg, target, max_vector_size, rejection_reason);
-    if (match.best_idx == UINT32_MAX)
-        return match.best_idx;
-    // Now we've decided on which sysimg version to use.
-    // Make sure the JIT target is compatible with it and save the JIT target.
-    if (match.vreg_size != max_vector_size(target.en.features) &&
-        (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) {
-#ifdef _CPU_ARM_
-        unset_bits(target.en.features, Feature::neon);
-#endif
-    }
-    jit_targets.push_back(std::move(target));
-    return match.best_idx;
-}
-
-static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason JL_REQUIRE_ROOTED_SLOT)
-{
-    TargetData<feature_sz> target = jit_targets.front();
-    auto pkgimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
-    for (auto &t: pkgimg) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-    }
-    auto match = match_sysimg_targets(pkgimg, target, max_vector_size, rejection_reason);
-    return match.best_idx;
-}
-
-static void ensure_jit_target(const char *cpu_target, bool imaging)
-{
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, imaging);
-    if (!jit_targets.empty())
-        return;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, jit_targets.empty());
-        jit_targets.push_back(std::move(data));
-    }
-    auto ntargets = jit_targets.size();
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = jit_targets[i];
-        if (t.en.flags & JL_TARGET_CLONE_ALL)
-            continue;
-        auto &features0 = jit_targets[t.base].en.features;
-        // Always clone when code checks CPU features
-        t.en.flags |= JL_TARGET_CLONE_CPU;
-        static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16};
-        for (auto fe: clone_fp16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_FLOAT16;
-                break;
-            }
-        }
-        // The most useful one in general...
-        t.en.flags |= JL_TARGET_CLONE_LOOP;
-#ifdef _CPU_ARM_
-        static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon};
-        for (auto fe: clone_math) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_MATH;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_simd[] = {Feature::neon};
-        for (auto fe: clone_simd) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_SIMD;
-                break;
-            }
-        }
-#endif
-    }
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_noext(const TargetData<feature_sz> &data)
-{
-    std::string name = data.name;
-    auto *spec = find_cpu(name);
-    while (spec) {
-        if (spec->llvmver <= JL_LLVM_VERSION)
-            break;
-        spec = find_cpu((uint32_t)spec->fallback);
-        name = spec->name;
-    }
-    auto features = data.en.features;
-    if (spec) {
-        if (is_generic_cpu_name((uint32_t)spec->cpu)) {
-            features = features | spec->features;
-            name = "generic";
-        }
-    }
-#ifdef _CPU_ARM_
-    // We use the name on aarch64 internally but the LLVM ARM backend still use the old name...
-    if (name == "apple-a7")
-        name = "cyclone";
-#endif
-    llvm::SmallVector<std::string, 0> feature_strs;
-    for (auto &fename: feature_names) {
-        if (fename.llvmver > JL_LLVM_VERSION)
-            continue;
-        if (fename.bit >= 32 * 2)
-            break;
-        const char *fename_str = fename.name;
-        bool enable = test_nbit(features, fename.bit);
-        bool disable = test_nbit(data.dis.features, fename.bit);
-        if (enable) {
-            feature_strs.insert(feature_strs.begin(), std::string("+") + fename_str);
-        }
-        else if (disable) {
-            feature_strs.push_back(std::string("-") + fename_str);
-        }
-    }
-    if (test_nbit(features, Feature::v8_6a))
-        feature_strs.push_back("+v8.6a");
-    if (test_nbit(features, Feature::v8_5a))
-        feature_strs.push_back("+v8.5a");
-    if (test_nbit(features, Feature::v8_4a))
-        feature_strs.push_back("+v8.4a");
-    if (test_nbit(features, Feature::v8_3a))
-        feature_strs.push_back("+v8.3a");
-    if (test_nbit(features, Feature::v8_2a))
-        feature_strs.push_back("+v8.2a");
-    if (test_nbit(features, Feature::v8_1a))
-        feature_strs.push_back("+v8.1a");
-#ifdef _CPU_ARM_
-    if (test_nbit(features, Feature::v8_m_main)) {
-        feature_strs.push_back("+v8m.main");
-        feature_strs.push_back("+armv8-m.main");
-    }
-    if (test_nbit(features, Feature::aclass))
-        feature_strs.push_back("+aclass");
-    if (test_nbit(features, Feature::rclass))
-        feature_strs.push_back("+rclass");
-    if (test_nbit(features, Feature::mclass))
-        feature_strs.push_back("+mclass");
-    if (test_nbit(features, Feature::v8)) {
-        feature_strs.push_back("+v8");
-        if (test_nbit(features, Feature::aclass))
-            feature_strs.push_back("+armv8-a");
-        if (test_nbit(features, Feature::rclass))
-            feature_strs.push_back("+armv8-r");
-        if (test_nbit(features, Feature::mclass)) {
-            feature_strs.push_back("+v8m");
-            feature_strs.push_back("+armv8-m.base");
-        }
-    }
-    if (test_nbit(features, Feature::v7)) {
-        feature_strs.push_back("+v7");
-        if (test_nbit(features, Feature::aclass))
-            feature_strs.push_back("+armv7-a");
-        if (test_nbit(features, Feature::rclass))
-            feature_strs.push_back("+armv7-r");
-        if (test_nbit(features, Feature::mclass))
-            feature_strs.push_back("+armv7-m");
-    }
-    feature_strs.push_back("+v6");
-    feature_strs.push_back("+vfp2");
-#else
-    feature_strs.push_back("+neon");
-    feature_strs.push_back("+fp-armv8");
-#endif
-    return std::make_pair(std::move(name), std::move(feature_strs));
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_vec(const TargetData<feature_sz> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    append_ext_features(res0.second, data.ext_features);
-    return res0;
-}
-
-static std::pair<std::string,std::string>
-get_llvm_target_str(const TargetData<feature_sz> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    auto features = join_feature_strs(res0.second);
-    append_ext_features(features, data.ext_features);
-    return std::make_pair(std::move(res0.first), std::move(features));
-}
-
-static FeatureList<feature_sz> get_max_feature(void)
-{
-#ifdef _CPU_ARM_
-    auto arch = get_elf_arch();
-    auto features = real_feature_masks;
-    if (arch.klass == 0)
-        arch.klass = 'A';
-    set_bit(features, Feature::v7, true);
-    set_bit(features, Feature::v8, true);
-    if (arch.klass == 'M') {
-        set_bit(features, Feature::mclass, true);
-        set_bit(features, Feature::v8_m_main, true);
-    }
-    else if (arch.klass == 'R') {
-        set_bit(features, Feature::rclass, true);
-    }
-    else if (arch.klass == 'A') {
-        set_bit(features, Feature::aclass, true);
-        set_bit(features, Feature::v8_1a, true);
-        set_bit(features, Feature::v8_2a, true);
-        set_bit(features, Feature::v8_3a, true);
-        set_bit(features, Feature::v8_4a, true);
-        set_bit(features, Feature::v8_5a, true);
-        set_bit(features, Feature::v8_6a, true);
-    }
-    return features;
-#else
-    // There isn't currently any conflicting features on AArch64
-    return feature_masks;
-#endif
-}
-
-}
-
-using namespace ARM;
-
-JL_DLLEXPORT void jl_dump_host_cpu(void)
-{
-    dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names,
-                  cpus, ncpu_names);
-}
-
-JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits)
-{
-#ifdef _CPU_AARCH64_
-    return jl_true;
-#else
-    TargetData<feature_sz> target = jit_targets.front();
-    FeatureList<feature_sz> features = target.en.features;
-    if (bits == 32 && test_nbit(features, Feature::vfp4sp))
-        return jl_true;
-    else if ((bits == 64 || bits == 32) && test_nbit(features, Feature::vfp4))
-        return jl_true;
-    else
-        return jl_false;
-#endif
-}
-
-jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target)
-{
-    if (!jit_targets.empty())
-        jl_error("JIT targets already initialized");
-    return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target);
-}
-
-jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image)
-{
-    if (jit_targets.empty())
-        jl_error("JIT targets not initialized");
-    if (jit_targets.size() > 1)
-        jl_error("Expected only one JIT target");
-    return parse_sysimg(image, pkgimg_init_cb, NULL);
-}
-
-JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data)
-{
-    jl_value_t *rejection_reason = NULL;
-    JL_GC_PUSH1(&rejection_reason);
-    uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason);
-    JL_GC_POP();
-    if (match_idx == UINT32_MAX)
-        return rejection_reason;
-    return jl_nothing;
-}
-
-std::pair<std::string,llvm::SmallVector<std::string, 0>> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags)
-{
-    ensure_jit_target(cpu_target, imaging);
-    flags = jit_targets[0].en.flags;
-    return get_llvm_target_vec(jit_targets[0]);
-}
-
-const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
-{
-    auto max_feature = get_max_feature();
-    static const auto res = get_llvm_target_str(TargetData<feature_sz>{host_cpu_name(),
-#ifdef _CPU_AARCH64_
-                "+ecv,+tme,+am,+specrestrict,+predres,+lor,+perfmon,+spe,+tracev8.4",
-#else
-                "+dotprod",
-#endif
-                {max_feature, 0}, {feature_masks & ~max_feature, 0}, 0});
-    return res;
-}
-
-#ifndef __clang_gcanalyzer__
-llvm::SmallVector<jl_target_spec_t, 0> jl_get_llvm_clone_targets(const char *cpu_target)
-{
-
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, true);
-    llvm::SmallVector<TargetData<feature_sz>, 0> image_targets;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, image_targets.empty());
-        image_targets.push_back(std::move(data));
-    }
-    auto ntargets = image_targets.size();
-    if (image_targets.empty())
-        jl_error("No targets specified");
-    llvm::SmallVector<jl_target_spec_t, 0> res;
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = image_targets[i];
-        if (t.en.flags & JL_TARGET_CLONE_ALL)
-            continue;
-        auto &features0 = image_targets[t.base].en.features;
-        // Always clone when code checks CPU features
-        t.en.flags |= JL_TARGET_CLONE_CPU;
-        static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16};
-        for (auto fe: clone_fp16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_FLOAT16;
-                break;
-            }
-        }
-        // The most useful one in general...
-        t.en.flags |= JL_TARGET_CLONE_LOOP;
-#ifdef _CPU_ARM_
-        static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon};
-        for (auto fe: clone_math) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_MATH;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_simd[] = {Feature::neon};
-        for (auto fe: clone_simd) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_SIMD;
-                break;
-            }
-        }
-#endif
-    }
-    for (auto &target: image_targets) {
-        auto features_en = target.en.features;
-        auto features_dis = target.dis.features;
-        for (auto &fename: feature_names) {
-            if (fename.llvmver > JL_LLVM_VERSION) {
-                unset_bits(features_en, fename.bit);
-                unset_bits(features_dis, fename.bit);
-            }
-        }
-        ARM::disable_depends(features_en);
-        jl_target_spec_t ele;
-        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
-        ele.data = serialize_target_data(target.name, features_en, features_dis,
-                                         target.ext_features);
-        ele.flags = target.en.flags;
-        ele.base = target.base;
-        res.push_back(ele);
-    }
-    return res;
-}
-
-#endif
-
-extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
-{
-    if (feature >= 32 * feature_sz)
-        return 0;
-    return test_nbit(&get_host_cpu().second[0], feature);
-}
-
-#ifdef _CPU_AARCH64_
-// FPCR FZ, bit [24]
-static constexpr uint64_t fpcr_fz_mask = 1 << 24;
-// FPCR FZ16, bit [19]
-static constexpr uint64_t fpcr_fz16_mask = 1 << 19;
-// FPCR DN, bit [25]
-static constexpr uint64_t fpcr_dn_mask = 1 << 25;
-
-static inline uint64_t get_fpcr_aarch64(void)
-{
-    uint64_t fpcr;
-    asm volatile("mrs %0, fpcr" : "=r"(fpcr));
-    return fpcr;
-}
-
-static inline void set_fpcr_aarch64(uint64_t fpcr)
-{
-    asm volatile("msr fpcr, %0" :: "r"(fpcr));
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return (get_fpcr_aarch64() & fpcr_fz_mask) != 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    uint64_t fpcr = get_fpcr_aarch64();
-    static uint64_t mask = fpcr_fz_mask | (jl_test_cpu_feature(JL_AArch64_fullfp16) ? fpcr_fz16_mask : 0);
-    fpcr = isZero ? (fpcr | mask) : (fpcr & ~mask);
-    set_fpcr_aarch64(fpcr);
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
-{
-    return (get_fpcr_aarch64() & fpcr_dn_mask) != 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
-{
-    uint64_t fpcr = get_fpcr_aarch64();
-    fpcr = isDefault ? (fpcr | fpcr_dn_mask) : (fpcr & ~fpcr_dn_mask);
-    set_fpcr_aarch64(fpcr);
-    return 0;
-}
-#else
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    return isZero;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
-{
-    return isDefault;
-}
-#endif
diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp
deleted file mode 100644
index c8c8feb072345..0000000000000
--- a/src/processor_fallback.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-// Fallback processor detection and dispatch
-
-static constexpr FeatureName *feature_names = nullptr;
-static constexpr uint32_t nfeature_names = 0;
-
-namespace Fallback {
-
-static inline const std::string &host_cpu_name()
-{
-    static std::string name = jl_get_cpu_name_llvm();
-    return name;
-}
-
-static const llvm::SmallVector<TargetData<1>, 0> &get_cmdline_targets(const char *cpu_target)
-{
-    auto feature_cb = [] (const char*, size_t, FeatureList<1>&) {
-        return false;
-    };
-    return ::get_cmdline_targets<1>(cpu_target, feature_cb);
-}
-
-static llvm::SmallVector<TargetData<1>, 0> jit_targets;
-
-static TargetData<1> arg_target_data(const TargetData<1> &arg, bool require_host)
-{
-    TargetData<1> res = arg;
-    if (res.name == "native") {
-        res.name = host_cpu_name();
-        append_ext_features(res.ext_features, jl_get_cpu_features_llvm());
-    }
-    else {
-        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
-    }
-    return res;
-}
-
-static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason)
-{
-    // First see what target is requested for the JIT.
-    const char *cpu_target = (const char *)ctx;
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    TargetData<1> target = arg_target_data(cmdline[0], true);
-    // Find the last name match or use the default one.
-    uint32_t best_idx = 0;
-    auto sysimg = deserialize_target_data<1>((const uint8_t*)id);
-    for (uint32_t i = 0; i < sysimg.size(); i++) {
-        auto &imgt = sysimg[i];
-        if (imgt.name == target.name) {
-            best_idx = i;
-        }
-    }
-    jit_targets.push_back(std::move(target));
-    return best_idx;
-}
-
-static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason)
-{
-    TargetData<1> target = jit_targets.front();
-    // Find the last name match or use the default one.
-    uint32_t best_idx = 0;
-    auto pkgimg = deserialize_target_data<1>((const uint8_t*)id);
-    for (uint32_t i = 0; i < pkgimg.size(); i++) {
-        auto &imgt = pkgimg[i];
-        if (imgt.name == target.name) {
-            best_idx = i;
-        }
-    }
-
-    return best_idx;
-}
-
-static void ensure_jit_target(const char *cpu_target, bool imaging)
-{
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, imaging);
-    if (!jit_targets.empty())
-        return;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, jit_targets.empty());
-        jit_targets.push_back(std::move(data));
-    }
-    auto ntargets = jit_targets.size();
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = jit_targets[i];
-        t.en.flags |= JL_TARGET_CLONE_ALL;
-    }
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_noext(const TargetData<1> &data)
-{
-    return std::make_pair(data.name, llvm::SmallVector<std::string, 0>{});
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_vec(const TargetData<1> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    append_ext_features(res0.second, data.ext_features);
-    return res0;
-}
-
-static std::pair<std::string,std::string>
-get_llvm_target_str(const TargetData<1> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    auto features = join_feature_strs(res0.second);
-    append_ext_features(features, data.ext_features);
-    return std::make_pair(std::move(res0.first), std::move(features));
-}
-
-}
-
-using namespace Fallback;
-
-jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target)
-{
-    if (!jit_targets.empty())
-        jl_error("JIT targets already initialized");
-    return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target);
-}
-
-jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image)
-{
-    if (jit_targets.empty())
-        jl_error("JIT targets not initialized");
-    if (jit_targets.size() > 1)
-        jl_error("Expected only one JIT target");
-    return parse_sysimg(image, pkgimg_init_cb, NULL);
-}
-
-std::pair<std::string,llvm::SmallVector<std::string, 0>> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags)
-{
-    ensure_jit_target(cpu_target, imaging);
-    flags = jit_targets[0].en.flags;
-    return get_llvm_target_vec(jit_targets[0]);
-}
-
-const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
-{
-    static const auto res = get_llvm_target_str(TargetData<1>{host_cpu_name(),
-                jl_get_cpu_features_llvm(), {{}, 0}, {{}, 0}, 0});
-    return res;
-}
-#ifndef __clang_gcanalyzer__
-llvm::SmallVector<jl_target_spec_t, 0> jl_get_llvm_clone_targets(const char *cpu_target)
-{
-
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, true);
-    llvm::SmallVector<TargetData<1>, 0> image_targets;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, image_targets.empty());
-        image_targets.push_back(std::move(data));
-    }
-    auto ntargets = image_targets.size();
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = image_targets[i];
-        t.en.flags |= JL_TARGET_CLONE_ALL;
-    }
-    if (image_targets.empty())
-        jl_error("No image targets found");
-    llvm::SmallVector<jl_target_spec_t, 0> res;
-    for (auto &target: image_targets) {
-        jl_target_spec_t ele;
-        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
-        ele.data = serialize_target_data(target.name, target.en.features,
-                                         target.dis.features, target.ext_features);
-        ele.flags = target.en.flags;
-        ele.base = 0;
-        res.push_back(ele);
-    }
-    return res;
-}
-#endif
-
-JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits)
-{
-    return jl_false; // Match behaviour of have_fma in src/llvm-cpufeatures.cpp (assume false)
-}
-
-JL_DLLEXPORT void jl_dump_host_cpu(void)
-{
-    jl_safe_printf("CPU: %s\n", host_cpu_name().c_str());
-    jl_safe_printf("Features: %s\n", jl_get_cpu_features_llvm().c_str());
-}
-
-JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data)
-{
-    jl_value_t *rejection_reason = NULL;
-    JL_GC_PUSH1(&rejection_reason);
-    uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason);
-    JL_GC_POP();
-    if (match_idx == UINT32_MAX)
-        return rejection_reason;
-    return jl_nothing;
-}
-
-extern "C" int jl_test_cpu_feature(jl_cpu_feature_t)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    return isZero;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
-{
-    return isDefault;
-}
diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp
deleted file mode 100644
index bd624943083ae..0000000000000
--- a/src/processor_x86.cpp
+++ /dev/null
@@ -1,1292 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-// X86 specific processor detection and dispatch
-
-// CPUID
-
-#include "julia.h"
-extern "C" JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
-{
-    asm volatile (
-#if defined(__i386__) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;" :
-        "=S" (CPUInfo[1]),
-#else
-        "cpuid" :
-        "=b" (CPUInfo[1]),
-#endif
-        "=a" (CPUInfo[0]),
-        "=c" (CPUInfo[2]),
-        "=d" (CPUInfo[3]) :
-        "a" (InfoType)
-        );
-}
-
-extern "C" JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType)
-{
-    asm volatile (
-#if defined(__i386__) && defined(__PIC__)
-        "xchg %%ebx, %%esi;"
-        "cpuid;"
-        "xchg %%esi, %%ebx;" :
-        "=S" (CPUInfo[1]),
-#else
-        "cpuid" :
-        "=b" (CPUInfo[1]),
-#endif
-        "=a" (CPUInfo[0]),
-        "=c" (CPUInfo[2]),
-        "=d" (CPUInfo[3]) :
-        "a" (InfoType),
-        "c" (subInfoType)
-        );
-}
-
-namespace X86 {
-
-enum class CPU : uint32_t {
-    generic = 0,
-    intel_nocona,
-    intel_prescott,
-    intel_atom_bonnell,
-    intel_atom_silvermont,
-    intel_atom_goldmont,
-    intel_atom_goldmont_plus,
-    intel_atom_tremont,
-    intel_core2,
-    intel_core2_penryn,
-    intel_yonah,
-    intel_corei7_nehalem,
-    intel_corei7_westmere,
-    intel_corei7_sandybridge,
-    intel_corei7_ivybridge,
-    intel_corei7_haswell,
-    intel_corei7_broadwell,
-    intel_corei7_skylake,
-    intel_corei7_skylake_avx512,
-    intel_corei7_cascadelake,
-    intel_corei7_cooperlake,
-    intel_corei7_cannonlake,
-    intel_corei7_icelake_client,
-    intel_corei7_icelake_server,
-    intel_corei7_tigerlake,
-    intel_corei7_alderlake,
-    intel_corei7_sapphirerapids,
-    intel_knights_landing,
-    intel_knights_mill,
-
-    amd_fam10h,
-    amd_athlon_fx,
-    amd_athlon_64,
-    amd_athlon_64_sse3,
-    amd_bdver1,
-    amd_bdver2,
-    amd_bdver3,
-    amd_bdver4,
-    amd_btver1,
-    amd_btver2,
-    amd_k8,
-    amd_k8_sse3,
-    amd_opteron,
-    amd_opteron_sse3,
-    amd_barcelona,
-    amd_znver1,
-    amd_znver2,
-    amd_znver3,
-    amd_znver4,
-    amd_znver5,
-};
-
-static constexpr size_t feature_sz = 12;
-static constexpr FeatureName feature_names[] = {
-#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver},
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver},
-#include "features_x86.h"
-#undef JL_FEATURE_DEF
-#undef JL_FEATURE_DEF_NAME
-};
-static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName);
-
-template<typename... Args>
-static inline constexpr FeatureList<feature_sz> get_feature_masks(Args... args)
-{
-    return ::get_feature_masks<feature_sz>(args...);
-}
-
-#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
-static constexpr auto feature_masks = get_feature_masks(
-#define JL_FEATURE_DEF(name, bit, llvmver) bit,
-#include "features_x86.h"
-#undef JL_FEATURE_DEF
-    -1);
-
-namespace Feature {
-enum : uint32_t {
-#define JL_FEATURE_DEF(name, bit, llvmver) name = bit,
-#include "features_x86.h"
-#undef JL_FEATURE_DEF
-};
-#undef JL_FEATURE_DEF_NAME
-static constexpr FeatureDep deps[] = {
-    {ssse3, sse3},
-    {fma, avx},
-    {sse41, ssse3},
-    {sse42, sse41},
-    {avx, sse42},
-    {f16c, avx},
-    {avx2, avx},
-    {vaes, avx},
-    {vaes, aes},
-    {vpclmulqdq, avx},
-    {vpclmulqdq, pclmul},
-    {avxvnni, avx2},
-    {avxvnniint8, avx2},
-    {avxvnniint16, avx2},
-    {avxifma, avx2},
-    {avxneconvert, avx2},
-    {avx512f, avx2},
-    {avx512dq, avx512f},
-    {avx512ifma, avx512f},
-    {avx512cd, avx512f},
-    {avx512bw, avx512f},
-    {avx512bf16, avx512bw},
-    {avx512bitalg, avx512bw},
-    {avx512vl, avx512f},
-    {avx512vbmi, avx512bw},
-    {avx512vbmi2, avx512bw},
-    {avx512vnni, avx512f},
-    {avx512vp2intersect, avx512f},
-    {avx512vpopcntdq, avx512f},
-    {avx512fp16, avx512bw},
-    {avx512fp16, avx512dq},
-    {avx512fp16, avx512vl},
-    {amx_int8, amx_tile},
-    {amx_bf16, amx_tile},
-    {amx_fp16, amx_tile},
-    {amx_complex, amx_tile},
-    {sse4a, sse3},
-    {xop, fma4},
-    {fma4, avx},
-    {fma4, sse4a},
-    {xsaveopt, xsave},
-    {xsavec, xsave},
-    {xsaves, xsave},
-    {sha512, avx2},
-    {sm3, avx},
-    {sm4, avx2},
-};
-
-// We require cx16 on 64bit by default. This can be overwritten with `-cx16`
-// This isn't really compatible with 32bit but we mask it off there with required LLVM version
-constexpr auto generic = get_feature_masks(cx16);
-constexpr auto bonnell = get_feature_masks(sse3, ssse3, cx16, movbe, sahf);
-constexpr auto silvermont = bonnell | get_feature_masks(sse41, sse42, popcnt,
-                                                        pclmul, prfchw, rdrnd);
-constexpr auto goldmont = silvermont | get_feature_masks(aes, sha, rdseed, xsave, xsaveopt,
-                                                         xsavec, xsaves, clflushopt, fsgsbase);
-constexpr auto goldmont_plus = goldmont | get_feature_masks(ptwrite, rdpid); // sgx
-constexpr auto tremont = goldmont_plus | get_feature_masks(clwb, gfni);
-constexpr auto knl = get_feature_masks(sse3, ssse3, sse41, sse42, cx16, sahf, popcnt,
-                                       aes, pclmul, avx, xsave, xsaveopt, rdrnd, f16c, fsgsbase,
-                                       avx2, bmi, bmi2, fma, lzcnt, movbe, adx, rdseed, prfchw,
-                                       avx512f, avx512cd);
-constexpr auto knm = knl | get_feature_masks(avx512vpopcntdq);
-constexpr auto yonah = get_feature_masks(sse3);
-constexpr auto prescott = yonah;
-constexpr auto core2 = get_feature_masks(sse3, ssse3, cx16, sahf);
-constexpr auto nocona = get_feature_masks(sse3, cx16);
-constexpr auto penryn = nocona | get_feature_masks(ssse3, sse41, sahf);
-constexpr auto nehalem = penryn | get_feature_masks(sse42, popcnt);
-constexpr auto westmere = nehalem | get_feature_masks(pclmul);
-constexpr auto sandybridge = westmere | get_feature_masks(avx, xsave, xsaveopt);
-constexpr auto ivybridge = sandybridge | get_feature_masks(rdrnd, f16c, fsgsbase);
-constexpr auto haswell = ivybridge | get_feature_masks(avx2, bmi, bmi2, fma, lzcnt, movbe);
-constexpr auto broadwell = haswell | get_feature_masks(adx, rdseed, prfchw);
-constexpr auto skylake = broadwell | get_feature_masks(aes, xsavec, xsaves, clflushopt); // sgx
-constexpr auto skx = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl,
-                                                 pku, clwb);
-constexpr auto cascadelake = skx | get_feature_masks(avx512vnni);
-constexpr auto cooperlake = cascadelake | get_feature_masks(avx512bf16);
-constexpr auto cannonlake = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw,
-                                                        avx512vl, pku, avx512vbmi, avx512ifma,
-                                                        sha); // sgx
-constexpr auto icelake = cannonlake | get_feature_masks(avx512bitalg, vaes, avx512vbmi2,
-                                                        vpclmulqdq, avx512vpopcntdq,
-                                                        gfni, clwb, rdpid);
-constexpr auto icelake_server = icelake | get_feature_masks(pconfig, wbnoinvd);
-constexpr auto tigerlake = icelake | get_feature_masks(avx512vp2intersect, movdiri,
-                                                       movdir64b, shstk);
-constexpr auto alderlake = skylake | get_feature_masks(clwb, sha, waitpkg, shstk, gfni, vaes, vpclmulqdq, pconfig,
-                                                       rdpid, movdiri, pku, movdir64b, serialize, ptwrite, avxvnni);
-constexpr auto sapphirerapids = icelake_server |
-    get_feature_masks(amx_tile, amx_int8, amx_bf16, avx512bf16, avx512fp16, serialize, cldemote, waitpkg,
-                      avxvnni, uintr, ptwrite, tsxldtrk, enqcmd, shstk, avx512vp2intersect, movdiri, movdir64b);
-
-constexpr auto k8_sse3 = get_feature_masks(sse3, cx16);
-constexpr auto amdfam10 = k8_sse3 | get_feature_masks(sse4a, lzcnt, popcnt, sahf);
-
-constexpr auto btver1 = amdfam10 | get_feature_masks(ssse3, prfchw);
-constexpr auto btver2 = btver1 | get_feature_masks(sse41, sse42, avx, aes, pclmul, bmi, f16c,
-                                                   movbe, xsave, xsaveopt);
-
-constexpr auto bdver1 = amdfam10 | get_feature_masks(xop, fma4, avx, ssse3, sse41, sse42, aes,
-                                                     prfchw, pclmul, xsave);
-constexpr auto bdver2 = bdver1 | get_feature_masks(f16c, bmi, tbm, fma);
-constexpr auto bdver3 = bdver2 | get_feature_masks(xsaveopt, fsgsbase);
-constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx, movbe, rdrnd);
-
-// technically xsaves is part of znver1, znver2, and znver3
-// Disabled due to Erratum 1386
-// See: https://github.com/JuliaLang/julia/issues/50102
-constexpr auto znver1 = haswell | get_feature_masks(adx, aes, clflushopt, clzero, mwaitx, prfchw,
-                                                    rdseed, sha, sse4a, xsavec);
-constexpr auto znver2 = znver1 | get_feature_masks(clwb, rdpid, wbnoinvd);
-constexpr auto znver3 = znver2 | get_feature_masks(shstk, pku, vaes, vpclmulqdq);
-constexpr auto znver4 = znver3 | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl, avx512ifma, avx512vbmi,
-                                                   avx512vbmi2, avx512vnni, avx512bitalg, avx512vpopcntdq, avx512bf16, gfni, shstk, xsaves);
-constexpr auto znver5 = znver4 | get_feature_masks(avxvnni, movdiri, movdir64b, avx512vp2intersect, prefetchi, avxvnni);
-
-}
-
-static constexpr CPUSpec<CPU, feature_sz> cpus[] = {
-    {"generic", CPU::generic, CPU::generic, 0, Feature::generic},
-    {"bonnell", CPU::intel_atom_bonnell, CPU::generic, 0, Feature::bonnell},
-    {"silvermont", CPU::intel_atom_silvermont, CPU::generic, 0, Feature::silvermont},
-    {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 0, Feature::goldmont},
-    {"goldmont-plus", CPU::intel_atom_goldmont_plus, CPU::generic, 0, Feature::goldmont_plus},
-    {"tremont", CPU::intel_atom_tremont, CPU::generic, 0, Feature::tremont},
-    {"core2", CPU::intel_core2, CPU::generic, 0, Feature::core2},
-    {"yonah", CPU::intel_yonah, CPU::generic, 0, Feature::yonah},
-    {"prescott", CPU::intel_prescott, CPU::generic, 0, Feature::prescott},
-    {"nocona", CPU::intel_nocona, CPU::generic, 0, Feature::nocona},
-    {"penryn", CPU::intel_core2_penryn, CPU::generic, 0, Feature::penryn},
-    {"nehalem", CPU::intel_corei7_nehalem, CPU::generic, 0, Feature::nehalem},
-    {"westmere", CPU::intel_corei7_westmere, CPU::generic, 0, Feature::westmere},
-    {"sandybridge", CPU::intel_corei7_sandybridge, CPU::generic, 0, Feature::sandybridge},
-    {"ivybridge", CPU::intel_corei7_ivybridge, CPU::generic, 0, Feature::ivybridge},
-    {"haswell", CPU::intel_corei7_haswell, CPU::generic, 0, Feature::haswell},
-    {"broadwell", CPU::intel_corei7_broadwell, CPU::generic, 0, Feature::broadwell},
-    {"skylake", CPU::intel_corei7_skylake, CPU::generic, 0, Feature::skylake},
-    {"knl", CPU::intel_knights_landing, CPU::generic, 0, Feature::knl},
-    {"knm", CPU::intel_knights_mill, CPU::generic, 0, Feature::knm},
-    {"skylake-avx512", CPU::intel_corei7_skylake_avx512, CPU::generic, 0, Feature::skx},
-    {"cascadelake", CPU::intel_corei7_cascadelake, CPU::generic, 0, Feature::cascadelake},
-    {"cooperlake", CPU::intel_corei7_cooperlake, CPU::generic, 0, Feature::cooperlake},
-    {"cannonlake", CPU::intel_corei7_cannonlake, CPU::generic, 0, Feature::cannonlake},
-    {"icelake-client", CPU::intel_corei7_icelake_client, CPU::generic, 0, Feature::icelake},
-    {"icelake-server", CPU::intel_corei7_icelake_server, CPU::generic, 0,
-     Feature::icelake_server},
-    {"tigerlake", CPU::intel_corei7_tigerlake, CPU::intel_corei7_icelake_client, 100000,
-     Feature::tigerlake},
-    {"alderlake", CPU::intel_corei7_alderlake, CPU::intel_corei7_skylake, 120000,
-     Feature::alderlake},
-    {"sapphirerapids", CPU::intel_corei7_sapphirerapids, CPU::intel_corei7_icelake_server, 120000,
-     Feature::sapphirerapids},
-
-    {"athlon64", CPU::amd_athlon_64, CPU::generic, 0, Feature::generic},
-    {"athlon-fx", CPU::amd_athlon_fx, CPU::generic, 0, Feature::generic},
-    {"k8", CPU::amd_k8, CPU::generic, 0, Feature::generic},
-    {"opteron", CPU::amd_opteron, CPU::generic, 0, Feature::generic},
-
-    {"athlon64-sse3", CPU::amd_athlon_64_sse3, CPU::generic, 0, Feature::k8_sse3},
-    {"k8-sse3", CPU::amd_k8_sse3, CPU::generic, 0, Feature::k8_sse3},
-    {"opteron-sse3", CPU::amd_opteron_sse3, CPU::generic, 0, Feature::k8_sse3},
-
-    {"amdfam10", CPU::amd_fam10h, CPU::generic, 0, Feature::amdfam10},
-    {"barcelona", CPU::amd_barcelona, CPU::generic, 0, Feature::amdfam10},
-
-    {"btver1", CPU::amd_btver1, CPU::generic, 0, Feature::btver1},
-    {"btver2", CPU::amd_btver2, CPU::generic, 0, Feature::btver2},
-
-    {"bdver1", CPU::amd_bdver1, CPU::generic, 0, Feature::bdver1},
-    {"bdver2", CPU::amd_bdver2, CPU::generic, 0, Feature::bdver2},
-    {"bdver3", CPU::amd_bdver3, CPU::generic, 0, Feature::bdver3},
-    {"bdver4", CPU::amd_bdver4, CPU::generic, 0, Feature::bdver4},
-
-    {"znver1", CPU::amd_znver1, CPU::generic, 0, Feature::znver1},
-    {"znver2", CPU::amd_znver2, CPU::generic, 0, Feature::znver2},
-    {"znver3", CPU::amd_znver3, CPU::amd_znver2, 120000, Feature::znver3},
-    {"znver4", CPU::amd_znver4, CPU::amd_znver3, 160000, Feature::znver4},
-    {"znver5", CPU::amd_znver5, CPU::amd_znver4, 190000, Feature::znver5},
-};
-static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]);
-
-// For CPU model and feature detection on X86
-
-const int SIG_INTEL = 0x756e6547; // Genu
-const int SIG_AMD = 0x68747541; // Auth
-
-static uint64_t get_xcr0(void)
-{
-    uint32_t eax, edx;
-    asm volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
-    return (uint64_t(edx) << 32) | eax;
-}
-
-static CPU get_intel_processor_name(uint32_t family, uint32_t model, uint32_t brand_id,
-                                    const uint32_t *features)
-{
-    if (brand_id != 0)
-        return CPU::generic;
-    switch (family) {
-    case 3:
-    case 4:
-    case 5:
-        return CPU::generic;
-    case 6:
-        switch (model) {
-        case 0x01: // Pentium Pro processor
-        case 0x03: // Intel Pentium II OverDrive processor, Pentium II processor, model 03
-        case 0x05: // Pentium II processor, model 05, Pentium II Xeon processor,
-            // model 05, and Intel Celeron processor, model 05
-        case 0x06: // Celeron processor, model 06
-        case 0x07: // Pentium III processor, model 07, and Pentium III Xeon processor, model 07
-        case 0x08: // Pentium III processor, model 08, Pentium III Xeon processor,
-            // model 08, and Celeron processor, model 08
-        case 0x0a: // Pentium III Xeon processor, model 0Ah
-        case 0x0b: // Pentium III processor, model 0Bh
-        case 0x09: // Intel Pentium M processor, Intel Celeron M processor model 09.
-        case 0x0d: // Intel Pentium M processor, Intel Celeron M processor, model
-            // 0Dh. All processors are manufactured using the 90 nm process.
-        case 0x15: // Intel EP80579 Integrated Processor and Intel EP80579
-            // Integrated Processor with Intel QuickAssist Technology
-            return CPU::generic;
-        case 0x0e: // Intel Core Duo processor, Intel Core Solo processor, model
-            // 0Eh. All processors are manufactured using the 65 nm process.
-            return CPU::intel_yonah;
-        case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
-            // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
-            // mobile processor, Intel Core 2 Extreme processor, Intel
-            // Pentium Dual-Core processor, Intel Xeon processor, model
-            // 0Fh. All processors are manufactured using the 65 nm process.
-        case 0x16: // Intel Celeron processor model 16h. All processors are
-            // manufactured using the 65 nm process
-            return CPU::intel_core2;
-        case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
-            // 17h. All processors are manufactured using the 45 nm process.
-            //
-            // 45nm: Penryn , Wolfdale, Yorkfield (XE)
-        case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
-            // the 45 nm process.
-            return CPU::intel_core2_penryn;
-        case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
-            // processors are manufactured using the 45 nm process.
-        case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
-            // As found in a Summer 2010 model iMac.
-        case 0x1f:
-        case 0x2e: // Nehalem EX
-            return CPU::intel_corei7_nehalem;
-        case 0x25: // Intel Core i7, laptop version.
-        case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
-            // processors are manufactured using the 32 nm process.
-        case 0x2f: // Westmere EX
-            return CPU::intel_corei7_westmere;
-        case 0x2a: // Intel Core i7 processor. All processors are manufactured
-            // using the 32 nm process.
-        case 0x2d:
-            return CPU::intel_corei7_sandybridge;
-        case 0x3a:
-        case 0x3e: // Ivy Bridge EP
-            return CPU::intel_corei7_ivybridge;
-
-            // Haswell:
-        case 0x3c:
-        case 0x3f:
-        case 0x45:
-        case 0x46:
-            return CPU::intel_corei7_haswell;
-
-            // Broadwell:
-        case 0x3d:
-        case 0x47:
-        case 0x4f:
-        case 0x56:
-            return CPU::intel_corei7_broadwell;
-
-            // Skylake:
-        case 0x4e: // Skylake mobile
-        case 0x5e: // Skylake desktop
-        case 0x8e: // Kaby Lake mobile
-        case 0x9e: // Kaby Lake desktop
-        case 0xa5: // Comet Lake-H/S
-        case 0xa6: // Comet Lake-U
-            return CPU::intel_corei7_skylake;
-
-            // Skylake Xeon:
-        case 0x55:
-            if (test_nbit(features, Feature::avx512bf16))
-                return CPU::intel_corei7_cooperlake;
-            if (test_nbit(features, Feature::avx512vnni))
-                return CPU::intel_corei7_cascadelake;
-            return CPU::intel_corei7_skylake_avx512;
-
-            // Cannonlake:
-        case 0x66:
-            return CPU::intel_corei7_cannonlake;
-
-            // Icelake:
-        case 0x7d:
-        case 0x7e:
-        case 0x9d:
-            return CPU::intel_corei7_icelake_client;
-
-            // Icelake Xeon:
-        case 0x6a:
-        case 0x6c:
-            return CPU::intel_corei7_icelake_server;
-
-            // Tiger Lake
-        case 0x8c:
-        case 0x8d:
-            return CPU::intel_corei7_tigerlake;
-            //Alder Lake
-        case 0x97:
-        case 0x9a:
-            return CPU::intel_corei7_alderlake;
-
-            // Sapphire Rapids
-        case 0x8f:
-            return CPU::intel_corei7_sapphirerapids;
-
-        case 0x1c: // Most 45 nm Intel Atom processors
-        case 0x26: // 45 nm Atom Lincroft
-        case 0x27: // 32 nm Atom Medfield
-        case 0x35: // 32 nm Atom Midview
-        case 0x36: // 32 nm Atom Midview
-            return CPU::intel_atom_bonnell;
-
-            // Atom Silvermont codes from the Intel software optimization guide.
-        case 0x37:
-        case 0x4a:
-        case 0x4d:
-        case 0x5d:
-            // Airmont
-        case 0x4c:
-        case 0x5a:
-        case 0x75:
-            return CPU::intel_atom_silvermont;
-
-            // Goldmont:
-        case 0x5c:
-        case 0x5f:
-            return CPU::intel_atom_goldmont;
-        case 0x7a:
-            return CPU::intel_atom_goldmont_plus;
-        case 0x86:
-        case 0x96:
-        case 0x9c:
-            return CPU::intel_atom_tremont;
-
-        case 0x57:
-            return CPU::intel_knights_landing;
-
-        case 0x85:
-            return CPU::intel_knights_mill;
-
-        default:
-            return CPU::generic;
-        }
-        break;
-    case 15: {
-        switch (model) {
-        case 0: // Pentium 4 processor, Intel Xeon processor. All processors are
-            // model 00h and manufactured using the 0.18 micron process.
-        case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon
-            // processor MP, and Intel Celeron processor. All processors are
-            // model 01h and manufactured using the 0.18 micron process.
-        case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M,
-            // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron
-            // processor, and Mobile Intel Celeron processor. All processors
-            // are model 02h and manufactured using the 0.13 micron process.
-        default:
-            return CPU::generic;
-
-        case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D
-            // processor. All processors are model 03h and manufactured using
-            // the 90 nm process.
-        case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition,
-            // Pentium D processor, Intel Xeon processor, Intel Xeon
-            // processor MP, Intel Celeron D processor. All processors are
-            // model 04h and manufactured using the 90 nm process.
-        case 6: // Pentium 4 processor, Pentium D processor, Pentium processor
-            // Extreme Edition, Intel Xeon processor, Intel Xeon processor
-            // MP, Intel Celeron D processor. All processors are model 06h
-            // and manufactured using the 65 nm process.
-#ifdef _CPU_X86_64_
-            return CPU::intel_nocona;
-#else
-            return CPU::intel_prescott;
-#endif
-        }
-    }
-    default:
-        break; /*"generic"*/
-    }
-    return CPU::generic;
-}
-
-static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_t *features)
-{
-    switch (family) {
-    case 4:
-    case 5:
-    case 6:
-    default:
-        return CPU::generic;
-    case 15:
-        if (test_nbit(features, Feature::sse3))
-            return CPU::amd_k8_sse3;
-        switch (model) {
-        case 1:
-            return CPU::amd_opteron;
-        case 5:
-            return CPU::amd_athlon_fx;
-        default:
-            return CPU::amd_athlon_64;
-        }
-    case 16:
-        switch (model) {
-        case 2:
-            return CPU::amd_barcelona;
-        case 4:
-        case 8:
-        default:
-            return CPU::amd_fam10h;
-        }
-    case 20:
-        return CPU::amd_btver1;
-    case 21:
-        if (model >= 0x50 && model <= 0x6f)
-            return CPU::amd_bdver4;
-        if (model >= 0x30 && model <= 0x3f)
-            return CPU::amd_bdver3;
-        if (model >= 0x10 && model <= 0x1f)
-            return CPU::amd_bdver2;
-        if (model <= 0x0f)
-            return CPU::amd_bdver1;
-        return CPU::amd_btver1; // fallback
-    case 22:
-        return CPU::amd_btver2;
-    case 23:
-        // Known models:
-        // Zen: 1, 17
-        // Zen+: 8, 24
-        // Zen2: 96, 113
-        if (model >= 0x30)
-            return CPU::amd_znver2;
-        return CPU::amd_znver1;
-    case 25:  // AMD Family 19h
-        if (model <= 0x0f || (model >= 0x20 && model <= 0x5f))
-            return CPU::amd_znver3;  // 00h-0Fh, 21h: Zen3
-        if ((model >= 0x10 && model <= 0x1f) ||
-            (model >= 0x60 && model <= 0x74) ||
-            (model >= 0x78 && model <= 0x7b) ||
-            (model >= 0xA0 && model <= 0xAf)) {
-                return CPU::amd_znver4;
-            }
-        return CPU::amd_znver3; // fallback
-    case 26:
-        // if (model <= 0x77)
-        return CPU::amd_znver5;
-    }
-}
-
-template<typename T>
-static inline void features_disable_avx512(T &features)
-{
-    using namespace Feature;
-    unset_bits(features, avx512f, avx512dq, avx512ifma, avx512cd,
-               avx512bw, avx512vl, avx512vbmi, avx512vpopcntdq, avx512vbmi2, avx512vnni,
-               avx512bitalg, avx512vp2intersect, avx512bf16);
-}
-
-template<typename T>
-static inline void features_disable_avx(T &features)
-{
-    using namespace Feature;
-    unset_bits(features, avx, Feature::fma, f16c, xsave, avx2, xop, fma4,
-               xsaveopt, xsavec, xsaves, vaes, vpclmulqdq);
-}
-
-template<typename T>
-static inline void features_disable_amx(T &features)
-{
-    using namespace Feature;
-    unset_bits(features, amx_bf16, amx_tile, amx_int8);
-}
-
-static NOINLINE std::pair<uint32_t,FeatureList<feature_sz>> _get_host_cpu(void)
-{
-    FeatureList<feature_sz> features = {};
-
-    int32_t info0[4];
-    jl_cpuid(info0, 0);
-    uint32_t maxleaf = info0[0];
-    if (maxleaf < 1)
-        return std::make_pair(uint32_t(CPU::generic), features);
-    int32_t info1[4];
-    jl_cpuid(info1, 1);
-
-    auto vendor = info0[1];
-    auto brand_id = info1[1] & 0xff;
-
-    auto family = (info1[0] >> 8) & 0xf; // Bits 8 - 11
-    auto model = (info1[0] >> 4) & 0xf;  // Bits 4 - 7
-    if (family == 6 || family == 0xf) {
-        if (family == 0xf)
-            // Examine extended family ID if family ID is F.
-            family += (info1[0] >> 20) & 0xff; // Bits 20 - 27
-        // Examine extended model ID if family ID is 6 or F.
-        model += ((info1[0] >> 16) & 0xf) << 4; // Bits 16 - 19
-    }
-
-    // Fill in the features
-    features[0] = info1[2];
-    features[1] = info1[3];
-    if (maxleaf >= 7) {
-        int32_t info7[4];
-        jl_cpuidex(info7, 7, 0);
-        features[2] = info7[1];
-        features[3] = info7[2];
-        features[4] = info7[3];
-    }
-    int32_t infoex0[4];
-    jl_cpuid(infoex0, 0x80000000);
-    uint32_t maxexleaf = infoex0[0];
-    if (maxexleaf >= 0x80000001) {
-        int32_t infoex1[4];
-        jl_cpuid(infoex1, 0x80000001);
-        features[5] = infoex1[2];
-        features[6] = infoex1[3];
-    }
-    if (maxleaf >= 0xd) {
-        int32_t infod[4];
-        jl_cpuidex(infod, 0xd, 0x1);
-        features[7] = infod[0];
-    }
-    if (maxexleaf >= 0x80000008) {
-        int32_t infoex8[4];
-        jl_cpuidex(infoex8, 0x80000008, 0);
-        features[8] = infoex8[1];
-    }
-    if (maxleaf >= 7) {
-        int32_t info7[4];
-        jl_cpuidex(info7, 7, 1);
-        features[9] = info7[0];
-        features[10] = info7[1];
-    }
-    if (maxleaf >= 0x14) {
-        int32_t info14[4];
-        jl_cpuidex(info14, 0x14, 0);
-        features[11] = info14[1];
-    }
-
-    // Fix up AVX bits to account for OS support and match LLVM model
-    uint64_t xcr0 = 0;
-    bool hasxsave = test_all_bits(features[0], 1 << 27);
-    if (hasxsave) {
-        xcr0 = get_xcr0();
-        hasxsave = test_all_bits(xcr0, 0x6);
-    }
-    bool hasavx = hasxsave && test_all_bits(features[0], 1 << 28);
-    unset_bits(features, 32 + 27);
-    if (!hasavx)
-        features_disable_avx(features);
-#ifdef _OS_DARWIN_
-    // See https://github.com/llvm/llvm-project/commit/82921bf2baed96b700f90b090d5dc2530223d9c0
-    // and https://github.com/apple/darwin-xnu/blob/a449c6a3b8014d9406c2ddbdc81795da24aa7443/osfmk/i386/fpu.c#L174
-    // Darwin lazily saves the AVX512 context on first use
-    bool hasavx512save = hasavx;
-#else
-    bool hasavx512save = hasavx && test_all_bits(xcr0, 0xe0);
-#endif
-    if (!hasavx512save)
-        features_disable_avx512(features);
-    // AMX requires additional context to be saved by the OS.
-    bool hasamxsave = hasxsave && test_all_bits(xcr0, (1 << 17) | (1 << 18));
-    if (!hasamxsave)
-        features_disable_amx(features);
-    // Ignore feature bits that we are not interested in.
-    mask_features(feature_masks, &features[0]);
-
-    uint32_t cpu;
-    if (vendor == SIG_INTEL) {
-        cpu = uint32_t(get_intel_processor_name(family, model, brand_id, &features[0]));
-    }
-    else if (vendor == SIG_AMD) {
-        cpu = uint32_t(get_amd_processor_name(family, model, &features[0]));
-    }
-    else {
-        cpu = uint32_t(CPU::generic);
-    }
-    /* Feature bits to register map
-    feature[0] = ecx
-    feature[1] = edx
-    feature[2] = leaf 7 ebx
-    feature[3] = leaf 7 ecx
-    feature[4] = leaf 7 edx
-    feature[5] = leaf 0x80000001 ecx
-    feature[6] = leaf 0x80000001 edx
-    feature[7] = leaf 0xd subleaf 1 eax
-    feature[8] = leaf 0x80000008 ebx
-    feature[9] = leaf 7 ebx subleaf 1 eax
-    feature[10] = leaf 7 ebx subleaf 1 ebx
-    feature[11] = leaf 0x14 ebx
-    */
-    return std::make_pair(cpu, features);
-}
-
-static inline const std::pair<uint32_t,FeatureList<feature_sz>> &get_host_cpu()
-{
-    static auto host_cpu = _get_host_cpu();
-    return host_cpu;
-}
-
-static inline const CPUSpec<CPU,feature_sz> *find_cpu(uint32_t cpu)
-{
-    return ::find_cpu(cpu, cpus, ncpu_names);
-}
-
-static inline const CPUSpec<CPU,feature_sz> *find_cpu(llvm::StringRef name)
-{
-    return ::find_cpu(name, cpus, ncpu_names);
-}
-
-static inline const char *find_cpu_name(uint32_t cpu)
-{
-    return ::find_cpu_name(cpu, cpus, ncpu_names);
-}
-
-static inline const std::string &host_cpu_name()
-{
-    static std::string name =
-        (CPU)get_host_cpu().first != CPU::generic ?
-        std::string(find_cpu_name(get_host_cpu().first)) :
-        jl_get_cpu_name_llvm();
-    return name;
-}
-
-static inline const char *normalize_cpu_name(llvm::StringRef name)
-{
-    if (name == "atom")
-        return "bonnell";
-    if (name == "slm")
-        return "silvermont";
-    if (name == "glm")
-        return "goldmont";
-    if (name == "corei7")
-        return "nehalem";
-    if (name == "corei7-avx")
-        return "sandybridge";
-    if (name == "core-avx-i")
-        return "ivybridge";
-    if (name == "core-avx2")
-        return "haswell";
-    if (name == "skx")
-        return "skylake-avx512";
-#ifdef _CPU_X86_
-    // i686 isn't a supported target but it's a common default one so just make it mean pentium4.
-    if (name == "pentium4" || name == "i686")
-        return "generic";
-#else
-    if (name == "x86-64" || name == "x86_64")
-        return "generic";
-#endif
-    return nullptr;
-}
-
-template<size_t n>
-static inline void enable_depends(FeatureList<n> &features)
-{
-    ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
-}
-
-template<size_t n>
-static inline void disable_depends(FeatureList<n> &features)
-{
-    ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep));
-}
-
-static const llvm::SmallVector<TargetData<feature_sz>, 0> &get_cmdline_targets(const char *cpu_target)
-{
-    auto feature_cb = [] (const char *str, size_t len, FeatureList<feature_sz> &list) {
-        auto fbit = find_feature_bit(feature_names, nfeature_names, str, len);
-        if (fbit == UINT32_MAX)
-            return false;
-        set_bit(list, fbit, true);
-        return true;
-    };
-    auto &targets = ::get_cmdline_targets<feature_sz>(cpu_target, feature_cb);
-    for (auto &t: targets) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-    }
-    return targets;
-}
-
-static llvm::SmallVector<TargetData<feature_sz>, 0> jit_targets;
-
-static TargetData<feature_sz> arg_target_data(const TargetData<feature_sz> &arg, bool require_host)
-{
-    TargetData<feature_sz> res = arg;
-    const FeatureList<feature_sz> *cpu_features = nullptr;
-    if (res.name == "native") {
-        res.name = host_cpu_name();
-        cpu_features = &get_host_cpu().second;
-    }
-    else if (auto spec = find_cpu(res.name)) {
-        cpu_features = &spec->features;
-    }
-    else {
-        res.en.flags |= JL_TARGET_UNKNOWN_NAME;
-    }
-    if (cpu_features) {
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.en.features[i] |= (*cpu_features)[i];
-        }
-    }
-    enable_depends(res.en.features);
-    // Mask our rdrand/rdseed/rtm/xsaveopt features that LLVM doesn't use and rr disables
-    unset_bits(res.en.features, Feature::rdrnd, Feature::rdseed, Feature::rtm, Feature::xsaveopt);
-    for (size_t i = 0; i < feature_sz; i++)
-        res.en.features[i] &= ~res.dis.features[i];
-    if (require_host) {
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.en.features[i] &= get_host_cpu().second[i];
-        }
-    }
-    disable_depends(res.en.features);
-    if (cpu_features) {
-        // If the base feature if known, fill in the disable features
-        for (size_t i = 0; i < feature_sz; i++) {
-            res.dis.features[i] = feature_masks[i] & ~res.en.features[i];
-        }
-    }
-    return res;
-}
-
-static int max_vector_size(const FeatureList<feature_sz> &features)
-{
-    if (test_nbit(features, Feature::avx512f))
-        return 64;
-    if (test_nbit(features, Feature::avx))
-        return 32;
-    // SSE is required
-    return 16;
-}
-
-static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t** rejection_reason)
-{
-    // First see what target is requested for the JIT.
-    const char *cpu_target = (const char *)ctx;
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    TargetData<feature_sz> target = arg_target_data(cmdline[0], true);
-    // Then find the best match in the sysimg
-    auto sysimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
-    // We translate `generic` to `pentium4` or `x86-64` before sending it to LLVM
-    // (see `get_llvm_target_noext`) which will be serialized into the sysimg target data.
-    // Translate them back so we can actually match them.
-    // We also track to see if the sysimg allows -cx16, however if the user does
-    // something silly like add +cx16 on a 32bit target, we want to disable this
-    // check, hence the pointer size check.
-    bool sysimg_allows_no_cx16 = sizeof(void *) == 4;;
-    for (auto &t: sysimg) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-
-        // Take note to see if the sysimg explicitly allows an architecture without cx16
-        sysimg_allows_no_cx16 |= !test_nbit(t.en.features, Feature::cx16);
-    }
-    if (!sysimg_allows_no_cx16 && !test_nbit(target.en.features, Feature::cx16)) {
-        jl_error("Your CPU does not support the CX16 instruction, which is required "
-                 "by this version of Julia!  This is often due to running inside of a "
-                 "virtualized environment.  Please read "
-                 "https://docs.julialang.org/en/v1/devdocs/sysimg/ for more.");
-    }
-    auto match = match_sysimg_targets(sysimg, target, max_vector_size, rejection_reason);
-    if (match.best_idx == UINT32_MAX)
-        return match.best_idx;
-    // Now we've decided on which sysimg version to use.
-    // Make sure the JIT target is compatible with it and save the JIT target.
-    if (match.vreg_size != max_vector_size(target.en.features) &&
-        (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) {
-        if (match.vreg_size < 64) {
-            features_disable_avx512(target.en.features);
-        }
-        if (match.vreg_size < 32) {
-            features_disable_avx(target.en.features);
-        }
-    }
-    jit_targets.push_back(std::move(target));
-    return match.best_idx;
-}
-
-static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason)
-{
-    TargetData<feature_sz> target = jit_targets.front();
-    auto pkgimg = deserialize_target_data<feature_sz>((const uint8_t*)id);
-    for (auto &t: pkgimg) {
-        if (auto nname = normalize_cpu_name(t.name)) {
-            t.name = nname;
-        }
-    }
-    auto match = match_sysimg_targets(pkgimg, target, max_vector_size, rejection_reason);
-    return match.best_idx;
-}
-
-//This function serves as a fallback during bootstrapping, at that point we don't have a sysimage with native code
-// so we won't call sysimg_init_cb, else this function shouldn't do anything.
-static void ensure_jit_target(const char *cpu_target, bool imaging)
-{
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, imaging);
-    if (!jit_targets.empty())
-        return;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, jit_targets.empty());
-        jit_targets.push_back(std::move(data));
-    }
-    auto ntargets = jit_targets.size();
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = jit_targets[i];
-        if (t.en.flags & JL_TARGET_CLONE_ALL)
-            continue;
-        // Always clone when code checks CPU features
-        t.en.flags |= JL_TARGET_CLONE_CPU;
-        // The most useful one in general...
-        t.en.flags |= JL_TARGET_CLONE_LOOP;
-        auto &features0 = jit_targets[t.base].en.features;
-        // Special case for KNL/KNM since they're so different
-        if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) {
-            if ((t.name == "knl" || t.name == "knm") &&
-                jit_targets[t.base].name != "knl" && jit_targets[t.base].name != "knm") {
-                t.en.flags |= JL_TARGET_CLONE_ALL;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4};
-        static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3,
-                                                  Feature::sse41, Feature::sse42,
-                                                  Feature::avx, Feature::avx2,
-                                                  Feature::vaes, Feature::vpclmulqdq,
-                                                  Feature::sse4a, Feature::avx512f,
-                                                  Feature::avx512dq, Feature::avx512ifma,
-                                                  Feature::avx512cd, Feature::avx512bw,
-                                                  Feature::avx512vl, Feature::avx512vbmi,
-                                                  Feature::avx512vpopcntdq, Feature::avxvnni,
-                                                  Feature::avx512vbmi2, Feature::avx512vnni,
-                                                  Feature::avx512bitalg, Feature::avx512bf16,
-                                                  Feature::avx512vp2intersect, Feature::avx512fp16};
-        for (auto fe: clone_math) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_MATH;
-                break;
-            }
-        }
-        for (auto fe: clone_simd) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_SIMD;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16};
-        for (auto fe: clone_fp16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_FLOAT16;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16};
-        for (auto fe: clone_bf16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_BFLOAT16;
-                break;
-            }
-        }
-    }
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_noext(const TargetData<feature_sz> &data)
-{
-    std::string name = data.name;
-    auto *spec = find_cpu(name);
-    while (spec) {
-        if (spec->llvmver <= JL_LLVM_VERSION)
-            break;
-        spec = find_cpu((uint32_t)spec->fallback);
-        name = spec->name;
-    }
-    if (name == "generic") {
-        // Use translate `generic` into what we actually require
-#ifdef _CPU_X86_
-        name = "pentium4";
-#else
-        name = "x86-64";
-#endif
-    }
-    llvm::SmallVector<std::string, 0> features;
-    for (auto &fename: feature_names) {
-        if (fename.llvmver > JL_LLVM_VERSION)
-            continue;
-        if (test_nbit(data.en.features, fename.bit)) {
-            features.insert(features.begin(), std::string("+") + fename.name);
-        }
-        else if (test_nbit(data.dis.features, fename.bit)) {
-            features.push_back(std::string("-") + fename.name);
-        }
-    }
-    features.push_back("+sse2");
-    features.push_back("+mmx");
-    features.push_back("+fxsr");
-#ifdef _CPU_X86_64_
-    // This is required to make LLVM happy if LLVM's feature based CPU arch guess
-    // returns a value that may not have 64bit support.
-    // This can happen with virtualization.
-    features.push_back("+64bit");
-#endif
-    features.push_back("+cx8");
-    return std::make_pair(std::move(name), std::move(features));
-}
-
-static std::pair<std::string,llvm::SmallVector<std::string, 0>>
-get_llvm_target_vec(const TargetData<feature_sz> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    append_ext_features(res0.second, data.ext_features);
-    return res0;
-}
-
-static std::pair<std::string,std::string>
-get_llvm_target_str(const TargetData<feature_sz> &data)
-{
-    auto res0 = get_llvm_target_noext(data);
-    auto features = join_feature_strs(res0.second);
-    append_ext_features(features, data.ext_features);
-    return std::make_pair(std::move(res0.first), std::move(features));
-}
-
-}
-
-using namespace X86;
-
-JL_DLLEXPORT void jl_dump_host_cpu(void)
-{
-    dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names,
-                  cpus, ncpu_names);
-}
-
-JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data)
-{
-    jl_value_t *rejection_reason = NULL;
-    JL_GC_PUSH1(&rejection_reason);
-    uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason);
-    JL_GC_POP();
-    if (match_idx == UINT32_MAX)
-        return rejection_reason;
-    return jl_nothing;
-}
-
-JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits)
-{
-    TargetData<feature_sz> target = jit_targets.front();
-    FeatureList<feature_sz> features = target.en.features;
-    if ((bits == 32 || bits == 64) && (test_nbit(features, Feature::fma) || test_nbit(features, Feature::fma4)))
-        return jl_true;
-    else
-        return jl_false;
-}
-
-jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target)
-{
-    if (!jit_targets.empty())
-        jl_error("JIT targets already initialized");
-    return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target);
-}
-
-jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image)
-{
-    if (jit_targets.empty())
-        jl_error("JIT targets not initialized");
-    if (jit_targets.size() > 1)
-        jl_error("Expected only one JIT target");
-    return parse_sysimg(image, pkgimg_init_cb, NULL);
-}
-
-std::pair<std::string,llvm::SmallVector<std::string, 0>> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags)
-{
-    ensure_jit_target(cpu_target, imaging);
-    flags = jit_targets[0].en.flags;
-    return get_llvm_target_vec(jit_targets[0]);
-}
-
-const std::pair<std::string,std::string> &jl_get_llvm_disasm_target(void)
-{
-    static const auto res = get_llvm_target_str(TargetData<feature_sz>{"generic", "",
-            {feature_masks, 0}, {{}, 0}, 0});
-    return res;
-}
-//This function parses the -C command line to figure out which targets to multiversion to.
-#ifndef __clang_gcanalyzer__
-llvm::SmallVector<jl_target_spec_t, 0> jl_get_llvm_clone_targets(const char *cpu_target)
-{
-
-    auto &cmdline = get_cmdline_targets(cpu_target);
-    check_cmdline(cmdline, true);
-    llvm::SmallVector<TargetData<feature_sz>, 0> image_targets;
-    for (auto &arg: cmdline) {
-        auto data = arg_target_data(arg, image_targets.empty());
-        image_targets.push_back(std::move(data));
-    }
-
-    auto ntargets = image_targets.size();
-    // Now decide the clone condition.
-    for (size_t i = 1; i < ntargets; i++) {
-        auto &t = image_targets[i];
-        if (t.en.flags & JL_TARGET_CLONE_ALL)
-            continue;
-        // Always clone when code checks CPU features
-        t.en.flags |= JL_TARGET_CLONE_CPU;
-        // The most useful one in general...
-        t.en.flags |= JL_TARGET_CLONE_LOOP;
-        auto &features0 = image_targets[t.base].en.features;
-        // Special case for KNL/KNM since they're so different
-        if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) {
-            if ((t.name == "knl" || t.name == "knm") &&
-                image_targets[t.base].name != "knl" && image_targets[t.base].name != "knm") {
-                t.en.flags |= JL_TARGET_CLONE_ALL;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4};
-        static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3,
-                                                  Feature::sse41, Feature::sse42,
-                                                  Feature::avx, Feature::avx2,
-                                                  Feature::vaes, Feature::vpclmulqdq,
-                                                  Feature::sse4a, Feature::avx512f,
-                                                  Feature::avx512dq, Feature::avx512ifma,
-                                                  Feature::avx512cd, Feature::avx512bw,
-                                                  Feature::avx512vl, Feature::avx512vbmi,
-                                                  Feature::avx512vpopcntdq, Feature::avxvnni,
-                                                  Feature::avx512vbmi2, Feature::avx512vnni,
-                                                  Feature::avx512bitalg, Feature::avx512bf16,
-                                                  Feature::avx512vp2intersect, Feature::avx512fp16};
-        for (auto fe: clone_math) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_MATH;
-                break;
-            }
-        }
-        for (auto fe: clone_simd) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_SIMD;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16};
-        for (auto fe: clone_fp16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_FLOAT16;
-                break;
-            }
-        }
-        static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16};
-        for (auto fe: clone_bf16) {
-            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
-                t.en.flags |= JL_TARGET_CLONE_BFLOAT16;
-                break;
-            }
-        }
-    }
-    if (image_targets.empty())
-        jl_error("No targets specified");
-    llvm::SmallVector<jl_target_spec_t, 0> res;
-    for (auto &target: image_targets) {
-        auto features_en = target.en.features;
-        auto features_dis = target.dis.features;
-        for (auto &fename: feature_names) {
-            if (fename.llvmver > JL_LLVM_VERSION) {
-                unset_bits(features_en, fename.bit);
-                unset_bits(features_dis, fename.bit);
-            }
-        }
-        X86::disable_depends(features_en);
-        jl_target_spec_t ele;
-        std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target);
-        ele.data = serialize_target_data(target.name, features_en, features_dis,
-                                         target.ext_features);
-        ele.flags = target.en.flags;
-        ele.base = target.base;
-        res.push_back(ele);
-    }
-    return res;
-}
-#endif
-
-extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature)
-{
-    if (feature >= 32 * feature_sz)
-        return 0;
-    return test_nbit(&get_host_cpu().second[0], feature);
-}
-
-// -- set/clear the FZ/DAZ flags on x86 & x86-64 --
-
-// Cache of information recovered from `cpuid` since executing `cpuid` it at runtime is slow.
-static uint32_t subnormal_flags = [] {
-    int32_t info[4];
-    jl_cpuid(info, 0);
-    if (info[0] >= 1) {
-        jl_cpuid(info, 1);
-        if (info[3] & (1 << 26)) {
-            // SSE2 supports both FZ and DAZ
-            return 0x00008040;
-        }
-        else if (info[3] & (1 << 25)) {
-            // SSE supports only the FZ flag
-            return 0x00008000;
-        }
-    }
-    return 0;
-}();
-
-// Returns non-zero if subnormals go to 0; zero otherwise.
-extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void)
-{
-    return _mm_getcsr() & subnormal_flags;
-}
-
-// Return zero on success, non-zero on failure.
-extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
-{
-    uint32_t flags = subnormal_flags;
-    if (flags) {
-        uint32_t state = _mm_getcsr();
-        if (isZero)
-            state |= flags;
-        else
-            state &= ~flags;
-        _mm_setcsr(state);
-        return 0;
-    }
-    else {
-        // Report a failure only if user is trying to enable FTZ/DAZ.
-        return isZero;
-    }
-}
-
-// X86 does not support default NaNs
-extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void)
-{
-    return 0;
-}
-
-extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault)
-{
-    return isDefault;
-}
diff --git a/src/staticdata.c b/src/staticdata.c
index 82e903fdfd05a..2b471f41fc77d 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -4401,7 +4401,7 @@ JL_DLLEXPORT void jl_restore_system_image(jl_image_t *image, jl_image_buf_t buf)
         return;
 
     if (buf.kind == JL_IMAGE_KIND_SO)
-        assert(image->fptrs.ptrs); // jl_init_processor_sysimg should already be run
+        assert(image->fptrs.ptrs); // jl_load_sysimg should already be run
 
     JL_SIGATOMIC_BEGIN();
     ios_static_buffer(&f, (char *)buf.data, buf.size);
@@ -4433,7 +4433,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j
     jl_gc_notify_image_load(buf.data, buf.size);
 
     // Despite the name, this function actually parses the pkgimage
-    jl_image_t pkgimage = jl_init_processor_pkgimg(buf);
+    jl_image_t pkgimage = jl_load_pkgimg(buf);
 
     if (ignore_native) {
         // Must disable using native code in possible downstream users of this code:
diff --git a/test/binaryplatforms.jl b/test/binaryplatforms.jl
index 8de522e9c6c8b..81ff9b42a9249 100644
--- a/test/binaryplatforms.jl
+++ b/test/binaryplatforms.jl
@@ -5,6 +5,7 @@ using Test, Base.BinaryPlatforms, Base.BinaryPlatforms.CPUID
 @testset "CPUID" begin
     @test CPUID.cpu_isa() isa CPUID.ISA
 
+    # x86_64 tiers form a strict subset chain
     get_x86_64(n) = (CPUID.ISAs_by_family["x86_64"][n].second)
     @test get_x86_64(2) <  get_x86_64(4)
     @test get_x86_64(5) <= get_x86_64(5)
@@ -12,6 +13,83 @@ using Test, Base.BinaryPlatforms, Base.BinaryPlatforms.CPUID
     @test get_x86_64(7) >= get_x86_64(1)
     @test sort([get_x86_64(6), get_x86_64(4), get_x86_64(2), get_x86_64(4)]) ==
         [get_x86_64(2), get_x86_64(4), get_x86_64(4), get_x86_64(6)]
+
+    # Cross-arch queries return real feature data
+    @test length(CPUID._cross_lookup_cpu("x86_64", "haswell").features) > 10
+    @test length(CPUID._cross_lookup_cpu("aarch64", "cortex-a78").features) > 10
+    @test length(CPUID._cross_lookup_cpu("riscv64", "sifive-u74").features) > 0
+    @test isempty(CPUID._cross_lookup_cpu("x86_64", "nonexistent").features)
+    @test isempty(CPUID._cross_lookup_cpu("badarch", "haswell").features)
+
+    # Apple M-series aliases resolve to their A-series equivalents
+    let m1 = CPUID._cross_lookup_cpu("aarch64", "apple-m1"),
+        a14 = CPUID._cross_lookup_cpu("aarch64", "apple-a14")
+        @test m1.features == a14.features
+    end
+    @test !isempty(CPUID._cross_lookup_cpu("aarch64", "apple-m2").features)
+
+    # Arch name normalization (i686 → x86_64, arm64 → aarch64)
+    @test CPUID._cross_lookup_cpu("i686", "haswell").features ==
+          CPUID._cross_lookup_cpu("x86_64", "haswell").features
+    @test CPUID._cross_lookup_cpu("arm64", "cortex-a78").features ==
+          CPUID._cross_lookup_cpu("aarch64", "cortex-a78").features
+
+    # All families have non-empty ISA data (cross-arch works)
+    for (arch, isas) in CPUID.ISAs_by_family
+        @test length(isas) >= 1
+    end
+
+    # feature_names(arch, cpu) — query by CPU name
+    hsw = CPUID.feature_names("x86_64", "haswell")
+    @test "avx2" in hsw
+    @test "fma" in hsw
+    @test "sse4.2" in hsw
+    @test !("avx512f" in hsw)  # haswell doesn't have avx512
+
+    skx = CPUID.feature_names("x86_64", "skylake-avx512")
+    @test "avx512f" in skx
+    @test "avx512bw" in skx
+
+    # aarch64 cross-arch feature names
+    a78 = CPUID.feature_names("aarch64", "cortex-a78")
+    @test "lse" in a78
+    @test "neon" in a78
+
+    x925 = CPUID.feature_names("aarch64", "cortex-x925")
+    @test "sve2" in x925
+    @test "bf16" in x925
+    @test "dotprod" in x925
+
+    # Architecture version features present for ARM cores
+    @test "v8.1a" in x925
+    @test "v9a" in x925
+
+    # Unknown CPU returns empty
+    @test isempty(CPUID.feature_names("x86_64", "nonexistent"))
+
+    # feature_names(arch, isa) — query by ISA struct
+    names_from_isa = CPUID.feature_names("x86_64", get_x86_64(5))
+    @test "avx" in names_from_isa
+    @test "sse4.2" in names_from_isa
+
+    # feature_names(isa) — host arch default
+    host_names = CPUID.feature_names(CPUID.cpu_isa())
+    @test length(host_names) > 5
+
+    # feature_names() — full default (host arch + host ISA)
+    default_names = CPUID.feature_names()
+    @test default_names == host_names
+
+    # _build_bit_to_name returns a non-empty mapping with known features
+    mapping = CPUID._build_bit_to_name("x86_64")
+    @test length(mapping) > 50
+    @test "avx2" in values(mapping)
+    @test "sse4.2" in values(mapping)
+
+    mapping_aarch64 = CPUID._build_bit_to_name("aarch64")
+    @test length(mapping_aarch64) > 50
+    @test "neon" in values(mapping_aarch64)
+    @test "sve" in values(mapping_aarch64)
 end
 
 # Helper constructor to create a Platform with `validate_strict` set to `true`.
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index 273a9ee8e26f7..25eb3e571480e 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -197,6 +197,13 @@ end
     wait(p)
     @test p.exitcode == 1
     @test occursin("empty CPU name", String(take!(io)))
+
+    # Test --cpu-target=help prints available targets and exits cleanly
+    let v = readchomperrors(`$(Base.julia_cmd(; cpu_target="help"))`)
+        @test v[1] == true  # exits with 0
+        @test occursin("Available CPU targets:", v[2])
+        @test occursin("Host CPU:", v[2])
+    end
 end
 
 let exename = `$(Base.julia_cmd()) --startup-file=no --color=no`
diff --git a/test/llvmpasses/multiversioning-annotate-only.ll b/test/llvmpasses/multiversioning-annotate-only.ll
index 849cf57c78aa3..48322690b509e 100644
--- a/test/llvmpasses/multiversioning-annotate-only.ll
+++ b/test/llvmpasses/multiversioning-annotate-only.ll
@@ -5,29 +5,17 @@
 ; COM: This test checks that multiversioning correctly picks up on features that should trigger cloning
 ; COM: Note that for annotations alone, we don't need jl_fvars or jl_gvars
 
-; COM: Copied from src/processor.h
-; COM:    JL_TARGET_VEC_CALL = 1 << 0,
-; COM:    // Clone all functions
-; COM:    JL_TARGET_CLONE_ALL = 1 << 1,
-; COM:    // Clone when there's scalar math operations that can benefit from target-specific
-; COM:    // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags.
-; COM:    JL_TARGET_CLONE_MATH = 1 << 2,
-; COM:    // Clone when the function has a loop
-; COM:    JL_TARGET_CLONE_LOOP = 1 << 3,
-; COM:    // Clone when the function uses any vectors
-; COM:    // When this is specified, the cloning pass should also record if any of the cloned functions
-; COM:    // used this in any function call (including the signature of the function itself)
-; COM:    JL_TARGET_CLONE_SIMD = 1 << 4,
-; COM:    // The CPU name is unknown
-; COM:    JL_TARGET_UNKNOWN_NAME = 1 << 5,
-; COM:    // Optimize for size for this target
-; COM:    JL_TARGET_OPTSIZE = 1 << 6,
-; COM:    // Only optimize for size for this target
-; COM:    JL_TARGET_MINSIZE = 1 << 7,
-; COM:    // Clone when the function queries CPU features
-; COM:    JL_TARGET_CLONE_CPU = 1 << 8,
-; COM:    // Clone when the function uses fp16
-; COM:    JL_TARGET_CLONE_FLOAT16 = 1 << 9,
+; COM: Target spec packed_flags() encoding (from llvm-multiversioning.cpp):
+; COM:    clone_all       = 1 << 0
+; COM:    opt_size        = 1 << 1
+; COM:    min_size        = 1 << 2
+; COM:    has_new_math    = 1 << 3
+; COM:    has_new_simd    = 1 << 4
+; COM:    has_new_float16 = 1 << 5
+; COM:    has_new_bfloat16 = 1 << 6
+; COM:
+; COM: clone_flags() always includes LOOP and CPU categories.
+; COM: Additionally includes MATH if has_new_math, SIMD if has_new_simd, etc.
 
 ; COM: start with the basics, just one feature per function
 
@@ -78,7 +66,7 @@ define noundef float @simd_fastmath_test(<4 x float> noundef %0) {
   ret float %4
 }
 
-; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_FASTMATH_TEST_ATTRS:[0-9]+]]
+; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_TEST_ATTRS]]
 define noundef i32 @loop_fastmath_test(i32 noundef %0) {
   %2 = icmp sgt i32 %0, 0
   br i1 %2, label %7, label %5
@@ -102,7 +90,7 @@ define noundef i32 @loop_fastmath_test(i32 noundef %0) {
   br i1 %14, label %3, label %7, !llvm.loop !9
 }
 
-; CHECK: @simd_loop_test{{.*}}#[[SIMD_LOOP_TEST_ATTRS:[0-9]+]]
+; CHECK: @simd_loop_test{{.*}}#[[LOOP_TEST_ATTRS]]
 define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) {
   %2 = extractelement <4 x i32> %0, i64 0
   %3 = icmp sgt i32 %2, 0
@@ -122,7 +110,7 @@ define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) {
   br i1 %12, label %4, label %6, !llvm.loop !9
 }
 
-; CHECK: @simd_loop_fastmath_test{{.*}}#[[SIMD_LOOP_FASTMATH_TEST_ATTRS:[0-9]+]]
+; CHECK: @simd_loop_fastmath_test{{.*}}#[[LOOP_TEST_ATTRS]]
 define noundef i32 @simd_loop_fastmath_test(<4 x i32> noundef %0) {
   %2 = extractelement <4 x i32> %0, i64 0
   %3 = icmp sgt i32 %2, 0
@@ -180,12 +168,9 @@ define noundef i32 @uncloned(i32 noundef %0) {
 ; COM: Note that these strings are hex-encoded bits of the target indices that will be cloned
 ; CHECK-DAG: attributes #[[BORING_ATTRS]] = { "julia.mv.clones"="2" }
 ; CHECK-DAG: attributes #[[FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="6" }
-; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="A" }
+; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1E" }
 ; CHECK-DAG: attributes #[[SIMD_TEST_ATTRS]] = { "julia.mv.clones"="12" }
 ; CHECK-DAG: attributes #[[SIMD_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="16" }
-; CHECK-DAG: attributes #[[LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="E" }
-; CHECK-DAG: attributes #[[SIMD_LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1A" }
-; CHECK-DAG: attributes #[[SIMD_LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="1E" }
 ; CHECK-DAG: attributes #[[FUNC_IN_GV_ATTRS]]
 ; CHECK-SAME: "julia.mv.clones"="2"
 ; CHECK-SAME: "julia.mv.fvar"
@@ -210,9 +195,9 @@ define noundef i32 @uncloned(i32 noundef %0) {
 !1 = !{i32 1, !"julia.mv.skipcloning", i32 1}
 !2 = !{i32 1, !"julia.mv.specs", !3}
 !3 = !{!4, !5, !6, !7, !8}
-!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2}
-!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2}
-!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4}
-!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8}
+!4 = !{!"cpubase", !"nofeatures", i32 0, i32 0}
+!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 1}
+!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 8}
+!7 = !{!"cpuloop", !"loopclone", i32 0, i32 0}
 !8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
 !9 = !{!9}
diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll
index c4f5257a59988..aff71f4c87c47 100644
--- a/test/llvmpasses/multiversioning-clone-only.ll
+++ b/test/llvmpasses/multiversioning-clone-only.ll
@@ -210,10 +210,10 @@ attributes #3 = {"julia.mv.clones"="6"}
 !1 = !{i32 1, !"julia.mv.annotated", i32 1}
 !2 = !{i32 1, !"julia.mv.specs", !3}
 !3 = !{!4, !5, !6, !7, !8}
-!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2}
-!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2}
-!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4}
-!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8}
+!4 = !{!"cpubase", !"nofeatures", i32 0, i32 0}
+!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 1}
+!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 8}
+!7 = !{!"cpuloop", !"loopclone", i32 0, i32 0}
 !8 = !{!"cpusimd", !"simdclone", i32 0, i32 16}
 ; CHECK-DAG: ![[TBAA_CONST_METADATA]] = !{![[JTBAA_CONST_METADATA:[0-9]+]], ![[JTBAA_CONST_METADATA]]
 ; CHECK-DAG: ![[JTBAA_CONST_METADATA]] = !{!"jtbaa_const"
diff --git a/test/llvmpasses/multiversioning-x86.ll b/test/llvmpasses/multiversioning-x86.ll
index e2918d0c20eec..7ce50a3397127 100644
--- a/test/llvmpasses/multiversioning-x86.ll
+++ b/test/llvmpasses/multiversioning-x86.ll
@@ -119,5 +119,5 @@ define noundef i32 @simd_test_call(<4 x i32> noundef %0) {
 !2 = !{i32 1, !"julia.mv.specs", !3}
 !3 = !{!4, !5, !6}
 !4 = !{!"x86-64", !"+cx16,-sse3,-pclmul,-ssse3,-fma,-sse4.1,-sse4.2,-movbe,-popcnt,-aes,-xsave,-avx,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sahf,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 0}
-!5 = !{!"sandybridge", !"+sahf,+avx,+xsave,+popcnt,+sse4.2,+sse4.1,+cx16,+ssse3,+pclmul,+sse3,-fma,-movbe,-aes,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 2}
-!6 = !{!"haswell", !"+lzcnt,+sahf,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-aes,-rdrnd,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 1, i32 284}
+!5 = !{!"sandybridge", !"+sahf,+avx,+xsave,+popcnt,+sse4.2,+sse4.1,+cx16,+ssse3,+pclmul,+sse3,-fma,-movbe,-aes,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 1}
+!6 = !{!"haswell", !"+lzcnt,+sahf,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-aes,-rdrnd,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 1, i32 24}