diff --git a/base/Makefile b/base/Makefile index a9fe6ae7fa6c5..ca7eb6e1e2879 100644 --- a/base/Makefile +++ b/base/Makefile @@ -17,17 +17,21 @@ else PCRE_INCL_PATH := $(build_includedir)/pcre2.h endif -define parse_features -@printf "%s\n" "# $(2) features" >> $@ -@$(call PRINT_PERL, cat $(SRCDIR)/../src/features_$(1).h | perl -lne 'print "const JL_$(2)_$$1 = UInt32($$2)" if /^\s*JL_FEATURE_DEF(?:_NAME)?\(\s*(\w+)\s*,\s*([^,]+)\s*,.*\)\s*(?:\/\/.*)?$$/' >> $@) +# Extract feature indices from cpufeatures generated headers. +# The FeatureIndex enum has entries like: FEAT_SSE3 = 108, +# We convert them to: const JL_X86_sse3 = UInt32(108) +CPUFEATURES_GENDIR := $(build_includedir)/cpufeatures + +define parse_cpufeatures +@printf "%s\n" "# $(2) features (from cpufeatures)" >> $@ +@$(call PRINT_PERL, perl -lne 'if (/^\s*FEAT_(\w+)\s*=\s*(\d+)/) { my $$n = lc($$1); print "const JL_$(2)_$$n = UInt32($$2)" }' $(1) >> $@) @printf "\n" >> $@ endef -$(BUILDDIR)/features_h.jl: $(SRCDIR)/../src/features_x86.h $(SRCDIR)/../src/features_aarch32.h $(SRCDIR)/../src/features_aarch64.h +$(BUILDDIR)/features_h.jl: $(wildcard $(CPUFEATURES_GENDIR)/target_tables_*.h) @-rm -f $@ - @$(call parse_features,x86,X86) - @$(call parse_features,aarch32,AArch32) - @$(call parse_features,aarch64,AArch64) + @$(call parse_cpufeatures,$(CPUFEATURES_GENDIR)/target_tables_x86_64.h,X86) + @$(call parse_cpufeatures,$(CPUFEATURES_GENDIR)/target_tables_aarch64.h,AArch64) $(BUILDDIR)/pcre_h.jl: $(PCRE_INCL_PATH) @$(call PRINT_PERL, $(CPP) -D PCRE2_CODE_UNIT_WIDTH=8 -dM $< | perl -nle '/^\s*#define\s+PCRE2_(\w*)\s*\(?($(PCRE_CONST))\)?u?\s*$$/ and print index($$1, "ERROR_") == 0 ? "const $$1 = Cint($$2)" : "const $$1 = UInt32($$2)"' | LC_ALL=C sort > $@) diff --git a/base/cpuid.jl b/base/cpuid.jl index 0370bd33b83e5..5f58f596b0af9 100644 --- a/base/cpuid.jl +++ b/base/cpuid.jl @@ -10,8 +10,8 @@ export cpu_isa A structure which represents the Instruction Set Architecture (ISA) of a computer. It holds the `Set` of features of the CPU. -The numerical values of the features are automatically generated from the C -source code of Julia and stored in the `features_h.jl` Julia file. +Feature bit indices come from the cpufeatures library's generated tables +(extracted from LLVM's TableGen data at build time). """ struct ISA features::Set{UInt32} @@ -23,55 +23,167 @@ Base.isless(a::ISA, b::ISA) = a < b include(string(Base.BUILDROOT, "features_h.jl")) # include($BUILDROOT/base/features_h.jl) -# Keep in sync with `arch_march_isa_mapping`. +""" + _featurebytes_to_isa(buf::Vector{UInt8}) -> ISA + +Convert a raw feature byte buffer (from cpufeatures) into an ISA. +""" +function _featurebytes_to_isa(buf::Vector{UInt8}) + features = Set{UInt32}() + for byte_idx in 0:length(buf)-1 + b = buf[byte_idx + 1] + b == 0 && continue + for bit in 0:7 + if (b >> bit) & 1 != 0 + push!(features, UInt32(byte_idx * 8 + bit)) + end + end + end + return ISA(features) +end + +""" + _cross_lookup_cpu(arch::String, name::String) -> ISA + +Look up hardware features for a CPU on any architecture using the +cross-arch tables. Works regardless of host architecture. +Returns an empty ISA if the CPU or architecture is not found. +""" +function _cross_lookup_cpu(arch::String, name::String) + nbytes = ccall(:jl_cpufeatures_cross_nbytes, Csize_t, (Cstring,), arch) + nbytes == 0 && return ISA(Set{UInt32}()) + buf = Vector{UInt8}(undef, nbytes) + written = ccall(:jl_cpufeatures_cross_lookup, Csize_t, + (Cstring, Cstring, Ptr{UInt8}, Csize_t), + arch, name, buf, nbytes) + written == 0 && return ISA(Set{UInt32}()) + return _featurebytes_to_isa(buf) +end + +""" + _build_bit_to_name(arch::String) -> Dict{UInt32, String} + +Build a mapping from feature bit index to feature name for an architecture. +""" +function _build_bit_to_name(arch::String) + nfeats = ccall(:jl_cpufeatures_cross_num_features, UInt32, (Cstring,), arch) + result = Dict{UInt32, String}() + for i in 0:nfeats-1 + name_ptr = ccall(:jl_cpufeatures_cross_feature_name, Cstring, (Cstring, UInt32), arch, i) + name_ptr == C_NULL && continue + bit = ccall(:jl_cpufeatures_cross_feature_bit, Cint, (Cstring, UInt32), arch, i) + bit < 0 && continue + result[UInt32(bit)] = unsafe_string(name_ptr) + end + return result +end + +""" + feature_names(arch::String, cpu::String) -> Vector{String} + feature_names(arch::String, isa::ISA) -> Vector{String} + feature_names(isa::ISA) -> Vector{String} + feature_names() -> Vector{String} + +Return sorted hardware feature names. Can query by CPU name (on any +architecture) or by ISA. Defaults to the host architecture and CPU. + +# Examples +```julia +feature_names() # host CPU features +feature_names("x86_64", "haswell") # haswell's features +feature_names("aarch64", "cortex-x925") # cross-arch query +``` +""" +feature_names() = feature_names(string(Sys.ARCH), _host_isa()) +feature_names(isa::ISA) = feature_names(string(Sys.ARCH), isa) +function feature_names(arch::String, cpu::String) + isa = _cross_lookup_cpu(arch, cpu) + return feature_names(arch, isa) +end +function feature_names(arch::String, isa::ISA) + mapping = _build_bit_to_name(arch) + return sort([get(mapping, bit, "unknown_$bit") for bit in isa.features]) +end + +""" + _lookup_cpu(name::String) -> ISA + +Look up hardware features for the named CPU on the host architecture. +Returns an empty ISA if the CPU name is not found. +""" +function _lookup_cpu(name::String) + nbytes = ccall(:jl_cpufeatures_nbytes, Csize_t, ()) + buf = Vector{UInt8}(undef, nbytes) + ret = ccall(:jl_cpufeatures_lookup, Cint, (Cstring, Ptr{UInt8}, Csize_t), name, buf, nbytes) + ret != 0 && return ISA(Set{UInt32}()) + return _featurebytes_to_isa(buf) +end + +""" + _host_isa() -> ISA + +Get the hardware features of the host CPU from the cpufeatures library. +""" +function _host_isa() + nbytes = ccall(:jl_cpufeatures_nbytes, Csize_t, ()) + buf = Vector{UInt8}(undef, nbytes) + ccall(:jl_cpufeatures_host, Cvoid, (Ptr{UInt8}, Csize_t), buf, nbytes) + return _featurebytes_to_isa(buf) +end + +# Build an ISA list for a given architecture family. +# Uses cross-arch lookup so it works on any host. +# Entries with empty cpuname get an empty ISA (generic baseline). +function _make_isa_list(arch::String, entries::Vector{Pair{String,String}}) + result = Pair{String,ISA}[] + for (label, cpuname) in entries + if isempty(cpuname) + push!(result, label => ISA(Set{UInt32}())) + else + push!(result, label => _cross_lookup_cpu(arch, cpuname)) + end + end + return result +end + +# ISA definitions per architecture family. +# CPU names are LLVM names in the cpufeatures database. +# Keep in sync with `arch_march_isa_mapping` in binaryplatforms.jl. const ISAs_by_family = Dict( - "i686" => [ - # Source: https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html. - # Implicit in all sets, because always required by Julia: mmx, sse, sse2 - "pentium4" => ISA(Set{UInt32}()), - "prescott" => ISA(Set((JL_X86_sse3,))), - ], - "x86_64" => [ - # Source: https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html. - # Implicit in all sets, because always required by x86-64 architecture: mmx, sse, sse2 - "x86_64" => ISA(Set{UInt32}()), - "core2" => ISA(Set((JL_X86_sse3, JL_X86_ssse3))), - "nehalem" => ISA(Set((JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt))), - "sandybridge" => ISA(Set((JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_aes, JL_X86_pclmul))), - "haswell" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c))), - "skylake" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c, JL_X86_rdseed, JL_X86_adx, JL_X86_prfchw, JL_X86_clflushopt, JL_X86_xsavec, JL_X86_xsaves))), - "skylake_avx512" => ISA(Set((JL_X86_movbe, JL_X86_sse3, JL_X86_ssse3, JL_X86_sse41, JL_X86_sse42, JL_X86_popcnt, JL_X86_pku, JL_X86_avx, JL_X86_avx2, JL_X86_aes, JL_X86_pclmul, JL_X86_fsgsbase, JL_X86_rdrnd, JL_X86_fma, JL_X86_bmi, JL_X86_bmi2, JL_X86_f16c, JL_X86_rdseed, JL_X86_adx, JL_X86_prfchw, JL_X86_clflushopt, JL_X86_xsavec, JL_X86_xsaves, JL_X86_avx512f, JL_X86_clwb, JL_X86_avx512vl, JL_X86_avx512bw, JL_X86_avx512dq, JL_X86_avx512cd))), - ], - "armv6l" => [ - # The only armv6l processor we know of that runs Julia on armv6l - # We don't have a good way to tell the different armv6l variants apart through features, - # and honestly we don't care much since it's basically this one chip that people want to use with Julia. - "arm1176jzfs" => ISA(Set{UInt32}()), - ], - "armv7l" => [ - "armv7l" => ISA(Set{UInt32}()), - "armv7l+neon" => ISA(Set((JL_AArch32_neon,))), - "armv7l+neon+vfpv4" => ISA(Set((JL_AArch32_neon, JL_AArch32_vfp4))), - ], - "aarch64" => [ - # Implicit in all sets, because always required: fp, asimd - "armv8.0-a" => ISA(Set{UInt32}()), - "armv8.1-a" => ISA(Set((JL_AArch64_v8_1a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm))), - "armv8.2-a+crypto" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2))), - "a64fx" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_sha2, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fullfp16, JL_AArch64_sve))), - "apple_m1" => ISA(Set((JL_AArch64_v8_5a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2, JL_AArch64_sha3, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fp16fml, JL_AArch64_fullfp16, JL_AArch64_dotprod, JL_AArch64_rcpc, JL_AArch64_altnzcv))), - ], - "riscv64" => [ - "riscv64" => ISA(Set{UInt32}()), - ], - "powerpc64le" => [ - # We have no way to test powerpc64le features yet, so we're only going to declare the lowest ISA: - "power8" => ISA(Set{UInt32}()), - ], - "riscv64" => [ - # We have no way to test riscv64 features yet, so we're only going to declare the lowest ISA: - "riscv64" => ISA(Set{UInt32}()), - ], + "i686" => _make_isa_list("x86_64", [ + "pentium4" => "", + "prescott" => "prescott", + ]), + "x86_64" => _make_isa_list("x86_64", [ + "x86_64" => "", + "core2" => "core2", + "nehalem" => "nehalem", + "sandybridge" => "sandybridge", + "haswell" => "haswell", + "skylake" => "skylake", + "skylake_avx512" => "skylake-avx512", + ]), + "aarch64" => _make_isa_list("aarch64", [ + "armv8.0-a" => "", + "armv8.1-a" => "cortex-a76", + "armv8.2-a+crypto" => "cortex-a78", + "a64fx" => "a64fx", + "apple_m1" => "apple-a14", + ]), + "armv6l" => _make_isa_list("aarch64", [ + "arm1176jzfs" => "", + ]), + "armv7l" => _make_isa_list("aarch64", [ + "armv7l" => "", + "armv7l+neon" => "", + "armv7l+neon+vfpv4" => "", + ]), + "riscv64" => _make_isa_list("riscv64", [ + "riscv64" => "", + ]), + "powerpc64le" => _make_isa_list("powerpc64le", [ + "power8" => "", + ]), ) # Test a CPU feature exists on the currently-running host @@ -96,27 +208,13 @@ function normalize_arch(arch::String) return arch end -let - # Collect all relevant features for the current architecture, if any. - FEATURES = UInt32[] - arch = normalize_arch(String(Sys.ARCH)) - if arch in keys(ISAs_by_family) - for isa in ISAs_by_family[arch] - unique!(append!(FEATURES, last(isa).features)) - end - end - - # Use `@eval` to inline the list of features. - @eval function cpu_isa() - return ISA(Set{UInt32}(feat for feat in $(FEATURES) if test_cpu_feature(feat))) - end -end - """ cpu_isa() Return the [`ISA`](@ref) (instruction set architecture) of the current CPU. """ -cpu_isa +function cpu_isa() + return _host_isa() +end end # module CPUID diff --git a/base/loading.jl b/base/loading.jl index fa5694227f382..8d0e68ece3b1c 100644 --- a/base/loading.jl +++ b/base/loading.jl @@ -1933,21 +1933,25 @@ end struct ImageTarget name::String flags::Int32 + base::Int32 ext_features::String - features_en::Vector{UInt8} - features_dis::Vector{UInt8} + features_en::String + features_dis::String end function parse_image_target(io::IO) flags = read(io, Int32) - nfeature = read(io, Int32) - feature_en = read(io, 4*nfeature) - feature_dis = read(io, 4*nfeature) + base = read(io, Int32) + nwords = read(io, Int32) # number of uint64_t feature words + feature_en_raw = read(io, 8*nwords) + feature_dis_raw = read(io, 8*nwords) name_len = read(io, Int32) name = String(read(io, name_len)) ext_features_len = read(io, Int32) ext_features = String(read(io, ext_features_len)) - ImageTarget(name, flags, ext_features, feature_en, feature_dis) + features_en = @ccall jl_feature_bits_to_string(feature_en_raw::Ptr{UInt8}, nwords::Int32)::Ref{String} + features_dis = @ccall jl_feature_bits_to_string(feature_dis_raw::Ptr{UInt8}, nwords::Int32)::Ref{String} + ImageTarget(name, flags, base, ext_features, features_en, features_dis) end function parse_image_targets(targets::Vector{UInt8}) @@ -1965,51 +1969,18 @@ function current_image_targets() return parse_image_targets(targets) end -struct FeatureName - name::Cstring - bit::UInt32 # bit index into a `uint32_t` array; - llvmver::UInt32 # 0 if it is available on the oldest LLVM version we support -end - -function feature_names() - fnames = Ref{Ptr{FeatureName}}() - nf = Ref{Csize_t}() - @ccall jl_reflect_feature_names(fnames::Ptr{Ptr{FeatureName}}, nf::Ptr{Csize_t})::Cvoid - if fnames[] == C_NULL - @assert nf[] == 0 - return Vector{FeatureName}(undef, 0) - end - Base.unsafe_wrap(Array, fnames[], nf[], own=false) -end - -function test_feature(features::Vector{UInt8}, feat::FeatureName) - bitidx = feat.bit - u8idx = div(bitidx, 8) + 1 - bit = bitidx % 8 - return (features[u8idx] & (1 << bit)) != 0 -end - function show(io::IO, it::ImageTarget) print(io, it.name) if !isempty(it.ext_features) print(io, ",", it.ext_features) end + if it.base >= 0 + print(io, "; base=", it.base) + end print(io, "; flags=", it.flags) - print(io, "; features_en=(") - first = true - for feat in feature_names() - if test_feature(it.features_en, feat) - name = Base.unsafe_string(feat.name) - if first - first = false - print(io, name) - else - print(io, ", ", name) - end - end + if !isempty(it.features_en) + print(io, "; features_en=(", it.features_en, ")") end - print(io, ")") - # Is feature_dis useful? end # should sync with the types of arguments of `stale_cachefile` diff --git a/deps/Makefile b/deps/Makefile index cea1e52c55156..1dd0d5dd9e1e5 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -95,6 +95,8 @@ ifeq ($(USE_SYSTEM_DSFMT), 0) DEP_LIBS += dsfmt endif +DEP_LIBS += cpufeatures + ifeq ($(USE_SYSTEM_LLVM), 0) DEP_LIBS += llvm endif @@ -211,7 +213,7 @@ DEP_LIBS_STAGED_ALL := llvm llvm-tools clang llvmunwind unwind libuv pcre \ openlibm dsfmt blastrampoline openblas lapack gmp mpfr patchelf utf8proc \ objconv openssl libssh2 nghttp2 curl libgit2 libwhich zlib zstd p7zip csl \ sanitizers libsuitesparse lld libtracyclient ittapi nvtx \ - terminfo mmtk_julia + terminfo mmtk_julia cpufeatures DEP_LIBS_ALL := $(DEP_LIBS_STAGED_ALL) ifneq ($(USE_BINARYBUILDER_OPENBLAS),0) @@ -282,6 +284,7 @@ include $(SRCDIR)/unwind.mk include $(SRCDIR)/gmp.mk include $(SRCDIR)/mpfr.mk include $(SRCDIR)/patchelf.mk +include $(SRCDIR)/cpufeatures.mk include $(SRCDIR)/openssl.mk include $(SRCDIR)/libssh2.mk include $(SRCDIR)/nghttp2.mk diff --git a/deps/cpufeatures.mk b/deps/cpufeatures.mk new file mode 100644 index 0000000000000..0bac863c6704e --- /dev/null +++ b/deps/cpufeatures.mk @@ -0,0 +1,48 @@ +## CPUFEATURES - standalone CPU feature detection library ## +include $(SRCDIR)/cpufeatures.version + +CPUFEATURES_SRC_DIR := $(BUILDDIR)/cpufeatures-$(CPUFEATURES_VER) + +$(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz: | $(SRCCACHE) + $(JLDOWNLOAD) $@ $(CPUFEATURES_TAR_URL) + touch -c $@ + +$(CPUFEATURES_SRC_DIR)/source-extracted: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz + rm -rf $(dir $@) + mkdir -p $(dir $@) + $(TAR) -C $(dir $@) --strip-components 1 -xf $< + echo 1 > $@ + +checksum-cpufeatures: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz + $(JLCHECKSUM) $< + +$(CPUFEATURES_SRC_DIR)/build-compiled: $(CPUFEATURES_SRC_DIR)/source-extracted + $(MAKE) -C $(CPUFEATURES_SRC_DIR) lib \ + CXX="$(CXX)" \ + CXXFLAGS="$(JCXXFLAGS) -O2" \ + ARCH=$(ARCH) + echo 1 > $@ + +define CPUFEATURES_INSTALL + mkdir -p $2/$$(build_includedir)/cpufeatures + mkdir -p $2/$$(build_libdir) + cp $1/include/*.h $2/$$(build_includedir)/cpufeatures/ + cp $1/generated/target_tables_*.h $2/$$(build_includedir)/cpufeatures/ + cp $1/build/libtarget_parsing.a $2/$$(build_libdir)/ +endef +$(eval $(call staged-install, \ + cpufeatures,cpufeatures-$(CPUFEATURES_VER), \ + CPUFEATURES_INSTALL,,,,)) + +clean-cpufeatures: + -rm -f $(CPUFEATURES_SRC_DIR)/build-compiled + +distclean-cpufeatures: + rm -rf $(SRCCACHE)/cpufeatures*.tar.gz $(CPUFEATURES_SRC_DIR) + +get-cpufeatures: $(SRCCACHE)/cpufeatures-$(CPUFEATURES_VER).tar.gz +extract-cpufeatures: $(CPUFEATURES_SRC_DIR)/source-extracted +configure-cpufeatures: extract-cpufeatures +compile-cpufeatures: $(CPUFEATURES_SRC_DIR)/build-compiled +fastcheck-cpufeatures: check-cpufeatures +check-cpufeatures: compile-cpufeatures diff --git a/deps/cpufeatures.version b/deps/cpufeatures.version new file mode 100644 index 0000000000000..ced12eccd62f5 --- /dev/null +++ b/deps/cpufeatures.version @@ -0,0 +1,7 @@ +# -*- makefile -*- + +## source build +CPUFEATURES_VER := 0.2.0 +CPUFEATURES_GIT_URL := https://github.com/gbaraldi/cpufeatures.git +CPUFEATURES_TAR_URL := https://github.com/gbaraldi/cpufeatures/archive/e8178f952870a83c506f3f08150e3915193ab862.tar.gz +CPUFEATURES_SHA := e8178f952870a83c506f3f08150e3915193ab862 diff --git a/src/Makefile b/src/Makefile index 495a923f372e3..2a6e0a554b757 100644 --- a/src/Makefile +++ b/src/Makefile @@ -106,7 +106,7 @@ else # JULIACODEGEN != LLVM endif -RT_LLVM_LIBS := support targetparser +RT_LLVM_LIBS := support ifeq ($(OS),WINNT) SRCS += win32_ucontext @@ -203,7 +203,7 @@ LIBJULIA_PATH_REL := libjulia endif COMMON_LIBPATHS := -L$(build_libdir) -L$(build_shlibdir) -RT_LIBS := $(call whole_archive,$(LIBUV)) $(call whole_archive,$(LIBUTF8PROC)) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) -lzstd +RT_LIBS := $(call whole_archive,$(LIBUV)) $(call whole_archive,$(LIBUTF8PROC)) $(LIBUNWIND) $(RT_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) -lzstd -L$(build_libdir) -ltarget_parsing # NB: CG needs uv_mutex_* symbols, but we expect to export them from libjulia-internal CG_LIBS := $(LIBUNWIND) $(CG_LLVMLINK) $(OSLIBS) $(LIBTRACYCLIENT) $(LIBITTAPI) @@ -422,7 +422,7 @@ $(BUILDDIR)/llvm-pass-helpers.o $(BUILDDIR)/llvm-pass-helpers.dbg.obj: $(SRCDIR) $(BUILDDIR)/llvm-propagate-addrspaces.o $(BUILDDIR)/llvm-propagate-addrspaces.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/llvm-remove-addrspaces.o $(BUILDDIR)/llvm-remove-addrspaces.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/llvm-ptls.o $(BUILDDIR)/llvm-ptls.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h -$(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(addprefix $(SRCDIR)/,processor_*.cpp processor.h features_*.h) +$(BUILDDIR)/processor.o $(BUILDDIR)/processor.dbg.obj: $(SRCDIR)/processor.h $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c) $(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/staticdata_utils.c $(SRCDIR)/precompile_utils.c $(SRCDIR)/processor.h $(SRCDIR)/builtin_proto.h $(BUILDDIR)/toplevel.o $(BUILDDIR)/toplevel.dbg.obj: $(SRCDIR)/builtin_proto.h @@ -577,10 +577,7 @@ INCLUDED_CXX_FILES := \ codegen.cpp:abi_x86.cpp \ codegen.cpp:cgutils.cpp \ codegen.cpp:intrinsics.cpp \ - codegen.cpp:ccall.cpp \ - processor.cpp:processor_x86.cpp \ - processor.cpp:processor_arm.cpp \ - processor.cpp:processor_fallback.cpp + codegen.cpp:ccall.cpp .PHONY: clean clean: @@ -612,7 +609,7 @@ $(build_shlibdir)/lib%Plugin.$(SHLIB_EXT): $(SRCDIR)/clangsa/%.cpp $(LLVM_CONFIG # before attempting this static analysis, so that all necessary headers # and dependencies are properly installed: # make -C src install-analysis-deps -ANALYSIS_DEPS := llvm clang llvm-tools libuv utf8proc zstd +ANALYSIS_DEPS := llvm clang llvm-tools libuv utf8proc zstd cpufeatures ifeq ($(OS),Darwin) ANALYSIS_DEPS += llvmunwind else ifeq ($(OS),OpenBSD) diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 1fbfd459fc2ea..4a7936b697b94 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -2105,8 +2105,6 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname, // Reset the target triple to make sure it matches the new target machine - bool has_veccall = false; - { JL_TIMING(NATIVE_AOT, NATIVE_Setup); dataM.setDataLayout(DL); @@ -2185,7 +2183,6 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname, } } - has_veccall = !!dataM.getModuleFlag("julia.mv.veccall"); }; { @@ -2245,20 +2242,8 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname, builder.CreateRet(ConstantInt::get(T_int32, 1)); } if (imaging_mode) { - auto specs = jl_get_llvm_clone_targets(jl_options.cpu_target); - const uint32_t base_flags = has_veccall ? JL_TARGET_VEC_CALL : 0; - SmallVector data; - auto push_i32 = [&] (uint32_t v) { - uint8_t buff[4]; - memcpy(buff, &v, 4); - data.insert(data.end(), buff, buff + 4); - }; - push_i32(specs.size()); - for (uint32_t i = 0; i < specs.size(); i++) { - push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME)); - auto &specdata = specs[i].data; - data.insert(data.end(), specdata.begin(), specdata.end()); - } + auto targets = jl_get_llvm_clone_targets(jl_options.cpu_target); + auto &data = targets.data; auto value = ConstantDataArray::get(Context, data); auto target_ids = new GlobalVariable(metadataM, value->getType(), true, GlobalVariable::InternalLinkage, @@ -2274,8 +2259,9 @@ void jl_dump_native_locked(jl_native_code_desc_t *data, const char *bc_fname, jl_small_typeof_copy->setVisibility(GlobalValue::HiddenVisibility); jl_small_typeof_copy->setDSOLocal(true); - // Create CPU target string constant - auto cpu_target_str = jl_options.cpu_target ? jl_options.cpu_target : "native"; + // Create CPU target string constant. + // Don't store "sysimage" keyword — store the actual resolved target string. + std::string cpu_target_str = jl_expand_sysimage_keyword(jl_options.cpu_target); auto cpu_target_data = ConstantDataArray::getString(Context, cpu_target_str, true); auto cpu_target_global = new GlobalVariable(metadataM, cpu_target_data->getType(), true, GlobalVariable::InternalLinkage, diff --git a/src/clangsa/GCChecker.cpp b/src/clangsa/GCChecker.cpp index d5e421c8f65d0..04f204b923cf0 100644 --- a/src/clangsa/GCChecker.cpp +++ b/src/clangsa/GCChecker.cpp @@ -905,7 +905,7 @@ bool GCChecker::isSafepoint(const CallEvent &Call, CheckerContext &C) const { while (DC) { // Anything in llvm or std is not a safepoint if (const NamespaceDecl *NDC = dyn_cast(DC)) - if (NDC->getName() == "llvm" || NDC->getName() == "std") + if (NDC->getName() == "llvm" || NDC->getName() == "std" || NDC->getName() == "tp") return false; DC = DC->getParent(); } diff --git a/src/crc32c.c b/src/crc32c.c index 50d2acc603359..4994015a930e2 100644 --- a/src/crc32c.c +++ b/src/crc32c.c @@ -345,7 +345,8 @@ JL_DLLEXPORT uint32_t jl_crc32c(uint32_t crc, const char *buf, size_t len) # elif defined(_OS_LINUX_) static crc32c_func_t crc32c_dispatch(unsigned long hwcap) { - if (hwcap & (1 << JL_AArch64_crc)) + // HWCAP_CRC32 is bit 7 in the Linux AArch64 HWCAP + if (hwcap & (1 << 7)) return crc32c_armv8; return jl_crc32c_sw; } diff --git a/src/init.c b/src/init.c index 4d7cdf70ef1f8..346e684d29e94 100644 --- a/src/init.c +++ b/src/init.c @@ -582,9 +582,14 @@ static NOINLINE void _finish_jl_init_(jl_image_buf_t sysimage, jl_ptls_t ptls, j if (jl_options.cpu_target == NULL) jl_options.cpu_target = "native"; + if (jl_options.cpu_target[0] == '\0') + jl_error("Invalid target option: empty CPU name"); + + // Validate CPU target: check for unknown names, multiple targets, clone_all + jl_check_cpu_target(jl_options.cpu_target, jl_generating_output()); // Parse image, perform relocations, and init JIT targets, etc. - jl_image_t parsed_image = jl_init_processor_sysimg(sysimage, jl_options.cpu_target); + jl_image_t parsed_image = jl_load_sysimg(sysimage, jl_options.cpu_target); jl_init_codegen(); diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index acfd7de43838e..dce16e3387a20 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -1125,38 +1125,13 @@ namespace { options.MCOptions.ABIName = "lp64"; #endif #endif - uint32_t target_flags = 0; - auto target = jl_get_llvm_target(jl_options.cpu_target, jl_generating_output(), target_flags); - auto &TheCPU = target.first; - SmallVector targetFeatures(target.second.begin(), target.second.end()); + auto [TheCPU, FeaturesStr] = jl_get_llvm_target(jl_options.cpu_target, jl_generating_output()); std::string errorstr; const Target *TheTarget = TargetRegistry::lookupTarget("", TheTriple, errorstr); if (!TheTarget) { jl_errorf("Internal problem with process triple %s lookup: %s", TheTriple.str().c_str(), errorstr.c_str()); return nullptr; } - if (jl_processor_print_help || (target_flags & JL_TARGET_UNKNOWN_NAME)) { - std::unique_ptr MSTI( - TheTarget->createMCSubtargetInfo(TheTriple.str(), "", "")); - if (!MSTI->isCPUStringValid(TheCPU)) { - jl_errorf("Invalid CPU name \"%s\".", TheCPU.c_str()); - return nullptr; - } - if (jl_processor_print_help) { - // This is the only way I can find to print the help message once. - // It'll be nice if we can iterate through the features and print our own help - // message... - MSTI->setDefaultFeatures("help", "", ""); - } - } - // Package up features to be passed to target/subtarget - std::string FeaturesStr; - if (!targetFeatures.empty()) { - SubtargetFeatures Features; - for (unsigned i = 0; i != targetFeatures.size(); ++i) - Features.AddFeature(targetFeatures[i]); - FeaturesStr = Features.getString(); - } // Allocate a target... std::optional codemodel = #ifdef _P64 diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 55b20479408f9..c9ac19ba616dd 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -49,9 +49,17 @@ using namespace llvm; extern std::optional always_have_fma(Function&, const Triple &TT); +// Per-function clone categories (set by IR analysis) +enum { + JL_CLONE_LOOP = 1 << 0, + JL_CLONE_SIMD = 1 << 1, + JL_CLONE_MATH = 1 << 2, + JL_CLONE_CPU = 1 << 3, + JL_CLONE_FLOAT16 = 1 << 4, + JL_CLONE_BFLOAT16 = 1 << 5, +}; + namespace { -constexpr uint32_t clone_mask = - JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16; // Treat identical mapping as missing and return `def` in that case. // We mainly need this to identify cloned function using value map after LLVM cloning @@ -83,9 +91,9 @@ static uint32_t collect_func_info(Function &F, const Triple &TT, bool &has_vecca LoopInfo LI(DT); uint32_t flag = 0; if (!LI.empty()) - flag |= JL_TARGET_CLONE_LOOP; + flag |= JL_CLONE_LOOP; if (is_vector(F.getFunctionType())) { - flag |= JL_TARGET_CLONE_SIMD; + flag |= JL_CLONE_SIMD; has_veccall = true; } for (auto &bb: F) { @@ -93,50 +101,47 @@ static uint32_t collect_func_info(Function &F, const Triple &TT, bool &has_vecca if (auto call = dyn_cast(&I)) { if (is_vector(call->getFunctionType())) { has_veccall = true; - flag |= JL_TARGET_CLONE_SIMD; + flag |= JL_CLONE_SIMD; } if (auto callee = call->getCalledFunction()) { auto name = callee->getName(); if (name.starts_with("llvm.muladd.") || name.starts_with("llvm.fma.")) { - flag |= JL_TARGET_CLONE_MATH; + flag |= JL_CLONE_MATH; } else if (name.starts_with("julia.cpu.")) { if (name.starts_with("julia.cpu.have_fma.")) { - // for some platforms we know they always do (or don't) support - // FMA. in those cases we don't need to clone the function. - // always_have_fma returns an optional if (!always_have_fma(*callee, TT)) - flag |= JL_TARGET_CLONE_CPU; + flag |= JL_CLONE_CPU; } else { - flag |= JL_TARGET_CLONE_CPU; + flag |= JL_CLONE_CPU; } } } } else if (auto store = dyn_cast(&I)) { if (store->getValueOperand()->getType()->isVectorTy()) { - flag |= JL_TARGET_CLONE_SIMD; + flag |= JL_CLONE_SIMD; } } else if (I.getType()->isVectorTy()) { - flag |= JL_TARGET_CLONE_SIMD; + flag |= JL_CLONE_SIMD; } if (auto mathOp = dyn_cast(&I)) { if (mathOp->getFastMathFlags().any()) { - flag |= JL_TARGET_CLONE_MATH; + flag |= JL_CLONE_MATH; } } for (size_t i = 0; i < I.getNumOperands(); i++) { if(I.getOperand(i)->getType()->isHalfTy()) { - flag |= JL_TARGET_CLONE_FLOAT16; + flag |= JL_CLONE_FLOAT16; } if(I.getOperand(i)->getType()->isBFloatTy()) { - flag |= JL_TARGET_CLONE_BFLOAT16; + flag |= JL_CLONE_BFLOAT16; } } - uint32_t veccall_flags = JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU | JL_TARGET_CLONE_FLOAT16 | JL_TARGET_CLONE_BFLOAT16; - if (has_veccall && (flag & veccall_flags) == veccall_flags) { + constexpr uint32_t all_flags = JL_CLONE_SIMD | JL_CLONE_MATH | JL_CLONE_CPU | JL_CLONE_FLOAT16 | JL_CLONE_BFLOAT16; + if (has_veccall && (flag & all_flags) == all_flags) { return flag; } } @@ -148,7 +153,20 @@ struct TargetSpec { std::string cpu_name; std::string cpu_features; uint32_t base; - uint32_t flags; + bool clone_all = false; + bool opt_size = false; + bool min_size = false; + tp::FeatureDiff diff; + + // Which per-function categories to clone for this target + uint32_t clone_flags() const { + uint32_t mask = JL_CLONE_LOOP | JL_CLONE_CPU; + if (diff.has_new_math) mask |= JL_CLONE_MATH; + if (diff.has_new_simd) mask |= JL_CLONE_SIMD; + if (diff.has_new_float16) mask |= JL_CLONE_FLOAT16; + if (diff.has_new_bfloat16) mask |= JL_CLONE_BFLOAT16; + return mask; + } TargetSpec() = default; @@ -157,17 +175,43 @@ struct TargetSpec { out.cpu_name = spec.cpu_name; out.cpu_features = spec.cpu_features; out.base = spec.base; - out.flags = spec.flags; + out.clone_all = spec.clone_all; + out.opt_size = spec.opt_size; + out.min_size = spec.min_size; + out.diff = spec.diff; return out; } + // Pack/unpack for LLVM metadata serialization + uint32_t packed_flags() const { + uint32_t f = 0; + if (clone_all) f |= 1 << 0; + if (opt_size) f |= 1 << 1; + if (min_size) f |= 1 << 2; + if (diff.has_new_math) f |= 1 << 3; + if (diff.has_new_simd) f |= 1 << 4; + if (diff.has_new_float16) f |= 1 << 5; + if (diff.has_new_bfloat16) f |= 1 << 6; + return f; + } + + void unpack_flags(uint32_t f) { + clone_all = f & (1 << 0); + opt_size = f & (1 << 1); + min_size = f & (1 << 2); + diff.has_new_math = f & (1 << 3); + diff.has_new_simd = f & (1 << 4); + diff.has_new_float16 = f & (1 << 5); + diff.has_new_bfloat16 = f & (1 << 6); + } + static TargetSpec fromMD(MDTuple *tup) { TargetSpec out; assert(tup->getNumOperands() == 4); out.cpu_name = cast(tup->getOperand(0))->getString().str(); out.cpu_features = cast(tup->getOperand(1))->getString().str(); out.base = cast(cast(tup->getOperand(2))->getValue())->getZExtValue(); - out.flags = cast(cast(tup->getOperand(3))->getValue())->getZExtValue(); + out.unpack_flags(cast(cast(tup->getOperand(3))->getValue())->getZExtValue()); return out; } @@ -176,7 +220,7 @@ struct TargetSpec { MDString::get(ctx, cpu_name), MDString::get(ctx, cpu_features), ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), base)), - ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), flags)) + ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(ctx), packed_flags())) }); } }; @@ -216,12 +260,14 @@ static void annotate_module_clones(Module &M) { if (auto maybe_specs = get_target_specs(M)) { specs = std::move(*maybe_specs); } else { - auto full_specs = jl_get_llvm_clone_targets(jl_options.cpu_target); - specs.reserve(full_specs.size()); - for (auto &spec: full_specs) { +#ifndef __clang_analyzer__ + auto full = jl_get_llvm_clone_targets(jl_options.cpu_target); + specs.reserve(full.specs.size()); + for (auto &spec: full.specs) { specs.push_back(TargetSpec::fromSpec(spec)); } set_target_specs(M, specs); +#endif } SmallVector clones(orig_funcs.size(), APInt(specs.size(), 0)); BitVector subtarget_cloned(orig_funcs.size()); @@ -231,12 +277,12 @@ static void annotate_module_clones(Module &M) { func_infos[i] = collect_func_info(*orig_funcs[i], TT, has_veccall); } for (unsigned i = 1; i < specs.size(); i++) { - if (specs[i].flags & JL_TARGET_CLONE_ALL) { + if (specs[i].clone_all) { for (unsigned j = 0; j < orig_funcs.size(); j++) { clones[j].setBit(i); } } else { - unsigned flag = specs[i].flags & clone_mask; + unsigned flag = specs[i].clone_flags(); std::set sets[2]; for (unsigned j = 0; j < orig_funcs.size(); j++) { if (!(func_infos[j] & flag)) { @@ -455,7 +501,7 @@ CloneCtx::CloneCtx(Module &M, bool allow_bad_fvars) uint32_t ntargets = specs.size(); for (uint32_t i = 1; i < ntargets; i++) { auto &spec = specs[i]; - if (spec.flags & JL_TARGET_CLONE_ALL) { + if (spec.clone_all) { group_ids[i] = groups.size(); groups.emplace_back(i); } @@ -586,7 +632,7 @@ void CloneCtx::clone_decls() new_F->setVisibility(F->getVisibility()); new_F->setDSOLocal(true); auto base_func = F; - if (!(specs[i].flags & JL_TARGET_CLONE_ALL)) + if (!(specs[i].clone_all)) base_func = static_cast(linearized[specs[i].base])->base_func(F); (*linearized[i]->vmap)[base_func] = new_F; } @@ -619,10 +665,10 @@ static void add_features(Function *F, TargetSpec &spec) } F->addFnAttr("target-cpu", spec.cpu_name); if (!F->hasFnAttribute(Attribute::OptimizeNone)) { - if (spec.flags & JL_TARGET_OPTSIZE) { + if (spec.opt_size) { F->addFnAttr(Attribute::OptimizeForSize); } - else if (spec.flags & JL_TARGET_MINSIZE) { + else if (spec.min_size) { F->addFnAttr(Attribute::MinSize); } } @@ -1012,7 +1058,7 @@ void CloneCtx::emit_metadata() uint32_t len_idx = idxs.size(); idxs.push_back(0); // We will fill in the real value later. uint32_t count = 0; - if (i == 0 || spec.flags & JL_TARGET_CLONE_ALL) { + if (i == 0 || spec.clone_all) { auto grp = static_cast(tgt); count = jl_sysimg_tag_mask; for (uint32_t j = 0; j < nfvars; j++) { diff --git a/src/processor.cpp b/src/processor.cpp index 1a25171082d82..fa15cbe6449f7 100644 --- a/src/processor.cpp +++ b/src/processor.cpp @@ -1,15 +1,14 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license -// Processor feature detection - -#include "llvm-version.h" -#include -#include -#include -#include -#include -#include -#include +// Processor feature detection and dispatch using the cpufeatures library. +// CPU/feature tables are generated from LLVM's TableGen data and committed +// to https://github.com/gbaraldi/cpufeatures +// +// On LLVM version bump: +// 1. cd cpufeatures && make -f Makefile.generate LLVM_VER= +// 2. Review and commit regenerated generated/ headers +// 3. Update Julia's deps/cpufeatures.version with the new commit hash +// 4. The static_assert below will catch major version mismatches #include "processor.h" @@ -17,6 +16,8 @@ #include "julia_internal.h" #include +#include +#include #include "julia_assert.h" @@ -24,617 +25,16 @@ #include #endif -// CPU target string is a list of strings separated by `;` each string starts with a CPU -// or architecture name and followed by an optional list of features separated by `,`. -// A "generic" or empty CPU name means the basic required feature set of the target ISA -// which is at least the architecture the C/C++ runtime is compiled with. - -// CPU dispatch needs to determine the version to be used by the sysimg as well as -// the target and feature used by the JIT. Currently the only limitation on JIT target -// and feature is matching register size between the sysimg and JIT so that SIMD vectors -// can be passed correctly. This means disabling AVX and AVX2 if AVX was not enabled -// in sysimg and disabling AVX512 if it was not enabled in sysimg. -// This also possibly means that SVE needs to be disabled on AArch64 if sysimg doesn't have it -// enabled. - -// CPU dispatch starts by first deciding the max feature set and CPU requested for JIT. -// This is the host or the target specified on the command line with features unavailable -// on the host disabled. All sysimg targets that require features not available in this set -// will be ignored. - -// The next step is matching CPU name. -// If exact name match with compatible feature set exists, all versions without name match -// are ignored. -// This step will query LLVM first so it can accept CPU names that is recognized by LLVM but -// not by us (yet) when LLVM is enabled. - -// If there are still more than one candidates, a feature match is performed. -// The ones with the largest register size will be used -// (i.e. AVX512 > AVX2/AVX > SSE, SVE > ASIMD). If there's a tie, the one with the most features -// enabled will be used. If there's still a tie the one that appears later in the list will be -// used. (i.e. the order in the version list is significant in this case). - -// Features that are not recognized will be passed to LLVM directly during codegen -// but ignored otherwise. - -// A few special features are supported: -// 1. `clone_all` -// -// This forces the target to have all functions in sysimg cloned. -// When used in negative form (i.e. `-clone_all`), this disables full clone that's -// enabled by default for certain targets. -// -// 2. `base([0-9]*)` -// -// This specifies the (0-based) base target index. The base target is the target -// that the current target is based on, i.e. the functions that are not being cloned -// will use the version in the base target. This option causes the base target to be -// fully cloned (as if `clone_all` is specified for it) if it is not the default target (0). -// The index can only be smaller than the current index. -// -// 3. `opt_size` -// -// Optimize for size with minimum performance impact. Clang/GCC's `-Os`. -// -// 4. `min_size` -// -// Optimize only for size. Clang's `-Oz`. -JL_DLLEXPORT bool jl_processor_print_help = false; +// Forward declarations for sysimage CPU target storage +static std::string sysimage_cpu_target; +void jl_set_sysimage_cpu_target(const char *cpu_target); namespace { -// Helper functions to test/set feature bits - -template -static inline bool test_bits(T1 v, T2 mask, T3 test) -{ - return T3(v & mask) == test; -} - -template -static inline bool test_all_bits(T1 v, T2 mask) -{ - return test_bits(v, mask, mask); -} - -template -static inline bool test_nbit(const T1 &bits, T2 _bitidx) -{ - auto bitidx = static_cast(_bitidx); - auto u32idx = bitidx / 32; - auto bit = bitidx % 32; - return (bits[u32idx] & (1 << bit)) != 0; -} - -template -static inline void unset_bits(T &bits) JL_NOTSAFEPOINT -{ - (void)bits; -} - -template -static inline void unset_bits(T &bits, T1 _bitidx, Rest... rest) JL_NOTSAFEPOINT -{ - auto bitidx = static_cast(_bitidx); - auto u32idx = bitidx / 32; - auto bit = bitidx % 32; - bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit); - unset_bits(bits, rest...); -} - -template -static inline void set_bit(T &bits, T1 _bitidx, bool val) -{ - auto bitidx = static_cast(_bitidx); - auto u32idx = bitidx / 32; - auto bit = bitidx % 32; - if (val) { - bits[u32idx] = bits[u32idx] | uint32_t(1 << bit); - } - else { - bits[u32idx] = bits[u32idx] & ~uint32_t(1 << bit); - } -} - -// Helper functions to create feature masks - -// This can be `std::array` on C++14 -template -struct FeatureList { - uint32_t eles[n]; - uint32_t &operator[](size_t pos) JL_NOTSAFEPOINT - { - return eles[pos]; - } - constexpr const uint32_t &operator[](size_t pos) const - { - return eles[pos]; - } - inline int nbits() const - { - int cnt = 0; - for (size_t i = 0; i < n; i++) - cnt += llvm::popcount(eles[i]); - return cnt; - } - inline bool empty() const - { - for (size_t i = 0; i < n; i++) { - if (eles[i]) { - return false; - } - } - return true; - } -}; - -static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx) -{ - return mask; -} - -template -static inline constexpr uint32_t add_feature_mask_u32(uint32_t mask, uint32_t u32idx, - T bit, Rest... args) -{ - return add_feature_mask_u32(mask | ((int(bit) >= 0 && int(bit) / 32 == (int)u32idx) ? - (1 << (int(bit) % 32)) : 0), - u32idx, args...); -} - -template -static inline constexpr uint32_t get_feature_mask_u32(uint32_t u32idx, Args... args) -{ - return add_feature_mask_u32(uint32_t(0), u32idx, args...); -} - -template struct seq{}; -template -struct gen_seq : gen_seq{}; -template -struct gen_seq<0, Is...> : seq{}; - -template -static inline constexpr FeatureList -_get_feature_mask(seq, Args... args) -{ - return FeatureList{{get_feature_mask_u32(I, args...)...}}; -} - -template -static inline constexpr FeatureList get_feature_masks(Args... args) -{ - return _get_feature_mask(gen_seq(), args...); -} - -template -static inline constexpr FeatureList -_feature_mask_or(seq, const FeatureList &a, const FeatureList &b) -{ - return FeatureList{{(a[I] | b[I])...}}; -} - -template -static inline constexpr FeatureList operator|(const FeatureList &a, const FeatureList &b) -{ - return _feature_mask_or(gen_seq(), a, b); -} - -template -static inline constexpr FeatureList -_feature_mask_and(seq, const FeatureList &a, const FeatureList &b) -{ - return FeatureList{{(a[I] & b[I])...}}; -} - -template -static inline constexpr FeatureList operator&(const FeatureList &a, const FeatureList &b) -{ - return _feature_mask_and(gen_seq(), a, b); -} - -template -static inline constexpr FeatureList -_feature_mask_not(seq, const FeatureList &a) -{ - return FeatureList{{(~a[I])...}}; -} - -template -static inline constexpr FeatureList operator~(const FeatureList &a) -{ - return _feature_mask_not(gen_seq(), a); -} - -template -static inline void mask_features(const FeatureList masks, uint32_t *features) -{ - for (size_t i = 0; i < n; i++) { - features[i] = features[i] & masks[i]; - } -} - -// Turn feature list to a string the LLVM accept -static inline std::string join_feature_strs(const llvm::ArrayRef &strs) -{ - size_t nstr = strs.size(); - if (!nstr) - return std::string(""); - std::string str = strs[0]; - for (size_t i = 1; i < nstr; i++) - str += ',' + strs[i]; - return str; -} - -static inline void append_ext_features(std::string &features, const std::string &ext_features) -{ - if (ext_features.empty()) - return; - if (!features.empty()) - features.push_back(','); - features.append(ext_features); -} - -static inline void append_ext_features(llvm::SmallVectorImpl &features, - const std::string &ext_features) -{ - if (ext_features.empty()) - return; - const char *start = ext_features.c_str(); - const char *p = start; - for (; *p; p++) { - if (*p == ',') { - features.emplace_back(start, p - start); - start = p + 1; - } - } - if (p > start) { - features.emplace_back(start, p - start); - } -} - -/** - * Target specific type/constant definitions, always enable. - */ - -template -struct CPUSpec { - const char *name; - CPU cpu; - CPU fallback; - uint32_t llvmver; - FeatureList features; -}; - -struct FeatureDep { - uint32_t feature; - uint32_t dep; -}; - -// Recursively enable all features that the current feature set depends on. -template -static inline void enable_depends(FeatureList &features, const FeatureDep *deps, size_t ndeps) -{ - bool changed = true; - while (changed) { - changed = false; - for (ssize_t i = ndeps - 1; i >= 0; i--) { - auto &dep = deps[i]; - if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep)) - continue; - set_bit(features, dep.dep, true); - changed = true; - } - } -} - -// Recursively disable all features that the current feature set does not provide. -template -static inline void disable_depends(FeatureList &features, const FeatureDep *deps, size_t ndeps) -{ - bool changed = true; - while (changed) { - changed = false; - for (ssize_t i = ndeps - 1; i >= 0; i--) { - auto &dep = deps[i]; - if (!test_nbit(features, dep.feature) || test_nbit(features, dep.dep)) - continue; - unset_bits(features, dep.feature); - changed = true; - } - } -} - -template -static const CPUSpec *find_cpu(uint32_t cpu, const CPUSpec *cpus, uint32_t ncpus) -{ - for (uint32_t i = 0; i < ncpus; i++) { - if (cpu == uint32_t(cpus[i].cpu)) { - return &cpus[i]; - } - } - return nullptr; -} - -template -static const CPUSpec *find_cpu(llvm::StringRef name, const CPUSpec *cpus, - uint32_t ncpus) -{ - for (uint32_t i = 0; i < ncpus; i++) { - if (name == cpus[i].name) { - return &cpus[i]; - } - } - return nullptr; -} - -template -static const char *find_cpu_name(uint32_t cpu, const CPUSpec *cpus, uint32_t ncpus) -{ - if (auto *spec = find_cpu(cpu, cpus, ncpus)) - return spec->name; - return "generic"; -} - -JL_UNUSED static uint32_t find_feature_bit(const FeatureName *features, size_t nfeatures, - const char *str, size_t len) -{ - for (size_t i = 0; i < nfeatures; i++) { - auto &feature = features[i]; - if (strncmp(feature.name, str, len) == 0 && feature.name[len] == 0) { - return feature.bit; - } - } - return UINT32_MAX; -} - -// This is how we save the target identification. -// CPU name is saved as string instead of binary data like features because -// 1. CPU ID is less stable (they are not bound to hardware/OS API) -// 2. We need to support CPU names that are not recognized by us and therefore doesn't have an ID -// 3. CPU name is trivial to parse -static inline llvm::SmallVector -serialize_target_data(llvm::StringRef name, uint32_t nfeature, const uint32_t *features_en, - const uint32_t *features_dis, llvm::StringRef ext_features) -{ - llvm::SmallVector res; - auto add_data = [&] (const void *data, size_t sz) { - if (sz == 0) - return; - size_t old_sz = res.size(); - res.resize(old_sz + sz); - memcpy(&res[old_sz], data, sz); - }; - add_data(&nfeature, 4); - add_data(features_en, 4 * nfeature); - add_data(features_dis, 4 * nfeature); - uint32_t namelen = name.size(); - add_data(&namelen, 4); - add_data(name.data(), namelen); - uint32_t ext_features_len = ext_features.size(); - add_data(&ext_features_len, 4); - add_data(ext_features.data(), ext_features_len); - return res; -} - -template -static inline llvm::SmallVector -serialize_target_data(llvm::StringRef name, const FeatureList &features_en, - const FeatureList &features_dis, llvm::StringRef ext_features) -{ - return serialize_target_data(name, n, &features_en[0], &features_dis[0], ext_features); -} - -template -struct TargetData { - std::string name; - std::string ext_features; - struct { - FeatureList features; - uint32_t flags; - } en, dis; - int base; -}; - -// In addition to the serialized data, the first `uint32_t` gives the number of targets saved -// and each target has a `uint32_t` flag before the serialized target data. -template -static inline llvm::SmallVector, 0> deserialize_target_data(const uint8_t *data) -{ - auto load_data = [&] (void *dest, size_t sz) { - memcpy(dest, data, sz); - data += sz; - }; - auto load_string = [&] () { - uint32_t len; - load_data(&len, 4); - std::string res((const char*)data, len); - data += len; - return res; - }; - uint32_t ntarget; - load_data(&ntarget, 4); - llvm::SmallVector, 0> res(ntarget); - for (uint32_t i = 0; i < ntarget; i++) { - auto &target = res[i]; - load_data(&target.en.flags, 4); - target.dis.flags = 0; - // Starting serialized target data - uint32_t nfeature; - load_data(&nfeature, 4); - assert(nfeature == n); - load_data(&target.en.features[0], 4 * n); - load_data(&target.dis.features[0], 4 * n); - target.name = load_string(); - target.ext_features = load_string(); - target.base = 0; - } - return res; -} - -// Try getting clone base argument. Return 1-based index. Return 0 if match failed. -static inline int get_clone_base(const char *start, const char *end) -{ - const char *prefix = "base("; - const int prefix_len = strlen(prefix); - if (end - start <= prefix_len) - return 0; - if (memcmp(start, prefix, prefix_len) != 0) - return 0; - start += prefix_len; - if (*start > '9' || *start < '0') - return 0; - char *digit_end; - auto idx = strtol(start, &digit_end, 10); - if (idx < 0) - return 0; - if (*digit_end != ')' || digit_end + 1 != end) - return 0; - return (int)idx + 1; -} - -// Parse cmdline string. This handles `clone_all` and `base` special features. -// Other feature names will be passed to `feature_cb` for target dependent parsing. -template -static inline llvm::SmallVector, 0> -parse_cmdline(const char *option, F &&feature_cb) -{ - if (!option) - abort(); - - // Preprocess the option string to expand "sysimage" keyword - std::string processed_option; - if (strncmp(option, "sysimage", 8) == 0 && (option[8] == '\0' || option[8] == ';')) { - // Replace "sysimage" with the actual sysimage CPU target - jl_value_t *target_str = jl_get_sysimage_cpu_target(); - if (target_str != nullptr) { - processed_option = std::string(jl_string_data(target_str), jl_string_len(target_str)); - if (option[8] == ';') { - processed_option += option + 8; // append the rest after "sysimage" - } - option = processed_option.c_str(); - } - } - - llvm::SmallVector, 0> res; - TargetData arg{}; - auto reset_arg = [&] { - res.push_back(arg); - arg.name.clear(); - arg.ext_features.clear(); - memset(&arg.en.features[0], 0, 4 * n); - memset(&arg.dis.features[0], 0, 4 * n); - arg.en.flags = 0; - arg.dis.flags = 0; - }; - const char *start = option; - for (const char *p = option; ; p++) { - switch (*p) { - case ',': - case ';': - case '\0': { - bool done = *p == '\0'; - bool next_target = *p == ';' || done; - if (arg.name.empty()) { - if (p == start) - jl_error("Invalid target option: empty CPU name"); - arg.name.append(start, p - start); - if (arg.name == "help") { - arg.name = "native"; - jl_processor_print_help = true; - } - start = p + 1; - if (next_target) - reset_arg(); - if (done) - return res; - continue; - } - bool disable = false; - const char *full = start; - const char *fname = full; - start = p + 1; - if (*full == '-') { - disable = true; - fname++; - } - else if (*full == '+') { - fname++; - } - if (llvm::StringRef(fname, p - fname) == "clone_all") { - if (!disable) { - arg.en.flags |= JL_TARGET_CLONE_ALL; - arg.dis.flags &= ~JL_TARGET_CLONE_ALL; - } - else { - arg.dis.flags |= JL_TARGET_CLONE_ALL; - arg.en.flags &= ~JL_TARGET_CLONE_ALL; - } - } - else if (llvm::StringRef(fname, p - fname) == "opt_size") { - if (disable) - jl_error("Invalid target option: disabled opt_size."); - if (arg.en.flags & JL_TARGET_MINSIZE) - jl_error("Conflicting target option: both opt_size and min_size are specified."); - arg.en.flags |= JL_TARGET_OPTSIZE; - } - else if (llvm::StringRef(fname, p - fname) == "min_size") { - if (disable) - jl_error("Invalid target option: disabled min_size."); - if (arg.en.flags & JL_TARGET_OPTSIZE) - jl_error("Conflicting target option: both opt_size and min_size are specified."); - arg.en.flags |= JL_TARGET_MINSIZE; - } - else if (int base = get_clone_base(fname, p)) { - if (disable) - jl_error("Invalid target option: disabled base index."); - base -= 1; - if (base >= (int)res.size()) - jl_error("Invalid target option: base index must refer to a previous target."); - if (res[base].dis.flags & JL_TARGET_CLONE_ALL || - !(res[base].en.flags & JL_TARGET_CLONE_ALL)) - jl_error("Invalid target option: base target must be clone_all."); - arg.base = base; - } - else if (llvm::StringRef(fname, p - fname) == "help") { - jl_processor_print_help = true; - } - else { - FeatureList &list = disable ? arg.dis.features : arg.en.features; - if (!feature_cb(fname, p - fname, list)) { - if (!arg.ext_features.empty()) - arg.ext_features += ','; - arg.ext_features += disable ? '-' : '+'; - arg.ext_features.append(fname, p - fname); - } - } - if (next_target) - reset_arg(); - if (done) { - return res; - } - } - JL_FALLTHROUGH; - default: - continue; - } - } -} - -// Cached version of command line parsing -template -static inline llvm::SmallVector, 0> &get_cmdline_targets(const char *cpu_target, F &&feature_cb) -{ - static llvm::SmallVector, 0> targets = - parse_cmdline(cpu_target, std::forward(feature_cb)); - return targets; -} - -// Load sysimg, use the `callback` for dispatch and perform all relocations -// for the selected target. +// Load sysimg/pkgimg, use the callback for dispatch and perform all relocations template -static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void *ctx) +static inline jl_image_t load_sysimg_target(jl_image_buf_t image, F &&callback, void *ctx) { JL_TIMING(LOAD_IMAGE, LOAD_Processor); jl_image_t res{}; @@ -662,10 +62,10 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void * jl_error("Image file is not compatible with this version of Julia"); } - llvm::SmallVector fvars(pointers->header->nfvars); - llvm::SmallVector gvars(pointers->header->ngvars); + std::vector fvars(pointers->header->nfvars); + std::vector gvars(pointers->header->ngvars); - llvm::SmallVector, 0> clones; + std::vector> clones; for (unsigned i = 0; i < pointers->header->nshards; i++) { auto shard = pointers->shards[i]; @@ -682,7 +82,7 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void * clone_idxs++; assert(tag_len & jl_sysimg_tag_mask); - llvm::SmallVector base_ptrs(0); + std::vector base_ptrs(0); base_ptrs.push_back(fvar_shard); // Find target for (uint32_t i = 0; i < target_idx; i++) { @@ -826,188 +226,656 @@ static inline jl_image_t parse_sysimg(jl_image_buf_t image, F &&callback, void * return res; } -template -static inline void check_cmdline(T &&cmdline, bool imaging) +} // namespace + + +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// Unified processor detection and dispatch using the cpufeatures library. +// Replaces processor_x86.cpp, processor_arm.cpp, and processor_fallback.cpp. +// No hand-maintained CPU/feature tables — all data comes from LLVM TableGen +// via generated headers committed to the cpufeatures repository. + +// Include cpufeatures generated tables (defines FeatureBits, feature_table, etc.) +#if defined(_CPU_X86_64_) || defined(_CPU_X86_) +#include +#elif defined(_CPU_AARCH64_) +#include +#elif defined(__riscv) && __riscv_xlen == 64 +#include +#else +#include +#endif + +#include +#include + +// Verify the cpufeatures tables were generated from a compatible LLVM version. +#if defined(TARGET_TABLES_LLVM_VERSION_MAJOR) && defined(LLVM_VERSION_MAJOR) +static_assert(TARGET_TABLES_LLVM_VERSION_MAJOR == LLVM_VERSION_MAJOR, + "cpufeatures tables were generated with a different LLVM major version than Julia uses"); +#endif + +// ============================================================================ +// Debug output +// ============================================================================ + +static bool cpufeatures_debug_enabled() { + static int enabled = -1; + if (enabled == -1) { + const char *debug_env = getenv("JULIA_DEBUG"); + enabled = debug_env && (strstr(debug_env, "cpufeatures") || strstr(debug_env, "all")); + } + return enabled; +} + +#define CF_DEBUG(...) do { if (cpufeatures_debug_enabled()) jl_safe_printf(__VA_ARGS__); } while (0) + +// ============================================================================ +// Convert feature bits to a comma-separated string of feature names. +// Called from Julia's loading.jl to display ImageTarget features. +JL_DLLEXPORT jl_value_t *jl_feature_bits_to_string(const uint8_t *bits, int32_t nwords) { - assert(cmdline.size() > 0); - // It's unclear what does specifying multiple target when not generating - // sysimg means. Make it an error for now. - if (!imaging) { - if (cmdline.size() > 1) { - jl_safe_printf("More than one command line CPU targets specified " - "without a `--output-` flag specified"); - exit(1); - } - if (cmdline[0].en.flags & JL_TARGET_CLONE_ALL) { - jl_safe_printf("\"clone_all\" feature specified " - "without a `--output-` flag specified"); - exit(1); - } - if (cmdline[0].en.flags & JL_TARGET_OPTSIZE) { - jl_safe_printf("\"opt_size\" feature specified " - "without a `--output-` flag specified"); - exit(1); - } - if (cmdline[0].en.flags & JL_TARGET_MINSIZE) { - jl_safe_printf("\"min_size\" feature specified " - "without a `--output-` flag specified"); - exit(1); + FeatureBits fb{}; + int copy_words = nwords < TARGET_FEATURE_WORDS ? nwords : TARGET_FEATURE_WORDS; + memcpy(fb.bits, bits, copy_words * sizeof(uint64_t)); + auto str = tp::build_feature_string(fb); + return jl_pchar_to_string(str.data(), str.size()); +} + +// ============================================================================ +// Host CPU detection — thin wrappers around cpufeatures library +// ============================================================================ + +static inline const std::string &host_cpu_name() +{ + return tp::get_host_cpu_name(); +} + +static std::string get_host_feature_string() +{ + auto fb = tp::get_host_features(); + return tp::build_feature_string(fb); +} + +// ============================================================================ +// JIT target management +// ============================================================================ + +static std::vector jit_targets; + +// If cpu_target starts with "sysimage", replace it with the target string +// stored in the loaded sysimage. Otherwise return as-is. +extern "C" std::string jl_expand_sysimage_keyword(const char *cpu_target) { + if (!cpu_target || !*cpu_target) + return ""; + std::string option(cpu_target); + if (option.substr(0, 8) == "sysimage" && (option.size() == 8 || option[8] == ';')) { + jl_value_t *target_str = jl_get_sysimage_cpu_target(); + if (target_str && jl_string_len(target_str) > 0) { + std::string expanded(jl_string_data(target_str), jl_string_len(target_str)); + if (option.size() > 8) + expanded += option.substr(8); + CF_DEBUG("[cpufeatures] expanded 'sysimage' -> '%s'\n", expanded.c_str()); + return expanded; } + CF_DEBUG("[cpufeatures] WARNING: 'sysimage' keyword but no stored target, using 'native'\n"); + return "native"; + } + return option; +} + +static void init_jit_targets(const char *cpu_target, bool imaging) +{ + + if (!jit_targets.empty()) + return; + + auto target_str = jl_expand_sysimage_keyword(cpu_target); + CF_DEBUG("[cpufeatures] init_jit_targets: '%s' imaging=%d\n", + target_str.c_str(), imaging); + + if (target_str.empty()) + jl_error("Invalid target option: empty CPU name"); + + auto specs = tp::resolve_targets_for_llvm(target_str); + + if (specs.empty()) + jl_error("No targets specified"); + + for (auto &s : specs) { + CF_DEBUG("[cpufeatures] target: name='%s' base=%d features=%s\n", + s.cpu_name.c_str(), s.base, s.cpu_features.c_str()); + jit_targets.push_back(std::move(s)); } } -struct SysimgMatch { - uint32_t best_idx{UINT32_MAX}; - int vreg_size{0}; -}; +// ============================================================================ +// Sysimage / pkgimage target matching +// ============================================================================ -// Find the best match in the sysimg. -// Select the best one based on the largest vector register and largest compatible feature set. -template -static inline SysimgMatch match_sysimg_targets(S &&sysimg, T &&target, F &&max_vector_size, jl_value_t **rejection_reason) +// Shared: deserialize image targets, match against a resolved target. +// Returns {target_index, vreg_size} or {UINT32_MAX, 0} on failure. +static std::pair match_image_targets( + const void *id, const tp::LLVMTargetSpec &target, jl_value_t **rejection_reason) { - SysimgMatch match; - bool match_name = false; - int feature_size = 0; - llvm::SmallVector rejection_reasons; - rejection_reasons.reserve(sysimg.size()); - for (uint32_t i = 0; i < sysimg.size(); i++) { - auto &imgt = sysimg[i]; - if (!(imgt.en.features & target.dis.features).empty()) { - // Check sysimg enabled features against runtime disabled features - // This is valid (and all what we can do) - // even if one or both of the targets are unknown. - rejection_reasons.push_back("Rejecting this target due to use of runtime-disabled features\n"); - continue; - } - if (imgt.name == target.name) { - if (!match_name) { - match_name = true; - match.vreg_size = 0; - feature_size = 0; - } - } - else if (match_name) { - rejection_reasons.push_back("Rejecting this target since another target has a cpu name match\n"); - continue; - } - int new_vsz = max_vector_size(imgt.en.features); - if (match.vreg_size > new_vsz) { - rejection_reasons.push_back("Rejecting this target since another target has a larger vector register size\n"); - continue; - } - int new_feature_size = imgt.en.features.nbits(); - if (match.vreg_size < new_vsz) { - match.best_idx = i; - match.vreg_size = new_vsz; - feature_size = new_feature_size; - rejection_reasons.push_back("Updating best match to this target due to larger vector register size\n"); - continue; + auto image_targets = tp::deserialize_targets((const uint8_t *)id); + CF_DEBUG("[cpufeatures] image has %zu target(s)\n", image_targets.size()); + + auto match = tp::match_targets(image_targets, target); + if (match.best_idx < 0) { + CF_DEBUG("[cpufeatures] NO compatible target found!\n"); + if (rejection_reason) { + std::string msg = "Unable to find compatible target in cached code image."; + *rejection_reason = jl_pchar_to_string(msg.data(), msg.size()); } - if (new_feature_size < feature_size) { - rejection_reasons.push_back("Rejecting this target since another target has a larger feature set\n"); - continue; + return {UINT32_MAX, 0}; + } + + CF_DEBUG("[cpufeatures] selected target %d '%s' (vreg_size=%d)\n", + match.best_idx, image_targets[match.best_idx].cpu_name.c_str(), match.vreg_size); + return {(uint32_t)match.best_idx, match.vreg_size}; +} + +static uint32_t match_sysimg_target(void *ctx, const void *id, jl_value_t **rejection_reason) +{ + const char *cpu_target = (const char *)ctx; + CF_DEBUG("[cpufeatures] match_sysimg_target: cpu_target='%s'\n", + cpu_target ? cpu_target : "(null)"); + + // For multi-target strings (sysimage building), use only the first + // target for matching against the image being loaded. + auto target_str = jl_expand_sysimage_keyword(cpu_target); + auto semi = target_str.find(';'); + auto first = semi != std::string::npos ? target_str.substr(0, semi) : target_str; + auto host_specs = tp::resolve_targets_for_llvm(first); + if (host_specs.empty()) + jl_error("No targets specified"); + + auto &target = host_specs[0]; + CF_DEBUG("[cpufeatures] JIT target: name='%s'\n", target.cpu_name.c_str()); + +#if defined(_CPU_X86_64_) + // CX16 check: only error if sysimage requires it and host doesn't have it + { + auto sysimg_peek = tp::deserialize_targets((const uint8_t *)id); + bool sysimg_allows_no_cx16 = false; + for (auto &t : sysimg_peek) + sysimg_allows_no_cx16 |= !tp::has_feature(t.en_features, "cx16"); + if (!sysimg_allows_no_cx16 && !tp::has_feature(target.en_features, "cx16")) { + jl_error("Your CPU does not support the CX16 instruction, which is required " + "by this version of Julia! This is often due to running inside of a " + "virtualized environment. Please read " + "https://docs.julialang.org/en/v1/devdocs/sysimg/ for more."); } - match.best_idx = i; - feature_size = new_feature_size; - rejection_reasons.push_back("Updating best match to this target\n"); } - if (match.best_idx == UINT32_MAX) { - // Construct a nice error message for debugging purposes - std::string error_msg = "Unable to find compatible target in cached code image.\n"; - for (size_t i = 0; i < rejection_reasons.size(); i++) { - error_msg += "Target "; - error_msg += std::to_string(i); - error_msg += " ("; - error_msg += sysimg[i].name; - error_msg += "): "; - error_msg += rejection_reasons[i]; +#endif + + // Match against image targets + auto match_result = match_image_targets(id, target, rejection_reason); + if (match_result.first == UINT32_MAX) + return UINT32_MAX; + + // Clamp JIT vector features to match the sysimage target's vector width. + // On x86, AVX/AVX-512 change how VecElement tuples are passed in registers + // (FixedVectorType maps to xmm/ymm/zmm), so the JIT must not use wider + // vectors than the sysimage clone it calls into. + // TODO: aarch64 SVE uses scalable vectors which Julia doesn't generate + // (only FixedVectorType/NEON), so SVE clamping is not needed for ABI + // correctness. RISC-V V is similar. Revisit if Julia adds scalable vector + // support. + int matched_vreg = match_result.second; + int host_vreg = tp::max_vector_size(target.en_features); +#if defined(_CPU_X86_64_) || defined(_CPU_X86_) + if (matched_vreg != host_vreg) { + if (matched_vreg < 64) { + static const char *avx512[] = { + "avx512f", "avx512dq", "avx512ifma", "avx512cd", + "avx512bw", "avx512vl", "avx512vbmi", "avx512vpopcntdq", + "avx512vbmi2", "avx512vnni", "avx512bitalg", + "avx512vp2intersect", "avx512bf16", "avx512fp16", nullptr + }; + for (const char **f = avx512; *f; f++) { + const FeatureEntry *fe = find_feature(*f); + if (fe) feature_clear(&target.en_features, fe->bit); + } } - if (rejection_reason) - *rejection_reason = jl_pchar_to_string(error_msg.data(), error_msg.size()); + if (matched_vreg < 32) { + static const char *avx[] = { + "avx", "avx2", "fma", "f16c", "fma4", "xop", + "vaes", "vpclmulqdq", nullptr + }; + for (const char **f = avx; *f; f++) { + const FeatureEntry *fe = find_feature(*f); + if (fe) feature_clear(&target.en_features, fe->bit); + } + } + for (int w = 0; w < TARGET_FEATURE_WORDS; w++) + target.dis_features.bits[w] = hw_feature_mask.bits[w] & ~target.en_features.bits[w]; + target.cpu_features = tp::build_llvm_feature_string(target.en_features, target.dis_features); } - return match; +#else + (void)matched_vreg; + (void)host_vreg; +#endif + + jit_targets.push_back(std::move(target)); + return match_result.first; } -// Debug helper +static uint32_t match_pkgimg_target(void *ctx, const void *id, jl_value_t **rejection_reason) +{ + auto &target = jit_targets.front(); + auto result = match_image_targets(id, target, rejection_reason); + return result.first; +} -template -static inline void dump_cpu_spec(uint32_t cpu, const FeatureList &features, - const FeatureName *feature_names, uint32_t nfeature_names, - const CPUSpec *cpus, uint32_t ncpus) +// ============================================================================ +// Exported functions +// ============================================================================ + +#if defined(_CPU_X86_64_) || defined(_CPU_X86_) + +extern "C" JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType) { - bool cpu_found = false; - for (uint32_t i = 0;i < ncpus;i++) { - if (cpu == uint32_t(cpus[i].cpu)) { - cpu_found = true; - jl_safe_printf("CPU: %s\n", cpus[i].name); - break; - } - } - if (!cpu_found) - jl_safe_printf("CPU: generic\n"); + asm volatile ( +#if defined(__i386__) && defined(__PIC__) + "xchg %%ebx, %%esi;" + "cpuid;" + "xchg %%esi, %%ebx;" : + "=S" (CPUInfo[1]), +#else + "cpuid" : + "=b" (CPUInfo[1]), +#endif + "=a" (CPUInfo[0]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "a" (InfoType) + ); +} + +extern "C" JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType) +{ + asm volatile ( +#if defined(__i386__) && defined(__PIC__) + "xchg %%ebx, %%esi;" + "cpuid;" + "xchg %%esi, %%ebx;" : + "=S" (CPUInfo[1]), +#else + "cpuid" : + "=b" (CPUInfo[1]), +#endif + "=a" (CPUInfo[0]), + "=c" (CPUInfo[2]), + "=d" (CPUInfo[3]) : + "a" (InfoType), + "c" (subInfoType) + ); +} + +#endif // x86 + +JL_DLLEXPORT void jl_dump_host_cpu(void) +{ + + jl_safe_printf("CPU: %s\n", host_cpu_name().c_str()); jl_safe_printf("Features:"); + auto host_feats = tp::get_host_features(); bool first = true; - for (uint32_t i = 0;i < nfeature_names;i++) { - if (test_nbit(&features[0], feature_names[i].bit)) { + for (uint32_t i = 0; i < num_features; i++) { + if (feature_test(&host_feats, feature_table[i].bit)) { if (first) { - jl_safe_printf(" %s", feature_names[i].name); + jl_safe_printf(" %s", feature_table[i].name); first = false; - } - else { - jl_safe_printf(", %s", feature_names[i].name); + } else { + jl_safe_printf(", %s", feature_table[i].name); } } } jl_safe_printf("\n"); } +JL_DLLEXPORT jl_value_t *jl_check_pkgimage_clones(char *data) +{ + jl_value_t *rejection_reason = NULL; + JL_GC_PUSH1(&rejection_reason); + uint32_t match_idx = match_pkgimg_target(NULL, data, &rejection_reason); + JL_GC_POP(); + if (match_idx == UINT32_MAX) + return rejection_reason; + return jl_nothing; } -static std::string jl_get_cpu_name_llvm(void) +JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits) { - return llvm::sys::getHostCPUName().str(); +#if defined(_CPU_X86_64_) || defined(_CPU_X86_) + if ((bits == 32 || bits == 64) && !jit_targets.empty()) { + const auto &feats = jit_targets.front().en_features; + if (tp::has_feature(feats, "fma") || tp::has_feature(feats, "fma4")) + return jl_true; + } +#elif defined(_CPU_AARCH64_) + if (bits == 32 || bits == 64) + return jl_true; +#endif + return jl_false; } -static std::string jl_get_cpu_features_llvm(void) +// Validate cpu_target string before any processing. +// Called from init.c early in startup. +extern "C" JL_DLLEXPORT void jl_check_cpu_target(const char *cpu_target, int imaging) { -#if JL_LLVM_VERSION >= 190000 - auto HostFeatures = llvm::sys::getHostCPUFeatures(); -#else - llvm::StringMap HostFeatures; - llvm::sys::getHostCPUFeatures(HostFeatures); + + if (!cpu_target || !*cpu_target) + return; // NULL/empty handled elsewhere + + auto target_str = jl_expand_sysimage_keyword(cpu_target); + if (target_str.empty()) + return; + + // Handle "help": print available CPU targets and exit + if (target_str == "help" || target_str.find(",help") != std::string::npos) { + tp::print_cpu_targets(); + exit(0); + } + + auto specs = tp::resolve_targets_for_llvm(target_str); + + for (auto &s : specs) { + if (s.flags & tp::TF_UNKNOWN_NAME) { + jl_safe_printf("Unknown cpu target: \"%s\"\n", s.cpu_name.c_str()); + exit(1); + } + } + + if (!imaging) { + if (specs.size() > 1) { + jl_safe_printf("More than one command line CPU targets specified " + "without a `--output-` flag specified"); + exit(1); + } + if (!specs.empty() && (specs[0].flags & tp::TF_CLONE_ALL)) { + jl_safe_printf("\"clone_all\" feature specified " + "without a `--output-` flag specified"); + exit(1); + } + } +} + +jl_image_t jl_load_sysimg(jl_image_buf_t image, const char *cpu_target) +{ + + if (!jit_targets.empty()) + jl_error("JIT targets already initialized"); + return load_sysimg_target(image, match_sysimg_target, (void *)cpu_target); +} + +jl_image_t jl_load_pkgimg(jl_image_buf_t image) +{ + if (jit_targets.empty()) + jl_error("JIT targets not initialized"); + if (jit_targets.size() > 1) + jl_error("Expected only one JIT target"); + return load_sysimg_target(image, match_pkgimg_target, NULL); +} + +#ifndef __clang_analyzer__ +std::pair +jl_get_llvm_target(const char *cpu_target, bool imaging) +{ + init_jit_targets(cpu_target, imaging); + auto &spec = jit_targets[0]; + + std::string features = spec.cpu_features; + if (!spec.ext_features.empty()) { + if (!features.empty()) features += ','; + features += spec.ext_features; + } + + return {spec.cpu_name, std::move(features)}; +} #endif - std::string attr; - for (auto &ele: HostFeatures) { - if (ele.getValue()) { - if (!attr.empty()) { - attr.append(",+"); - } - else { - attr.append("+"); + +#ifndef __clang_analyzer__ +const std::pair &jl_get_llvm_disasm_target(void) +{ + // Use generic CPU with all features enabled so the disassembler + // can decode any instruction (including sysimage clones compiled + // for targets beyond the current JIT target). + static const auto res = [] { + std::string features; + for (uint32_t i = 0; i < num_features; i++) { + if (feature_table[i].is_hw) { + if (!features.empty()) features += ','; + features += '+'; + features += feature_table[i].name; } - attr.append(ele.getKey().str()); } + return std::make_pair(std::string("generic"), std::move(features)); + }(); + return res; +} +#endif + +#ifndef __clang_gcanalyzer__ +jl_clone_targets_t jl_get_llvm_clone_targets(const char *cpu_target) +{ + + + auto target_str = jl_expand_sysimage_keyword(cpu_target); + auto specs = tp::resolve_targets_for_llvm(target_str); + + if (specs.empty()) + jl_error("No targets specified"); + + jl_clone_targets_t result; + + // Serialized blob for sysimage embedding + auto blob = tp::serialize_targets(specs); + result.data.assign(blob.begin(), blob.end()); + + // LLVM specs for codegen + for (auto &s : specs) { + jl_target_spec_t ele; + ele.cpu_name = s.cpu_name; + ele.cpu_features = s.cpu_features; + if (!s.ext_features.empty()) { + if (!ele.cpu_features.empty()) ele.cpu_features += ','; + ele.cpu_features += s.ext_features; + } + ele.base = s.base; + ele.clone_all = (s.flags & tp::TF_CLONE_ALL) != 0; + ele.opt_size = (s.flags & tp::TF_OPTSIZE) != 0; + ele.min_size = (s.flags & tp::TF_MINSIZE) != 0; + ele.diff = s.diff; + result.specs.push_back(std::move(ele)); } - return attr; + return result; } +#endif -#if defined(_CPU_X86_) || defined(_CPU_X86_64_) +extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) +{ + auto host_feats = tp::get_host_features(); + if (feature >= TARGET_FEATURE_WORDS * 64) + return 0; + return feature_test(&host_feats, feature); +} -#include "processor_x86.cpp" +// ============================================================================ +// Cross-architecture CPU/feature queries +// ============================================================================ -#elif defined(_CPU_AARCH64_) || defined(_CPU_ARM_) +extern "C" JL_DLLEXPORT size_t jl_cpufeatures_nbytes(void) +{ + return sizeof(FeatureBits); +} -#include "processor_arm.cpp" +extern "C" JL_DLLEXPORT int jl_cpufeatures_lookup(const char *cpu_name, + uint8_t *features_out, + size_t bufsize) +{ + if (bufsize < sizeof(FeatureBits)) + return -1; + const CPUEntry *entry = find_cpu(cpu_name); + if (!entry) + return -1; + FeatureBits hw; + for (int i = 0; i < TARGET_FEATURE_WORDS; i++) + hw.bits[i] = entry->features.bits[i] & hw_feature_mask.bits[i]; + memcpy(features_out, &hw, sizeof(FeatureBits)); + return 0; +} + +extern "C" JL_DLLEXPORT void jl_cpufeatures_host(uint8_t *features_out, size_t bufsize) +{ + if (bufsize < sizeof(FeatureBits)) + return; + auto fb = tp::get_host_features(); + for (int i = 0; i < TARGET_FEATURE_WORDS; i++) + fb.bits[i] &= hw_feature_mask.bits[i]; + memcpy(features_out, &fb, sizeof(FeatureBits)); +} + +extern "C" JL_DLLEXPORT size_t jl_cpufeatures_cross_lookup( + const char *arch, const char *cpu_name, + uint8_t *features_out, size_t bufsize) +{ + tp::CrossFeatureBits fb; + if (!tp::cross_lookup_cpu(arch, cpu_name, fb)) + return 0; + size_t nbytes = fb.num_words * sizeof(uint64_t); + if (bufsize < nbytes) + return 0; + memcpy(features_out, fb.bits, nbytes); + return nbytes; +} + +extern "C" JL_DLLEXPORT size_t jl_cpufeatures_cross_nbytes(const char *arch) +{ + return tp::cross_feature_words(arch) * sizeof(uint64_t); +} + +extern "C" JL_DLLEXPORT unsigned jl_cpufeatures_cross_num_features(const char *arch) +{ + return tp::cross_num_features(arch); +} + +extern "C" JL_DLLEXPORT unsigned jl_cpufeatures_cross_num_cpus(const char *arch) +{ + return tp::cross_num_cpus(arch); +} + +extern "C" JL_DLLEXPORT const char *jl_cpufeatures_cross_feature_name(const char *arch, unsigned idx) +{ + return tp::cross_feature_name(arch, idx); +} + +extern "C" JL_DLLEXPORT int jl_cpufeatures_cross_feature_bit(const char *arch, unsigned idx) +{ + return tp::cross_feature_bit_at(arch, idx); +} + +extern "C" JL_DLLEXPORT const char *jl_cpufeatures_cross_cpu_name(const char *arch, unsigned idx) +{ + return tp::cross_cpu_name(arch, idx); +} + +// ============================================================================ +// FPU control +// ============================================================================ + +#if defined(_CPU_X86_64_) || defined(_CPU_X86_) + +#include + +static uint32_t subnormal_flags = [] { + int32_t info[4]; + jl_cpuid(info, 0); + if (info[0] >= 1) { + jl_cpuid(info, 1); + if (info[3] & (1 << 26)) { + return 0x00008040u; + } + else if (info[3] & (1 << 25)) { + return 0x00008000u; + } + } + return 0u; +}(); + +extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) +{ + return _mm_getcsr() & subnormal_flags; +} + +extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) +{ + uint32_t flags = subnormal_flags; + if (flags) { + uint32_t state = _mm_getcsr(); + if (isZero) state |= flags; + else state &= ~flags; + _mm_setcsr(state); + return 0; + } + return isZero; +} + +extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) { return 0; } +extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) { return isDefault; } + +#elif defined(_CPU_AARCH64_) + +extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) +{ + uint64_t fpcr; + asm volatile ("mrs %0, fpcr" : "=r"(fpcr)); + return (fpcr & (1 << 24)) != 0; +} + +extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) +{ + uint64_t fpcr; + asm volatile ("mrs %0, fpcr" : "=r"(fpcr)); + if (isZero) fpcr |= (1 << 24); + else fpcr &= ~(uint64_t)(1 << 24); + asm volatile ("msr fpcr, %0" :: "r"(fpcr)); + return 0; +} + +extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) +{ + uint64_t fpcr; + asm volatile ("mrs %0, fpcr" : "=r"(fpcr)); + return (fpcr & (1 << 25)) != 0; +} + +extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) +{ + uint64_t fpcr; + asm volatile ("mrs %0, fpcr" : "=r"(fpcr)); + if (isDefault) fpcr |= (1 << 25); + else fpcr &= ~(uint64_t)(1 << 25); + asm volatile ("msr fpcr, %0" :: "r"(fpcr)); + return 0; +} #else -#include "processor_fallback.cpp" +extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) { return 0; } +extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) { return isZero; } +extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) { return 0; } +extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) { return isDefault; } #endif -// Global variable to store the CPU target string used for the sysimage -static std::string sysimage_cpu_target; + +// ============================================================================ +// Global exports (defined after backend) +// ============================================================================ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) { @@ -1016,35 +884,19 @@ JL_DLLEXPORT jl_value_t *jl_get_cpu_name(void) JL_DLLEXPORT jl_value_t *jl_get_cpu_features(void) { - return jl_cstr_to_string(jl_get_cpu_features_llvm().c_str()); + return jl_cstr_to_string(get_host_feature_string().c_str()); } +#ifndef __clang_analyzer__ extern "C" JL_DLLEXPORT jl_value_t* jl_reflect_clone_targets() { - auto specs = jl_get_llvm_clone_targets(jl_options.cpu_target); - const uint32_t base_flags = 0; - llvm::SmallVector data; - auto push_i32 = [&] (uint32_t v) { - uint8_t buff[4]; - memcpy(buff, &v, 4); - data.insert(data.end(), buff, buff + 4); - }; - push_i32(specs.size()); - for (uint32_t i = 0; i < specs.size(); i++) { - push_i32(base_flags | (specs[i].flags & JL_TARGET_UNKNOWN_NAME)); - auto &specdata = specs[i].data; - data.insert(data.end(), specdata.begin(), specdata.end()); - } - + auto targets = jl_get_llvm_clone_targets(jl_options.cpu_target); + auto &data = targets.data; jl_value_t *arr = (jl_value_t*)jl_alloc_array_1d(jl_array_uint8_type, data.size()); uint8_t *out = jl_array_data(arr, uint8_t); memcpy(out, data.data(), data.size()); return arr; } - -extern "C" JL_DLLEXPORT void jl_reflect_feature_names(const FeatureName **fnames, size_t *nf) { - *fnames = feature_names; - *nf = nfeature_names; -} +#endif extern "C" JL_DLLEXPORT jl_value_t *jl_get_sysimage_cpu_target(void) { if (sysimage_cpu_target.empty()) { @@ -1053,7 +905,6 @@ extern "C" JL_DLLEXPORT jl_value_t *jl_get_sysimage_cpu_target(void) { return jl_cstr_to_string(sysimage_cpu_target.c_str()); } -// Function to set the sysimage CPU target (called during initialization) void jl_set_sysimage_cpu_target(const char *cpu_target) { if (cpu_target) { sysimage_cpu_target = cpu_target; diff --git a/src/processor.h b/src/processor.h index 091defadd4951..6567dcea01031 100644 --- a/src/processor.h +++ b/src/processor.h @@ -18,46 +18,9 @@ extern "C" { // Every image exports a `jl_image_pointers_t` as a global symbol `jl_image_pointers`. // This symbol acts as a root for all other code-related symbols in the image. -enum { - JL_TARGET_VEC_CALL = 1 << 0, - // Clone all functions - JL_TARGET_CLONE_ALL = 1 << 1, - // Clone when there's scalar math operations that can benefit from target-specific - // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags. - JL_TARGET_CLONE_MATH = 1 << 2, - // Clone when the function has a loop - JL_TARGET_CLONE_LOOP = 1 << 3, - // Clone when the function uses any vectors - // When this is specified, the cloning pass should also record if any of the cloned functions - // used this in any function call (including the signature of the function itself) - JL_TARGET_CLONE_SIMD = 1 << 4, - // The CPU name is unknown - JL_TARGET_UNKNOWN_NAME = 1 << 5, - // Optimize for size for this target - JL_TARGET_OPTSIZE = 1 << 6, - // Only optimize for size for this target - JL_TARGET_MINSIZE = 1 << 7, - // Clone when the function queries CPU features - JL_TARGET_CLONE_CPU = 1 << 8, - // Clone when the function uses fp16 - JL_TARGET_CLONE_FLOAT16 = 1 << 9, - // Clone when the function uses bf16 - JL_TARGET_CLONE_BFLOAT16 = 1 << 10, -}; - -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) -typedef enum { -#define JL_FEATURE_DEF(name, bit, llvmver) JL_X86_##name = bit, -#include "features_x86.h" -#undef JL_FEATURE_DEF -#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch32_##name = bit, -#include "features_aarch32.h" -#undef JL_FEATURE_DEF -#define JL_FEATURE_DEF(name, bit, llvmver) JL_AArch64_##name = bit, -#include "features_aarch64.h" -#undef JL_FEATURE_DEF -} jl_cpu_feature_t; -#undef JL_FEATURE_DEF_NAME +// Feature indices come from the cpufeatures library's generated tables. +// The actual constants are defined in base/features_h.jl (auto-generated). +typedef uint32_t jl_cpu_feature_t; JL_DLLEXPORT int jl_test_cpu_feature(jl_cpu_feature_t feature); @@ -209,8 +172,9 @@ typedef struct { * * Return the data about the function pointers selected. */ -jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target); -jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image); +void jl_check_cpu_target(const char *cpu_target, int imaging); +jl_image_t jl_load_sysimg(jl_image_buf_t image, const char *cpu_target); +jl_image_t jl_load_pkgimg(jl_image_buf_t image); // Internal function to set the sysimage CPU target during initialization void jl_set_sysimage_cpu_target(const char *cpu_target); @@ -250,7 +214,17 @@ extern jl_image_unpack_func_t *jl_image_unpack; #include #include -extern JL_DLLEXPORT bool jl_processor_print_help; +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#include +#elif defined(__aarch64__) || defined(_M_ARM64) +#include +#elif defined(__riscv) && __riscv_xlen == 64 +#include +#else +#include +#endif +#include + // NOLINTBEGIN(clang-diagnostic-return-type-c-linkage) /** * Returns the CPU name and feature string to be used by LLVM JIT. @@ -258,7 +232,7 @@ extern JL_DLLEXPORT bool jl_processor_print_help; * If the detected/specified CPU name is not available on the LLVM version specified, * a fallback CPU name will be used. Unsupported features will be ignored. */ -extern "C" JL_DLLEXPORT std::pair> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags) JL_NOTSAFEPOINT; +extern "C" JL_DLLEXPORT std::pair jl_get_llvm_target(const char *cpu_target, bool imaging) JL_NOTSAFEPOINT; /** * Returns the CPU name and feature string to be used by LLVM disassembler. @@ -268,30 +242,28 @@ extern "C" JL_DLLEXPORT std::pair> extern "C" JL_DLLEXPORT const std::pair &jl_get_llvm_disasm_target(void) JL_NOTSAFEPOINT; struct jl_target_spec_t { - // LLVM target name std::string cpu_name; - // LLVM feature string std::string cpu_features; - // serialized identification data - llvm::SmallVector data; - // Clone condition. - uint32_t flags; - // Base target index. int base; + bool clone_all = false; + bool opt_size = false; + bool min_size = false; + tp::FeatureDiff diff; }; + +struct jl_clone_targets_t { + std::vector specs; + std::vector data; // serialized target identification blob +}; + /** - * Return the list of targets to clone + * Return the list of targets to clone and their serialized identification data */ -extern "C" JL_DLLEXPORT llvm::SmallVector jl_get_llvm_clone_targets(const char *cpu_target) JL_NOTSAFEPOINT; +extern "C" JL_DLLEXPORT jl_clone_targets_t jl_get_llvm_clone_targets(const char *cpu_target) JL_NOTSAFEPOINT; // NOLINTEND(clang-diagnostic-return-type-c-linkage) -struct FeatureName { - const char *name; - uint32_t bit; // bit index into a `uint32_t` array; - uint32_t llvmver; // 0 if it is available on the oldest LLVM version we support -}; - extern "C" JL_DLLEXPORT jl_value_t* jl_reflect_clone_targets(); -extern "C" JL_DLLEXPORT void jl_reflect_feature_names(const FeatureName **feature_names, size_t *nfeatures); +extern "C" JL_DLLEXPORT jl_value_t *jl_feature_bits_to_string(const uint8_t *bits, int32_t nwords); +extern "C" JL_DLLEXPORT std::string jl_expand_sysimage_keyword(const char *cpu_target); #endif #endif diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp deleted file mode 100644 index 0fba135c0b17e..0000000000000 --- a/src/processor_arm.cpp +++ /dev/null @@ -1,2085 +0,0 @@ -// This file is a part of Julia. License is MIT: https://julialang.org/license - -// ARM (AArch32/AArch64) specific processor detection and dispatch - -#include -#include -#include -#include -#include -#include - -// This nesting is required to allow compilation on musl -#define USE_DYN_GETAUXVAL -#if (defined(_OS_LINUX_) || defined(_OS_FREEBSD_)) && defined(_CPU_AARCH64_) -# undef USE_DYN_GETAUXVAL -# include -#elif defined(__GLIBC_PREREQ) -# if __GLIBC_PREREQ(2, 16) -# undef USE_DYN_GETAUXVAL -# include -# endif -#elif defined _CPU_AARCH64_ && defined _OS_DARWIN_ -#include -#include -#endif - -namespace ARM { -enum class CPU : uint32_t { - generic = 0, - - // Architecture targets - armv7_a, - armv7_m, - armv7e_m, - armv7_r, - armv8_a, - armv8_m_base, - armv8_m_main, - armv8_r, - armv8_1_a, - armv8_2_a, - armv8_3_a, - armv8_4_a, - armv8_5_a, - armv8_6_a, - - // ARM - // armv6l - arm_mpcore, - arm_1136jf_s, - arm_1156t2f_s, - arm_1176jzf_s, - arm_cortex_m0, - arm_cortex_m1, - // armv7ml - arm_cortex_m3, - arm_cortex_m4, - arm_cortex_m7, - // armv7l - arm_cortex_a5, - arm_cortex_a7, - arm_cortex_a8, - arm_cortex_a9, - arm_cortex_a12, - arm_cortex_a15, - arm_cortex_a17, - arm_cortex_r4, - arm_cortex_r5, - arm_cortex_r7, - arm_cortex_r8, - // armv8ml - arm_cortex_m23, - arm_cortex_m33, - // armv8l - arm_cortex_a32, - arm_cortex_r52, - // aarch64 - arm_cortex_a34, - arm_cortex_a35, - arm_cortex_a53, - arm_cortex_a55, - arm_cortex_a57, - arm_cortex_a65, - arm_cortex_a65ae, - arm_cortex_a72, - arm_cortex_a73, - arm_cortex_a75, - arm_cortex_a76, - arm_cortex_a76ae, - arm_cortex_a77, - arm_cortex_a78, - arm_cortex_x1, - arm_neoverse_e1, - arm_neoverse_n1, - arm_neoverse_v1, - arm_neoverse_n2, - - // Cavium - // aarch64 - cavium_thunderx, - cavium_thunderx88, - cavium_thunderx88p1, - cavium_thunderx81, - cavium_thunderx83, - cavium_thunderx2t99, - cavium_thunderx2t99p1, - cavium_octeontx2, - cavium_octeontx2t98, - cavium_octeontx2t96, - cavium_octeontx2f95, - cavium_octeontx2f95n, - cavium_octeontx2f95mm, - - // Fujitsu - // aarch64 - fujitsu_a64fx, - - // HiSilicon - // aarch64 - hisilicon_tsv110, - - // Huaxingtong - // aarch64 - hxt_phecda, - - // NVIDIA - // aarch64 - nvidia_denver1, - nvidia_denver2, - nvidia_carmel, - - // AppliedMicro - // aarch64 - apm_xgene1, - apm_xgene2, - apm_xgene3, - - // Qualcomm - // armv7l - qualcomm_scorpion, - qualcomm_krait, - // aarch64 - qualcomm_kyro, - qualcomm_falkor, - qualcomm_saphira, - - // Samsung - // aarch64 - samsung_exynos_m1, - samsung_exynos_m2, - samsung_exynos_m3, - samsung_exynos_m4, - samsung_exynos_m5, - - // Apple - // armv7l - apple_swift, - // aarch64 - apple_a7, // cyclone - apple_a8, // typhoon - apple_a9, // twister - apple_a10, // hurricane - apple_a11, - apple_a12, - apple_a13, - apple_a14, - apple_a15, - apple_a16, - apple_a17, - apple_m1, - apple_m2, - apple_m3, - apple_m4, - apple_s4, - apple_s5, - - // Marvell - // armv7l - marvell_pj4, - // aarch64 - marvell_thunderx3t110, - - // Intel - // armv7l - intel_3735d, -}; - -#ifdef _CPU_AARCH64_ -static constexpr size_t feature_sz = 3; -static constexpr FeatureName feature_names[] = { -#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, -#include "features_aarch64.h" -#undef JL_FEATURE_DEF -#undef JL_FEATURE_DEF_NAME -}; -static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName); - -template -static inline constexpr FeatureList get_feature_masks(Args... args) -{ - return ::get_feature_masks(args...); -} - -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) -static constexpr auto feature_masks = get_feature_masks( -#define JL_FEATURE_DEF(name, bit, llvmver) bit, -#include "features_aarch64.h" -#undef JL_FEATURE_DEF - -1); -static const auto real_feature_masks = - feature_masks & FeatureList{{UINT32_MAX, UINT32_MAX, 0}}; - -namespace Feature { -enum : uint32_t { -#define JL_FEATURE_DEF(name, bit, llvmver) name = bit, -#include "features_aarch64.h" -#undef JL_FEATURE_DEF -}; -#undef JL_FEATURE_DEF_NAME -// This does not cover all dependencies (e.g. the ones that depends on arm versions) -static constexpr FeatureDep deps[] = { - {rcpc_immo, rcpc}, - {sha3, sha2}, - // {sha512, sha3}, - {ccdp, ccpp}, - {sve, fullfp16}, - {fp16fml, fullfp16}, - {altnzcv, flagm}, - {sve2, sve}, - {sve2_aes, sve2}, - {sve2_aes, aes}, - {sve2_bitperm, sve2}, - {sve2_sha3, sve2}, - {sve2_sha3, sha3}, - {sve2_sm4, sve2}, - {sve2_sm4, sm4}, - {f32mm, sve}, - {f64mm, sve}, -}; - -constexpr auto generic = get_feature_masks(); -constexpr auto armv8a_crc = get_feature_masks(crc); -constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(aes, sha2); -constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a, lse, rdm); // lor -constexpr auto armv8_1a_crypto = armv8_1a | get_feature_masks(aes, sha2); -constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a, ccpp); -constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(aes, sha2); -constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a, jsconv, complxnum, rcpc); -constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(aes, sha2); -constexpr auto armv8_4a = armv8_3a | get_feature_masks(v8_4a, dit, rcpc_immo, flagm); -constexpr auto armv8_4a_crypto = armv8_4a | get_feature_masks(aes, sha2); -constexpr auto armv8_5a = armv8_4a | get_feature_masks(v8_5a, sb, ccdp, altnzcv, fptoint); -constexpr auto armv8_5a_crypto = armv8_5a | get_feature_masks(aes, sha2); -constexpr auto armv8_6a = armv8_5a | get_feature_masks(v8_6a, i8mm, bf16); - -// For ARM cores, the features required can be found in the technical reference manual -// The relevant register values and the features they are related to are: -// ID_AA64ISAR0_EL1: -// .AES: aes, pmull -// .SHA1: sha1 -// .SHA2: sha2, sha512 -// .CRC32: crc -// .Atomic: les -// .RDM: rdm -// .SHA3: sha3 -// .SM3: sm3 (sm4) -// .SM4: sm4 -// .DP: dotprod -// .FHM: fp16fml -// .TS: flagm, altnzcz -// .RNDR: rand - -// ID_AA64ISAR1_EL1 -// .JSCVT: jsconv -// .FCMA: complxnum -// .LRCPC: rcpc, rcpc_immo -// .DPB: ccpp, ccdp -// .SB: sb -// .APA/.API: paca (pa) -// .GPA/.GPI: paga (pa) -// .FRINTTS: fptoint -// .I8MM: i8mm -// .BF16: bf16 -// .DGH: dgh - -// ID_AA64PFR0_EL1 -// .FP: fullfp16 -// .SVE: sve -// .DIT: dit -// .BT: bti - -// ID_AA64PFR1_EL1 -// .SSBS: ssbs -// .MTE: mte - -// ID_AA64MMFR2_EL1.AT: uscat - -// ID_AA64ZFR0_EL1 -// .SVEVer: sve2 -// .AES: sve2-aes, sve2-pmull -// .BitPerm: sve2-bitperm -// .SHA3: sve2-sha3 -// .SM4: sve2-sm4 -// .F32MM: f32mm -// .F64MM: f64mm - -constexpr auto arm_cortex_a34 = armv8a_crc; -constexpr auto arm_cortex_a35 = armv8a_crc; -constexpr auto arm_cortex_a53 = armv8a_crc; -constexpr auto arm_cortex_a55 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); -constexpr auto arm_cortex_a57 = armv8a_crc; -constexpr auto arm_cortex_a65 = armv8_2a | get_feature_masks(rcpc, fullfp16, ssbs); -constexpr auto arm_cortex_a72 = armv8a_crc; -constexpr auto arm_cortex_a73 = armv8a_crc; -constexpr auto arm_cortex_a75 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16); -constexpr auto arm_cortex_a76 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); -constexpr auto arm_cortex_a77 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); -constexpr auto arm_cortex_a78 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); // spe -constexpr auto arm_cortex_x1 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); // spe -constexpr auto arm_neoverse_e1 = armv8_2a | get_feature_masks(rcpc, fullfp16, ssbs); -constexpr auto arm_neoverse_n1 = armv8_2a | get_feature_masks(dotprod, rcpc, fullfp16, ssbs); -constexpr auto arm_neoverse_v1 = armv8_4a | get_feature_masks(sve, i8mm, bf16, fullfp16, ssbs, rand); -constexpr auto arm_neoverse_n2 = armv8_5a | get_feature_masks(sve, i8mm, bf16, fullfp16, sve2, - sve2_bitperm, rand, mte); -constexpr auto cavium_thunderx = armv8a_crc_crypto; -constexpr auto cavium_thunderx88 = armv8a_crc_crypto; -constexpr auto cavium_thunderx88p1 = armv8a_crc_crypto; -constexpr auto cavium_thunderx81 = armv8a_crc_crypto; -constexpr auto cavium_thunderx83 = armv8a_crc_crypto; -constexpr auto cavium_thunderx2t99 = armv8_1a_crypto; -constexpr auto cavium_thunderx2t99p1 = cavium_thunderx2t99; -constexpr auto cavium_octeontx2 = armv8_2a_crypto; -constexpr auto fujitsu_a64fx = armv8_2a | get_feature_masks(sha2, fullfp16, sve, complxnum); -constexpr auto hisilicon_tsv110 = armv8_2a_crypto | get_feature_masks(dotprod, fullfp16); -constexpr auto hxt_phecda = armv8a_crc_crypto; -constexpr auto marvell_thunderx3t110 = armv8_3a_crypto; -constexpr auto nvidia_denver1 = generic; // TODO? (crc, crypto) -constexpr auto nvidia_denver2 = armv8a_crc_crypto; -constexpr auto nvidia_carmel = armv8_2a_crypto | get_feature_masks(fullfp16); -constexpr auto apm_xgene1 = generic; -constexpr auto apm_xgene2 = generic; // TODO? -constexpr auto apm_xgene3 = generic; // TODO? -constexpr auto qualcomm_kyro = armv8a_crc_crypto; -constexpr auto qualcomm_falkor = armv8a_crc_crypto | get_feature_masks(rdm); -constexpr auto qualcomm_saphira = armv8_4a_crypto; -constexpr auto samsung_exynos_m1 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m2 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m3 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m4 = armv8_2a_crypto | get_feature_masks(dotprod, fullfp16); -constexpr auto samsung_exynos_m5 = samsung_exynos_m4; -constexpr auto apple_a7 = armv8a_crc_crypto; -constexpr auto apple_a10 = armv8a_crc_crypto | get_feature_masks(rdm); -constexpr auto apple_a11 = armv8_2a_crypto | get_feature_masks(fullfp16); -constexpr auto apple_a12 = armv8_3a_crypto | get_feature_masks(fullfp16); -constexpr auto apple_a13 = armv8_4a_crypto | get_feature_masks(fp16fml, fullfp16, sha3); -constexpr auto apple_a14 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3); -constexpr auto apple_a15 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -constexpr auto apple_a16 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -constexpr auto apple_a17 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -constexpr auto apple_m1 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3); -constexpr auto apple_m2 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -constexpr auto apple_m3 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -constexpr auto apple_m4 = armv8_5a_crypto | get_feature_masks(dotprod,fp16fml, fullfp16, sha3, i8mm, bf16); -// Features based on https://github.com/llvm/llvm-project/blob/82507f1798768280cf5d5aab95caaafbc7fe6f47/llvm/include/llvm/Support/AArch64TargetParser.def -// and sysctl -a hw.optional -constexpr auto apple_s4 = apple_a12; -constexpr auto apple_s5 = apple_a12; - -} - -static constexpr CPUSpec cpus[] = { - {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, - {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a}, - {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a}, - {"armv8.3_a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a}, - {"armv8.4-a", CPU::armv8_4_a, CPU::generic, 0, Feature::armv8_4a}, - {"armv8.5-a", CPU::armv8_5_a, CPU::generic, 0, Feature::armv8_5a}, - {"armv8.6_a", CPU::armv8_6_a, CPU::generic, 0, Feature::armv8_6a}, - {"cortex-a34", CPU::arm_cortex_a34, CPU::arm_cortex_a35, 110000, Feature::arm_cortex_a34}, - {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35}, - {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53}, - {"cortex-a55", CPU::arm_cortex_a55, CPU::generic, 0, Feature::arm_cortex_a55}, - {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57}, - {"cortex-a65", CPU::arm_cortex_a65, CPU::arm_cortex_a75, 100000, Feature::arm_cortex_a65}, - {"cortex-a65ae", CPU::arm_cortex_a65ae, CPU::arm_cortex_a75, 100000, Feature::arm_cortex_a65}, - {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72}, - {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73}, - {"cortex-a75", CPU::arm_cortex_a75, CPU::generic, 0, Feature::arm_cortex_a75}, - {"cortex-a76", CPU::arm_cortex_a76, CPU::generic, 0, Feature::arm_cortex_a76}, - {"cortex-a76ae", CPU::arm_cortex_a76ae, CPU::generic, 0, Feature::arm_cortex_a76}, - {"cortex-a77", CPU::arm_cortex_a77, CPU::arm_cortex_a76, 110000, Feature::arm_cortex_a77}, - {"cortex-a78", CPU::arm_cortex_a78, CPU::arm_cortex_a77, 110000, Feature::arm_cortex_a78}, - {"cortex-x1", CPU::arm_cortex_x1, CPU::arm_cortex_a78, 110000, Feature::arm_cortex_x1}, - {"neoverse-e1", CPU::arm_neoverse_e1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_e1}, - {"neoverse-n1", CPU::arm_neoverse_n1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_n1}, - {"neoverse-v1", CPU::arm_neoverse_v1, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_v1}, - {"neoverse-n2", CPU::arm_neoverse_n2, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_n2}, - {"thunderx", CPU::cavium_thunderx, CPU::generic, 0, Feature::cavium_thunderx}, - {"thunderxt88", CPU::cavium_thunderx88, CPU::generic, 0, Feature::cavium_thunderx88}, - {"thunderxt88p1", CPU::cavium_thunderx88p1, CPU::cavium_thunderx88, UINT32_MAX, - Feature::cavium_thunderx88p1}, - {"thunderxt81", CPU::cavium_thunderx81, CPU::generic, 0, Feature::cavium_thunderx81}, - {"thunderxt83", CPU::cavium_thunderx83, CPU::generic, 0, Feature::cavium_thunderx83}, - {"thunderx2t99", CPU::cavium_thunderx2t99, CPU::generic, 0, Feature::cavium_thunderx2t99}, - {"thunderx2t99p1", CPU::cavium_thunderx2t99p1, CPU::cavium_thunderx2t99, UINT32_MAX, - Feature::cavium_thunderx2t99p1}, - {"octeontx2", CPU::cavium_octeontx2, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"octeontx2t98", CPU::cavium_octeontx2t98, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"octeontx2t96", CPU::cavium_octeontx2t96, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"octeontx2f95", CPU::cavium_octeontx2f95, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"octeontx2f95n", CPU::cavium_octeontx2f95n, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"octeontx2f95mm", CPU::cavium_octeontx2f95mm, CPU::arm_cortex_a57, UINT32_MAX, - Feature::cavium_octeontx2}, - {"a64fx", CPU::fujitsu_a64fx, CPU::generic, 110000, Feature::fujitsu_a64fx}, - {"tsv110", CPU::hisilicon_tsv110, CPU::generic, 0, Feature::hisilicon_tsv110}, - {"phecda", CPU::hxt_phecda, CPU::qualcomm_falkor, UINT32_MAX, Feature::hxt_phecda}, - {"denver1", CPU::nvidia_denver1, CPU::generic, UINT32_MAX, Feature::nvidia_denver1}, - {"denver2", CPU::nvidia_denver2, CPU::generic, UINT32_MAX, Feature::nvidia_denver2}, - {"carmel", CPU::nvidia_carmel, CPU::generic, 110000, Feature::nvidia_carmel}, - {"xgene1", CPU::apm_xgene1, CPU::generic, UINT32_MAX, Feature::apm_xgene1}, - {"xgene2", CPU::apm_xgene2, CPU::generic, UINT32_MAX, Feature::apm_xgene2}, - {"xgene3", CPU::apm_xgene3, CPU::generic, UINT32_MAX, Feature::apm_xgene3}, - {"kyro", CPU::qualcomm_kyro, CPU::generic, 0, Feature::qualcomm_kyro}, - {"falkor", CPU::qualcomm_falkor, CPU::generic, 0, Feature::qualcomm_falkor}, - {"saphira", CPU::qualcomm_saphira, CPU::generic, 0, Feature::qualcomm_saphira}, - {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m1}, - {"exynos-m2", CPU::samsung_exynos_m2, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m2}, - {"exynos-m3", CPU::samsung_exynos_m3, CPU::generic, 0, Feature::samsung_exynos_m3}, - {"exynos-m4", CPU::samsung_exynos_m4, CPU::generic, 0, Feature::samsung_exynos_m4}, - {"exynos-m5", CPU::samsung_exynos_m5, CPU::samsung_exynos_m4, 110000, - Feature::samsung_exynos_m5}, - {"apple-a7", CPU::apple_a7, CPU::generic, 100000, Feature::apple_a7}, - {"apple-a8", CPU::apple_a8, CPU::generic, 100000, Feature::apple_a7}, - {"apple-a9", CPU::apple_a9, CPU::generic, 100000, Feature::apple_a7}, - {"apple-a10", CPU::apple_a10, CPU::generic, 100000, Feature::apple_a10}, - {"apple-a11", CPU::apple_a11, CPU::generic, 100000, Feature::apple_a11}, - {"apple-a12", CPU::apple_a12, CPU::generic, 100000, Feature::apple_a12}, - {"apple-a13", CPU::apple_a13, CPU::generic, 100000, Feature::apple_a13}, - {"apple-a14", CPU::apple_a14, CPU::apple_a13, 120000, Feature::apple_a14}, - {"apple-a15", CPU::apple_a15, CPU::apple_a14, 160000, Feature::apple_a15}, - {"apple-a16", CPU::apple_a16, CPU::apple_a14, 160000, Feature::apple_a16}, - {"apple-a17", CPU::apple_a17, CPU::apple_a16, 190000, Feature::apple_a17}, - {"apple-m1", CPU::apple_m1, CPU::apple_a14, 130000, Feature::apple_m1}, - {"apple-m2", CPU::apple_m2, CPU::apple_m1, 160000, Feature::apple_m2}, - {"apple-m3", CPU::apple_m3, CPU::apple_m2, 180000, Feature::apple_m3}, - {"apple-m4", CPU::apple_m4, CPU::apple_m3, 190000, Feature::apple_m4}, - {"apple-s4", CPU::apple_s4, CPU::generic, 100000, Feature::apple_s4}, - {"apple-s5", CPU::apple_s5, CPU::generic, 100000, Feature::apple_s5}, - {"thunderx3t110", CPU::marvell_thunderx3t110, CPU::cavium_thunderx2t99, 110000, - Feature::marvell_thunderx3t110}, -}; -#else -static constexpr size_t feature_sz = 3; -static constexpr FeatureName feature_names[] = { -#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, -#include "features_aarch32.h" -#undef JL_FEATURE_DEF -#undef JL_FEATURE_DEF_NAME -}; -static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName); - -template -static inline constexpr FeatureList get_feature_masks(Args... args) -{ - return ::get_feature_masks(args...); -} - -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) -static constexpr auto feature_masks = get_feature_masks( -#define JL_FEATURE_DEF(name, bit, llvmver) bit, -#include "features_aarch32.h" -#undef JL_FEATURE_DEF - -1); -static const auto real_feature_masks = - feature_masks & FeatureList{{UINT32_MAX, UINT32_MAX, 0}}; - -namespace Feature { -enum : uint32_t { -#define JL_FEATURE_DEF(name, bit, llvmver) name = bit, -#include "features_aarch32.h" -#undef JL_FEATURE_DEF -}; -#undef JL_FEATURE_DEF_NAME -// This does not cover all dependencies (e.g. the ones that depends on arm versions) -static constexpr FeatureDep deps[] = { - {neon, vfp3}, - {vfp4, vfp3}, - {crypto, neon}, -}; - -// These are the real base requirements of the specific architectures -constexpr auto _armv7m = get_feature_masks(v7, mclass, hwdiv); -constexpr auto _armv7a = get_feature_masks(v7, aclass); -constexpr auto _armv7r = get_feature_masks(v7, rclass); -constexpr auto _armv8m = get_feature_masks(v7, v8, mclass, hwdiv); -constexpr auto _armv8a = get_feature_masks(v7, v8, aclass, neon, vfp3, vfp4, d32, - hwdiv, hwdiv_arm); -constexpr auto _armv8r = get_feature_masks(v7, v8, rclass, neon, vfp3, vfp4, d32, - hwdiv, hwdiv_arm); - -// Set `generic` to match the feature requirement of the `C` code. -// we'll require at least these when compiling the sysimg. -#if __ARM_ARCH >= 8 -# if !defined(__ARM_ARCH_PROFILE) -constexpr auto generic = get_feature_masks(v7, v8, hwdiv); -# elif __ARM_ARCH_PROFILE == 'A' -constexpr auto generic = _armv8a; -# elif __ARM_ARCH_PROFILE == 'R' -constexpr auto generic = _armv8r; -# elif __ARM_ARCH_PROFILE == 'M' -constexpr auto generic = _armv8m; -# else -constexpr auto generic = get_feature_masks(v7, v8, hwdiv); -# endif -#elif __ARM_ARCH == 7 -# if !defined(__ARM_ARCH_PROFILE) -constexpr auto generic = get_feature_masks(v7); -# elif __ARM_ARCH_PROFILE == 'A' -constexpr auto generic = _armv7a; -# elif __ARM_ARCH_PROFILE == 'R' -constexpr auto generic = _armv7r; -# elif __ARM_ARCH_PROFILE == 'M' -constexpr auto generic = _armv7m; -# else -constexpr auto generic = get_feature_masks(v7); -# endif -#else -constexpr auto generic = get_feature_masks(); -#endif - -// All feature sets below should use or be or'ed with one of these (or generic). -// This makes sure that, for example, the `generic` target on `armv7-a` binary is equivalent -// to the `armv7-a` target. -constexpr auto armv7m = generic | _armv7m; -constexpr auto armv7a = generic | _armv7a; -constexpr auto armv7r = generic | _armv7r; -constexpr auto armv8m = generic | _armv8m; -constexpr auto armv8a = generic | _armv8a; -constexpr auto armv8r = generic | _armv8r; - -// armv7l -constexpr auto arm_cortex_a5 = armv7a; -constexpr auto arm_cortex_a7 = armv7a | get_feature_masks(vfp3, vfp4, neon); -constexpr auto arm_cortex_a8 = armv7a | get_feature_masks(d32, vfp3, neon); -constexpr auto arm_cortex_a9 = armv7a; -constexpr auto arm_cortex_a12 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); -constexpr auto arm_cortex_a15 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); -constexpr auto arm_cortex_a17 = armv7a | get_feature_masks(d32, vfp3, vfp4, neon); -constexpr auto arm_cortex_r4 = armv7r | get_feature_masks(vfp3, hwdiv); -constexpr auto arm_cortex_r5 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); -constexpr auto arm_cortex_r7 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); -constexpr auto arm_cortex_r8 = armv7r | get_feature_masks(vfp3, hwdiv, hwdiv_arm); -constexpr auto qualcomm_scorpion = armv7a | get_feature_masks(v7, aclass, vfp3, neon); -constexpr auto qualcomm_krait = armv7a | get_feature_masks(vfp3, vfp4, neon, hwdiv, hwdiv_arm); -constexpr auto apple_swift = armv7a | get_feature_masks(d32, vfp3, vfp4, neon, hwdiv, hwdiv_arm); -constexpr auto marvell_pj4 = armv7a | get_feature_masks(vfp3); -constexpr auto intel_3735d = armv7a | get_feature_masks(vfp3, neon); -// armv8ml -constexpr auto arm_cortex_m23 = armv8m; // unsupported -constexpr auto arm_cortex_m33 = armv8m | get_feature_masks(v8_m_main); // unsupported -// armv8l -constexpr auto armv8a_crc = armv8a | get_feature_masks(crc); -constexpr auto armv8_1a = armv8a_crc | get_feature_masks(v8_1a); -constexpr auto armv8_2a = armv8_1a | get_feature_masks(v8_2a); -constexpr auto armv8a_crc_crypto = armv8a_crc | get_feature_masks(crypto); -constexpr auto armv8_2a_crypto = armv8_2a | get_feature_masks(crypto); -constexpr auto armv8_3a = armv8_2a | get_feature_masks(v8_3a); -constexpr auto armv8_3a_crypto = armv8_3a | get_feature_masks(crypto); -constexpr auto armv8_4a = armv8_3a | get_feature_masks(v8_4a); -constexpr auto armv8_4a_crypto = armv8_4a | get_feature_masks(crypto); -constexpr auto armv8_5a = armv8_4a | get_feature_masks(v8_5a); -constexpr auto armv8_5a_crypto = armv8_5a | get_feature_masks(crypto); -constexpr auto armv8_6a = armv8_5a | get_feature_masks(v8_6a); -constexpr auto armv8_6a_crypto = armv8_6a | get_feature_masks(crypto); - -constexpr auto arm_cortex_a32 = armv8a_crc; -constexpr auto arm_cortex_r52 = armv8a_crc; -constexpr auto arm_cortex_a35 = armv8a_crc; -constexpr auto arm_cortex_a53 = armv8a_crc; -constexpr auto arm_cortex_a55 = armv8_2a; -constexpr auto arm_cortex_a57 = armv8a_crc; -constexpr auto arm_cortex_a72 = armv8a_crc; -constexpr auto arm_cortex_a73 = armv8a_crc; -constexpr auto arm_cortex_a75 = armv8_2a; -constexpr auto arm_cortex_a76 = armv8_2a; -constexpr auto arm_cortex_a77 = armv8_2a; -constexpr auto arm_cortex_a78 = armv8_2a; -constexpr auto arm_cortex_x1 = armv8_2a; -constexpr auto arm_neoverse_n1 = armv8_2a; -constexpr auto arm_neoverse_v1 = armv8_4a; -constexpr auto arm_neoverse_n2 = armv8_5a; -constexpr auto nvidia_denver1 = armv8a; // TODO? (crc, crypto) -constexpr auto nvidia_denver2 = armv8a_crc_crypto; -constexpr auto apm_xgene1 = armv8a; -constexpr auto apm_xgene2 = armv8a; // TODO? -constexpr auto apm_xgene3 = armv8a; // TODO? -constexpr auto qualcomm_kyro = armv8a_crc_crypto; -constexpr auto qualcomm_falkor = armv8a_crc_crypto; -constexpr auto qualcomm_saphira = armv8_3a_crypto; -constexpr auto samsung_exynos_m1 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m2 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m3 = armv8a_crc_crypto; -constexpr auto samsung_exynos_m4 = armv8_2a_crypto; -constexpr auto samsung_exynos_m5 = samsung_exynos_m4; -constexpr auto apple_a7 = armv8a_crc_crypto; - -} - -static constexpr CPUSpec cpus[] = { - {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, - // armv6 - {"mpcore", CPU::arm_mpcore, CPU::generic, 0, Feature::generic}, - {"arm1136jf-s", CPU::arm_1136jf_s, CPU::generic, 0, Feature::generic}, - {"arm1156t2f-s", CPU::arm_1156t2f_s, CPU::generic, 0, Feature::generic}, - {"arm1176jzf-s", CPU::arm_1176jzf_s, CPU::generic, 0, Feature::generic}, - {"cortex-m0", CPU::arm_cortex_m0, CPU::generic, 0, Feature::generic}, - {"cortex-m1", CPU::arm_cortex_m1, CPU::generic, 0, Feature::generic}, - // armv7ml - {"armv7-m", CPU::armv7_m, CPU::generic, 0, Feature::armv7m}, - {"armv7e-m", CPU::armv7e_m, CPU::generic, 0, Feature::armv7m}, - {"cortex-m3", CPU::arm_cortex_m3, CPU::generic, 0, Feature::armv7m}, - {"cortex-m4", CPU::arm_cortex_m4, CPU::generic, 0, Feature::armv7m}, - {"cortex-m7", CPU::arm_cortex_m7, CPU::generic, 0, Feature::armv7m}, - // armv7l - {"armv7-a", CPU::armv7_a, CPU::generic, 0, Feature::armv7a}, - {"armv7-r", CPU::armv7_r, CPU::generic, 0, Feature::armv7r}, - {"cortex-a5", CPU::arm_cortex_a5, CPU::generic, 0, Feature::arm_cortex_a5}, - {"cortex-a7", CPU::arm_cortex_a7, CPU::generic, 0, Feature::arm_cortex_a7}, - {"cortex-a8", CPU::arm_cortex_a8, CPU::generic, 0, Feature::arm_cortex_a8}, - {"cortex-a9", CPU::arm_cortex_a9, CPU::generic, 0, Feature::arm_cortex_a9}, - {"cortex-a12", CPU::arm_cortex_a12, CPU::generic, 0, Feature::arm_cortex_a12}, - {"cortex-a15", CPU::arm_cortex_a15, CPU::generic, 0, Feature::arm_cortex_a15}, - {"cortex-a17", CPU::arm_cortex_a17, CPU::generic, 0, Feature::arm_cortex_a17}, - {"cortex-r4", CPU::arm_cortex_r4, CPU::generic, 0, Feature::arm_cortex_r4}, - {"cortex-r5", CPU::arm_cortex_r5, CPU::generic, 0, Feature::arm_cortex_r5}, - {"cortex-r7", CPU::arm_cortex_r7, CPU::generic, 0, Feature::arm_cortex_r7}, - {"cortex-r8", CPU::arm_cortex_r8, CPU::generic, 0, Feature::arm_cortex_r8}, - {"scorpion", CPU::qualcomm_scorpion, CPU::armv7_a, UINT32_MAX, Feature::qualcomm_scorpion}, - {"krait", CPU::qualcomm_krait, CPU::generic, 0, Feature::qualcomm_krait}, - {"swift", CPU::apple_swift, CPU::generic, 0, Feature::apple_swift}, - {"pj4", CPU::marvell_pj4, CPU::armv7_a, UINT32_MAX, Feature::marvell_pj4}, - {"3735d", CPU::intel_3735d, CPU::armv7_a, UINT32_MAX, Feature::intel_3735d}, - - // armv8ml - {"armv8-m.base", CPU::armv8_m_base, CPU::generic, 0, Feature::armv8m}, - {"armv8-m.main", CPU::armv8_m_main, CPU::generic, 0, Feature::armv8m}, - {"cortex-m23", CPU::arm_cortex_m23, CPU::armv8_m_base, 0, Feature::arm_cortex_m23}, - {"cortex-m33", CPU::arm_cortex_m33, CPU::armv8_m_main, 0, Feature::arm_cortex_m33}, - - // armv8l - {"armv8-a", CPU::armv8_a, CPU::generic, 0, Feature::armv8a}, - {"armv8-r", CPU::armv8_r, CPU::generic, 0, Feature::armv8r}, - {"armv8.1-a", CPU::armv8_1_a, CPU::generic, 0, Feature::armv8_1a}, - {"armv8.2-a", CPU::armv8_2_a, CPU::generic, 0, Feature::armv8_2a}, - {"armv8.3-a", CPU::armv8_3_a, CPU::generic, 0, Feature::armv8_3a}, - {"armv8.4-a", CPU::armv8_4_a, CPU::generic, 0, Feature::armv8_4a}, - {"armv8.5-a", CPU::armv8_5_a, CPU::generic, 0, Feature::armv8_5a}, - {"armv8.6_a", CPU::armv8_6_a, CPU::generic, 0, Feature::armv8_6a}, - {"cortex-a32", CPU::arm_cortex_a32, CPU::generic, 0, Feature::arm_cortex_a32}, - {"cortex-r52", CPU::arm_cortex_r52, CPU::generic, 0, Feature::arm_cortex_r52}, - {"cortex-a35", CPU::arm_cortex_a35, CPU::generic, 0, Feature::arm_cortex_a35}, - {"cortex-a53", CPU::arm_cortex_a53, CPU::generic, 0, Feature::arm_cortex_a53}, - {"cortex-a55", CPU::arm_cortex_a55, CPU::generic, 0, Feature::arm_cortex_a55}, - {"cortex-a57", CPU::arm_cortex_a57, CPU::generic, 0, Feature::arm_cortex_a57}, - {"cortex-a72", CPU::arm_cortex_a72, CPU::generic, 0, Feature::arm_cortex_a72}, - {"cortex-a73", CPU::arm_cortex_a73, CPU::generic, 0, Feature::arm_cortex_a73}, - {"cortex-a75", CPU::arm_cortex_a75, CPU::generic, 0, Feature::arm_cortex_a75}, - {"cortex-a76", CPU::arm_cortex_a76, CPU::generic, 0, Feature::arm_cortex_a76}, - {"cortex-a76ae", CPU::arm_cortex_a76ae, CPU::generic, 0, Feature::arm_cortex_a76}, - {"cortex-a77", CPU::arm_cortex_a77, CPU::arm_cortex_a76, 110000, Feature::arm_cortex_a77}, - {"cortex-a78", CPU::arm_cortex_a78, CPU::arm_cortex_a77, 110000, Feature::arm_cortex_a78}, - {"cortex-x1", CPU::arm_cortex_x1, CPU::arm_cortex_a78, 110000, Feature::arm_cortex_x1}, - {"neoverse-n1", CPU::arm_neoverse_n1, CPU::arm_cortex_a76, 100000, Feature::arm_neoverse_n1}, - {"neoverse-v1", CPU::arm_neoverse_v1, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_v1}, - {"neoverse-n2", CPU::arm_neoverse_n2, CPU::arm_neoverse_n1, UINT32_MAX, Feature::arm_neoverse_n2}, - {"denver1", CPU::nvidia_denver1, CPU::arm_cortex_a53, UINT32_MAX, Feature::nvidia_denver1}, - {"denver2", CPU::nvidia_denver2, CPU::arm_cortex_a57, UINT32_MAX, Feature::nvidia_denver2}, - {"xgene1", CPU::apm_xgene1, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene1}, - {"xgene2", CPU::apm_xgene2, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene2}, - {"xgene3", CPU::apm_xgene3, CPU::armv8_a, UINT32_MAX, Feature::apm_xgene3}, - {"kyro", CPU::qualcomm_kyro, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_kyro}, - {"falkor", CPU::qualcomm_falkor, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_falkor}, - {"saphira", CPU::qualcomm_saphira, CPU::armv8_a, UINT32_MAX, Feature::qualcomm_saphira}, - {"exynos-m1", CPU::samsung_exynos_m1, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m1}, - {"exynos-m2", CPU::samsung_exynos_m2, CPU::generic, UINT32_MAX, Feature::samsung_exynos_m2}, - {"exynos-m3", CPU::samsung_exynos_m3, CPU::generic, 0, Feature::samsung_exynos_m3}, - {"exynos-m4", CPU::samsung_exynos_m4, CPU::generic, 0, Feature::samsung_exynos_m4}, - {"exynos-m5", CPU::samsung_exynos_m5, CPU::samsung_exynos_m4, 110000, Feature::samsung_exynos_m5}, - {"apple-a7", CPU::apple_a7, CPU::generic, 0, Feature::apple_a7}, -}; -#endif -static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]); - -static inline const CPUSpec *find_cpu(uint32_t cpu) -{ - return ::find_cpu(cpu, cpus, ncpu_names); -} - -static inline const CPUSpec *find_cpu(llvm::StringRef name) -{ - return ::find_cpu(name, cpus, ncpu_names); -} - -static inline const char *find_cpu_name(uint32_t cpu) -{ - return ::find_cpu_name(cpu, cpus, ncpu_names); -} - -#if defined _CPU_AARCH64_ && defined _OS_DARWIN_ - -static NOINLINE std::pair> _get_host_cpu() -{ - using namespace llvm; - char buffer[128]; - size_t bufferlen = 128; - sysctlbyname("machdep.cpu.brand_string",&buffer,&bufferlen,NULL,0); - StringRef cpu_name(buffer); - if (cpu_name.find("M1") != StringRef ::npos) - return std::make_pair((uint32_t)CPU::apple_m1, Feature::apple_m1); - else if (cpu_name.find("M2") != StringRef ::npos) - return std::make_pair((uint32_t)CPU::apple_m2, Feature::apple_m2); - else if (cpu_name.find("M3") != StringRef ::npos) - return std::make_pair((uint32_t)CPU::apple_m3, Feature::apple_m3); - else if (cpu_name.find("M4") != StringRef ::npos) - return std::make_pair((uint32_t)CPU::apple_m4, Feature::apple_m4); - else - return std::make_pair((uint32_t)CPU::apple_m1, Feature::apple_m1); -} - -#else - -// auxval reader - -#ifndef AT_HWCAP -# define AT_HWCAP 16 -#endif -#ifndef AT_HWCAP2 -# define AT_HWCAP2 26 -#endif - -#if defined(_OS_FREEBSD_) -static inline unsigned long jl_getauxval(unsigned long type) -{ - unsigned long val; - if (elf_aux_info((int)type, &val, sizeof(val)) != 0) { - return 0; - } - return val; -} -#elif defined(USE_DYN_GETAUXVAL) -static unsigned long getauxval_procfs(unsigned long type) -{ - int fd = open("/proc/self/auxv", O_RDONLY); - if (fd == -1) - return 0; - unsigned long val = 0; - unsigned long buff[2]; - while (read(fd, buff, sizeof(buff)) == sizeof(buff)) { - if (buff[0] == 0) - break; - if (buff[0] == type) { - val = buff[1]; - break; - } - } - close(fd); - return val; -} - -static inline unsigned long jl_getauxval(unsigned long type) -{ - // First, try resolving getauxval in libc - auto libc = jl_dlopen(nullptr, JL_RTLD_LOCAL); - static unsigned long (*getauxval_p)(unsigned long) = NULL; - if (getauxval_p == NULL && jl_dlsym(libc, "getauxval", (void **)&getauxval_p, 0, 0)) { - return getauxval_p(type); - } - - // If we couldn't resolve it, use procfs. - return getauxval_procfs(type); -} -#else -static inline unsigned long jl_getauxval(unsigned long type) -{ - return getauxval(type); -} -#endif - -struct CPUID { - uint8_t implementer; - uint8_t variant; - uint16_t part; - bool operator<(const CPUID &right) const - { - if (implementer < right.implementer) - return true; - if (implementer > right.implementer) - return false; - if (part < right.part) - return true; - if (part > right.part) - return false; - return variant < right.variant; - } -}; - -// /sys/devices/system/cpu/cpu/regs/identification/midr_el1 reader -static inline void get_cpuinfo_sysfs(std::set &res) -{ - // This only works on a 64bit 4.7+ kernel - auto dir = opendir("/sys/devices/system/cpu"); - if (!dir) - return; - while (auto entry = readdir(dir)) { - if (entry->d_type != DT_DIR) - continue; - if (strncmp(entry->d_name, "cpu", 3) != 0) - continue; - std::string stm; - llvm::raw_string_ostream(stm) << "/sys/devices/system/cpu/" << entry->d_name << "/regs/identification/midr_el1"; - std::ifstream file(stm); - if (!file) - continue; - uint64_t val = 0; - file >> std::hex >> val; - if (!file) - continue; - CPUID cpuid = { - uint8_t(val >> 24), - uint8_t((val >> 20) & 0xf), - uint16_t((val >> 4) & 0xfff) - }; - res.insert(cpuid); - } - closedir(dir); -} - -// Use an external template since lambda's can't be templated in C++11 -template -static inline bool try_read_procfs_line(llvm::StringRef line, const char *prefix, T &out, - bool &flag, F &&reset) -{ - if (!line.starts_with(prefix)) - return false; - if (flag) - reset(); - flag = line.substr(strlen(prefix)).ltrim("\t :").getAsInteger(0, out); - return true; -} - -// /proc/cpuinfo reader -static inline void get_cpuinfo_procfs(std::set &res) -{ - std::ifstream file("/proc/cpuinfo"); - CPUID cpuid = {0, 0, 0}; - bool impl = false; - bool part = false; - bool var = false; - auto reset = [&] () { - if (impl && part) - res.insert(cpuid); - impl = false; - part = false; - var = false; - memset(&cpuid, 0, sizeof(cpuid)); - }; - for (std::string line; std::getline(file, line);) { - if (line.empty()) { - reset(); - continue; - } - try_read_procfs_line(line, "CPU implementer", cpuid.implementer, impl, reset) || - try_read_procfs_line(line, "CPU variant", cpuid.variant, var, reset) || - try_read_procfs_line(line, "CPU part", cpuid.part, part, reset); - } - reset(); -} - -static std::set get_cpuinfo(void) -{ - std::set res; - get_cpuinfo_sysfs(res); - if (res.empty()) - get_cpuinfo_procfs(res); - return res; -} - -static CPU get_cpu_name(CPUID cpuid) -{ - switch (cpuid.implementer) { - case 0x41: // 'A': ARM - switch (cpuid.part) { - case 0xb02: return CPU::arm_mpcore; - case 0xb36: return CPU::arm_1136jf_s; - case 0xb56: return CPU::arm_1156t2f_s; - case 0xb76: return CPU::arm_1176jzf_s; - case 0xc05: return CPU::arm_cortex_a5; - case 0xc07: return CPU::arm_cortex_a7; - case 0xc08: return CPU::arm_cortex_a8; - case 0xc09: return CPU::arm_cortex_a9; - case 0xc0d: return CPU::arm_cortex_a12; - case 0xc0f: return CPU::arm_cortex_a15; - case 0xc0e: return CPU::arm_cortex_a17; - case 0xc14: return CPU::arm_cortex_r4; - case 0xc15: return CPU::arm_cortex_r5; - case 0xc17: return CPU::arm_cortex_r7; - case 0xc18: return CPU::arm_cortex_r8; - case 0xc20: return CPU::arm_cortex_m0; - case 0xc21: return CPU::arm_cortex_m1; - case 0xc23: return CPU::arm_cortex_m3; - case 0xc24: return CPU::arm_cortex_m4; - case 0xc27: return CPU::arm_cortex_m7; - case 0xd01: return CPU::arm_cortex_a32; - case 0xd02: return CPU::arm_cortex_a34; - case 0xd03: return CPU::arm_cortex_a53; - case 0xd04: return CPU::arm_cortex_a35; - case 0xd05: return CPU::arm_cortex_a55; - case 0xd06: return CPU::arm_cortex_a65; - case 0xd07: return CPU::arm_cortex_a57; - case 0xd08: return CPU::arm_cortex_a72; - case 0xd09: return CPU::arm_cortex_a73; - case 0xd0a: return CPU::arm_cortex_a75; - case 0xd0b: return CPU::arm_cortex_a76; - case 0xd0c: return CPU::arm_neoverse_n1; - case 0xd0d: return CPU::arm_cortex_a77; - case 0xd0e: return CPU::arm_cortex_a76ae; - case 0xd13: return CPU::arm_cortex_r52; - case 0xd20: return CPU::arm_cortex_m23; - case 0xd21: return CPU::arm_cortex_m33; - // case 0xd22: return CPU::arm_cortex_m55; - case 0xd40: return CPU::arm_neoverse_v1; - case 0xd41: return CPU::arm_cortex_a78; - case 0xd43: return CPU::arm_cortex_a65ae; - case 0xd44: return CPU::arm_cortex_x1; - case 0xd49: return CPU::arm_neoverse_n2; - case 0xd4a: return CPU::arm_neoverse_e1; - default: return CPU::generic; - } - case 0x42: // 'B': Broadcom (Cavium) - switch (cpuid.part) { - // case 0x00f: return CPU::broadcom_brahma_b15; - // case 0x100: return CPU::broadcom_brahma_b53; - case 0x516: return CPU::cavium_thunderx2t99p1; - default: return CPU::generic; - } - case 0x43: // 'C': Cavium - switch (cpuid.part) { - case 0xa0: return CPU::cavium_thunderx; - case 0xa1: - if (cpuid.variant == 0) - return CPU::cavium_thunderx88p1; - return CPU::cavium_thunderx88; - case 0xa2: return CPU::cavium_thunderx81; - case 0xa3: return CPU::cavium_thunderx83; - case 0xaf: return CPU::cavium_thunderx2t99; - case 0xb0: return CPU::cavium_octeontx2; - case 0xb1: return CPU::cavium_octeontx2t98; - case 0xb2: return CPU::cavium_octeontx2t96; - case 0xb3: return CPU::cavium_octeontx2f95; - case 0xb4: return CPU::cavium_octeontx2f95n; - case 0xb5: return CPU::cavium_octeontx2f95mm; - case 0xb8: return CPU::marvell_thunderx3t110; - default: return CPU::generic; - } - case 0x46: // 'F': Fujitsu - switch (cpuid.part) { - case 0x1: return CPU::fujitsu_a64fx; - default: return CPU::generic; - } - case 0x48: // 'H': HiSilicon - switch (cpuid.part) { - case 0xd01: return CPU::hisilicon_tsv110; - case 0xd40: return CPU::arm_cortex_a76; // Kirin 980 - default: return CPU::generic; - } - case 0x4e: // 'N': NVIDIA - switch (cpuid.part) { - case 0x000: return CPU::nvidia_denver1; - case 0x003: return CPU::nvidia_denver2; - case 0x004: return CPU::nvidia_carmel; - default: return CPU::generic; - } - case 0x50: // 'P': AppliedMicro - // x-gene 2 - // x-gene 3 - switch (cpuid.part) { - case 0x000: return CPU::apm_xgene1; - default: return CPU::generic; - } - case 0x51: // 'Q': Qualcomm - switch (cpuid.part) { - case 0x00f: - case 0x02d: - return CPU::qualcomm_scorpion; - case 0x04d: - case 0x06f: - return CPU::qualcomm_krait; - case 0x201: // silver - case 0x205: // gold - case 0x211: // silver - return CPU::qualcomm_kyro; - // kryo 2xx - case 0x800: // gold - return CPU::arm_cortex_a73; - case 0x801: // silver - return CPU::arm_cortex_a53; - // kryo 3xx - case 0x802: // gold - return CPU::arm_cortex_a75; - case 0x803: // silver - return CPU::arm_cortex_a55; - // kryo 4xx - case 0x804: // gold - return CPU::arm_cortex_a76; - case 0x805: // silver - return CPU::arm_cortex_a55; - // kryo 5xx seems to be using ID for cortex-a77 directly - case 0xc00: - return CPU::qualcomm_falkor; - case 0xc01: - return CPU::qualcomm_saphira; - default: return CPU::generic; - } - case 0x53: // 'S': Samsung - if (cpuid.part == 1) { - if (cpuid.variant == 4) - return CPU::samsung_exynos_m2; - return CPU::samsung_exynos_m1; - } - if (cpuid.variant != 1) - return CPU::generic; - switch (cpuid.part) { - case 0x2: return CPU::samsung_exynos_m3; - case 0x3: return CPU::samsung_exynos_m4; - case 0x4: return CPU::samsung_exynos_m5; - default: return CPU::generic; - } - case 0x56: // 'V': Marvell - switch (cpuid.part) { - case 0x581: - case 0x584: - return CPU::marvell_pj4; - default: return CPU::generic; - } - case 0x61: // 'a': Apple - // Data here is partially based on these sources: - // https://github.com/apple-oss-distributions/xnu/blob/main/osfmk/arm/cpuid.h - // https://asahilinux.org/docs/hw/soc/soc-codenames/#socs - // https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/AArch64/AArch64Processors.td - switch (cpuid.part) { - case 0x0: // Swift - return CPU::apple_swift; - case 0x1: // Cyclone - return CPU::apple_a7; - case 0x2: // Typhoon - case 0x3: // Typhoo/Capri - return CPU::apple_a8; - case 0x4: // Twister - case 0x5: // Twister/Elba/Malta - return CPU::apple_a9; - case 0x6: // Hurricane - case 0x7: // Hurricane/Myst - return CPU::apple_a10; - case 0x8: // Monsoon - case 0x9: // Mistral - return CPU::apple_a11; - case 0xB: // Vortex - case 0xC: // Tempest - case 0x10: // A12X, Vortex Aruba - case 0x11: // A12X, Tempest Aruba - return CPU::apple_a12; - case 0xF: // Tempest M9 - return CPU::apple_s4; - case 0x12: // H12 Cebu p-Core "Lightning" - case 0x13: // H12 Cebu e-Core "Thunder" - return CPU::apple_a13; - case 0x20: // H13 Sicily e-Core "Icestorm" - case 0x21: // H13 Sicily p-Core "Firestorm" - return CPU::apple_a14; - case 0x22: // H13G Tonga e-Core "Icestorm" used in Apple M1 - case 0x23: // H13G Tonga p-Core "Firestorm" used in Apple M1 - case 0x24: // H13J Jade Chop e-Core "Icestorm" used in Apple M1 Pro - case 0x25: // H13J Jade Chop p-Core "Firestorm" used in Apple M1 Pro - case 0x28: // H13J Jade Die e-Core "Icestorm" used in Apple M1 Max / Ultra - case 0x29: // H13J Jade Die p-Core "Firestorm" used in Apple M1 Max / Ultra - return CPU::apple_m1; - case 0x30: // H14 Ellis e-Core "Blizzard" used in Apple A15 - case 0x31: // H14 Ellis p-Core "Avalanche" used in Apple A15 - return CPU::apple_a15; - case 0x32: // H14G Staten e-Core "Blizzard" used in Apple M2 - case 0x33: // H14G Staten p-Core "Avalanche" used in Apple M2 - case 0x34: // H14S Rhodes Chop e-Core "Blizzard" used in Apple M2 Pro - case 0x35: // H14S Rhodes Chop p-Core "Avalanche" used in Apple M2 Pro - case 0x38: // H14C Rhodes Die e-Core "Blizzard" used in Apple M2 Max / Ultra - case 0x39: // H14C Rhodes Die p-Core "Avalanche" used in Apple M2 Max / Ultra - return CPU::apple_m2; - case 0x40: // H15 Crete e-Core "Sawtooth" used in Apple A16 - case 0x41: // H15 Crete p-Core "Everest" used in Apple A16 - return CPU::apple_a16; - case 0x42: // H15 Ibiza e-Core "Sawtooth" used in Apple M3 - case 0x43: // H15 Ibiza p-Core "Everest" used in Apple M3 - case 0x44: // H15 Lobos e-Core "Sawtooth" used in Apple M3 Pro - case 0x45: // H15 Lobos p-Core "Everest" used in Apple M3 Pro - case 0x49: // H15 Palma e-Core "Sawtooth" used in Apple M3 Max - case 0x48: // H15 Palma p-Core "Everest" used in Apple M3 Max - return CPU::apple_m3; - //case 0x46: // M11 e-Core "Sawtooth" used in Apple S9 - //case 0x47: does not exist - //return CPU::apple_s9; - case 0x50: // H15 Coll e-Core "Sawtooth" used in Apple A17 Pro - case 0x51: // H15 Coll p-Core "Everest" used in Apple A17 Pro - return CPU::apple_a17; - case 0x52: // H16G Donan e-Core used in Apple M4 - case 0x53: // H16H Donan p-Core used in Apple M4 - case 0x54: // H16S Brava S e-Core used in Apple M4 Pro - case 0x55: // H16S Brava S p-Core used in Apple M4 Pro - case 0x58: // H16C Brava C e-Core used in Apple M4 Max - case 0x59: // H16C Brava C p-Core used in Apple M4 Max - return CPU::apple_m4; - //case 0x60: // H17P Tahiti e-Core used in Apple A18 Pro - //case 0x61: // H17P Tahiti p-Core used in Apple A18 Pro - //case 0x6a: // H17A Tupai e-Core used in Apple A18 - //case 0x6b: // H17A Tupai p-Core used in Apple A18 - //return CPU::apple_a18; - default: return CPU::generic; - } - case 0x68: // 'h': Huaxintong Semiconductor - switch (cpuid.part) { - case 0x0: return CPU::hxt_phecda; - default: return CPU::generic; - } - case 0x69: // 'i': Intel - switch (cpuid.part) { - case 0x001: return CPU::intel_3735d; - default: return CPU::generic; - } - default: - return CPU::generic; - } -} - - - - -namespace { - -struct arm_arch { - int version; - char klass; - constexpr bool mclass() const { return klass == 'M'; } -}; - -} - -static arm_arch get_elf_arch(void) -{ -#ifdef _CPU_AARCH64_ - return {8, 'A'}; -#else - int ver = 0; - char profile = 0; - struct utsname name; - if (uname(&name) >= 0) { - // name.machine is the elf_platform in the kernel. - if (strcmp(name.machine, "armv6l") == 0) { - ver = 6; - } - else if (strcmp(name.machine, "armv7l") == 0) { - ver = 7; - } - else if (strcmp(name.machine, "armv7ml") == 0) { - ver = 7; - profile = 'M'; - } - else if (strcmp(name.machine, "armv8l") == 0 || strcmp(name.machine, "aarch64") == 0) { - ver = 8; - } - } - if (__ARM_ARCH > ver) - ver = __ARM_ARCH; -# if __ARM_ARCH > 6 && defined(__ARM_ARCH_PROFILE) - profile = __ARM_ARCH_PROFILE; -# endif - return {ver, profile}; -#endif -} - -static arm_arch feature_arch_version(const FeatureList &feature) -{ -#ifdef _CPU_AARCH64_ - return {8, 'A'}; -#else - int ver; - if (test_nbit(feature, Feature::v8)) { - ver = 8; - } - else if (test_nbit(feature, Feature::v7)) { - ver = 7; - } - else { - return {6, 0}; - } - if (test_nbit(feature, Feature::mclass)) { - return {ver, 'M'}; - } - else if (test_nbit(feature, Feature::rclass)) { - return {ver, 'R'}; - } - else if (test_nbit(feature, Feature::aclass)) { - return {ver, 'A'}; - } - return {ver, 0}; -#endif -} - -static CPU generic_for_arch(arm_arch arch) -{ -#ifdef _CPU_AARCH64_ - return CPU::generic; -#else -# if defined(__ARM_ARCH_PROFILE) - char klass = __ARM_ARCH_PROFILE; -# else - char klass = arch.klass; -# endif - if (arch.version >= 8) { - if (klass == 'M') { - return CPU::armv8_m_base; - } - else if (klass == 'R') { - return CPU::armv8_r; - } - else { - return CPU::armv8_a; - } - } - else if (arch.version == 7) { - if (klass == 'M') { - return CPU::armv7_m; - } - else if (klass == 'R') { - return CPU::armv7_r; - } - else { - return CPU::armv7_a; - } - } - return CPU::generic; -#endif -} - -static bool check_cpu_arch_ver(uint32_t cpu, arm_arch arch) -{ - auto spec = find_cpu(cpu); - // This happens on AArch64 and indicates that the cpu name isn't a valid aarch64 CPU - if (!spec) - return false; - auto feature_arch = feature_arch_version(spec->features); - if (arch.mclass() != feature_arch.mclass()) - return false; - if (arch.version > feature_arch.version) - return false; - return true; -} - -static void shrink_big_little(llvm::SmallVectorImpl> &list, - const CPU *cpus, uint32_t ncpu) -{ - auto find = [&] (uint32_t name) { - for (uint32_t i = 0; i < ncpu; i++) { - if (cpus[i] == CPU(name)) { - return (int)i; - } - } - return -1; - }; - int maxidx = -1; - for (auto &ele: list) { - int idx = find(ele.first); - if (idx > maxidx) { - maxidx = idx; - } - } - if (maxidx >= 0) { - list.erase(std::remove_if(list.begin(), list.end(), [&] (std::pair &ele) { - int idx = find(ele.first); - return idx != -1 && idx < maxidx; - }), list.end()); - } -} - -static NOINLINE std::pair> _get_host_cpu() -{ - FeatureList features = {}; - // Here we assume that only the lower 32bit are used on aarch64 - // Change the cast here when that's not the case anymore (and when there's features in the - // high bits that we want to detect). - features[0] = (uint32_t)jl_getauxval(AT_HWCAP); - features[1] = (uint32_t)jl_getauxval(AT_HWCAP2); -#ifdef _CPU_AARCH64_ - if (test_nbit(features, 31)) // HWCAP_PACG - set_bit(features, Feature::pauth, true); -#endif - auto cpuinfo = get_cpuinfo(); - auto arch = get_elf_arch(); -#ifdef _CPU_ARM_ - if (arch.version >= 7) { - if (arch.klass == 'M') { - set_bit(features, Feature::mclass, true); - } - else if (arch.klass == 'R') { - set_bit(features, Feature::rclass, true); - } - else if (arch.klass == 'A') { - set_bit(features, Feature::aclass, true); - } - } - switch (arch.version) { - case 8: - set_bit(features, Feature::v8, true); - JL_FALLTHROUGH; - case 7: - set_bit(features, Feature::v7, true); - break; - default: - break; - } -#endif - - std::set cpus; - llvm::SmallVector, 0> list; - // Ideally the feature detection above should be enough. - // However depending on the kernel version not all features are available - // and it's also impossible to detect the ISA version which contains - // some features not yet exposed by the kernel. - // We therefore try to get a more complete feature list from the CPU name. - // Since it is possible to pair cores that have different feature set - // (Observed for exynos 9810 with exynos-m3 + cortex-a55) we'll compute - // an intersection of the known features from each core. - // If there's a core that we don't recognize, treat it as generic. - bool extra_initialized = false; - FeatureList extra_features = {}; - for (auto info: cpuinfo) { - auto name = (uint32_t)get_cpu_name(info); - if (name == 0) { - // no need to clear the feature set if it wasn't initialized - if (extra_initialized) - extra_features = FeatureList{}; - extra_initialized = true; - continue; - } - if (!check_cpu_arch_ver(name, arch)) - continue; - if (cpus.insert(name).second) { - if (extra_initialized) { - extra_features = extra_features & find_cpu(name)->features; - } - else { - extra_initialized = true; - extra_features = find_cpu(name)->features; - } - list.emplace_back(name, info); - } - } - features = features | extra_features; - - // Not all elements/pairs are valid - static constexpr CPU v8order[] = { - CPU::arm_cortex_a35, - CPU::arm_cortex_a53, - CPU::arm_cortex_a55, - CPU::arm_cortex_a57, - CPU::arm_cortex_a72, - CPU::arm_cortex_a73, - CPU::arm_cortex_a75, - CPU::arm_cortex_a76, - CPU::arm_neoverse_n1, - CPU::arm_neoverse_n2, - CPU::arm_neoverse_v1, - CPU::nvidia_denver2, - CPU::nvidia_carmel, - CPU::samsung_exynos_m1, - CPU::samsung_exynos_m2, - CPU::samsung_exynos_m3, - CPU::samsung_exynos_m4, - CPU::samsung_exynos_m5, - }; - shrink_big_little(list, v8order, sizeof(v8order) / sizeof(CPU)); -#ifdef _CPU_ARM_ - // Not all elements/pairs are valid - static constexpr CPU v7order[] = { - CPU::arm_cortex_a5, - CPU::arm_cortex_a7, - CPU::arm_cortex_a8, - CPU::arm_cortex_a9, - CPU::arm_cortex_a12, - CPU::arm_cortex_a15, - CPU::arm_cortex_a17 - }; - shrink_big_little(list, v7order, sizeof(v7order) / sizeof(CPU)); -#endif - uint32_t cpu = 0; - if (list.empty()) { - cpu = (uint32_t)generic_for_arch(arch); - } - else { - // This also covers `list.size() > 1` case which means there's a unknown combination - // consists of CPU's we know. Unclear what else we could try so just randomly return - // one... - cpu = list[0].first; - } - // Ignore feature bits that we are not interested in. - mask_features(feature_masks, &features[0]); - return std::make_pair(cpu, features); -} -#endif - -static inline const std::pair> &get_host_cpu() -{ - static auto host_cpu = _get_host_cpu(); - return host_cpu; -} - -static bool is_generic_cpu_name(uint32_t cpu) -{ - switch ((CPU)cpu) { - case CPU::generic: - case CPU::armv7_a: - case CPU::armv7_m: - case CPU::armv7e_m: - case CPU::armv7_r: - case CPU::armv8_a: - case CPU::armv8_m_base: - case CPU::armv8_m_main: - case CPU::armv8_r: - case CPU::armv8_1_a: - case CPU::armv8_2_a: - case CPU::armv8_3_a: - case CPU::armv8_4_a: - case CPU::armv8_5_a: - case CPU::armv8_6_a: - return true; - default: - return false; - } -} - -static inline const std::string &host_cpu_name() -{ - static std::string name = [] { - if (is_generic_cpu_name(get_host_cpu().first)) { - auto llvm_name = jl_get_cpu_name_llvm(); - if (llvm_name != "generic") { - return llvm_name; - } - } - return std::string(find_cpu_name(get_host_cpu().first)); - }(); - return name; -} - -static inline const char *normalize_cpu_name(llvm::StringRef name) -{ - if (name == "ares") - return "neoverse-n1"; - if (name == "zeus") - return "neoverse-v1"; - if (name == "cyclone") - return "apple-a7"; - if (name == "typhoon") - return "apple-a8"; - if (name == "twister") - return "apple-a9"; - if (name == "hurricane") - return "apple-a10"; - return nullptr; -} - -template -static inline void enable_depends(FeatureList &features) -{ - if (test_nbit(features, Feature::v8_6a)) - set_bit(features, Feature::v8_5a, true); - if (test_nbit(features, Feature::v8_5a)) - set_bit(features, Feature::v8_4a, true); - if (test_nbit(features, Feature::v8_4a)) - set_bit(features, Feature::v8_3a, true); - if (test_nbit(features, Feature::v8_3a)) - set_bit(features, Feature::v8_2a, true); - if (test_nbit(features, Feature::v8_2a)) - set_bit(features, Feature::v8_1a, true); - if (test_nbit(features, Feature::v8_1a)) - set_bit(features, Feature::crc, true); -#ifdef _CPU_ARM_ - if (test_nbit(features, Feature::v8_1a)) { - set_bit(features, Feature::v8, true); - set_bit(features, Feature::aclass, true); - } - if (test_nbit(features, Feature::v8_m_main)) { - set_bit(features, Feature::v8, true); - set_bit(features, Feature::mclass, true); - } - if (test_nbit(features, Feature::v8)) { - set_bit(features, Feature::v7, true); - if (test_nbit(features, Feature::aclass)) { - set_bit(features, Feature::neon, true); - set_bit(features, Feature::vfp3, true); - set_bit(features, Feature::vfp4, true); - set_bit(features, Feature::hwdiv_arm, true); - set_bit(features, Feature::hwdiv, true); - set_bit(features, Feature::d32, true); - } - } -#else - if (test_nbit(features, Feature::v8_1a)) { - set_bit(features, Feature::lse, true); - set_bit(features, Feature::rdm, true); - } - if (test_nbit(features, Feature::v8_2a)) { - set_bit(features, Feature::ccpp, true); - } - if (test_nbit(features, Feature::v8_3a)) { - set_bit(features, Feature::jsconv, true); - set_bit(features, Feature::complxnum, true); - set_bit(features, Feature::rcpc, true); - } - if (test_nbit(features, Feature::v8_4a)) { - set_bit(features, Feature::dit, true); - set_bit(features, Feature::rcpc_immo, true); - set_bit(features, Feature::flagm, true); - } - if (test_nbit(features, Feature::v8_5a)) { - set_bit(features, Feature::sb, true); - set_bit(features, Feature::ccdp, true); - set_bit(features, Feature::altnzcv, true); - set_bit(features, Feature::fptoint, true); - } - if (test_nbit(features, Feature::v8_6a)) { - set_bit(features, Feature::i8mm, true); - set_bit(features, Feature::bf16, true); - } -#endif - ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); -} - -template -static inline void disable_depends(FeatureList &features) -{ - ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); -} - -static const llvm::SmallVector, 0> &get_cmdline_targets(const char *cpu_target) -{ - auto feature_cb = [] (const char *str, size_t len, FeatureList &list) { -#ifdef _CPU_AARCH64_ - // On AArch64, treat `crypto` as an alias of aes + sha2 just like LLVM - if (llvm::StringRef(str, len) == "crypto") { - set_bit(list, Feature::aes, true); - set_bit(list, Feature::sha2, true); - return true; - } -#endif - auto fbit = find_feature_bit(feature_names, nfeature_names, str, len); - if (fbit == UINT32_MAX) - return false; - set_bit(list, fbit, true); - return true; - }; - auto &targets = ::get_cmdline_targets(cpu_target, feature_cb); - for (auto &t: targets) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - } - return targets; -} - -static llvm::SmallVector, 0> jit_targets; - -static TargetData arg_target_data(const TargetData &arg, bool require_host) -{ - TargetData res = arg; - const FeatureList *cpu_features = nullptr; - if (res.name == "native") { - res.name = host_cpu_name(); - cpu_features = &get_host_cpu().second; - } - else if (auto spec = find_cpu(res.name)) { - cpu_features = &spec->features; - } - else { - res.en.flags |= JL_TARGET_UNKNOWN_NAME; - } - if (cpu_features) { - for (size_t i = 0; i < feature_sz; i++) { - res.en.features[i] |= (*cpu_features)[i]; - } - } - enable_depends(res.en.features); - for (size_t i = 0; i < feature_sz; i++) - res.en.features[i] &= ~res.dis.features[i]; - if (require_host) { - for (size_t i = 0; i < feature_sz; i++) { - res.en.features[i] &= get_host_cpu().second[i]; - } - } - disable_depends(res.en.features); - if (cpu_features) { - // If the base feature if known, fill in the disable features - for (size_t i = 0; i < feature_sz; i++) { - res.dis.features[i] = feature_masks[i] & ~res.en.features[i]; - } - } - return res; -} - -static int max_vector_size(const FeatureList &features) -{ -#ifdef _CPU_ARM_ - if (test_nbit(features, Feature::neon)) - return 16; - return 8; -#else - if (test_nbit(features, Feature::sve2)) - return 256; - if (test_nbit(features, Feature::sve)) - return 128; - return 16; -#endif -} - -static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason) -{ - // First see what target is requested for the JIT. - const char *cpu_target = (const char *)ctx; - auto &cmdline = get_cmdline_targets(cpu_target); - TargetData target = arg_target_data(cmdline[0], true); - // Then find the best match in the sysimg - auto sysimg = deserialize_target_data((const uint8_t*)id); - for (auto &t: sysimg) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - } - auto match = match_sysimg_targets(sysimg, target, max_vector_size, rejection_reason); - if (match.best_idx == UINT32_MAX) - return match.best_idx; - // Now we've decided on which sysimg version to use. - // Make sure the JIT target is compatible with it and save the JIT target. - if (match.vreg_size != max_vector_size(target.en.features) && - (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) { -#ifdef _CPU_ARM_ - unset_bits(target.en.features, Feature::neon); -#endif - } - jit_targets.push_back(std::move(target)); - return match.best_idx; -} - -static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason JL_REQUIRE_ROOTED_SLOT) -{ - TargetData target = jit_targets.front(); - auto pkgimg = deserialize_target_data((const uint8_t*)id); - for (auto &t: pkgimg) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - } - auto match = match_sysimg_targets(pkgimg, target, max_vector_size, rejection_reason); - return match.best_idx; -} - -static void ensure_jit_target(const char *cpu_target, bool imaging) -{ - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, imaging); - if (!jit_targets.empty()) - return; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, jit_targets.empty()); - jit_targets.push_back(std::move(data)); - } - auto ntargets = jit_targets.size(); - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = jit_targets[i]; - if (t.en.flags & JL_TARGET_CLONE_ALL) - continue; - auto &features0 = jit_targets[t.base].en.features; - // Always clone when code checks CPU features - t.en.flags |= JL_TARGET_CLONE_CPU; - static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16}; - for (auto fe: clone_fp16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_FLOAT16; - break; - } - } - // The most useful one in general... - t.en.flags |= JL_TARGET_CLONE_LOOP; -#ifdef _CPU_ARM_ - static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; - for (auto fe: clone_math) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_MATH; - break; - } - } - static constexpr uint32_t clone_simd[] = {Feature::neon}; - for (auto fe: clone_simd) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_SIMD; - break; - } - } -#endif - } -} - -static std::pair> -get_llvm_target_noext(const TargetData &data) -{ - std::string name = data.name; - auto *spec = find_cpu(name); - while (spec) { - if (spec->llvmver <= JL_LLVM_VERSION) - break; - spec = find_cpu((uint32_t)spec->fallback); - name = spec->name; - } - auto features = data.en.features; - if (spec) { - if (is_generic_cpu_name((uint32_t)spec->cpu)) { - features = features | spec->features; - name = "generic"; - } - } -#ifdef _CPU_ARM_ - // We use the name on aarch64 internally but the LLVM ARM backend still use the old name... - if (name == "apple-a7") - name = "cyclone"; -#endif - llvm::SmallVector feature_strs; - for (auto &fename: feature_names) { - if (fename.llvmver > JL_LLVM_VERSION) - continue; - if (fename.bit >= 32 * 2) - break; - const char *fename_str = fename.name; - bool enable = test_nbit(features, fename.bit); - bool disable = test_nbit(data.dis.features, fename.bit); - if (enable) { - feature_strs.insert(feature_strs.begin(), std::string("+") + fename_str); - } - else if (disable) { - feature_strs.push_back(std::string("-") + fename_str); - } - } - if (test_nbit(features, Feature::v8_6a)) - feature_strs.push_back("+v8.6a"); - if (test_nbit(features, Feature::v8_5a)) - feature_strs.push_back("+v8.5a"); - if (test_nbit(features, Feature::v8_4a)) - feature_strs.push_back("+v8.4a"); - if (test_nbit(features, Feature::v8_3a)) - feature_strs.push_back("+v8.3a"); - if (test_nbit(features, Feature::v8_2a)) - feature_strs.push_back("+v8.2a"); - if (test_nbit(features, Feature::v8_1a)) - feature_strs.push_back("+v8.1a"); -#ifdef _CPU_ARM_ - if (test_nbit(features, Feature::v8_m_main)) { - feature_strs.push_back("+v8m.main"); - feature_strs.push_back("+armv8-m.main"); - } - if (test_nbit(features, Feature::aclass)) - feature_strs.push_back("+aclass"); - if (test_nbit(features, Feature::rclass)) - feature_strs.push_back("+rclass"); - if (test_nbit(features, Feature::mclass)) - feature_strs.push_back("+mclass"); - if (test_nbit(features, Feature::v8)) { - feature_strs.push_back("+v8"); - if (test_nbit(features, Feature::aclass)) - feature_strs.push_back("+armv8-a"); - if (test_nbit(features, Feature::rclass)) - feature_strs.push_back("+armv8-r"); - if (test_nbit(features, Feature::mclass)) { - feature_strs.push_back("+v8m"); - feature_strs.push_back("+armv8-m.base"); - } - } - if (test_nbit(features, Feature::v7)) { - feature_strs.push_back("+v7"); - if (test_nbit(features, Feature::aclass)) - feature_strs.push_back("+armv7-a"); - if (test_nbit(features, Feature::rclass)) - feature_strs.push_back("+armv7-r"); - if (test_nbit(features, Feature::mclass)) - feature_strs.push_back("+armv7-m"); - } - feature_strs.push_back("+v6"); - feature_strs.push_back("+vfp2"); -#else - feature_strs.push_back("+neon"); - feature_strs.push_back("+fp-armv8"); -#endif - return std::make_pair(std::move(name), std::move(feature_strs)); -} - -static std::pair> -get_llvm_target_vec(const TargetData &data) -{ - auto res0 = get_llvm_target_noext(data); - append_ext_features(res0.second, data.ext_features); - return res0; -} - -static std::pair -get_llvm_target_str(const TargetData &data) -{ - auto res0 = get_llvm_target_noext(data); - auto features = join_feature_strs(res0.second); - append_ext_features(features, data.ext_features); - return std::make_pair(std::move(res0.first), std::move(features)); -} - -static FeatureList get_max_feature(void) -{ -#ifdef _CPU_ARM_ - auto arch = get_elf_arch(); - auto features = real_feature_masks; - if (arch.klass == 0) - arch.klass = 'A'; - set_bit(features, Feature::v7, true); - set_bit(features, Feature::v8, true); - if (arch.klass == 'M') { - set_bit(features, Feature::mclass, true); - set_bit(features, Feature::v8_m_main, true); - } - else if (arch.klass == 'R') { - set_bit(features, Feature::rclass, true); - } - else if (arch.klass == 'A') { - set_bit(features, Feature::aclass, true); - set_bit(features, Feature::v8_1a, true); - set_bit(features, Feature::v8_2a, true); - set_bit(features, Feature::v8_3a, true); - set_bit(features, Feature::v8_4a, true); - set_bit(features, Feature::v8_5a, true); - set_bit(features, Feature::v8_6a, true); - } - return features; -#else - // There isn't currently any conflicting features on AArch64 - return feature_masks; -#endif -} - -} - -using namespace ARM; - -JL_DLLEXPORT void jl_dump_host_cpu(void) -{ - dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names, - cpus, ncpu_names); -} - -JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits) -{ -#ifdef _CPU_AARCH64_ - return jl_true; -#else - TargetData target = jit_targets.front(); - FeatureList features = target.en.features; - if (bits == 32 && test_nbit(features, Feature::vfp4sp)) - return jl_true; - else if ((bits == 64 || bits == 32) && test_nbit(features, Feature::vfp4)) - return jl_true; - else - return jl_false; -#endif -} - -jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target) -{ - if (!jit_targets.empty()) - jl_error("JIT targets already initialized"); - return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target); -} - -jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image) -{ - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); - if (jit_targets.size() > 1) - jl_error("Expected only one JIT target"); - return parse_sysimg(image, pkgimg_init_cb, NULL); -} - -JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data) -{ - jl_value_t *rejection_reason = NULL; - JL_GC_PUSH1(&rejection_reason); - uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason); - JL_GC_POP(); - if (match_idx == UINT32_MAX) - return rejection_reason; - return jl_nothing; -} - -std::pair> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags) -{ - ensure_jit_target(cpu_target, imaging); - flags = jit_targets[0].en.flags; - return get_llvm_target_vec(jit_targets[0]); -} - -const std::pair &jl_get_llvm_disasm_target(void) -{ - auto max_feature = get_max_feature(); - static const auto res = get_llvm_target_str(TargetData{host_cpu_name(), -#ifdef _CPU_AARCH64_ - "+ecv,+tme,+am,+specrestrict,+predres,+lor,+perfmon,+spe,+tracev8.4", -#else - "+dotprod", -#endif - {max_feature, 0}, {feature_masks & ~max_feature, 0}, 0}); - return res; -} - -#ifndef __clang_gcanalyzer__ -llvm::SmallVector jl_get_llvm_clone_targets(const char *cpu_target) -{ - - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, true); - llvm::SmallVector, 0> image_targets; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, image_targets.empty()); - image_targets.push_back(std::move(data)); - } - auto ntargets = image_targets.size(); - if (image_targets.empty()) - jl_error("No targets specified"); - llvm::SmallVector res; - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = image_targets[i]; - if (t.en.flags & JL_TARGET_CLONE_ALL) - continue; - auto &features0 = image_targets[t.base].en.features; - // Always clone when code checks CPU features - t.en.flags |= JL_TARGET_CLONE_CPU; - static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16}; - for (auto fe: clone_fp16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_FLOAT16; - break; - } - } - // The most useful one in general... - t.en.flags |= JL_TARGET_CLONE_LOOP; -#ifdef _CPU_ARM_ - static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; - for (auto fe: clone_math) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_MATH; - break; - } - } - static constexpr uint32_t clone_simd[] = {Feature::neon}; - for (auto fe: clone_simd) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_SIMD; - break; - } - } -#endif - } - for (auto &target: image_targets) { - auto features_en = target.en.features; - auto features_dis = target.dis.features; - for (auto &fename: feature_names) { - if (fename.llvmver > JL_LLVM_VERSION) { - unset_bits(features_en, fename.bit); - unset_bits(features_dis, fename.bit); - } - } - ARM::disable_depends(features_en); - jl_target_spec_t ele; - std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); - ele.data = serialize_target_data(target.name, features_en, features_dis, - target.ext_features); - ele.flags = target.en.flags; - ele.base = target.base; - res.push_back(ele); - } - return res; -} - -#endif - -extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) -{ - if (feature >= 32 * feature_sz) - return 0; - return test_nbit(&get_host_cpu().second[0], feature); -} - -#ifdef _CPU_AARCH64_ -// FPCR FZ, bit [24] -static constexpr uint64_t fpcr_fz_mask = 1 << 24; -// FPCR FZ16, bit [19] -static constexpr uint64_t fpcr_fz16_mask = 1 << 19; -// FPCR DN, bit [25] -static constexpr uint64_t fpcr_dn_mask = 1 << 25; - -static inline uint64_t get_fpcr_aarch64(void) -{ - uint64_t fpcr; - asm volatile("mrs %0, fpcr" : "=r"(fpcr)); - return fpcr; -} - -static inline void set_fpcr_aarch64(uint64_t fpcr) -{ - asm volatile("msr fpcr, %0" :: "r"(fpcr)); -} - -extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) -{ - return (get_fpcr_aarch64() & fpcr_fz_mask) != 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) -{ - uint64_t fpcr = get_fpcr_aarch64(); - static uint64_t mask = fpcr_fz_mask | (jl_test_cpu_feature(JL_AArch64_fullfp16) ? fpcr_fz16_mask : 0); - fpcr = isZero ? (fpcr | mask) : (fpcr & ~mask); - set_fpcr_aarch64(fpcr); - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) -{ - return (get_fpcr_aarch64() & fpcr_dn_mask) != 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) -{ - uint64_t fpcr = get_fpcr_aarch64(); - fpcr = isDefault ? (fpcr | fpcr_dn_mask) : (fpcr & ~fpcr_dn_mask); - set_fpcr_aarch64(fpcr); - return 0; -} -#else -extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) -{ - return isZero; -} - -extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) -{ - return isDefault; -} -#endif diff --git a/src/processor_fallback.cpp b/src/processor_fallback.cpp deleted file mode 100644 index c8c8feb072345..0000000000000 --- a/src/processor_fallback.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// This file is a part of Julia. License is MIT: https://julialang.org/license - -// Fallback processor detection and dispatch - -static constexpr FeatureName *feature_names = nullptr; -static constexpr uint32_t nfeature_names = 0; - -namespace Fallback { - -static inline const std::string &host_cpu_name() -{ - static std::string name = jl_get_cpu_name_llvm(); - return name; -} - -static const llvm::SmallVector, 0> &get_cmdline_targets(const char *cpu_target) -{ - auto feature_cb = [] (const char*, size_t, FeatureList<1>&) { - return false; - }; - return ::get_cmdline_targets<1>(cpu_target, feature_cb); -} - -static llvm::SmallVector, 0> jit_targets; - -static TargetData<1> arg_target_data(const TargetData<1> &arg, bool require_host) -{ - TargetData<1> res = arg; - if (res.name == "native") { - res.name = host_cpu_name(); - append_ext_features(res.ext_features, jl_get_cpu_features_llvm()); - } - else { - res.en.flags |= JL_TARGET_UNKNOWN_NAME; - } - return res; -} - -static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason) -{ - // First see what target is requested for the JIT. - const char *cpu_target = (const char *)ctx; - auto &cmdline = get_cmdline_targets(cpu_target); - TargetData<1> target = arg_target_data(cmdline[0], true); - // Find the last name match or use the default one. - uint32_t best_idx = 0; - auto sysimg = deserialize_target_data<1>((const uint8_t*)id); - for (uint32_t i = 0; i < sysimg.size(); i++) { - auto &imgt = sysimg[i]; - if (imgt.name == target.name) { - best_idx = i; - } - } - jit_targets.push_back(std::move(target)); - return best_idx; -} - -static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason) -{ - TargetData<1> target = jit_targets.front(); - // Find the last name match or use the default one. - uint32_t best_idx = 0; - auto pkgimg = deserialize_target_data<1>((const uint8_t*)id); - for (uint32_t i = 0; i < pkgimg.size(); i++) { - auto &imgt = pkgimg[i]; - if (imgt.name == target.name) { - best_idx = i; - } - } - - return best_idx; -} - -static void ensure_jit_target(const char *cpu_target, bool imaging) -{ - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, imaging); - if (!jit_targets.empty()) - return; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, jit_targets.empty()); - jit_targets.push_back(std::move(data)); - } - auto ntargets = jit_targets.size(); - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = jit_targets[i]; - t.en.flags |= JL_TARGET_CLONE_ALL; - } -} - -static std::pair> -get_llvm_target_noext(const TargetData<1> &data) -{ - return std::make_pair(data.name, llvm::SmallVector{}); -} - -static std::pair> -get_llvm_target_vec(const TargetData<1> &data) -{ - auto res0 = get_llvm_target_noext(data); - append_ext_features(res0.second, data.ext_features); - return res0; -} - -static std::pair -get_llvm_target_str(const TargetData<1> &data) -{ - auto res0 = get_llvm_target_noext(data); - auto features = join_feature_strs(res0.second); - append_ext_features(features, data.ext_features); - return std::make_pair(std::move(res0.first), std::move(features)); -} - -} - -using namespace Fallback; - -jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target) -{ - if (!jit_targets.empty()) - jl_error("JIT targets already initialized"); - return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target); -} - -jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image) -{ - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); - if (jit_targets.size() > 1) - jl_error("Expected only one JIT target"); - return parse_sysimg(image, pkgimg_init_cb, NULL); -} - -std::pair> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags) -{ - ensure_jit_target(cpu_target, imaging); - flags = jit_targets[0].en.flags; - return get_llvm_target_vec(jit_targets[0]); -} - -const std::pair &jl_get_llvm_disasm_target(void) -{ - static const auto res = get_llvm_target_str(TargetData<1>{host_cpu_name(), - jl_get_cpu_features_llvm(), {{}, 0}, {{}, 0}, 0}); - return res; -} -#ifndef __clang_gcanalyzer__ -llvm::SmallVector jl_get_llvm_clone_targets(const char *cpu_target) -{ - - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, true); - llvm::SmallVector, 0> image_targets; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, image_targets.empty()); - image_targets.push_back(std::move(data)); - } - auto ntargets = image_targets.size(); - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = image_targets[i]; - t.en.flags |= JL_TARGET_CLONE_ALL; - } - if (image_targets.empty()) - jl_error("No image targets found"); - llvm::SmallVector res; - for (auto &target: image_targets) { - jl_target_spec_t ele; - std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); - ele.data = serialize_target_data(target.name, target.en.features, - target.dis.features, target.ext_features); - ele.flags = target.en.flags; - ele.base = 0; - res.push_back(ele); - } - return res; -} -#endif - -JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits) -{ - return jl_false; // Match behaviour of have_fma in src/llvm-cpufeatures.cpp (assume false) -} - -JL_DLLEXPORT void jl_dump_host_cpu(void) -{ - jl_safe_printf("CPU: %s\n", host_cpu_name().c_str()); - jl_safe_printf("Features: %s\n", jl_get_cpu_features_llvm().c_str()); -} - -JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data) -{ - jl_value_t *rejection_reason = NULL; - JL_GC_PUSH1(&rejection_reason); - uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason); - JL_GC_POP(); - if (match_idx == UINT32_MAX) - return rejection_reason; - return jl_nothing; -} - -extern "C" int jl_test_cpu_feature(jl_cpu_feature_t) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) -{ - return isZero; -} - -extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) -{ - return isDefault; -} diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp deleted file mode 100644 index bd624943083ae..0000000000000 --- a/src/processor_x86.cpp +++ /dev/null @@ -1,1292 +0,0 @@ -// This file is a part of Julia. License is MIT: https://julialang.org/license - -// X86 specific processor detection and dispatch - -// CPUID - -#include "julia.h" -extern "C" JL_DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType) -{ - asm volatile ( -#if defined(__i386__) && defined(__PIC__) - "xchg %%ebx, %%esi;" - "cpuid;" - "xchg %%esi, %%ebx;" : - "=S" (CPUInfo[1]), -#else - "cpuid" : - "=b" (CPUInfo[1]), -#endif - "=a" (CPUInfo[0]), - "=c" (CPUInfo[2]), - "=d" (CPUInfo[3]) : - "a" (InfoType) - ); -} - -extern "C" JL_DLLEXPORT void jl_cpuidex(int32_t CPUInfo[4], int32_t InfoType, int32_t subInfoType) -{ - asm volatile ( -#if defined(__i386__) && defined(__PIC__) - "xchg %%ebx, %%esi;" - "cpuid;" - "xchg %%esi, %%ebx;" : - "=S" (CPUInfo[1]), -#else - "cpuid" : - "=b" (CPUInfo[1]), -#endif - "=a" (CPUInfo[0]), - "=c" (CPUInfo[2]), - "=d" (CPUInfo[3]) : - "a" (InfoType), - "c" (subInfoType) - ); -} - -namespace X86 { - -enum class CPU : uint32_t { - generic = 0, - intel_nocona, - intel_prescott, - intel_atom_bonnell, - intel_atom_silvermont, - intel_atom_goldmont, - intel_atom_goldmont_plus, - intel_atom_tremont, - intel_core2, - intel_core2_penryn, - intel_yonah, - intel_corei7_nehalem, - intel_corei7_westmere, - intel_corei7_sandybridge, - intel_corei7_ivybridge, - intel_corei7_haswell, - intel_corei7_broadwell, - intel_corei7_skylake, - intel_corei7_skylake_avx512, - intel_corei7_cascadelake, - intel_corei7_cooperlake, - intel_corei7_cannonlake, - intel_corei7_icelake_client, - intel_corei7_icelake_server, - intel_corei7_tigerlake, - intel_corei7_alderlake, - intel_corei7_sapphirerapids, - intel_knights_landing, - intel_knights_mill, - - amd_fam10h, - amd_athlon_fx, - amd_athlon_64, - amd_athlon_64_sse3, - amd_bdver1, - amd_bdver2, - amd_bdver3, - amd_bdver4, - amd_btver1, - amd_btver2, - amd_k8, - amd_k8_sse3, - amd_opteron, - amd_opteron_sse3, - amd_barcelona, - amd_znver1, - amd_znver2, - amd_znver3, - amd_znver4, - amd_znver5, -}; - -static constexpr size_t feature_sz = 12; -static constexpr FeatureName feature_names[] = { -#define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, -#include "features_x86.h" -#undef JL_FEATURE_DEF -#undef JL_FEATURE_DEF_NAME -}; -static constexpr uint32_t nfeature_names = sizeof(feature_names) / sizeof(FeatureName); - -template -static inline constexpr FeatureList get_feature_masks(Args... args) -{ - return ::get_feature_masks(args...); -} - -#define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) -static constexpr auto feature_masks = get_feature_masks( -#define JL_FEATURE_DEF(name, bit, llvmver) bit, -#include "features_x86.h" -#undef JL_FEATURE_DEF - -1); - -namespace Feature { -enum : uint32_t { -#define JL_FEATURE_DEF(name, bit, llvmver) name = bit, -#include "features_x86.h" -#undef JL_FEATURE_DEF -}; -#undef JL_FEATURE_DEF_NAME -static constexpr FeatureDep deps[] = { - {ssse3, sse3}, - {fma, avx}, - {sse41, ssse3}, - {sse42, sse41}, - {avx, sse42}, - {f16c, avx}, - {avx2, avx}, - {vaes, avx}, - {vaes, aes}, - {vpclmulqdq, avx}, - {vpclmulqdq, pclmul}, - {avxvnni, avx2}, - {avxvnniint8, avx2}, - {avxvnniint16, avx2}, - {avxifma, avx2}, - {avxneconvert, avx2}, - {avx512f, avx2}, - {avx512dq, avx512f}, - {avx512ifma, avx512f}, - {avx512cd, avx512f}, - {avx512bw, avx512f}, - {avx512bf16, avx512bw}, - {avx512bitalg, avx512bw}, - {avx512vl, avx512f}, - {avx512vbmi, avx512bw}, - {avx512vbmi2, avx512bw}, - {avx512vnni, avx512f}, - {avx512vp2intersect, avx512f}, - {avx512vpopcntdq, avx512f}, - {avx512fp16, avx512bw}, - {avx512fp16, avx512dq}, - {avx512fp16, avx512vl}, - {amx_int8, amx_tile}, - {amx_bf16, amx_tile}, - {amx_fp16, amx_tile}, - {amx_complex, amx_tile}, - {sse4a, sse3}, - {xop, fma4}, - {fma4, avx}, - {fma4, sse4a}, - {xsaveopt, xsave}, - {xsavec, xsave}, - {xsaves, xsave}, - {sha512, avx2}, - {sm3, avx}, - {sm4, avx2}, -}; - -// We require cx16 on 64bit by default. This can be overwritten with `-cx16` -// This isn't really compatible with 32bit but we mask it off there with required LLVM version -constexpr auto generic = get_feature_masks(cx16); -constexpr auto bonnell = get_feature_masks(sse3, ssse3, cx16, movbe, sahf); -constexpr auto silvermont = bonnell | get_feature_masks(sse41, sse42, popcnt, - pclmul, prfchw, rdrnd); -constexpr auto goldmont = silvermont | get_feature_masks(aes, sha, rdseed, xsave, xsaveopt, - xsavec, xsaves, clflushopt, fsgsbase); -constexpr auto goldmont_plus = goldmont | get_feature_masks(ptwrite, rdpid); // sgx -constexpr auto tremont = goldmont_plus | get_feature_masks(clwb, gfni); -constexpr auto knl = get_feature_masks(sse3, ssse3, sse41, sse42, cx16, sahf, popcnt, - aes, pclmul, avx, xsave, xsaveopt, rdrnd, f16c, fsgsbase, - avx2, bmi, bmi2, fma, lzcnt, movbe, adx, rdseed, prfchw, - avx512f, avx512cd); -constexpr auto knm = knl | get_feature_masks(avx512vpopcntdq); -constexpr auto yonah = get_feature_masks(sse3); -constexpr auto prescott = yonah; -constexpr auto core2 = get_feature_masks(sse3, ssse3, cx16, sahf); -constexpr auto nocona = get_feature_masks(sse3, cx16); -constexpr auto penryn = nocona | get_feature_masks(ssse3, sse41, sahf); -constexpr auto nehalem = penryn | get_feature_masks(sse42, popcnt); -constexpr auto westmere = nehalem | get_feature_masks(pclmul); -constexpr auto sandybridge = westmere | get_feature_masks(avx, xsave, xsaveopt); -constexpr auto ivybridge = sandybridge | get_feature_masks(rdrnd, f16c, fsgsbase); -constexpr auto haswell = ivybridge | get_feature_masks(avx2, bmi, bmi2, fma, lzcnt, movbe); -constexpr auto broadwell = haswell | get_feature_masks(adx, rdseed, prfchw); -constexpr auto skylake = broadwell | get_feature_masks(aes, xsavec, xsaves, clflushopt); // sgx -constexpr auto skx = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl, - pku, clwb); -constexpr auto cascadelake = skx | get_feature_masks(avx512vnni); -constexpr auto cooperlake = cascadelake | get_feature_masks(avx512bf16); -constexpr auto cannonlake = skylake | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, - avx512vl, pku, avx512vbmi, avx512ifma, - sha); // sgx -constexpr auto icelake = cannonlake | get_feature_masks(avx512bitalg, vaes, avx512vbmi2, - vpclmulqdq, avx512vpopcntdq, - gfni, clwb, rdpid); -constexpr auto icelake_server = icelake | get_feature_masks(pconfig, wbnoinvd); -constexpr auto tigerlake = icelake | get_feature_masks(avx512vp2intersect, movdiri, - movdir64b, shstk); -constexpr auto alderlake = skylake | get_feature_masks(clwb, sha, waitpkg, shstk, gfni, vaes, vpclmulqdq, pconfig, - rdpid, movdiri, pku, movdir64b, serialize, ptwrite, avxvnni); -constexpr auto sapphirerapids = icelake_server | - get_feature_masks(amx_tile, amx_int8, amx_bf16, avx512bf16, avx512fp16, serialize, cldemote, waitpkg, - avxvnni, uintr, ptwrite, tsxldtrk, enqcmd, shstk, avx512vp2intersect, movdiri, movdir64b); - -constexpr auto k8_sse3 = get_feature_masks(sse3, cx16); -constexpr auto amdfam10 = k8_sse3 | get_feature_masks(sse4a, lzcnt, popcnt, sahf); - -constexpr auto btver1 = amdfam10 | get_feature_masks(ssse3, prfchw); -constexpr auto btver2 = btver1 | get_feature_masks(sse41, sse42, avx, aes, pclmul, bmi, f16c, - movbe, xsave, xsaveopt); - -constexpr auto bdver1 = amdfam10 | get_feature_masks(xop, fma4, avx, ssse3, sse41, sse42, aes, - prfchw, pclmul, xsave); -constexpr auto bdver2 = bdver1 | get_feature_masks(f16c, bmi, tbm, fma); -constexpr auto bdver3 = bdver2 | get_feature_masks(xsaveopt, fsgsbase); -constexpr auto bdver4 = bdver3 | get_feature_masks(avx2, bmi2, mwaitx, movbe, rdrnd); - -// technically xsaves is part of znver1, znver2, and znver3 -// Disabled due to Erratum 1386 -// See: https://github.com/JuliaLang/julia/issues/50102 -constexpr auto znver1 = haswell | get_feature_masks(adx, aes, clflushopt, clzero, mwaitx, prfchw, - rdseed, sha, sse4a, xsavec); -constexpr auto znver2 = znver1 | get_feature_masks(clwb, rdpid, wbnoinvd); -constexpr auto znver3 = znver2 | get_feature_masks(shstk, pku, vaes, vpclmulqdq); -constexpr auto znver4 = znver3 | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl, avx512ifma, avx512vbmi, - avx512vbmi2, avx512vnni, avx512bitalg, avx512vpopcntdq, avx512bf16, gfni, shstk, xsaves); -constexpr auto znver5 = znver4 | get_feature_masks(avxvnni, movdiri, movdir64b, avx512vp2intersect, prefetchi, avxvnni); - -} - -static constexpr CPUSpec cpus[] = { - {"generic", CPU::generic, CPU::generic, 0, Feature::generic}, - {"bonnell", CPU::intel_atom_bonnell, CPU::generic, 0, Feature::bonnell}, - {"silvermont", CPU::intel_atom_silvermont, CPU::generic, 0, Feature::silvermont}, - {"goldmont", CPU::intel_atom_goldmont, CPU::generic, 0, Feature::goldmont}, - {"goldmont-plus", CPU::intel_atom_goldmont_plus, CPU::generic, 0, Feature::goldmont_plus}, - {"tremont", CPU::intel_atom_tremont, CPU::generic, 0, Feature::tremont}, - {"core2", CPU::intel_core2, CPU::generic, 0, Feature::core2}, - {"yonah", CPU::intel_yonah, CPU::generic, 0, Feature::yonah}, - {"prescott", CPU::intel_prescott, CPU::generic, 0, Feature::prescott}, - {"nocona", CPU::intel_nocona, CPU::generic, 0, Feature::nocona}, - {"penryn", CPU::intel_core2_penryn, CPU::generic, 0, Feature::penryn}, - {"nehalem", CPU::intel_corei7_nehalem, CPU::generic, 0, Feature::nehalem}, - {"westmere", CPU::intel_corei7_westmere, CPU::generic, 0, Feature::westmere}, - {"sandybridge", CPU::intel_corei7_sandybridge, CPU::generic, 0, Feature::sandybridge}, - {"ivybridge", CPU::intel_corei7_ivybridge, CPU::generic, 0, Feature::ivybridge}, - {"haswell", CPU::intel_corei7_haswell, CPU::generic, 0, Feature::haswell}, - {"broadwell", CPU::intel_corei7_broadwell, CPU::generic, 0, Feature::broadwell}, - {"skylake", CPU::intel_corei7_skylake, CPU::generic, 0, Feature::skylake}, - {"knl", CPU::intel_knights_landing, CPU::generic, 0, Feature::knl}, - {"knm", CPU::intel_knights_mill, CPU::generic, 0, Feature::knm}, - {"skylake-avx512", CPU::intel_corei7_skylake_avx512, CPU::generic, 0, Feature::skx}, - {"cascadelake", CPU::intel_corei7_cascadelake, CPU::generic, 0, Feature::cascadelake}, - {"cooperlake", CPU::intel_corei7_cooperlake, CPU::generic, 0, Feature::cooperlake}, - {"cannonlake", CPU::intel_corei7_cannonlake, CPU::generic, 0, Feature::cannonlake}, - {"icelake-client", CPU::intel_corei7_icelake_client, CPU::generic, 0, Feature::icelake}, - {"icelake-server", CPU::intel_corei7_icelake_server, CPU::generic, 0, - Feature::icelake_server}, - {"tigerlake", CPU::intel_corei7_tigerlake, CPU::intel_corei7_icelake_client, 100000, - Feature::tigerlake}, - {"alderlake", CPU::intel_corei7_alderlake, CPU::intel_corei7_skylake, 120000, - Feature::alderlake}, - {"sapphirerapids", CPU::intel_corei7_sapphirerapids, CPU::intel_corei7_icelake_server, 120000, - Feature::sapphirerapids}, - - {"athlon64", CPU::amd_athlon_64, CPU::generic, 0, Feature::generic}, - {"athlon-fx", CPU::amd_athlon_fx, CPU::generic, 0, Feature::generic}, - {"k8", CPU::amd_k8, CPU::generic, 0, Feature::generic}, - {"opteron", CPU::amd_opteron, CPU::generic, 0, Feature::generic}, - - {"athlon64-sse3", CPU::amd_athlon_64_sse3, CPU::generic, 0, Feature::k8_sse3}, - {"k8-sse3", CPU::amd_k8_sse3, CPU::generic, 0, Feature::k8_sse3}, - {"opteron-sse3", CPU::amd_opteron_sse3, CPU::generic, 0, Feature::k8_sse3}, - - {"amdfam10", CPU::amd_fam10h, CPU::generic, 0, Feature::amdfam10}, - {"barcelona", CPU::amd_barcelona, CPU::generic, 0, Feature::amdfam10}, - - {"btver1", CPU::amd_btver1, CPU::generic, 0, Feature::btver1}, - {"btver2", CPU::amd_btver2, CPU::generic, 0, Feature::btver2}, - - {"bdver1", CPU::amd_bdver1, CPU::generic, 0, Feature::bdver1}, - {"bdver2", CPU::amd_bdver2, CPU::generic, 0, Feature::bdver2}, - {"bdver3", CPU::amd_bdver3, CPU::generic, 0, Feature::bdver3}, - {"bdver4", CPU::amd_bdver4, CPU::generic, 0, Feature::bdver4}, - - {"znver1", CPU::amd_znver1, CPU::generic, 0, Feature::znver1}, - {"znver2", CPU::amd_znver2, CPU::generic, 0, Feature::znver2}, - {"znver3", CPU::amd_znver3, CPU::amd_znver2, 120000, Feature::znver3}, - {"znver4", CPU::amd_znver4, CPU::amd_znver3, 160000, Feature::znver4}, - {"znver5", CPU::amd_znver5, CPU::amd_znver4, 190000, Feature::znver5}, -}; -static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]); - -// For CPU model and feature detection on X86 - -const int SIG_INTEL = 0x756e6547; // Genu -const int SIG_AMD = 0x68747541; // Auth - -static uint64_t get_xcr0(void) -{ - uint32_t eax, edx; - asm volatile ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0)); - return (uint64_t(edx) << 32) | eax; -} - -static CPU get_intel_processor_name(uint32_t family, uint32_t model, uint32_t brand_id, - const uint32_t *features) -{ - if (brand_id != 0) - return CPU::generic; - switch (family) { - case 3: - case 4: - case 5: - return CPU::generic; - case 6: - switch (model) { - case 0x01: // Pentium Pro processor - case 0x03: // Intel Pentium II OverDrive processor, Pentium II processor, model 03 - case 0x05: // Pentium II processor, model 05, Pentium II Xeon processor, - // model 05, and Intel Celeron processor, model 05 - case 0x06: // Celeron processor, model 06 - case 0x07: // Pentium III processor, model 07, and Pentium III Xeon processor, model 07 - case 0x08: // Pentium III processor, model 08, Pentium III Xeon processor, - // model 08, and Celeron processor, model 08 - case 0x0a: // Pentium III Xeon processor, model 0Ah - case 0x0b: // Pentium III processor, model 0Bh - case 0x09: // Intel Pentium M processor, Intel Celeron M processor model 09. - case 0x0d: // Intel Pentium M processor, Intel Celeron M processor, model - // 0Dh. All processors are manufactured using the 90 nm process. - case 0x15: // Intel EP80579 Integrated Processor and Intel EP80579 - // Integrated Processor with Intel QuickAssist Technology - return CPU::generic; - case 0x0e: // Intel Core Duo processor, Intel Core Solo processor, model - // 0Eh. All processors are manufactured using the 65 nm process. - return CPU::intel_yonah; - case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile - // processor, Intel Core 2 Quad processor, Intel Core 2 Quad - // mobile processor, Intel Core 2 Extreme processor, Intel - // Pentium Dual-Core processor, Intel Xeon processor, model - // 0Fh. All processors are manufactured using the 65 nm process. - case 0x16: // Intel Celeron processor model 16h. All processors are - // manufactured using the 65 nm process - return CPU::intel_core2; - case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model - // 17h. All processors are manufactured using the 45 nm process. - // - // 45nm: Penryn , Wolfdale, Yorkfield (XE) - case 0x1d: // Intel Xeon processor MP. All processors are manufactured using - // the 45 nm process. - return CPU::intel_core2_penryn; - case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All - // processors are manufactured using the 45 nm process. - case 0x1e: // Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz. - // As found in a Summer 2010 model iMac. - case 0x1f: - case 0x2e: // Nehalem EX - return CPU::intel_corei7_nehalem; - case 0x25: // Intel Core i7, laptop version. - case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All - // processors are manufactured using the 32 nm process. - case 0x2f: // Westmere EX - return CPU::intel_corei7_westmere; - case 0x2a: // Intel Core i7 processor. All processors are manufactured - // using the 32 nm process. - case 0x2d: - return CPU::intel_corei7_sandybridge; - case 0x3a: - case 0x3e: // Ivy Bridge EP - return CPU::intel_corei7_ivybridge; - - // Haswell: - case 0x3c: - case 0x3f: - case 0x45: - case 0x46: - return CPU::intel_corei7_haswell; - - // Broadwell: - case 0x3d: - case 0x47: - case 0x4f: - case 0x56: - return CPU::intel_corei7_broadwell; - - // Skylake: - case 0x4e: // Skylake mobile - case 0x5e: // Skylake desktop - case 0x8e: // Kaby Lake mobile - case 0x9e: // Kaby Lake desktop - case 0xa5: // Comet Lake-H/S - case 0xa6: // Comet Lake-U - return CPU::intel_corei7_skylake; - - // Skylake Xeon: - case 0x55: - if (test_nbit(features, Feature::avx512bf16)) - return CPU::intel_corei7_cooperlake; - if (test_nbit(features, Feature::avx512vnni)) - return CPU::intel_corei7_cascadelake; - return CPU::intel_corei7_skylake_avx512; - - // Cannonlake: - case 0x66: - return CPU::intel_corei7_cannonlake; - - // Icelake: - case 0x7d: - case 0x7e: - case 0x9d: - return CPU::intel_corei7_icelake_client; - - // Icelake Xeon: - case 0x6a: - case 0x6c: - return CPU::intel_corei7_icelake_server; - - // Tiger Lake - case 0x8c: - case 0x8d: - return CPU::intel_corei7_tigerlake; - //Alder Lake - case 0x97: - case 0x9a: - return CPU::intel_corei7_alderlake; - - // Sapphire Rapids - case 0x8f: - return CPU::intel_corei7_sapphirerapids; - - case 0x1c: // Most 45 nm Intel Atom processors - case 0x26: // 45 nm Atom Lincroft - case 0x27: // 32 nm Atom Medfield - case 0x35: // 32 nm Atom Midview - case 0x36: // 32 nm Atom Midview - return CPU::intel_atom_bonnell; - - // Atom Silvermont codes from the Intel software optimization guide. - case 0x37: - case 0x4a: - case 0x4d: - case 0x5d: - // Airmont - case 0x4c: - case 0x5a: - case 0x75: - return CPU::intel_atom_silvermont; - - // Goldmont: - case 0x5c: - case 0x5f: - return CPU::intel_atom_goldmont; - case 0x7a: - return CPU::intel_atom_goldmont_plus; - case 0x86: - case 0x96: - case 0x9c: - return CPU::intel_atom_tremont; - - case 0x57: - return CPU::intel_knights_landing; - - case 0x85: - return CPU::intel_knights_mill; - - default: - return CPU::generic; - } - break; - case 15: { - switch (model) { - case 0: // Pentium 4 processor, Intel Xeon processor. All processors are - // model 00h and manufactured using the 0.18 micron process. - case 1: // Pentium 4 processor, Intel Xeon processor, Intel Xeon - // processor MP, and Intel Celeron processor. All processors are - // model 01h and manufactured using the 0.18 micron process. - case 2: // Pentium 4 processor, Mobile Intel Pentium 4 processor - M, - // Intel Xeon processor, Intel Xeon processor MP, Intel Celeron - // processor, and Mobile Intel Celeron processor. All processors - // are model 02h and manufactured using the 0.13 micron process. - default: - return CPU::generic; - - case 3: // Pentium 4 processor, Intel Xeon processor, Intel Celeron D - // processor. All processors are model 03h and manufactured using - // the 90 nm process. - case 4: // Pentium 4 processor, Pentium 4 processor Extreme Edition, - // Pentium D processor, Intel Xeon processor, Intel Xeon - // processor MP, Intel Celeron D processor. All processors are - // model 04h and manufactured using the 90 nm process. - case 6: // Pentium 4 processor, Pentium D processor, Pentium processor - // Extreme Edition, Intel Xeon processor, Intel Xeon processor - // MP, Intel Celeron D processor. All processors are model 06h - // and manufactured using the 65 nm process. -#ifdef _CPU_X86_64_ - return CPU::intel_nocona; -#else - return CPU::intel_prescott; -#endif - } - } - default: - break; /*"generic"*/ - } - return CPU::generic; -} - -static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_t *features) -{ - switch (family) { - case 4: - case 5: - case 6: - default: - return CPU::generic; - case 15: - if (test_nbit(features, Feature::sse3)) - return CPU::amd_k8_sse3; - switch (model) { - case 1: - return CPU::amd_opteron; - case 5: - return CPU::amd_athlon_fx; - default: - return CPU::amd_athlon_64; - } - case 16: - switch (model) { - case 2: - return CPU::amd_barcelona; - case 4: - case 8: - default: - return CPU::amd_fam10h; - } - case 20: - return CPU::amd_btver1; - case 21: - if (model >= 0x50 && model <= 0x6f) - return CPU::amd_bdver4; - if (model >= 0x30 && model <= 0x3f) - return CPU::amd_bdver3; - if (model >= 0x10 && model <= 0x1f) - return CPU::amd_bdver2; - if (model <= 0x0f) - return CPU::amd_bdver1; - return CPU::amd_btver1; // fallback - case 22: - return CPU::amd_btver2; - case 23: - // Known models: - // Zen: 1, 17 - // Zen+: 8, 24 - // Zen2: 96, 113 - if (model >= 0x30) - return CPU::amd_znver2; - return CPU::amd_znver1; - case 25: // AMD Family 19h - if (model <= 0x0f || (model >= 0x20 && model <= 0x5f)) - return CPU::amd_znver3; // 00h-0Fh, 21h: Zen3 - if ((model >= 0x10 && model <= 0x1f) || - (model >= 0x60 && model <= 0x74) || - (model >= 0x78 && model <= 0x7b) || - (model >= 0xA0 && model <= 0xAf)) { - return CPU::amd_znver4; - } - return CPU::amd_znver3; // fallback - case 26: - // if (model <= 0x77) - return CPU::amd_znver5; - } -} - -template -static inline void features_disable_avx512(T &features) -{ - using namespace Feature; - unset_bits(features, avx512f, avx512dq, avx512ifma, avx512cd, - avx512bw, avx512vl, avx512vbmi, avx512vpopcntdq, avx512vbmi2, avx512vnni, - avx512bitalg, avx512vp2intersect, avx512bf16); -} - -template -static inline void features_disable_avx(T &features) -{ - using namespace Feature; - unset_bits(features, avx, Feature::fma, f16c, xsave, avx2, xop, fma4, - xsaveopt, xsavec, xsaves, vaes, vpclmulqdq); -} - -template -static inline void features_disable_amx(T &features) -{ - using namespace Feature; - unset_bits(features, amx_bf16, amx_tile, amx_int8); -} - -static NOINLINE std::pair> _get_host_cpu(void) -{ - FeatureList features = {}; - - int32_t info0[4]; - jl_cpuid(info0, 0); - uint32_t maxleaf = info0[0]; - if (maxleaf < 1) - return std::make_pair(uint32_t(CPU::generic), features); - int32_t info1[4]; - jl_cpuid(info1, 1); - - auto vendor = info0[1]; - auto brand_id = info1[1] & 0xff; - - auto family = (info1[0] >> 8) & 0xf; // Bits 8 - 11 - auto model = (info1[0] >> 4) & 0xf; // Bits 4 - 7 - if (family == 6 || family == 0xf) { - if (family == 0xf) - // Examine extended family ID if family ID is F. - family += (info1[0] >> 20) & 0xff; // Bits 20 - 27 - // Examine extended model ID if family ID is 6 or F. - model += ((info1[0] >> 16) & 0xf) << 4; // Bits 16 - 19 - } - - // Fill in the features - features[0] = info1[2]; - features[1] = info1[3]; - if (maxleaf >= 7) { - int32_t info7[4]; - jl_cpuidex(info7, 7, 0); - features[2] = info7[1]; - features[3] = info7[2]; - features[4] = info7[3]; - } - int32_t infoex0[4]; - jl_cpuid(infoex0, 0x80000000); - uint32_t maxexleaf = infoex0[0]; - if (maxexleaf >= 0x80000001) { - int32_t infoex1[4]; - jl_cpuid(infoex1, 0x80000001); - features[5] = infoex1[2]; - features[6] = infoex1[3]; - } - if (maxleaf >= 0xd) { - int32_t infod[4]; - jl_cpuidex(infod, 0xd, 0x1); - features[7] = infod[0]; - } - if (maxexleaf >= 0x80000008) { - int32_t infoex8[4]; - jl_cpuidex(infoex8, 0x80000008, 0); - features[8] = infoex8[1]; - } - if (maxleaf >= 7) { - int32_t info7[4]; - jl_cpuidex(info7, 7, 1); - features[9] = info7[0]; - features[10] = info7[1]; - } - if (maxleaf >= 0x14) { - int32_t info14[4]; - jl_cpuidex(info14, 0x14, 0); - features[11] = info14[1]; - } - - // Fix up AVX bits to account for OS support and match LLVM model - uint64_t xcr0 = 0; - bool hasxsave = test_all_bits(features[0], 1 << 27); - if (hasxsave) { - xcr0 = get_xcr0(); - hasxsave = test_all_bits(xcr0, 0x6); - } - bool hasavx = hasxsave && test_all_bits(features[0], 1 << 28); - unset_bits(features, 32 + 27); - if (!hasavx) - features_disable_avx(features); -#ifdef _OS_DARWIN_ - // See https://github.com/llvm/llvm-project/commit/82921bf2baed96b700f90b090d5dc2530223d9c0 - // and https://github.com/apple/darwin-xnu/blob/a449c6a3b8014d9406c2ddbdc81795da24aa7443/osfmk/i386/fpu.c#L174 - // Darwin lazily saves the AVX512 context on first use - bool hasavx512save = hasavx; -#else - bool hasavx512save = hasavx && test_all_bits(xcr0, 0xe0); -#endif - if (!hasavx512save) - features_disable_avx512(features); - // AMX requires additional context to be saved by the OS. - bool hasamxsave = hasxsave && test_all_bits(xcr0, (1 << 17) | (1 << 18)); - if (!hasamxsave) - features_disable_amx(features); - // Ignore feature bits that we are not interested in. - mask_features(feature_masks, &features[0]); - - uint32_t cpu; - if (vendor == SIG_INTEL) { - cpu = uint32_t(get_intel_processor_name(family, model, brand_id, &features[0])); - } - else if (vendor == SIG_AMD) { - cpu = uint32_t(get_amd_processor_name(family, model, &features[0])); - } - else { - cpu = uint32_t(CPU::generic); - } - /* Feature bits to register map - feature[0] = ecx - feature[1] = edx - feature[2] = leaf 7 ebx - feature[3] = leaf 7 ecx - feature[4] = leaf 7 edx - feature[5] = leaf 0x80000001 ecx - feature[6] = leaf 0x80000001 edx - feature[7] = leaf 0xd subleaf 1 eax - feature[8] = leaf 0x80000008 ebx - feature[9] = leaf 7 ebx subleaf 1 eax - feature[10] = leaf 7 ebx subleaf 1 ebx - feature[11] = leaf 0x14 ebx - */ - return std::make_pair(cpu, features); -} - -static inline const std::pair> &get_host_cpu() -{ - static auto host_cpu = _get_host_cpu(); - return host_cpu; -} - -static inline const CPUSpec *find_cpu(uint32_t cpu) -{ - return ::find_cpu(cpu, cpus, ncpu_names); -} - -static inline const CPUSpec *find_cpu(llvm::StringRef name) -{ - return ::find_cpu(name, cpus, ncpu_names); -} - -static inline const char *find_cpu_name(uint32_t cpu) -{ - return ::find_cpu_name(cpu, cpus, ncpu_names); -} - -static inline const std::string &host_cpu_name() -{ - static std::string name = - (CPU)get_host_cpu().first != CPU::generic ? - std::string(find_cpu_name(get_host_cpu().first)) : - jl_get_cpu_name_llvm(); - return name; -} - -static inline const char *normalize_cpu_name(llvm::StringRef name) -{ - if (name == "atom") - return "bonnell"; - if (name == "slm") - return "silvermont"; - if (name == "glm") - return "goldmont"; - if (name == "corei7") - return "nehalem"; - if (name == "corei7-avx") - return "sandybridge"; - if (name == "core-avx-i") - return "ivybridge"; - if (name == "core-avx2") - return "haswell"; - if (name == "skx") - return "skylake-avx512"; -#ifdef _CPU_X86_ - // i686 isn't a supported target but it's a common default one so just make it mean pentium4. - if (name == "pentium4" || name == "i686") - return "generic"; -#else - if (name == "x86-64" || name == "x86_64") - return "generic"; -#endif - return nullptr; -} - -template -static inline void enable_depends(FeatureList &features) -{ - ::enable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); -} - -template -static inline void disable_depends(FeatureList &features) -{ - ::disable_depends(features, Feature::deps, sizeof(Feature::deps) / sizeof(FeatureDep)); -} - -static const llvm::SmallVector, 0> &get_cmdline_targets(const char *cpu_target) -{ - auto feature_cb = [] (const char *str, size_t len, FeatureList &list) { - auto fbit = find_feature_bit(feature_names, nfeature_names, str, len); - if (fbit == UINT32_MAX) - return false; - set_bit(list, fbit, true); - return true; - }; - auto &targets = ::get_cmdline_targets(cpu_target, feature_cb); - for (auto &t: targets) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - } - return targets; -} - -static llvm::SmallVector, 0> jit_targets; - -static TargetData arg_target_data(const TargetData &arg, bool require_host) -{ - TargetData res = arg; - const FeatureList *cpu_features = nullptr; - if (res.name == "native") { - res.name = host_cpu_name(); - cpu_features = &get_host_cpu().second; - } - else if (auto spec = find_cpu(res.name)) { - cpu_features = &spec->features; - } - else { - res.en.flags |= JL_TARGET_UNKNOWN_NAME; - } - if (cpu_features) { - for (size_t i = 0; i < feature_sz; i++) { - res.en.features[i] |= (*cpu_features)[i]; - } - } - enable_depends(res.en.features); - // Mask our rdrand/rdseed/rtm/xsaveopt features that LLVM doesn't use and rr disables - unset_bits(res.en.features, Feature::rdrnd, Feature::rdseed, Feature::rtm, Feature::xsaveopt); - for (size_t i = 0; i < feature_sz; i++) - res.en.features[i] &= ~res.dis.features[i]; - if (require_host) { - for (size_t i = 0; i < feature_sz; i++) { - res.en.features[i] &= get_host_cpu().second[i]; - } - } - disable_depends(res.en.features); - if (cpu_features) { - // If the base feature if known, fill in the disable features - for (size_t i = 0; i < feature_sz; i++) { - res.dis.features[i] = feature_masks[i] & ~res.en.features[i]; - } - } - return res; -} - -static int max_vector_size(const FeatureList &features) -{ - if (test_nbit(features, Feature::avx512f)) - return 64; - if (test_nbit(features, Feature::avx)) - return 32; - // SSE is required - return 16; -} - -static uint32_t sysimg_init_cb(void *ctx, const void *id, jl_value_t** rejection_reason) -{ - // First see what target is requested for the JIT. - const char *cpu_target = (const char *)ctx; - auto &cmdline = get_cmdline_targets(cpu_target); - TargetData target = arg_target_data(cmdline[0], true); - // Then find the best match in the sysimg - auto sysimg = deserialize_target_data((const uint8_t*)id); - // We translate `generic` to `pentium4` or `x86-64` before sending it to LLVM - // (see `get_llvm_target_noext`) which will be serialized into the sysimg target data. - // Translate them back so we can actually match them. - // We also track to see if the sysimg allows -cx16, however if the user does - // something silly like add +cx16 on a 32bit target, we want to disable this - // check, hence the pointer size check. - bool sysimg_allows_no_cx16 = sizeof(void *) == 4;; - for (auto &t: sysimg) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - - // Take note to see if the sysimg explicitly allows an architecture without cx16 - sysimg_allows_no_cx16 |= !test_nbit(t.en.features, Feature::cx16); - } - if (!sysimg_allows_no_cx16 && !test_nbit(target.en.features, Feature::cx16)) { - jl_error("Your CPU does not support the CX16 instruction, which is required " - "by this version of Julia! This is often due to running inside of a " - "virtualized environment. Please read " - "https://docs.julialang.org/en/v1/devdocs/sysimg/ for more."); - } - auto match = match_sysimg_targets(sysimg, target, max_vector_size, rejection_reason); - if (match.best_idx == UINT32_MAX) - return match.best_idx; - // Now we've decided on which sysimg version to use. - // Make sure the JIT target is compatible with it and save the JIT target. - if (match.vreg_size != max_vector_size(target.en.features) && - (sysimg[match.best_idx].en.flags & JL_TARGET_VEC_CALL)) { - if (match.vreg_size < 64) { - features_disable_avx512(target.en.features); - } - if (match.vreg_size < 32) { - features_disable_avx(target.en.features); - } - } - jit_targets.push_back(std::move(target)); - return match.best_idx; -} - -static uint32_t pkgimg_init_cb(void *ctx, const void *id, jl_value_t **rejection_reason) -{ - TargetData target = jit_targets.front(); - auto pkgimg = deserialize_target_data((const uint8_t*)id); - for (auto &t: pkgimg) { - if (auto nname = normalize_cpu_name(t.name)) { - t.name = nname; - } - } - auto match = match_sysimg_targets(pkgimg, target, max_vector_size, rejection_reason); - return match.best_idx; -} - -//This function serves as a fallback during bootstrapping, at that point we don't have a sysimage with native code -// so we won't call sysimg_init_cb, else this function shouldn't do anything. -static void ensure_jit_target(const char *cpu_target, bool imaging) -{ - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, imaging); - if (!jit_targets.empty()) - return; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, jit_targets.empty()); - jit_targets.push_back(std::move(data)); - } - auto ntargets = jit_targets.size(); - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = jit_targets[i]; - if (t.en.flags & JL_TARGET_CLONE_ALL) - continue; - // Always clone when code checks CPU features - t.en.flags |= JL_TARGET_CLONE_CPU; - // The most useful one in general... - t.en.flags |= JL_TARGET_CLONE_LOOP; - auto &features0 = jit_targets[t.base].en.features; - // Special case for KNL/KNM since they're so different - if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) { - if ((t.name == "knl" || t.name == "knm") && - jit_targets[t.base].name != "knl" && jit_targets[t.base].name != "knm") { - t.en.flags |= JL_TARGET_CLONE_ALL; - break; - } - } - static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4}; - static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3, - Feature::sse41, Feature::sse42, - Feature::avx, Feature::avx2, - Feature::vaes, Feature::vpclmulqdq, - Feature::sse4a, Feature::avx512f, - Feature::avx512dq, Feature::avx512ifma, - Feature::avx512cd, Feature::avx512bw, - Feature::avx512vl, Feature::avx512vbmi, - Feature::avx512vpopcntdq, Feature::avxvnni, - Feature::avx512vbmi2, Feature::avx512vnni, - Feature::avx512bitalg, Feature::avx512bf16, - Feature::avx512vp2intersect, Feature::avx512fp16}; - for (auto fe: clone_math) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_MATH; - break; - } - } - for (auto fe: clone_simd) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_SIMD; - break; - } - } - static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16}; - for (auto fe: clone_fp16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_FLOAT16; - break; - } - } - static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16}; - for (auto fe: clone_bf16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_BFLOAT16; - break; - } - } - } -} - -static std::pair> -get_llvm_target_noext(const TargetData &data) -{ - std::string name = data.name; - auto *spec = find_cpu(name); - while (spec) { - if (spec->llvmver <= JL_LLVM_VERSION) - break; - spec = find_cpu((uint32_t)spec->fallback); - name = spec->name; - } - if (name == "generic") { - // Use translate `generic` into what we actually require -#ifdef _CPU_X86_ - name = "pentium4"; -#else - name = "x86-64"; -#endif - } - llvm::SmallVector features; - for (auto &fename: feature_names) { - if (fename.llvmver > JL_LLVM_VERSION) - continue; - if (test_nbit(data.en.features, fename.bit)) { - features.insert(features.begin(), std::string("+") + fename.name); - } - else if (test_nbit(data.dis.features, fename.bit)) { - features.push_back(std::string("-") + fename.name); - } - } - features.push_back("+sse2"); - features.push_back("+mmx"); - features.push_back("+fxsr"); -#ifdef _CPU_X86_64_ - // This is required to make LLVM happy if LLVM's feature based CPU arch guess - // returns a value that may not have 64bit support. - // This can happen with virtualization. - features.push_back("+64bit"); -#endif - features.push_back("+cx8"); - return std::make_pair(std::move(name), std::move(features)); -} - -static std::pair> -get_llvm_target_vec(const TargetData &data) -{ - auto res0 = get_llvm_target_noext(data); - append_ext_features(res0.second, data.ext_features); - return res0; -} - -static std::pair -get_llvm_target_str(const TargetData &data) -{ - auto res0 = get_llvm_target_noext(data); - auto features = join_feature_strs(res0.second); - append_ext_features(features, data.ext_features); - return std::make_pair(std::move(res0.first), std::move(features)); -} - -} - -using namespace X86; - -JL_DLLEXPORT void jl_dump_host_cpu(void) -{ - dump_cpu_spec(get_host_cpu().first, get_host_cpu().second, feature_names, nfeature_names, - cpus, ncpu_names); -} - -JL_DLLEXPORT jl_value_t* jl_check_pkgimage_clones(char *data) -{ - jl_value_t *rejection_reason = NULL; - JL_GC_PUSH1(&rejection_reason); - uint32_t match_idx = pkgimg_init_cb(NULL, data, &rejection_reason); - JL_GC_POP(); - if (match_idx == UINT32_MAX) - return rejection_reason; - return jl_nothing; -} - -JL_DLLEXPORT jl_value_t *jl_cpu_has_fma(int bits) -{ - TargetData target = jit_targets.front(); - FeatureList features = target.en.features; - if ((bits == 32 || bits == 64) && (test_nbit(features, Feature::fma) || test_nbit(features, Feature::fma4))) - return jl_true; - else - return jl_false; -} - -jl_image_t jl_init_processor_sysimg(jl_image_buf_t image, const char *cpu_target) -{ - if (!jit_targets.empty()) - jl_error("JIT targets already initialized"); - return parse_sysimg(image, sysimg_init_cb, (void *)cpu_target); -} - -jl_image_t jl_init_processor_pkgimg(jl_image_buf_t image) -{ - if (jit_targets.empty()) - jl_error("JIT targets not initialized"); - if (jit_targets.size() > 1) - jl_error("Expected only one JIT target"); - return parse_sysimg(image, pkgimg_init_cb, NULL); -} - -std::pair> jl_get_llvm_target(const char *cpu_target, bool imaging, uint32_t &flags) -{ - ensure_jit_target(cpu_target, imaging); - flags = jit_targets[0].en.flags; - return get_llvm_target_vec(jit_targets[0]); -} - -const std::pair &jl_get_llvm_disasm_target(void) -{ - static const auto res = get_llvm_target_str(TargetData{"generic", "", - {feature_masks, 0}, {{}, 0}, 0}); - return res; -} -//This function parses the -C command line to figure out which targets to multiversion to. -#ifndef __clang_gcanalyzer__ -llvm::SmallVector jl_get_llvm_clone_targets(const char *cpu_target) -{ - - auto &cmdline = get_cmdline_targets(cpu_target); - check_cmdline(cmdline, true); - llvm::SmallVector, 0> image_targets; - for (auto &arg: cmdline) { - auto data = arg_target_data(arg, image_targets.empty()); - image_targets.push_back(std::move(data)); - } - - auto ntargets = image_targets.size(); - // Now decide the clone condition. - for (size_t i = 1; i < ntargets; i++) { - auto &t = image_targets[i]; - if (t.en.flags & JL_TARGET_CLONE_ALL) - continue; - // Always clone when code checks CPU features - t.en.flags |= JL_TARGET_CLONE_CPU; - // The most useful one in general... - t.en.flags |= JL_TARGET_CLONE_LOOP; - auto &features0 = image_targets[t.base].en.features; - // Special case for KNL/KNM since they're so different - if (!(t.dis.flags & JL_TARGET_CLONE_ALL)) { - if ((t.name == "knl" || t.name == "knm") && - image_targets[t.base].name != "knl" && image_targets[t.base].name != "knm") { - t.en.flags |= JL_TARGET_CLONE_ALL; - break; - } - } - static constexpr uint32_t clone_math[] = {Feature::fma, Feature::fma4}; - static constexpr uint32_t clone_simd[] = {Feature::sse3, Feature::ssse3, - Feature::sse41, Feature::sse42, - Feature::avx, Feature::avx2, - Feature::vaes, Feature::vpclmulqdq, - Feature::sse4a, Feature::avx512f, - Feature::avx512dq, Feature::avx512ifma, - Feature::avx512cd, Feature::avx512bw, - Feature::avx512vl, Feature::avx512vbmi, - Feature::avx512vpopcntdq, Feature::avxvnni, - Feature::avx512vbmi2, Feature::avx512vnni, - Feature::avx512bitalg, Feature::avx512bf16, - Feature::avx512vp2intersect, Feature::avx512fp16}; - for (auto fe: clone_math) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_MATH; - break; - } - } - for (auto fe: clone_simd) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_SIMD; - break; - } - } - static constexpr uint32_t clone_fp16[] = {Feature::avx512fp16}; - for (auto fe: clone_fp16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_FLOAT16; - break; - } - } - static constexpr uint32_t clone_bf16[] = {Feature::avx512bf16}; - for (auto fe: clone_bf16) { - if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { - t.en.flags |= JL_TARGET_CLONE_BFLOAT16; - break; - } - } - } - if (image_targets.empty()) - jl_error("No targets specified"); - llvm::SmallVector res; - for (auto &target: image_targets) { - auto features_en = target.en.features; - auto features_dis = target.dis.features; - for (auto &fename: feature_names) { - if (fename.llvmver > JL_LLVM_VERSION) { - unset_bits(features_en, fename.bit); - unset_bits(features_dis, fename.bit); - } - } - X86::disable_depends(features_en); - jl_target_spec_t ele; - std::tie(ele.cpu_name, ele.cpu_features) = get_llvm_target_str(target); - ele.data = serialize_target_data(target.name, features_en, features_dis, - target.ext_features); - ele.flags = target.en.flags; - ele.base = target.base; - res.push_back(ele); - } - return res; -} -#endif - -extern "C" int jl_test_cpu_feature(jl_cpu_feature_t feature) -{ - if (feature >= 32 * feature_sz) - return 0; - return test_nbit(&get_host_cpu().second[0], feature); -} - -// -- set/clear the FZ/DAZ flags on x86 & x86-64 -- - -// Cache of information recovered from `cpuid` since executing `cpuid` it at runtime is slow. -static uint32_t subnormal_flags = [] { - int32_t info[4]; - jl_cpuid(info, 0); - if (info[0] >= 1) { - jl_cpuid(info, 1); - if (info[3] & (1 << 26)) { - // SSE2 supports both FZ and DAZ - return 0x00008040; - } - else if (info[3] & (1 << 25)) { - // SSE supports only the FZ flag - return 0x00008000; - } - } - return 0; -}(); - -// Returns non-zero if subnormals go to 0; zero otherwise. -extern "C" JL_DLLEXPORT int32_t jl_get_zero_subnormals(void) -{ - return _mm_getcsr() & subnormal_flags; -} - -// Return zero on success, non-zero on failure. -extern "C" JL_DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero) -{ - uint32_t flags = subnormal_flags; - if (flags) { - uint32_t state = _mm_getcsr(); - if (isZero) - state |= flags; - else - state &= ~flags; - _mm_setcsr(state); - return 0; - } - else { - // Report a failure only if user is trying to enable FTZ/DAZ. - return isZero; - } -} - -// X86 does not support default NaNs -extern "C" JL_DLLEXPORT int32_t jl_get_default_nans(void) -{ - return 0; -} - -extern "C" JL_DLLEXPORT int32_t jl_set_default_nans(int8_t isDefault) -{ - return isDefault; -} diff --git a/src/staticdata.c b/src/staticdata.c index 82e903fdfd05a..2b471f41fc77d 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -4401,7 +4401,7 @@ JL_DLLEXPORT void jl_restore_system_image(jl_image_t *image, jl_image_buf_t buf) return; if (buf.kind == JL_IMAGE_KIND_SO) - assert(image->fptrs.ptrs); // jl_init_processor_sysimg should already be run + assert(image->fptrs.ptrs); // jl_load_sysimg should already be run JL_SIGATOMIC_BEGIN(); ios_static_buffer(&f, (char *)buf.data, buf.size); @@ -4433,7 +4433,7 @@ JL_DLLEXPORT jl_value_t *jl_restore_package_image_from_file(const char *fname, j jl_gc_notify_image_load(buf.data, buf.size); // Despite the name, this function actually parses the pkgimage - jl_image_t pkgimage = jl_init_processor_pkgimg(buf); + jl_image_t pkgimage = jl_load_pkgimg(buf); if (ignore_native) { // Must disable using native code in possible downstream users of this code: diff --git a/test/binaryplatforms.jl b/test/binaryplatforms.jl index 8de522e9c6c8b..81ff9b42a9249 100644 --- a/test/binaryplatforms.jl +++ b/test/binaryplatforms.jl @@ -5,6 +5,7 @@ using Test, Base.BinaryPlatforms, Base.BinaryPlatforms.CPUID @testset "CPUID" begin @test CPUID.cpu_isa() isa CPUID.ISA + # x86_64 tiers form a strict subset chain get_x86_64(n) = (CPUID.ISAs_by_family["x86_64"][n].second) @test get_x86_64(2) < get_x86_64(4) @test get_x86_64(5) <= get_x86_64(5) @@ -12,6 +13,83 @@ using Test, Base.BinaryPlatforms, Base.BinaryPlatforms.CPUID @test get_x86_64(7) >= get_x86_64(1) @test sort([get_x86_64(6), get_x86_64(4), get_x86_64(2), get_x86_64(4)]) == [get_x86_64(2), get_x86_64(4), get_x86_64(4), get_x86_64(6)] + + # Cross-arch queries return real feature data + @test length(CPUID._cross_lookup_cpu("x86_64", "haswell").features) > 10 + @test length(CPUID._cross_lookup_cpu("aarch64", "cortex-a78").features) > 10 + @test length(CPUID._cross_lookup_cpu("riscv64", "sifive-u74").features) > 0 + @test isempty(CPUID._cross_lookup_cpu("x86_64", "nonexistent").features) + @test isempty(CPUID._cross_lookup_cpu("badarch", "haswell").features) + + # Apple M-series aliases resolve to their A-series equivalents + let m1 = CPUID._cross_lookup_cpu("aarch64", "apple-m1"), + a14 = CPUID._cross_lookup_cpu("aarch64", "apple-a14") + @test m1.features == a14.features + end + @test !isempty(CPUID._cross_lookup_cpu("aarch64", "apple-m2").features) + + # Arch name normalization (i686 → x86_64, arm64 → aarch64) + @test CPUID._cross_lookup_cpu("i686", "haswell").features == + CPUID._cross_lookup_cpu("x86_64", "haswell").features + @test CPUID._cross_lookup_cpu("arm64", "cortex-a78").features == + CPUID._cross_lookup_cpu("aarch64", "cortex-a78").features + + # All families have non-empty ISA data (cross-arch works) + for (arch, isas) in CPUID.ISAs_by_family + @test length(isas) >= 1 + end + + # feature_names(arch, cpu) — query by CPU name + hsw = CPUID.feature_names("x86_64", "haswell") + @test "avx2" in hsw + @test "fma" in hsw + @test "sse4.2" in hsw + @test !("avx512f" in hsw) # haswell doesn't have avx512 + + skx = CPUID.feature_names("x86_64", "skylake-avx512") + @test "avx512f" in skx + @test "avx512bw" in skx + + # aarch64 cross-arch feature names + a78 = CPUID.feature_names("aarch64", "cortex-a78") + @test "lse" in a78 + @test "neon" in a78 + + x925 = CPUID.feature_names("aarch64", "cortex-x925") + @test "sve2" in x925 + @test "bf16" in x925 + @test "dotprod" in x925 + + # Architecture version features present for ARM cores + @test "v8.1a" in x925 + @test "v9a" in x925 + + # Unknown CPU returns empty + @test isempty(CPUID.feature_names("x86_64", "nonexistent")) + + # feature_names(arch, isa) — query by ISA struct + names_from_isa = CPUID.feature_names("x86_64", get_x86_64(5)) + @test "avx" in names_from_isa + @test "sse4.2" in names_from_isa + + # feature_names(isa) — host arch default + host_names = CPUID.feature_names(CPUID.cpu_isa()) + @test length(host_names) > 5 + + # feature_names() — full default (host arch + host ISA) + default_names = CPUID.feature_names() + @test default_names == host_names + + # _build_bit_to_name returns a non-empty mapping with known features + mapping = CPUID._build_bit_to_name("x86_64") + @test length(mapping) > 50 + @test "avx2" in values(mapping) + @test "sse4.2" in values(mapping) + + mapping_aarch64 = CPUID._build_bit_to_name("aarch64") + @test length(mapping_aarch64) > 50 + @test "neon" in values(mapping_aarch64) + @test "sve" in values(mapping_aarch64) end # Helper constructor to create a Platform with `validate_strict` set to `true`. diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl index 273a9ee8e26f7..25eb3e571480e 100644 --- a/test/cmdlineargs.jl +++ b/test/cmdlineargs.jl @@ -197,6 +197,13 @@ end wait(p) @test p.exitcode == 1 @test occursin("empty CPU name", String(take!(io))) + + # Test --cpu-target=help prints available targets and exits cleanly + let v = readchomperrors(`$(Base.julia_cmd(; cpu_target="help"))`) + @test v[1] == true # exits with 0 + @test occursin("Available CPU targets:", v[2]) + @test occursin("Host CPU:", v[2]) + end end let exename = `$(Base.julia_cmd()) --startup-file=no --color=no` diff --git a/test/llvmpasses/multiversioning-annotate-only.ll b/test/llvmpasses/multiversioning-annotate-only.ll index 849cf57c78aa3..48322690b509e 100644 --- a/test/llvmpasses/multiversioning-annotate-only.ll +++ b/test/llvmpasses/multiversioning-annotate-only.ll @@ -5,29 +5,17 @@ ; COM: This test checks that multiversioning correctly picks up on features that should trigger cloning ; COM: Note that for annotations alone, we don't need jl_fvars or jl_gvars -; COM: Copied from src/processor.h -; COM: JL_TARGET_VEC_CALL = 1 << 0, -; COM: // Clone all functions -; COM: JL_TARGET_CLONE_ALL = 1 << 1, -; COM: // Clone when there's scalar math operations that can benefit from target-specific -; COM: // optimizations. This includes `muladd`, `fma`, `fast`/`contract` flags. -; COM: JL_TARGET_CLONE_MATH = 1 << 2, -; COM: // Clone when the function has a loop -; COM: JL_TARGET_CLONE_LOOP = 1 << 3, -; COM: // Clone when the function uses any vectors -; COM: // When this is specified, the cloning pass should also record if any of the cloned functions -; COM: // used this in any function call (including the signature of the function itself) -; COM: JL_TARGET_CLONE_SIMD = 1 << 4, -; COM: // The CPU name is unknown -; COM: JL_TARGET_UNKNOWN_NAME = 1 << 5, -; COM: // Optimize for size for this target -; COM: JL_TARGET_OPTSIZE = 1 << 6, -; COM: // Only optimize for size for this target -; COM: JL_TARGET_MINSIZE = 1 << 7, -; COM: // Clone when the function queries CPU features -; COM: JL_TARGET_CLONE_CPU = 1 << 8, -; COM: // Clone when the function uses fp16 -; COM: JL_TARGET_CLONE_FLOAT16 = 1 << 9, +; COM: Target spec packed_flags() encoding (from llvm-multiversioning.cpp): +; COM: clone_all = 1 << 0 +; COM: opt_size = 1 << 1 +; COM: min_size = 1 << 2 +; COM: has_new_math = 1 << 3 +; COM: has_new_simd = 1 << 4 +; COM: has_new_float16 = 1 << 5 +; COM: has_new_bfloat16 = 1 << 6 +; COM: +; COM: clone_flags() always includes LOOP and CPU categories. +; COM: Additionally includes MATH if has_new_math, SIMD if has_new_simd, etc. ; COM: start with the basics, just one feature per function @@ -78,7 +66,7 @@ define noundef float @simd_fastmath_test(<4 x float> noundef %0) { ret float %4 } -; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_FASTMATH_TEST_ATTRS:[0-9]+]] +; CHECK: @loop_fastmath_test{{.*}}#[[LOOP_TEST_ATTRS]] define noundef i32 @loop_fastmath_test(i32 noundef %0) { %2 = icmp sgt i32 %0, 0 br i1 %2, label %7, label %5 @@ -102,7 +90,7 @@ define noundef i32 @loop_fastmath_test(i32 noundef %0) { br i1 %14, label %3, label %7, !llvm.loop !9 } -; CHECK: @simd_loop_test{{.*}}#[[SIMD_LOOP_TEST_ATTRS:[0-9]+]] +; CHECK: @simd_loop_test{{.*}}#[[LOOP_TEST_ATTRS]] define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) { %2 = extractelement <4 x i32> %0, i64 0 %3 = icmp sgt i32 %2, 0 @@ -122,7 +110,7 @@ define dso_local noundef i32 @simd_loop_test(<4 x i32> noundef %0) { br i1 %12, label %4, label %6, !llvm.loop !9 } -; CHECK: @simd_loop_fastmath_test{{.*}}#[[SIMD_LOOP_FASTMATH_TEST_ATTRS:[0-9]+]] +; CHECK: @simd_loop_fastmath_test{{.*}}#[[LOOP_TEST_ATTRS]] define noundef i32 @simd_loop_fastmath_test(<4 x i32> noundef %0) { %2 = extractelement <4 x i32> %0, i64 0 %3 = icmp sgt i32 %2, 0 @@ -180,12 +168,9 @@ define noundef i32 @uncloned(i32 noundef %0) { ; COM: Note that these strings are hex-encoded bits of the target indices that will be cloned ; CHECK-DAG: attributes #[[BORING_ATTRS]] = { "julia.mv.clones"="2" } ; CHECK-DAG: attributes #[[FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="6" } -; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="A" } +; CHECK-DAG: attributes #[[LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1E" } ; CHECK-DAG: attributes #[[SIMD_TEST_ATTRS]] = { "julia.mv.clones"="12" } ; CHECK-DAG: attributes #[[SIMD_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="16" } -; CHECK-DAG: attributes #[[LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="E" } -; CHECK-DAG: attributes #[[SIMD_LOOP_TEST_ATTRS]] = { "julia.mv.clones"="1A" } -; CHECK-DAG: attributes #[[SIMD_LOOP_FASTMATH_TEST_ATTRS]] = { "julia.mv.clones"="1E" } ; CHECK-DAG: attributes #[[FUNC_IN_GV_ATTRS]] ; CHECK-SAME: "julia.mv.clones"="2" ; CHECK-SAME: "julia.mv.fvar" @@ -210,9 +195,9 @@ define noundef i32 @uncloned(i32 noundef %0) { !1 = !{i32 1, !"julia.mv.skipcloning", i32 1} !2 = !{i32 1, !"julia.mv.specs", !3} !3 = !{!4, !5, !6, !7, !8} -!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2} -!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2} -!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4} -!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8} +!4 = !{!"cpubase", !"nofeatures", i32 0, i32 0} +!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 1} +!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 8} +!7 = !{!"cpuloop", !"loopclone", i32 0, i32 0} !8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} !9 = !{!9} diff --git a/test/llvmpasses/multiversioning-clone-only.ll b/test/llvmpasses/multiversioning-clone-only.ll index c4f5257a59988..aff71f4c87c47 100644 --- a/test/llvmpasses/multiversioning-clone-only.ll +++ b/test/llvmpasses/multiversioning-clone-only.ll @@ -210,10 +210,10 @@ attributes #3 = {"julia.mv.clones"="6"} !1 = !{i32 1, !"julia.mv.annotated", i32 1} !2 = !{i32 1, !"julia.mv.specs", !3} !3 = !{!4, !5, !6, !7, !8} -!4 = !{!"cpubase", !"nofeatures", i32 0, i32 2} -!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 2} -!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 4} -!7 = !{!"cpuloop", !"loopclone", i32 0, i32 8} +!4 = !{!"cpubase", !"nofeatures", i32 0, i32 0} +!5 = !{!"cpucloneall", !"cloneall", i32 0, i32 1} +!6 = !{!"cpufastmath", !"fastmathclone", i32 0, i32 8} +!7 = !{!"cpuloop", !"loopclone", i32 0, i32 0} !8 = !{!"cpusimd", !"simdclone", i32 0, i32 16} ; CHECK-DAG: ![[TBAA_CONST_METADATA]] = !{![[JTBAA_CONST_METADATA:[0-9]+]], ![[JTBAA_CONST_METADATA]] ; CHECK-DAG: ![[JTBAA_CONST_METADATA]] = !{!"jtbaa_const" diff --git a/test/llvmpasses/multiversioning-x86.ll b/test/llvmpasses/multiversioning-x86.ll index e2918d0c20eec..7ce50a3397127 100644 --- a/test/llvmpasses/multiversioning-x86.ll +++ b/test/llvmpasses/multiversioning-x86.ll @@ -119,5 +119,5 @@ define noundef i32 @simd_test_call(<4 x i32> noundef %0) { !2 = !{i32 1, !"julia.mv.specs", !3} !3 = !{!4, !5, !6} !4 = !{!"x86-64", !"+cx16,-sse3,-pclmul,-ssse3,-fma,-sse4.1,-sse4.2,-movbe,-popcnt,-aes,-xsave,-avx,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sahf,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 0} -!5 = !{!"sandybridge", !"+sahf,+avx,+xsave,+popcnt,+sse4.2,+sse4.1,+cx16,+ssse3,+pclmul,+sse3,-fma,-movbe,-aes,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 2} -!6 = !{!"haswell", !"+lzcnt,+sahf,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-aes,-rdrnd,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 1, i32 284} +!5 = !{!"sandybridge", !"+sahf,+avx,+xsave,+popcnt,+sse4.2,+sse4.1,+cx16,+ssse3,+pclmul,+sse3,-fma,-movbe,-aes,-f16c,-rdrnd,-fsgsbase,-bmi,-avx2,-bmi2,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-lzcnt,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 0, i32 1} +!6 = !{!"haswell", !"+lzcnt,+sahf,+bmi2,+avx2,+bmi,+fsgsbase,+f16c,+avx,+xsave,+popcnt,+movbe,+sse4.2,+sse4.1,+cx16,+fma,+ssse3,+pclmul,+sse3,-aes,-rdrnd,-rtm,-avx512f,-avx512dq,-rdseed,-adx,-avx512ifma,-clflushopt,-clwb,-avx512cd,-sha,-avx512bw,-avx512vl,-avx512vbmi,-pku,-waitpkg,-avx512vbmi2,-shstk,-gfni,-vaes,-vpclmulqdq,-avx512vnni,-avx512bitalg,-avx512vpopcntdq,-rdpid,-cldemote,-movdiri,-movdir64b,-enqcmd,-uintr,-avx512vp2intersect,-serialize,-tsxldtrk,-pconfig,-amx-bf16,-avx512fp16,-amx-tile,-amx-int8,-sse4a,-prfchw,-xop,-fma4,-tbm,-mwaitx,-xsaveopt,-xsavec,-xsaves,-clzero,-wbnoinvd,-avxvnni,-avx512bf16,-ptwrite,+sse2,+mmx,+fxsr,+64bit,+cx8", i32 1, i32 24}