diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index da0a32f1f0..6a84ccbe3e 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -252,6 +252,14 @@ jobs: cxx_flags: -ftrapv cxx_standard: 23 + - name: Header only (GCC-14 C++23) + extra_deps: g++-14 + c_compiler: gcc-14 + cxx_compiler: g++-14 + cxx_flags: -ftrapv + cxx_standard: 23 + extra_cmake_flags: -DHWY_CMAKE_HEADER_ONLY=ON + steps: - name: Harden Runner uses: step-security/harden-runner@20cf305ff2072d973412fa9b1e3a4f227bda3c76 # v2.14.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 575ef13920..0c8a1d2f43 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -285,6 +285,7 @@ set(HWY_SOURCES hwy/x86_cpuid.h ) + if (NOT HWY_CMAKE_HEADER_ONLY) list(APPEND HWY_SOURCES hwy/abort.cc @@ -500,10 +501,6 @@ if (HWY_DISABLE_FUTEX) list(APPEND HWY_FLAGS -DHWY_DISABLE_FUTEX) endif() -if (HWY_CMAKE_HEADER_ONLY) - list(APPEND HWY_FLAGS -DHWY_HEADER_ONLY) -endif() - include(CheckIncludeFile) check_include_file(sys/auxv.h HAVE_SYS_AUXV_H) check_include_file(asm/hwcap.h HAVE_ASM_HWCAP_H) @@ -531,33 +528,78 @@ else() set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE") endif() -add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES}) -if(NOT HAVE_SYS_AUXV_H) - target_compile_definitions(hwy PUBLIC TOOLCHAIN_MISS_SYS_AUXV_H) -endif() -if(NOT HAVE_ASM_HWCAP_H) - target_compile_definitions(hwy PUBLIC TOOLCHAIN_MISS_ASM_HWCAP_H) -endif() -target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}") -target_compile_options(hwy PRIVATE ${HWY_FLAGS}) -set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) -set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) -target_include_directories(hwy PUBLIC - $ - $) -target_compile_features(hwy PUBLIC cxx_std_11) -if (NOT HWY_CXX_STD_TGT_COMPILE_FEATURE STREQUAL "cxx_std_11") - target_compile_features(hwy PRIVATE ${HWY_CXX_STD_TGT_COMPILE_FEATURE}) -endif() -set_target_properties(hwy PROPERTIES - LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) -# For GCC __atomic_store_8, see #887 -target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES}) -# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) -if(UNIX AND NOT APPLE) - set_property(TARGET hwy APPEND_STRING PROPERTY - LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") -endif() +if (NOT HWY_CMAKE_HEADER_ONLY) + add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES}) + if (NOT HAVE_SYS_AUXV_H) + target_compile_definitions(hwy PUBLIC TOOLCHAIN_MISS_SYS_AUXV_H) + endif () + if (NOT HAVE_ASM_HWCAP_H) + target_compile_definitions(hwy PUBLIC TOOLCHAIN_MISS_ASM_HWCAP_H) + endif () + target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}") + target_compile_definitions(hwy PUBLIC -DHWY_HEADER_ONLY=0) + target_compile_options(hwy PRIVATE ${HWY_FLAGS}) + set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) + set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) + target_include_directories(hwy PUBLIC + $ + $) + target_compile_features(hwy PUBLIC cxx_std_11) + if (NOT HWY_CXX_STD_TGT_COMPILE_FEATURE STREQUAL "cxx_std_11") + target_compile_features(hwy PRIVATE ${HWY_CXX_STD_TGT_COMPILE_FEATURE}) + endif () + set_target_properties(hwy PROPERTIES + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) + # For GCC __atomic_store_8, see #887 + target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES}) + # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) + if (UNIX AND NOT APPLE) + set_property(TARGET hwy APPEND_STRING PROPERTY + LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") + endif () +else() + # to uncomment this, finally +# add_library(hwy INTERFACE) +# target_compile_definitions(hwy INTERFACE -DHWY_HEADER_ONLY=1) +# if (HWY_WARNINGS_ARE_ERRORS) +# target_include_directories(hwy INTERFACE +# $ +# $) +# else() +# target_include_directories(hwy SYSTEM INTERFACE +# $ +# $) +# endif () +# target_link_libraries(hwy INTERFACE ${ATOMICS_LIBRARIES}) + + add_library(hwy STATIC) + target_compile_features(hwy PUBLIC cxx_std_11) + if (NOT HWY_CXX_STD_TGT_COMPILE_FEATURE STREQUAL "cxx_std_11") + target_compile_features(hwy PRIVATE ${HWY_CXX_STD_TGT_COMPILE_FEATURE}) + endif () + target_compile_definitions(hwy PUBLIC -DHWY_HEADER_ONLY=1) + if (HWY_WARNINGS_ARE_ERRORS) + target_include_directories(hwy PUBLIC + $ + $) + else () + target_include_directories(hwy SYSTEM PUBLIC + $ + $) + endif () + target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES}) + target_sources(hwy PRIVATE +# hwy/abort.cc +# hwy/aligned_allocator.cc +# hwy/nanobenchmark.cc + hwy/per_target.cc +# hwy/perf_counters.cc +# hwy/print.cc +# hwy/profiler.cc +# hwy/targets.cc +# hwy/timer.cc + ) +endif () if (HWY_ENABLE_CONTRIB) add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES}) diff --git a/hwy/abort.cc b/hwy/abort.cc index a67819bbd3..0cf8508687 100644 --- a/hwy/abort.cc +++ b/hwy/abort.cc @@ -20,19 +20,19 @@ namespace hwy { -namespace { +namespace detail { -std::atomic& AtomicWarnFunc() { +HWY_HEADER_ONLY_FUNC std::atomic& AtomicWarnFunc() { static std::atomic func; return func; } -std::atomic& AtomicAbortFunc() { +HWY_HEADER_ONLY_FUNC std::atomic& AtomicAbortFunc() { static std::atomic func; return func; } -std::string GetBaseName(std::string const& file_name) { +HWY_HEADER_ONLY_FUNC std::string GetBaseName(std::string const& file_name) { auto last_slash = file_name.find_last_of("/\\"); return file_name.substr(last_slash + 1); } @@ -43,27 +43,29 @@ std::string GetBaseName(std::string const& file_name) { // is required to safely implement `SetWarnFunc`. As a workaround, we store a // copy here, update it when called, and return a reference to the copy. This // has the added benefit of protecting the actual pointer from modification. -HWY_DLLEXPORT WarnFunc& GetWarnFunc() { +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT WarnFunc& GetWarnFunc() { static WarnFunc func; - func = AtomicWarnFunc().load(); + func = detail::AtomicWarnFunc().load(); return func; } -HWY_DLLEXPORT AbortFunc& GetAbortFunc() { +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT AbortFunc& GetAbortFunc() { static AbortFunc func; - func = AtomicAbortFunc().load(); + func = detail::AtomicAbortFunc().load(); return func; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT WarnFunc SetWarnFunc(WarnFunc func) { - return AtomicWarnFunc().exchange(func); + return detail::AtomicWarnFunc().exchange(func); } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT AbortFunc SetAbortFunc(AbortFunc func) { - return AtomicAbortFunc().exchange(func); + return detail::AtomicAbortFunc().exchange(func); } -HWY_DLLEXPORT void HWY_FORMAT(3, 4) +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void HWY_FORMAT(3, 4) Warn(const char* file, int line, const char* format, ...) { char buf[800]; va_list args; @@ -71,15 +73,15 @@ HWY_DLLEXPORT void HWY_FORMAT(3, 4) vsnprintf(buf, sizeof(buf), format, args); va_end(args); - WarnFunc handler = AtomicWarnFunc().load(); + WarnFunc handler = detail::AtomicWarnFunc().load(); if (handler != nullptr) { handler(file, line, buf); } else { - fprintf(stderr, "Warn at %s:%d: %s\n", GetBaseName(file).data(), line, buf); + fprintf(stderr, "Warn at %s:%d: %s\n", detail::GetBaseName(file).data(), line, buf); } } -HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...) { char buf[800]; va_list args; @@ -87,11 +89,11 @@ HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) vsnprintf(buf, sizeof(buf), format, args); va_end(args); - AbortFunc handler = AtomicAbortFunc().load(); + AbortFunc handler = detail::AtomicAbortFunc().load(); if (handler != nullptr) { handler(file, line, buf); } else { - fprintf(stderr, "Abort at %s:%d: %s\n", GetBaseName(file).data(), line, + fprintf(stderr, "Abort at %s:%d: %s\n", detail::GetBaseName(file).data(), line, buf); } diff --git a/hwy/aligned_allocator.cc b/hwy/aligned_allocator.cc index e857b2288f..5d8acc70bc 100644 --- a/hwy/aligned_allocator.cc +++ b/hwy/aligned_allocator.cc @@ -25,7 +25,7 @@ #include "hwy/base.h" namespace hwy { -namespace { +namespace detail { #if HWY_ARCH_RISCV && defined(__riscv_v_intrinsic) && \ __riscv_v_intrinsic >= 11000 @@ -54,6 +54,7 @@ struct AllocationHeader { #pragma pack(pop) // Returns a 'random' (cyclical) offset for AllocateAlignedBytes. +HWY_HEADER_ONLY_FUNC size_t NextAlignedOffset() { static std::atomic next{0}; static_assert(kAlias % kAlignment == 0, "kAlias must be a multiple"); @@ -66,8 +67,10 @@ size_t NextAlignedOffset() { } // namespace +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr, void* opaque_ptr) { + using namespace hwy::detail; HWY_ASSERT(payload_size != 0); // likely a bug in caller if (payload_size >= std::numeric_limits::max() / 2) { HWY_DASSERT(false && "payload_size too large"); @@ -114,8 +117,10 @@ HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size, return HWY_ASSUME_ALIGNED(reinterpret_cast(payload), kAlignment); } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr, void* opaque_ptr) { + using namespace hwy::detail; if (aligned_pointer == nullptr) return; const uintptr_t payload = reinterpret_cast(aligned_pointer); @@ -131,10 +136,12 @@ HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, } // static +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr, void* opaque_ptr, ArrayDeleter deleter) { + using namespace hwy::detail; if (aligned_pointer == nullptr) return; const uintptr_t payload = reinterpret_cast(aligned_pointer); diff --git a/hwy/aligned_allocator.h b/hwy/aligned_allocator.h index 76c7156377..365f7037a8 100644 --- a/hwy/aligned_allocator.h +++ b/hwy/aligned_allocator.h @@ -424,4 +424,8 @@ class AlignedNDArray { }; } // namespace hwy + +#if HWY_HEADER_ONLY +#include "hwy/aligned_allocator.cc" +#endif // HWY_HEADER_ONLY #endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ diff --git a/hwy/auto_tune.h b/hwy/auto_tune.h index 72d154e5ad..4f5328db98 100644 --- a/hwy/auto_tune.h +++ b/hwy/auto_tune.h @@ -28,7 +28,7 @@ // configuration to allow auto_tune to use std::sort instead of VQSort // (also enabled in header only mode). -#if defined(HWY_HEADER_ONLY) +#if HWY_HEADER_ONLY #define HWY_AUTOTUNE_STDSORT #endif diff --git a/hwy/base.h b/hwy/base.h index ef920cf88a..f6f8cd1e81 100644 --- a/hwy/base.h +++ b/hwy/base.h @@ -21,9 +21,13 @@ // IWYU pragma: begin_exports #include #include -#if defined(HWY_HEADER_ONLY) +#if HWY_HEADER_ONLY #include #include + +#define HWY_HEADER_ONLY_FUNC inline +#else +#define HWY_HEADER_ONLY_FUNC #endif #if !defined(HWY_NO_LIBCXX) @@ -284,38 +288,6 @@ namespace hwy { //------------------------------------------------------------------------------ // Abort / Warn -#if defined(HWY_HEADER_ONLY) -HWY_DLLEXPORT inline void HWY_FORMAT(3, 4) - Warn(const char* file, int line, const char* format, ...) { - char buf[800]; - va_list args; - va_start(args, format); - vsnprintf(buf, sizeof(buf), format, args); - va_end(args); - - fprintf(stderr, "Warn at %s:%d: %s\n", file, line, buf); -} - -HWY_DLLEXPORT HWY_NORETURN inline void HWY_FORMAT(3, 4) - Abort(const char* file, int line, const char* format, ...) { - char buf[800]; - va_list args; - va_start(args, format); - vsnprintf(buf, sizeof(buf), format, args); - va_end(args); - - fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); - - fflush(stderr); - -// Now terminate the program: -#if HWY_ARCH_RISCV - exit(1); // trap/abort just freeze Spike. -#else - abort(); // Compile error without this due to HWY_NORETURN. -#endif -} -#else // !HWY_HEADER_ONLY // Interfaces for custom Warn/Abort handlers. typedef void (*WarnFunc)(const char* file, int line, const char* message); @@ -350,8 +322,6 @@ HWY_DLLEXPORT void HWY_FORMAT(3, 4) HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...); -#endif // HWY_HEADER_ONLY - #define HWY_WARN(format, ...) \ ::hwy::Warn(__FILE__, __LINE__, format, ##__VA_ARGS__) @@ -3290,4 +3260,7 @@ HWY_API void PreventElision(T&& output) { } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/abort.cc" +#endif // HWY_HEADER_ONLY #endif // HIGHWAY_HWY_BASE_H_ diff --git a/hwy/nanobenchmark.cc b/hwy/nanobenchmark.cc index 0a885d2d09..77f0764415 100644 --- a/hwy/nanobenchmark.cc +++ b/hwy/nanobenchmark.cc @@ -24,12 +24,11 @@ #include #include -#include "hwy/base.h" #include "hwy/robust_statistics.h" -#include "hwy/timer.h" namespace hwy { -namespace { +namespace detail { +HWY_HEADER_ONLY_FUNC const timer::Ticks& GetTimerResolution() { static const timer::Ticks timer_resolution = platform::TimerResolution(); return timer_resolution; @@ -107,6 +106,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, using InputVec = std::vector; // Returns vector of unique input values. +HWY_HEADER_ONLY_FUNC InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) { InputVec unique(inputs, inputs + num_inputs); std::sort(unique.begin(), unique.end()); @@ -115,6 +115,7 @@ InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) { } // Returns how often we need to call func for sufficient precision. +HWY_HEADER_ONLY_FUNC size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, const Params& p) { // Min elapsed ticks for any input. @@ -144,6 +145,7 @@ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, } // Replicates inputs until we can omit "num_skip" occurrences of an input. +HWY_HEADER_ONLY_FUNC InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs, const size_t num_unique, const size_t num_skip, const Params& p) { @@ -164,6 +166,7 @@ InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs, // Copies the "full" to "subset" in the same order, but with "num_skip" // randomly selected occurrences of "input_to_skip" removed. +HWY_HEADER_ONLY_FUNC void FillSubset(const InputVec& full, const FuncInput input_to_skip, const size_t num_skip, InputVec* subset) { const size_t count = @@ -203,6 +206,7 @@ void FillSubset(const InputVec& full, const FuncInput input_to_skip, } // Returns total ticks elapsed for all inputs. +HWY_HEADER_ONLY_FUNC timer::Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs, const Params& p, double* max_rel_mad) { @@ -218,12 +222,13 @@ timer::Ticks TotalDuration(const Func func, const uint8_t* arg, } // (Nearly) empty Func for measuring timer overhead/resolution. -HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) { +static HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) { return input; } // Returns overhead of accessing inputs[] and calling a function; this will // be deducted from future TotalDuration return values. +HWY_HEADER_ONLY_FUNC timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) { double rel_mad; @@ -235,10 +240,12 @@ timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, }); } -} // namespace +} // namespace detail +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, const size_t num_inputs, Result* results, const Params& p) { @@ -250,19 +257,19 @@ HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, return 0; } - const InputVec& unique = UniqueInputs(inputs, num_inputs); + const detail::InputVec& unique = detail::UniqueInputs(inputs, num_inputs); - const size_t num_skip = NumSkip(func, arg, unique, p); // never 0 + const size_t num_skip = detail::NumSkip(func, arg, unique, p); // never 0 if (num_skip == 0) return 0; // NumSkip already printed error message // (slightly less work on x86 to cast from signed integer) const float mul = 1.0f / static_cast(static_cast(num_skip)); - const InputVec& full = - ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); - InputVec subset(full.size() - num_skip); + const detail::InputVec& full = + detail::ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); + detail::InputVec subset(full.size() - num_skip); - const timer::Ticks overhead = Overhead(arg, &full, p); - const timer::Ticks overhead_skip = Overhead(arg, &subset, p); + const timer::Ticks overhead = detail::Overhead(arg, &full, p); + const timer::Ticks overhead_skip = detail::Overhead(arg, &subset, p); if (overhead < overhead_skip) { HWY_WARN("Measurement failed: overhead %d < %d\n", static_cast(overhead), static_cast(overhead_skip)); @@ -276,12 +283,12 @@ HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, } double max_rel_mad = 0.0; - const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); + const timer::Ticks total = detail::TotalDuration(func, arg, &full, p, &max_rel_mad); for (size_t i = 0; i < unique.size(); ++i) { - FillSubset(full, unique[i], num_skip, &subset); + detail::FillSubset(full, unique[i], num_skip, &subset); const timer::Ticks total_skip = - TotalDuration(func, arg, &subset, p, &max_rel_mad); + detail::TotalDuration(func, arg, &subset, p, &max_rel_mad); if (total < total_skip) { HWY_WARN("Measurement failed: total %f < %f\n", diff --git a/hwy/nanobenchmark.h b/hwy/nanobenchmark.h index 5ca03aca84..c0ef9248d5 100644 --- a/hwy/nanobenchmark.h +++ b/hwy/nanobenchmark.h @@ -150,4 +150,7 @@ static inline size_t MeasureClosure(const Closure& closure, } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/nanobenchmark.cc" +#endif #endif // HIGHWAY_HWY_NANOBENCHMARK_H_ diff --git a/hwy/perf_counters.cc b/hwy/perf_counters.cc index 4cad466d67..c838a1ed9e 100644 --- a/hwy/perf_counters.cc +++ b/hwy/perf_counters.cc @@ -46,8 +46,9 @@ namespace platform { #if HWY_OS_LINUX || HWY_IDE -namespace { +namespace detail { +HWY_HEADER_ONLY_FUNC bool PerfCountersSupported() { // This is the documented way. struct stat s; @@ -57,6 +58,7 @@ bool PerfCountersSupported() { // If we detect Linux < 6.9 and AMD EPYC, use cycles instead of ref-cycles // because the latter is not supported and returns 0, see // https://lwn.net/Articles/967791/. +HWY_HEADER_ONLY_FUNC uint64_t RefCyclesOrCycles() { const uint32_t ref_cycles = PERF_COUNT_HW_REF_CPU_CYCLES; @@ -81,6 +83,7 @@ struct CounterConfig { // for perf_event_open PerfCounters::Counter c; }; +HWY_HEADER_ONLY_FUNC std::vector AllCounterConfigs() { constexpr uint32_t kHW = PERF_TYPE_HARDWARE; constexpr uint32_t kSW = PERF_TYPE_SOFTWARE; @@ -107,6 +110,7 @@ std::vector AllCounterConfigs() { {PERF_COUNT_HW_CACHE_MISSES, kHW, PerfCounters::kCacheMisses}}; } +HWY_HEADER_ONLY_FUNC size_t& PackedIdx(PerfCounters::Counter c) { static size_t packed_idx[64]; return packed_idx[static_cast(c)]; @@ -342,6 +346,7 @@ class PMU { }; // Monostate, see header. +HWY_HEADER_ONLY_FUNC PMU& GetPMU() { static PMU& pmu = *new PMU(); // avoids exit-dtor warning (no dtor required) return pmu; @@ -349,27 +354,37 @@ PMU& GetPMU() { } // namespace -HWY_DLLEXPORT bool PerfCounters::Init() { return GetPMU().Init(); } -HWY_DLLEXPORT bool PerfCounters::StartAll() { return GetPMU().StartAll(); } +HWY_HEADER_ONLY_FUNC +HWY_DLLEXPORT bool PerfCounters::Init() { return detail::GetPMU().Init(); } +HWY_HEADER_ONLY_FUNC +HWY_DLLEXPORT bool PerfCounters::StartAll() { return detail::GetPMU().StartAll(); } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void PerfCounters::StopAllAndReset() { - GetPMU().StopAllAndReset(); + detail::GetPMU().StopAllAndReset(); } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT PerfCounters::PerfCounters() { - if (HWY_UNLIKELY(!GetPMU().Read(valid_, max_extrapolate_, values_))) { + if (HWY_UNLIKELY(!detail::GetPMU().Read(valid_, max_extrapolate_, values_))) { valid_ = BitSet64(); max_extrapolate_ = 0.0; hwy::ZeroBytes(values_, sizeof(values_)); } } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter c) { - return PackedIdx(c); + return detail::PackedIdx(c); } #else +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT bool PerfCounters::Init() { return false; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT bool PerfCounters::StartAll() { return false; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void PerfCounters::StopAllAndReset() {} +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT PerfCounters::PerfCounters() : max_extrapolate_(1.0), values_{0.0} {} +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT size_t PerfCounters::IndexForCounter(Counter) { return 0; } #endif // HWY_OS_LINUX || HWY_IDE diff --git a/hwy/perf_counters.h b/hwy/perf_counters.h index 7485116247..1fc40e8abd 100644 --- a/hwy/perf_counters.h +++ b/hwy/perf_counters.h @@ -153,4 +153,7 @@ class PerfCounters { } // namespace platform } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/perf_counters.cc" +#endif #endif // HIGHWAY_HWY_PERF_COUNTERS_H_ diff --git a/hwy/print.cc b/hwy/print.cc index cea41042b6..ca74674ed2 100644 --- a/hwy/print.cc +++ b/hwy/print.cc @@ -23,6 +23,7 @@ namespace hwy { namespace detail { +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) { const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u'); // Omit the xN suffix for scalars. @@ -39,6 +40,7 @@ HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) { // The NOLINT are to suppress the warning about passing 100 instead of // `sizeof(string100)`, which is a pointer. +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, char* string100) { if (info.sizeof_t == 1) { @@ -111,6 +113,7 @@ HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, } } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, const void* array_void, size_t N, size_t lane_u, size_t max_lanes) { diff --git a/hwy/print.h b/hwy/print.h index e61631e650..e6c8b5e849 100644 --- a/hwy/print.h +++ b/hwy/print.h @@ -72,4 +72,7 @@ HWY_NOINLINE void PrintArray(const T* value, size_t count) { } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/print.cc" +#endif #endif // HWY_PRINT_H_ diff --git a/hwy/profiler.cc b/hwy/profiler.cc index 8855d48366..253c856b17 100644 --- a/hwy/profiler.cc +++ b/hwy/profiler.cc @@ -33,14 +33,12 @@ namespace hwy { #if PROFILER_ENABLED -static constexpr bool kPrintOverhead = true; - -// Must zero-init because `ThreadFunc` calls `SetGlobalIdx()` potentially after -// this is first used in the `pool::Worker` ctor. -/*static*/ thread_local size_t Profiler::s_global_idx = 0; +namespace detail { +HWY_INLINE_VAR constexpr bool kPrintOverhead = true; // Detects duration of a zero-length zone: timer plus packet overhead. -static uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) { +HWY_HEADER_ONLY_FUNC +uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) { static const profiler::ZoneHandle zone = profiler.AddZone("DetectSelf"); profiler::Results results; const size_t kNumSamples = 25; @@ -56,7 +54,7 @@ static uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) { } durations[idx_duration] = static_cast(profiler.GetFirstDurationAndReset(global_idx)); - } + } samples[idx_sample] = robust_statistics::Mode(durations, kNumDurations); } return robust_statistics::Mode(samples, kNumSamples); @@ -65,7 +63,8 @@ static uint64_t DetectSelfOverhead(Profiler& profiler, size_t global_idx) { // Detects average duration of a zero-length zone, after deducting self // overhead. This accounts for the delay before/after capturing start/end // timestamps, for example due to fence instructions in timer::Start/Stop. -static uint64_t DetectChildOverhead(Profiler& profiler, size_t global_idx, +HWY_HEADER_ONLY_FUNC +uint64_t DetectChildOverhead(Profiler& profiler, size_t global_idx, uint64_t self_overhead) { static const profiler::ZoneHandle zone = profiler.AddZone("DetectChild"); // Enough for stable measurements, but only about 50 ms startup cost. @@ -100,7 +99,18 @@ static uint64_t DetectChildOverhead(Profiler& profiler, size_t global_idx, } return num_samples == 0 ? 0 : robust_statistics::Mode(samples, num_samples); } +// Must zero-init because `ThreadFunc` calls `SetGlobalIdx()` potentially after +// this is first used in the `pool::Worker` ctor. +HWY_HEADER_ONLY_FUNC +size_t& GetProfilerGlobalIdx() { + thread_local size_t s_global_idx = 0; + return s_global_idx; +} + +} // namespace detail + +HWY_HEADER_ONLY_FUNC Profiler::Profiler() { const uint64_t t0 = timer::Start(); @@ -120,13 +130,13 @@ Profiler::Profiler() { profiler::Overheads overheads; // WARNING: must pass in `*this` and use `PROFILER_ZONE3` to avoid calling // `Profiler::Get()`, because that would re-enter the magic static init. - overheads.self = DetectSelfOverhead(*this, kMain); - overheads.child = DetectChildOverhead(*this, kMain, overheads.self); + overheads.self = detail::DetectSelfOverhead(*this, kMain); + overheads.child = detail::DetectChildOverhead(*this, kMain, overheads.self); for (size_t worker = 0; worker < profiler::kMaxWorkers; ++worker) { workers_[worker].SetOverheads(overheads); } - HWY_IF_CONSTEXPR(kPrintOverhead) { + HWY_IF_CONSTEXPR(detail::kPrintOverhead) { printf("Self overhead: %.0f; child: %.0f; elapsed %.1f ms\n", static_cast(overheads.self), static_cast(overheads.child), @@ -135,9 +145,17 @@ Profiler::Profiler() { } } +HWY_HEADER_ONLY_FUNC +size_t Profiler::Thread() { return detail::GetProfilerGlobalIdx(); } +HWY_HEADER_ONLY_FUNC +size_t Profiler::GlobalIdx() { return detail::GetProfilerGlobalIdx(); } +HWY_HEADER_ONLY_FUNC +void Profiler::SetGlobalIdx(size_t global_idx) { detail::GetProfilerGlobalIdx() = global_idx; } + #endif // PROFILER_ENABLED // Even if disabled, we want to export the symbol. +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT Profiler& Profiler::Get() { static Profiler* profiler = new Profiler(); return *profiler; diff --git a/hwy/profiler.h b/hwy/profiler.h index 10ad48a85a..9ce7262672 100644 --- a/hwy/profiler.h +++ b/hwy/profiler.h @@ -613,11 +613,11 @@ class Profiler { // global_idx from `ThreadPool::Run` (if constructed with non-default // `PoolWorkerMapping`) to `PROFILER_ZONE2/PROFILER_ZONE3`. // DEPRECATED: use `GlobalIdx` instead. - static size_t Thread() { return s_global_idx; } - static size_t GlobalIdx() { return s_global_idx; } + static size_t Thread(); + static size_t GlobalIdx(); // Must be called from all worker threads, and once also on the main thread, // before any use of `PROFILER_ZONE/PROFILER_FUNC`. - static void SetGlobalIdx(size_t global_idx) { s_global_idx = global_idx; } + static void SetGlobalIdx(size_t global_idx); void ReserveWorker(size_t global_idx) { HWY_ASSERT(!workers_reserved_.Get(global_idx)); @@ -733,8 +733,6 @@ class Profiler { }); } - static thread_local size_t s_global_idx; - // These are atomic because `ThreadFunc` reserves its slot(s) and even // `ThreadPool::ThreadPool` may be called concurrently. Both have bit `i` set // between calls to `Reserve*(i)` and `Free*(i)`. They are consulted in @@ -870,4 +868,7 @@ struct Zone { #define PROFILER_END_ROOT_RUN() hwy::Profiler::Get().EndRootRun() #define PROFILER_PRINT_RESULTS() hwy::Profiler::Get().PrintResults() +#if HWY_HEADER_ONLY +#include "hwy/profiler.cc" +#endif #endif // HIGHWAY_HWY_PROFILER_H_ diff --git a/hwy/targets.cc b/hwy/targets.cc index 58025dc677..250f0651c9 100644 --- a/hwy/targets.cc +++ b/hwy/targets.cc @@ -49,16 +49,18 @@ namespace hwy { #if HWY_OS_APPLE -static HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature( - const char* feature_name) { +namespace detail { +// HWY_HEADER_ONLY_FUNC +HWY_INLINE HWY_MAYBE_UNUSED bool HasCpuFeature(const char* feature_name) { int result = 0; size_t len = sizeof(int); return (sysctlbyname(feature_name, &result, &len, nullptr, 0) == 0 && result != 0); } -static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr, - uint32_t& parsed_val) { +// HWY_HEADER_ONLY_FUNC +HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr, + uint32_t& parsed_val) { uint64_t parsed_u64 = 0; const char* start_ptr = ptr; @@ -79,7 +81,8 @@ static HWY_INLINE HWY_MAYBE_UNUSED bool ParseU32(const char*& ptr, return (ptr != start_ptr); } -static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() { +// HWY_HEADER_ONLY_FUNC +HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() { utsname uname_buf; ZeroBytes(&uname_buf, sizeof(utsname)); @@ -111,6 +114,7 @@ static HWY_INLINE HWY_MAYBE_UNUSED bool IsMacOs12_2OrLater() { // or later return (major > 21 || (major == 21 && minor >= 3)); } +} // namespace detail #endif // HWY_OS_APPLE #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH @@ -118,7 +122,8 @@ namespace x86 { // Returns the lower 32 bits of extended control register 0. // Requires CPU support for "OSXSAVE" (see below). -static uint32_t ReadXCR0() { +HWY_HEADER_ONLY_FUNC +uint32_t ReadXCR0() { #if HWY_COMPILER_MSVC return static_cast(_xgetbv(0)); #else // HWY_COMPILER_MSVC @@ -177,12 +182,13 @@ enum class FeatureIndex : uint32_t { static_assert(static_cast(FeatureIndex::kSentinel) < 64, "Too many bits for u64"); -static HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { +HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { return 1ull << static_cast(index); } // Returns bit array of FeatureIndex from CPUID feature flags. -static uint64_t FlagsFromCPUID() { +HWY_HEADER_ONLY_FUNC +uint64_t FlagsFromCPUID() { uint64_t flags = 0; // return value uint32_t abcd[4]; Cpuid(0, 0, abcd); @@ -240,17 +246,17 @@ static uint64_t FlagsFromCPUID() { } // Each Highway target requires a 'group' of multiple features/flags. -static constexpr uint64_t kGroupSSE2 = +HWY_INLINE_VAR constexpr uint64_t kGroupSSE2 = Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2); -static constexpr uint64_t kGroupSSSE3 = +HWY_INLINE_VAR constexpr uint64_t kGroupSSSE3 = Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2; #ifdef HWY_DISABLE_PCLMUL_AES -static constexpr uint64_t kGroupSSE4 = +HWY_INLINE_VAR constexpr uint64_t kGroupSSE4 = Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | kGroupSSSE3; #else -static constexpr uint64_t kGroupSSE4 = +HWY_INLINE_VAR constexpr uint64_t kGroupSSE4 = Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; #endif // HWY_DISABLE_PCLMUL_AES @@ -260,46 +266,47 @@ static constexpr uint64_t kGroupSSE4 = // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of // avoiding using and requiring these so AVX2 can still be used. #ifdef HWY_DISABLE_BMI2_FMA -static constexpr uint64_t kGroupBMI2_FMA = 0; +HWY_INLINE_VAR constexpr uint64_t kGroupBMI2_FMA = 0; #else -static constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | - Bit(FeatureIndex::kBMI2) | - Bit(FeatureIndex::kFMA); +HWY_INLINE_VAR constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | + Bit(FeatureIndex::kBMI2) | + Bit(FeatureIndex::kFMA); #endif #ifdef HWY_DISABLE_F16C -static constexpr uint64_t kGroupF16C = 0; +HWY_INLINE_VAR constexpr uint64_t kGroupF16C = 0; #else -static constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); +HWY_INLINE_VAR constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); #endif -static constexpr uint64_t kGroupAVX2 = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX2 = Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; -static constexpr uint64_t kGroupAVX3 = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX3 = Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | Bit(FeatureIndex::kAVX512CD) | kGroupAVX2; -static constexpr uint64_t kGroupAVX3_DL = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX3_DL = Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3; -static constexpr uint64_t kGroupAVX3_ZEN4 = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX3_ZEN4 = Bit(FeatureIndex::kAVX512BF16) | kGroupAVX3_DL; -static constexpr uint64_t kGroupAVX3_SPR = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX3_SPR = Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_ZEN4; -static constexpr uint64_t kGroupAVX10 = +HWY_INLINE_VAR constexpr uint64_t kGroupAVX10 = Bit(FeatureIndex::kAVX10) | Bit(FeatureIndex::kAPX) | Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kGFNI) | kGroupAVX2; -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. HWY_IF_CONSTEXPR(HWY_ARCH_X86_64) { bits |= HWY_SSE2; // always present in x64 @@ -405,7 +412,8 @@ static int64_t DetectTargets() { // HasCpuFeature("hw.optional.avx512f") avoids false negative results // on x86_64 CPU's that have AVX3 support. const bool have_avx3_xsave_support = - IsMacOs12_2OrLater() && HasCpuFeature("hw.optional.avx512f"); + detail::IsMacOs12_2OrLater() && + detail::HasCpuFeature("hw.optional.avx512f"); #endif const uint32_t xcr0 = ReadXCR0(); @@ -450,7 +458,8 @@ namespace arm { (HWY_COMPILER_GCC || HWY_COMPILER_CLANG) && \ ((HWY_TARGETS & HWY_ALL_SVE) != 0) HWY_PUSH_ATTRIBUTES("+sve") -static int64_t DetectAdditionalSveTargets(int64_t detected_targets) { +HWY_HEADER_ONLY_FUNC +int64_t DetectAdditionalSveTargets(int64_t detected_targets) { uint64_t sve_vec_len; // Use inline assembly instead of svcntb_pat(SV_ALL) as GCC or Clang might @@ -467,7 +476,8 @@ static int64_t DetectAdditionalSveTargets(int64_t detected_targets) { HWY_POP_ATTRIBUTES #endif -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. using CapBits = unsigned long; // NOLINT @@ -483,6 +493,7 @@ static int64_t DetectTargets() { bits |= HWY_NEON_WITHOUT_AES; // aarch64 always has NEON and VFPv4.. #if HWY_OS_APPLE + using namespace detail; if (HasCpuFeature("hw.optional.arm.FEAT_AES")) { bits |= HWY_NEON; @@ -588,19 +599,18 @@ namespace ppc { using CapBits = unsigned long; // NOLINT // For AT_HWCAP, the others are for AT_HWCAP2 -static constexpr CapBits kGroupVSX = - PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX; +HWY_INLINE_VAR constexpr CapBits kGroupVSX = PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX; #if defined(HWY_DISABLE_PPC8_CRYPTO) -static constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07; +HWY_INLINE_VAR constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07; #else -static constexpr CapBits kGroupPPC8 = - PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO; +HWY_INLINE_VAR constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO; #endif -static constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00; -static constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1; +HWY_INLINE_VAR constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00; +HWY_INLINE_VAR constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1; -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. #if defined(AT_HWCAP) && defined(AT_HWCAP2) @@ -640,11 +650,12 @@ namespace s390x { using CapBits = unsigned long; // NOLINT -static constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE; -static constexpr CapBits kGroupZ15 = +HWY_INLINE_VAR constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE; +HWY_INLINE_VAR constexpr CapBits kGroupZ15 = HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2; -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; #if defined(AT_HWCAP) @@ -671,7 +682,8 @@ namespace rvv { using CapBits = unsigned long; // NOLINT -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; const CapBits hw = getauxval(AT_HWCAP); @@ -720,7 +732,8 @@ namespace loongarch { using CapBits = unsigned long; // NOLINT -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { int64_t bits = 0; const CapBits hw = getauxval(AT_HWCAP); if (hw & LA_HWCAP_LSX) bits |= HWY_LSX; @@ -733,7 +746,8 @@ static int64_t DetectTargets() { // Returns targets supported by the CPU, independently of DisableTargets. // Factored out of SupportedTargets to make its structure more obvious. Note // that x86 CPUID may take several hundred cycles. -static int64_t DetectTargets() { +HWY_HEADER_ONLY_FUNC +int64_t DetectTargets() { // Apps will use only one of these (the default is EMU128), but compile flags // for this TU may differ from that of the app, so allow both. int64_t bits = HWY_SCALAR | HWY_EMU128; @@ -772,15 +786,24 @@ static int64_t DetectTargets() { return bits; } +namespace detail { // When running tests, this value can be set to the mocked supported targets // mask. Only written to from a single thread before the test starts. -static int64_t supported_targets_for_test_ = 0; +inline int64_t& SupportedTargetsForTest() { + static int64_t supported_targets_for_test_ = 0; + return supported_targets_for_test_; +} // Mask of targets disabled at runtime with DisableTargets. -static int64_t supported_mask_ = LimitsMax(); +inline int64_t& SupportedMask() { + static auto supported_mask_ = LimitsMax(); + return supported_mask_; +} +} // namespace detail +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { - supported_mask_ = static_cast(~disabled_targets); + detail::SupportedMask() = static_cast(~disabled_targets); // This will take effect on the next call to SupportedTargets, which is // called right before GetChosenTarget::Update. However, calling Update here // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want @@ -789,13 +812,15 @@ HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { GetChosenTarget().DeInit(); } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { - supported_targets_for_test_ = targets; + detail::SupportedTargetsForTest() = targets; GetChosenTarget().DeInit(); // see comment above } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT int64_t SupportedTargets() { - int64_t targets = supported_targets_for_test_; + int64_t targets = detail::SupportedTargetsForTest(); if (HWY_LIKELY(targets == 0)) { // Mock not active. Re-detect instead of caching just in case we're on a // heterogeneous ISA (also requires some app support to pin threads). This @@ -809,10 +834,11 @@ HWY_DLLEXPORT int64_t SupportedTargets() { GetChosenTarget().Update(targets); } - targets &= supported_mask_; + targets &= detail::SupportedMask(); return targets == 0 ? HWY_STATIC_TARGET : targets; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { static ChosenTarget chosen_target; return chosen_target; diff --git a/hwy/targets.h b/hwy/targets.h index b4ea2850a2..64b884123c 100644 --- a/hwy/targets.h +++ b/hwy/targets.h @@ -385,4 +385,7 @@ HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/targets.cc" +#endif #endif // HIGHWAY_HWY_TARGETS_H_ diff --git a/hwy/timer.cc b/hwy/timer.cc index eac90566c8..d56c23f523 100644 --- a/hwy/timer.cc +++ b/hwy/timer.cc @@ -26,15 +26,17 @@ namespace hwy { +namespace detail { #if HWY_ARCH_X86 namespace x86 { -static bool HasRDTSCP() { +HWY_HEADER_ONLY_FUNC +bool HasRDTSCP() { uint32_t abcd[4]; - Cpuid(0x80000001U, 0, abcd); // Extended feature flags + hwy::x86::Cpuid(0x80000001U, 0, abcd); // Extended feature flags if ((abcd[3] & (1u << 27)) == 0) return false; // RDTSCP - Cpuid(0x80000007U, 0, abcd); + hwy::x86::Cpuid(0x80000007U, 0, abcd); if ((abcd[3] & (1u << 8)) == 0) { HWY_WARN("TSC not constant/invariant, may vary frequency or jump."); } @@ -48,7 +50,8 @@ static bool HasRDTSCP() { // frequency encoded in x86 GetCpuString because it is misleading on M1 Rosetta, // and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also // used on RISC-V and aarch64. -static HWY_MAYBE_UNUSED double MeasureNominalClockRate() { +HWY_HEADER_ONLY_FUNC +HWY_MAYBE_UNUSED double MeasureNominalClockRate() { double max_ticks_per_sec = 0.0; // Arbitrary, enough to ignore 2 outliers without excessive init time. for (int rep = 0; rep < 3; ++rep) { @@ -79,7 +82,8 @@ static HWY_MAYBE_UNUSED double MeasureNominalClockRate() { #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) namespace ppc { -static HWY_INLINE double GetTimebaseFreq() { +HWY_HEADER_ONLY_FUNC +HWY_INLINE double GetTimebaseFreq() { const auto timebase_freq = __ppc_get_timebase_freq(); // If timebase_freq is greater than 0, then return timebase_freq. @@ -93,21 +97,24 @@ static HWY_INLINE double GetTimebaseFreq() { } // namespace ppc #endif +} // namespace detail + namespace platform { +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT bool GetCpuString(char* cpu100) { #if HWY_ARCH_X86 uint32_t abcd[4]; // Check if brand string is supported (it is on all reasonable Intel/AMD) - x86::Cpuid(0x80000000U, 0, abcd); + hwy::x86::Cpuid(0x80000000U, 0, abcd); if (abcd[0] < 0x80000004U) { cpu100[0] = '\0'; return false; } for (size_t i = 0; i < 3; ++i) { - x86::Cpuid(static_cast(0x80000002U + i), 0, abcd); + hwy::x86::Cpuid(static_cast(0x80000002U + i), 0, abcd); CopyBytes(&abcd[0], cpu100 + i * 16); // not same size } cpu100[48] = '\0'; @@ -119,14 +126,16 @@ HWY_DLLEXPORT bool GetCpuString(char* cpu100) { #endif } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT double Now() { static const double mul = 1.0 / InvariantTicksPerSecond(); return static_cast(timer::Start()) * mul; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT bool HaveTimerStop(char* cpu100) { #if HWY_ARCH_X86 - if (!x86::HasRDTSCP()) { + if (!detail::x86::HasRDTSCP()) { (void)GetCpuString(cpu100); return false; } @@ -135,13 +144,14 @@ HWY_DLLEXPORT bool HaveTimerStop(char* cpu100) { return true; } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT double InvariantTicksPerSecond() { #if HWY_ARCH_PPC && defined(__GLIBC__) && defined(__powerpc64__) - static const double freq = ppc::GetTimebaseFreq(); + static const double freq = detail::ppc::GetTimebaseFreq(); return freq; #elif HWY_ARCH_X86 || HWY_ARCH_RISCV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs. - static const double freq = MeasureNominalClockRate(); + static const double freq = detail::MeasureNominalClockRate(); return freq; #elif defined(_WIN32) || defined(_WIN64) LARGE_INTEGER freq; @@ -157,6 +167,7 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() { #endif } +HWY_HEADER_ONLY_FUNC HWY_DLLEXPORT uint64_t TimerResolution() { char cpu100[100]; bool can_use_stop = HaveTimerStop(cpu100); diff --git a/hwy/timer.h b/hwy/timer.h index 0873ac2cf1..915ded51b0 100644 --- a/hwy/timer.h +++ b/hwy/timer.h @@ -278,4 +278,8 @@ class Stopwatch { } // namespace hwy +#if HWY_HEADER_ONLY +#include "hwy/timer.cc" +#endif + #endif // HIGHWAY_HWY_TIMER_H_ diff --git a/hwy/x86_cpuid.h b/hwy/x86_cpuid.h index 60bd51a022..8515a60cd7 100644 --- a/hwy/x86_cpuid.h +++ b/hwy/x86_cpuid.h @@ -35,7 +35,7 @@ namespace x86 { // Calls CPUID instruction with eax=level and ecx=count and returns the result // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). -static inline void Cpuid(const uint32_t level, const uint32_t count, +inline void Cpuid(const uint32_t level, const uint32_t count, uint32_t* HWY_RESTRICT abcd) { #if HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL int regs[4]; @@ -56,17 +56,17 @@ static inline void Cpuid(const uint32_t level, const uint32_t count, #endif // HWY_COMPILER_MSVC || HWY_COMPILER_CLANGCL } -static inline bool IsBitSet(const uint32_t reg, const int index) { +inline bool IsBitSet(const uint32_t reg, const int index) { return (reg & (1U << index)) != 0; } -static inline uint32_t MaxLevel() { +inline uint32_t MaxLevel() { uint32_t abcd[4]; Cpuid(0, 0, abcd); return abcd[0]; } -static inline bool IsAMD() { +inline bool IsAMD() { uint32_t abcd[4]; Cpuid(0, 0, abcd); const uint32_t max_level = abcd[0];