Skip to content

Commit 0f53ca5

Browse files
authored
Add comprehensive logging system and exception disabling support (#1080)
* Add comprehensive logging system and exception disabling support Enhances MatX's observability and error handling capabilities by adding extensive logging throughout the codebase and providing an option to disable exceptions. Logging enhancements: - Added TRACE-level logging to all operator and generator constructors - Log operator name via str() method and relevant constructor parameters - Consolidated log.h include in base_operator.h to reduce duplication - Added DEBUG-level logging for cache operations - Log cache hits and misses in LookupAndExec with cache ID, device, and thread - Log transform-specific cache attempts with descriptive names (FFT, MatMul, SVD, QR, LU, Eigenvalue, Inverse, CUB, Einsum, Solve, Sparse conversions, Filter, Covariance) - Added DEBUG-level logging for kernel launches - Log kernel parameters in CUDA executor - Added DEBUG-level logging for memory operations - Log all tensor allocations and deallocations with pointer and size info - Log all make_tensor() calls with signature, shape, pointer, and memory kind - Converted all printf/fprintf calls in error.h to use MatX logger - Error messages now use MATX_LOG_ERROR/MATX_LOG_FATAL consistently - Changed default log level from OFF to ERROR - Ensures error messages are visible by default - Users can override via MATX_LOG_LEVEL environment variable Exception handling improvements: - Added MATX_DISABLE_EXCEPTIONS CMake option - When enabled, MATX_THROW logs fatal error and calls abort() instead of throwing - Provides exception-free operation for environments that don't support them - All error handling macros automatically adapt to exception-disabled mode - Fixed macro parameter naming to avoid preprocessor conflicts These changes enable detailed runtime diagnostics for debugging performance issues, cache behavior, and memory usage while maintaining zero overhead when logging is disabled. * fix format
1 parent 758d3e8 commit 0f53ca5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+606
-110
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ option(MATX_EN_CUDA_LINEINFO "Enable line information for CUDA kernels via -line
8181
option(MATX_EN_EXTENDED_LAMBDA "Enable extended lambda support for device/host lambdas" ON)
8282
option(MATX_EN_MATHDX "Enable MathDx support for kernel fusion" OFF)
8383
option(MATX_EN_UNSAFE_ALIAS_DETECTION "Enable aliased memory detection" OFF)
84+
option(MATX_DISABLE_EXCEPTIONS "Disable C++ exceptions and log errors instead" OFF)
8485

8586
set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
8687

@@ -206,6 +207,9 @@ if (MATX_NVTX_FLAGS)
206207
add_definitions(-DMATX_NVTX_FLAGS)
207208
target_compile_definitions(matx INTERFACE MATX_NVTX_FLAGS)
208209
endif()
210+
if (MATX_DISABLE_EXCEPTIONS)
211+
target_compile_definitions(matx INTERFACE MATX_DISABLE_EXCEPTIONS)
212+
endif()
209213
if (MATX_BUILD_32_BIT)
210214
set(MATX_NVPL_INT_TYPE "lp64")
211215
target_compile_definitions(matx INTERFACE MATX_INDEX_32_BIT)

docs_input/build.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,8 @@ By default, all of these options are OFF.
201201
- ``-DMATX_EN_MATHDX=ON``
202202
* - Enable pybind11 Support. This option is usually not explicitly set, but is enabled by other options.
203203
- ``-DMATX_EN_PYBIND11=ON``
204+
* - Disable Exceptions
205+
- ``-DMATX_DISABLE_EXCEPTIONS=ON``
204206

205207

206208
NVTX Flags

examples/black_scholes.cu

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -150,16 +150,15 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
150150
index_t input_size = 100'000'000;
151151
constexpr uint32_t num_iterations = 100;
152152
float time_ms;
153-
154-
tensor_t<dtype, 1> K_tensor{{input_size}};
155-
tensor_t<dtype, 1> S_tensor{{input_size}};
156-
tensor_t<dtype, 1> V_tensor{{input_size}};
157-
tensor_t<dtype, 1> r_tensor{{input_size}};
158-
tensor_t<dtype, 1> T_tensor{{input_size}};
159-
tensor_t<dtype, 1> output_tensor{{input_size}};
160-
tensor_t<dtype, 1> output_tensor2{{input_size}};
161-
tensor_t<dtype, 1> output_tensor3{{input_size}};
162-
tensor_t<dtype, 1> output_tensor4{{input_size}};
153+
auto K_tensor = make_tensor<dtype>({input_size});
154+
auto S_tensor = make_tensor<dtype>({input_size});
155+
auto V_tensor = make_tensor<dtype>({input_size});
156+
auto r_tensor = make_tensor<dtype>({input_size});
157+
auto T_tensor = make_tensor<dtype>({input_size});
158+
auto output_tensor = make_tensor<dtype>({input_size});
159+
auto output_tensor2 = make_tensor<dtype>({input_size});
160+
auto output_tensor3 = make_tensor<dtype>({input_size});
161+
auto output_tensor4 = make_tensor<dtype>({input_size});
163162

164163
(K_tensor = random<float>({input_size}, UNIFORM)).run();
165164
(S_tensor = random<float>({input_size}, UNIFORM)).run();
@@ -171,8 +170,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
171170
cudaStreamCreate(&stream);
172171
cudaExecutor exec{stream};
173172

174-
//compute_black_scholes_matx(K_tensor, S_tensor, V_tensor, r_tensor, T_tensor, output_tensor, exec);
175-
176173
cudaEvent_t start, stop;
177174
cudaEventCreate(&start);
178175
cudaEventCreate(&stop);

include/matx/core/allocator.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343

4444
#include "matx/core/error.h"
4545
#include "matx/core/nvtx.h"
46+
#include "matx/core/log.h"
4647
#include <cuda/std/functional>
4748
#include <cuda/std/__algorithm/max.h>
4849

@@ -122,6 +123,9 @@ struct MemTracker {
122123

123124
size_t bytes = iter->second.size;
124125

126+
MATX_LOG_DEBUG("Deallocating memory: ptr={}, {} bytes, space={}, remaining={} bytes",
127+
ptr, bytes, static_cast<int>(iter->second.kind), matxMemoryStats.currentBytesAllocated - bytes);
128+
125129
matxMemoryStats.currentBytesAllocated -= bytes;
126130

127131
switch (iter->second.kind) {
@@ -187,6 +191,8 @@ struct MemTracker {
187191
}
188192
}
189193

194+
MATX_LOG_DEBUG("Allocating memory: {} bytes, space={}, stream={}", bytes, static_cast<int>(space), reinterpret_cast<void*>(stream));
195+
190196
switch (space) {
191197
case MATX_MANAGED_MEMORY:
192198
err = cudaMallocManaged(ptr, bytes);
@@ -214,6 +220,8 @@ struct MemTracker {
214220
MATX_THROW(matxOutOfMemory, "Failed to allocate memory");
215221
}
216222

223+
MATX_LOG_DEBUG("Allocated memory: ptr={}, {} bytes, total_current={} bytes", *ptr, bytes, matxMemoryStats.currentBytesAllocated + bytes);
224+
217225
[[maybe_unused]] std::unique_lock lck(memory_mtx);
218226
matxMemoryStats.currentBytesAllocated += bytes;
219227
matxMemoryStats.totalBytesAllocated += bytes;

include/matx/core/cache.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,11 +192,15 @@ class matxCache_t {
192192
auto &common_params_cache = rmap[key];
193193
auto cache_el = common_params_cache.find(params);
194194
if (cache_el == common_params_cache.end()) {
195+
MATX_LOG_DEBUG("Cache MISS for transform: id={}, device={}, thread={}",
196+
id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
195197
std::any tmp = mfun();
196198
common_params_cache.insert({params, tmp});
197199
efun(std::any_cast<decltype(mfun())>(tmp));
198200
}
199201
else {
202+
MATX_LOG_DEBUG("Cache HIT for transform: id={}, device={}, thread={}",
203+
id, key.device_id, reinterpret_cast<void*>(std::hash<std::thread::id>{}(key.thread_id)));
200204
efun(std::any_cast<decltype(mfun())>(cache_el->second));
201205
}
202206
}

include/matx/core/error.h

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#endif
4343

4444
#include "matx/core/stacktrace.h"
45+
#include "matx/core/log.h"
4546
#endif
4647

4748
namespace matx
@@ -150,6 +151,23 @@ namespace matx
150151
};
151152
}
152153

154+
#ifdef MATX_DISABLE_EXCEPTIONS
155+
156+
#define MATX_ENTER_HANDLER() {
157+
#define MATX_EXIT_HANDLER() }
158+
159+
#define MATX_THROW(e, str_arg) \
160+
do { \
161+
MATX_LOG_FATAL("matxException ({}: {}) - {}:{}", matxErrorString(e), str_arg, __FILE__, __LINE__); \
162+
std::stringstream matx_stack_trace; \
163+
detail::printStackTrace(matx_stack_trace); \
164+
std::string matx_stack_str = matx_stack_trace.str(); \
165+
MATX_LOG_FATAL("Stack Trace:\n{}", matx_stack_str); \
166+
std::abort(); \
167+
} while(0)
168+
169+
#else
170+
153171
#define MATX_ENTER_HANDLER() \
154172
try \
155173
{
@@ -158,8 +176,8 @@ namespace matx
158176
} \
159177
catch (matx::detail::matxException & e) \
160178
{ \
161-
fprintf(stderr, "%s\n", e.what()); \
162-
fprintf(stderr, "Stack Trace:\n%s", e.stack.str().c_str()); \
179+
MATX_LOG_FATAL("{}", e.what()); \
180+
MATX_LOG_FATAL("Stack Trace:\n{}", e.stack.str()); \
163181
exit(1); \
164182
}
165183

@@ -168,6 +186,8 @@ namespace matx
168186
throw matx::detail::matxException(e, str, __FILE__, __LINE__); \
169187
}
170188

189+
#endif
190+
171191
#if !defined(NDEBUG) && !defined(__CUDA_ARCH__)
172192
#define MATX_ASSERT(a, error) \
173193
{ \
@@ -190,7 +210,7 @@ namespace matx
190210
auto tmp = a; \
191211
if ((tmp != expected)) \
192212
{ \
193-
std::cout << #a ": " << str << "(" << tmp << " != " << expected << ")\n";\
213+
MATX_LOG_ERROR("{}: {} ({} != {})", #a, str, static_cast<int>(tmp), static_cast<int>(expected)); \
194214
MATX_THROW(error, ""); \
195215
} \
196216
}
@@ -217,7 +237,7 @@ namespace matx
217237
const auto e_ = (e); \
218238
if (e_ != cudaSuccess) \
219239
{ \
220-
fprintf(stderr, "%s:%d CUDA Error: %s (%d)\n", __FILE__,__LINE__, cudaGetErrorString(e_), e_); \
240+
MATX_LOG_ERROR("{}:{} CUDA Error: {} ({})", __FILE__, __LINE__, cudaGetErrorString(e_), static_cast<int>(e_)); \
221241
MATX_THROW(matx::matxCudaError, cudaGetErrorString(e_)); \
222242
} \
223243
} while (0)
@@ -239,21 +259,22 @@ namespace matx
239259
compatible = (size == 0 || size == Size(i)); \
240260
} \
241261
if (!compatible) { \
242-
std::cerr << "Incompatible operator sizes: ("; \
262+
std::string msg = "Incompatible operator sizes: ("; \
243263
for (int32_t i = 0; i < Rank(); i++) { \
244-
std::cerr << Size(i); \
264+
msg += std::to_string(Size(i)); \
245265
if (i != Rank() - 1) { \
246-
std::cerr << ","; \
266+
msg += ","; \
247267
} \
248268
} \
249-
std::cerr << ") not compatible with ("; \
269+
msg += ") not compatible with ("; \
250270
for (int32_t i = 0; i < Rank(); i++) { \
251-
std::cerr << matx::detail::get_expanded_size<Rank()>(op, i); \
271+
msg += std::to_string(matx::detail::get_expanded_size<Rank()>(op, i)); \
252272
if (i != Rank() - 1) { \
253-
std::cerr << ","; \
273+
msg += ","; \
254274
} \
255275
} \
256-
std::cerr << ")" << std::endl; \
276+
msg += ")"; \
277+
MATX_LOG_ERROR("{}", msg); \
257278
MATX_THROW(matxInvalidSize, "Incompatible operator sizes"); \
258279
} \
259280
}

include/matx/core/log.h

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,83 @@
4646
#include <memory>
4747
#include <mutex>
4848

49+
// Include MatX type traits and complex types for formatting support
50+
#include "matx/core/half.h"
51+
#include "matx/core/half_complex.h"
52+
#include <complex>
53+
#include <cuda/std/complex>
54+
55+
// Helper for formatting complex types
56+
namespace matx {
57+
namespace detail {
58+
// Generic helper to format any complex-like type with real() and imag() methods
59+
template<typename ComplexType>
60+
inline std::string format_complex(const ComplexType& c) {
61+
return std::format("({:g}{:+g}j)",
62+
static_cast<double>(c.real()),
63+
static_cast<double>(c.imag()));
64+
}
65+
}
66+
}
67+
68+
// Formatter specializations for all types supported by MatX
69+
namespace std {
70+
// Formatter for std::complex<T>
71+
template<typename T>
72+
struct formatter<std::complex<T>> {
73+
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
74+
75+
template<typename FormatContext>
76+
auto format(const std::complex<T>& c, FormatContext& ctx) const {
77+
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
78+
}
79+
};
80+
81+
// Formatter for cuda::std::complex<T>
82+
template<typename T>
83+
struct formatter<cuda::std::complex<T>> {
84+
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
85+
86+
template<typename FormatContext>
87+
auto format(const cuda::std::complex<T>& c, FormatContext& ctx) const {
88+
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
89+
}
90+
};
91+
92+
// Formatter for matxHalfComplex (fp16/bf16 complex)
93+
template<typename T>
94+
struct formatter<matx::matxHalfComplex<T>> {
95+
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
96+
97+
template<typename FormatContext>
98+
auto format(const matx::matxHalfComplex<T>& c, FormatContext& ctx) const {
99+
return format_to(ctx.out(), "{}", matx::detail::format_complex(c));
100+
}
101+
};
102+
103+
// Formatter for matxFp16 (half-precision float)
104+
template<>
105+
struct formatter<matx::matxFp16> {
106+
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
107+
108+
template<typename FormatContext>
109+
auto format(const matx::matxFp16& val, FormatContext& ctx) const {
110+
return format_to(ctx.out(), "{:g}", static_cast<float>(val));
111+
}
112+
};
113+
114+
// Formatter for matxBf16 (bfloat16)
115+
template<>
116+
struct formatter<matx::matxBf16> {
117+
constexpr auto parse(format_parse_context& ctx) { return ctx.begin(); }
118+
119+
template<typename FormatContext>
120+
auto format(const matx::matxBf16& val, FormatContext& ctx) const {
121+
return format_to(ctx.out(), "{:g}", static_cast<float>(val));
122+
}
123+
};
124+
}
125+
49126
namespace matx {
50127
namespace detail {
51128

@@ -139,7 +216,7 @@ class Logger {
139216
std::mutex mutex_;
140217
bool show_function_;
141218

142-
Logger() : min_level_(LogLevel::OFF), output_stream_(&std::cout), show_function_(false) {
219+
Logger() : min_level_(LogLevel::ERROR), output_stream_(&std::cout), show_function_(false) {
143220
// Read log level from environment
144221
const char* level_env = std::getenv("MATX_LOG_LEVEL");
145222
if (level_env) {

0 commit comments

Comments
 (0)