tracel-ai
diff --git a/‎crates/cubecl-cpu/src/compute/server.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-cpu/src/compute/server.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-cpu/src/runtime.rs‎
Lines changed: 13 additions & 4 deletions b/‎crates/cubecl-cpu/src/runtime.rs‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎crates/cubecl-cuda/src/compute/server.rs‎
Lines changed: 3 additions & 2 deletions b/‎crates/cubecl-cuda/src/compute/server.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎crates/cubecl-cuda/src/compute/stream.rs‎
Lines changed: 16 additions & 3 deletions b/‎crates/cubecl-cuda/src/compute/stream.rs‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎crates/cubecl-hip/src/compute/server.rs‎
Lines changed: 3 additions & 2 deletions b/‎crates/cubecl-hip/src/compute/server.rs‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎crates/cubecl-hip/src/compute/stream.rs‎
Lines changed: 16 additions & 3 deletions b/‎crates/cubecl-hip/src/compute/stream.rs‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎crates/cubecl-runtime/benches/dynamic.rs‎
Lines changed: 13 additions & 3 deletions b/‎crates/cubecl-runtime/benches/dynamic.rs‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎crates/cubecl-runtime/src/client.rs‎
Lines changed: 6 additions & 7 deletions b/‎crates/cubecl-runtime/src/client.rs‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎crates/cubecl-runtime/src/config/base.rs‎
Lines changed: 5 additions & 0 deletions b/‎crates/cubecl-runtime/src/config/base.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎crates/cubecl-runtime/src/config/logger.rs‎
Lines changed: 33 additions & 2 deletions b/‎crates/cubecl-runtime/src/config/logger.rs‎
Lines changed: 33 additions & 2 deletions
@@ -29,9 +29,9 @@ pub struct CpuServer {
 }
 
 impl CpuServer {
-    pub fn new(ctx: CpuContext) -> Self {
+    pub fn new(ctx: CpuContext, logger: Arc<ServerLogger>) -> Self {
         Self {
-            logger: Arc::new(ServerLogger::default()),
+            logger,
             scheduler: Scheduler::default(),
             ctx,
         }
 
@@ -7,7 +7,10 @@ use cubecl_core::{
 };
 use cubecl_runtime::{
     ComputeRuntime, DeviceProperties,
-    memory_management::{HardwareProperties, MemoryDeviceProperties, MemoryManagement},
+    logging::ServerLogger,
+    memory_management::{
+        HardwareProperties, MemoryDeviceProperties, MemoryManagement, MemoryManagementOptions,
+    },
     storage::BytesStorage,
 };
 use cubecl_std::tensor::is_contiguous;
@@ -45,6 +48,7 @@ fn create_client(options: RuntimeOptions) -> ComputeClient<Server, Channel> {
         .cgroup_limits()
         .map(|g| g.total_memory)
         .unwrap_or(system.total_memory()) as usize;
+    let logger = cubecl_common::stub::Arc::new(ServerLogger::default());
 
     let topology = HardwareProperties {
         plane_size_min: 1,
@@ -66,8 +70,13 @@ fn create_client(options: RuntimeOptions) -> ComputeClient<Server, Channel> {
         alignment: ALIGNMENT,
     };
 
-    let memory_management =
-        MemoryManagement::from_configuration(storage, &mem_properties, options.memory_config);
+    let memory_management = MemoryManagement::from_configuration(
+        storage,
+        &mem_properties,
+        options.memory_config,
+        logger.clone(),
+        MemoryManagementOptions::new("test"),
+    );
     let mut device_props = DeviceProperties::new(
         Default::default(),
         mem_properties,
@@ -77,7 +86,7 @@ fn create_client(options: RuntimeOptions) -> ComputeClient<Server, Channel> {
     register_supported_types(&mut device_props);
 
     let ctx = CpuContext::new(memory_management);
-    let server = CpuServer::new(ctx);
+    let server = CpuServer::new(ctx, logger);
     ComputeClient::new(Channel::new(server), device_props, ())
 }
 
 
@@ -464,13 +464,14 @@ impl CudaServer {
             log::info!("Peer data transfer not available for device {device_id}");
         }
 
+        let logger = Arc::new(ServerLogger::default());
         Self {
             mem_alignment,
             ctx,
             peer_activated,
             streams: MultiStream::new(
-                Arc::new(ServerLogger::default()),
-                CudaStreamBackend::new(mem_props, mem_config, mem_alignment),
+                logger.clone(),
+                CudaStreamBackend::new(mem_props, mem_config, mem_alignment, logger),
                 max_streams,
             ),
         }
 
@@ -1,3 +1,5 @@
+use std::sync::Arc;
+
 use crate::compute::{
     storage::{
         cpu::{PINNED_MEMORY_ALIGNMENT, PinnedMemoryStorage},
@@ -7,7 +9,10 @@ use crate::compute::{
 };
 use cubecl_core::MemoryConfiguration;
 use cubecl_runtime::{
-    memory_management::{MemoryDeviceProperties, MemoryManagement},
+    logging::ServerLogger,
+    memory_management::{
+        MemoryAllocationMode, MemoryDeviceProperties, MemoryManagement, MemoryManagementOptions,
+    },
     stream::EventStreamBackend,
 };
 
@@ -23,6 +28,7 @@ pub struct CudaStreamBackend {
     mem_props: MemoryDeviceProperties,
     mem_config: MemoryConfiguration,
     mem_alignment: usize,
+    logger: Arc<ServerLogger>,
 }
 
 impl EventStreamBackend for CudaStreamBackend {
@@ -36,8 +42,13 @@ impl EventStreamBackend for CudaStreamBackend {
         .expect("Can create a new stream.");
 
         let storage = GpuStorage::new(self.mem_alignment, stream);
-        let memory_management_gpu =
-            MemoryManagement::from_configuration(storage, &self.mem_props, self.mem_config.clone());
+        let memory_management_gpu = MemoryManagement::from_configuration(
+            storage,
+            &self.mem_props,
+            self.mem_config.clone(),
+            self.logger.clone(),
+            MemoryManagementOptions::new("Main GPU Memory"),
+        );
         // We use the same page size and memory pools configuration for CPU pinned memory, since we
         // expect the CPU to have at least the same amount of RAM as GPU memory.
         let memory_management_cpu = MemoryManagement::from_configuration(
@@ -47,6 +58,8 @@ impl EventStreamBackend for CudaStreamBackend {
                 alignment: PINNED_MEMORY_ALIGNMENT as u64,
             },
             self.mem_config.clone(),
+            self.logger.clone(),
+            MemoryManagementOptions::new("Pinned CPU Memory").mode(MemoryAllocationMode::Auto),
         );
 
         Stream {
 
@@ -252,12 +252,13 @@ impl HipServer {
         let config = GlobalConfig::get();
         let max_streams = config.streaming.max_streams;
 
+        let logger = Arc::new(ServerLogger::default());
         Self {
             ctx,
             mem_alignment,
             streams: MultiStream::new(
-                Arc::new(ServerLogger::default()),
-                HipStreamBackend::new(mem_props, mem_config, mem_alignment),
+                logger.clone(),
+                HipStreamBackend::new(mem_props, mem_config, mem_alignment, logger),
                 max_streams,
             ),
         }
 
@@ -1,7 +1,12 @@
+use std::sync::Arc;
+
 use cubecl_core::MemoryConfiguration;
 use cubecl_hip_sys::HIP_SUCCESS;
 use cubecl_runtime::{
-    memory_management::{MemoryDeviceProperties, MemoryManagement},
+    logging::ServerLogger,
+    memory_management::{
+        MemoryAllocationMode, MemoryDeviceProperties, MemoryManagement, MemoryManagementOptions,
+    },
     stream::EventStreamBackend,
 };
 
@@ -23,6 +28,7 @@ pub struct HipStreamBackend {
     mem_props: MemoryDeviceProperties,
     mem_config: MemoryConfiguration,
     mem_alignment: usize,
+    logger: Arc<ServerLogger>,
 }
 
 impl EventStreamBackend for HipStreamBackend {
@@ -37,8 +43,13 @@ impl EventStreamBackend for HipStreamBackend {
             stream
         };
         let storage = GpuStorage::new(self.mem_alignment);
-        let memory_management_gpu =
-            MemoryManagement::from_configuration(storage, &self.mem_props, self.mem_config.clone());
+        let memory_management_gpu = MemoryManagement::from_configuration(
+            storage,
+            &self.mem_props,
+            self.mem_config.clone(),
+            self.logger.clone(),
+            MemoryManagementOptions::new("Main GPU Memory"),
+        );
         // We use the same page size and memory pools configuration for CPU pinned memory, since we
         // expect the CPU to have at least the same amount of RAM as GPU memory.
         let memory_management_cpu = MemoryManagement::from_configuration(
@@ -48,6 +59,8 @@ impl EventStreamBackend for HipStreamBackend {
                 alignment: PINNED_MEMORY_ALIGNMENT as u64,
             },
             self.mem_config.clone(),
+            self.logger.clone(),
+            MemoryManagementOptions::new("Pinned CPU Memory").mode(MemoryAllocationMode::Auto),
         );
 
         Stream {
 
@@ -1,7 +1,10 @@
-use std::collections::LinkedList;
+use std::{collections::LinkedList, sync::Arc};
 
 use cubecl_runtime::{
-    memory_management::{MemoryConfiguration, MemoryDeviceProperties, MemoryManagement},
+    logging::ServerLogger,
+    memory_management::{
+        MemoryConfiguration, MemoryDeviceProperties, MemoryManagement, MemoryManagementOptions,
+    },
     storage::BytesStorage,
 };
 
@@ -15,7 +18,14 @@ fn main() {
         max_page_size: 2048 * MB,
         alignment: 32,
     };
-    let mut mm = MemoryManagement::from_configuration(storage, &mem_props, config);
+    let logger = Arc::new(ServerLogger::default());
+    let mut mm = MemoryManagement::from_configuration(
+        storage,
+        &mem_props,
+        config,
+        logger,
+        MemoryManagementOptions::new("test"),
+    );
     let mut handles = LinkedList::new();
     for _ in 0..100 * 2048 {
         if handles.len() >= 4000 {
 
@@ -508,27 +508,26 @@ where
         self.channel.allocation_mode(mode, self.stream_id())
     }
 
-    /// Use a static memory strategy to execute the provided function.
+    /// Use a persistent memory strategy to execute the provided function.
     ///
     /// # Notes
     ///
-    /// Using that memory strategy is beneficial for weights loading and similar workflows.
-    /// However make sure to call [Self::memory_cleanup()] if you want to free the allocated
-    /// memory.
-    pub fn memory_static_allocation<Input, Output, Func: Fn(Input) -> Output>(
+    /// - Using that memory strategy is beneficial for stating model parameters and similar workflows.
+    /// - You can call [Self::memory_cleanup()] if you want to free persistent memory.
+    pub fn memory_persistent_allocation<Input, Output, Func: Fn(Input) -> Output>(
         &self,
         input: Input,
         func: Func,
     ) -> Output {
         // We use the same profiling lock to make sure no other task is currently using the current
-        // device. Meaning that the current static memory strategy will only be used for the
+        // device. Meaning that the current persistent memory strategy will only be used for the
         // provided function.
 
         #[cfg(multi_threading)]
         let stream_id = self.profile_acquire();
 
         self.channel
-            .allocation_mode(MemoryAllocationMode::Static, self.stream_id());
+            .allocation_mode(MemoryAllocationMode::Persistent, self.stream_id());
         let output = func(input);
         self.channel
             .allocation_mode(MemoryAllocationMode::Auto, self.stream_id());
 
@@ -1,3 +1,4 @@
+use crate::config::memory::MemoryConfig;
 use crate::config::streaming::StreamingConfig;
 
 use super::{autotune::AutotuneConfig, compilation::CompilationConfig, profiling::ProfilingConfig};
@@ -26,6 +27,10 @@ pub struct GlobalConfig {
     /// Configuration for streaming settings.
     #[serde(default)]
     pub streaming: StreamingConfig,
+
+    /// Configuration for memory settings.
+    #[serde(default)]
+    pub memory: MemoryConfig,
 }
 
 impl GlobalConfig {
 
@@ -1,7 +1,7 @@
 use super::GlobalConfig;
 use crate::config::{
-    autotune::AutotuneLogLevel, compilation::CompilationLogLevel, profiling::ProfilingLogLevel,
-    streaming::StreamingLogLevel,
+    autotune::AutotuneLogLevel, compilation::CompilationLogLevel, memory::MemoryLogLevel,
+    profiling::ProfilingLogLevel, streaming::StreamingLogLevel,
 };
 use alloc::{string::ToString, sync::Arc, vec::Vec};
 use core::fmt::Display;
@@ -118,6 +118,9 @@ pub struct Logger {
     /// Indices of loggers used for streaming logging.
     streaming_index: Vec<usize>,
 
+    /// Indices of loggers used for memory logging.
+    memory_index: Vec<usize>,
+
     /// Global configuration for logging settings.
     pub config: Arc<GlobalConfig>,
 }
@@ -142,6 +145,7 @@ impl Logger {
         let mut profiling_index = Vec::new();
         let mut autotune_index = Vec::new();
         let mut streaming_index = Vec::new();
+        let mut memory_index = Vec::new();
 
         #[derive(Hash, PartialEq, Eq)]
         enum LoggerId {
@@ -281,12 +285,25 @@ impl Logger {
             )
         }
 
+        if let MemoryLogLevel::Disabled = config.memory.logger.level {
+        } else {
+            register_logger(
+                &config.memory.logger,
+                config.memory.logger.append,
+                config.memory.logger.log,
+                &mut memory_index,
+                &mut loggers,
+                &mut logger2index,
+            )
+        }
+
         Self {
             loggers,
             compilation_index,
             profiling_index,
             autotune_index,
             streaming_index,
+            memory_index,
             config,
         }
     }
@@ -305,6 +322,20 @@ impl Logger {
         }
     }
 
+    /// Logs a message for memory, directing it to all configured streaming loggers.
+    pub fn log_memory<S: Display>(&mut self, msg: &S) {
+        let length = self.memory_index.len();
+        if length > 1 {
+            let msg = msg.to_string();
+            for i in 0..length {
+                let index = self.memory_index[i];
+                self.log(&msg, index)
+            }
+        } else if let Some(index) = self.memory_index.first() {
+            self.log(&msg, *index)
+        }
+    }
+
     /// Logs a message for compilation, directing it to all configured compilation loggers.
     pub fn log_compilation<S: Display>(&mut self, msg: &S) {
         let length = self.compilation_index.len();
Original file line number	Diff line number	Diff line change
`@@ -29,9 +29,9 @@ pub struct CpuServer {`
`29`	`29`	`}`
`30`	`30`
`31`	`31`	`impl CpuServer {`
`32`		`- pub fn new(ctx: CpuContext) -> Self {`
	`32`	`+ pub fn new(ctx: CpuContext, logger: Arc<ServerLogger>) -> Self {`
`33`	`33`	`Self {`
`34`		`- logger: Arc::new(ServerLogger::default()),`
	`34`	`+ logger,`
`35`	`35`	`scheduler: Scheduler::default(),`
`36`	`36`	`ctx,`
`37`	`37`	`}`