@@ -119,6 +119,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
119119    hp_n_ctx_train = model->hparams .n_ctx_train ;
120120    hp_n_expert    = model->hparams .n_expert ;
121121
122+     llama_memory_breakdown_print (ctx); //  goes to debug log
123+ 
122124    llama_free (ctx);
123125    llama_model_free (model);
124126    llama_log_set (ud.original_logger .callback , ud.original_logger .user_data );
@@ -142,6 +144,7 @@ bool llama_params_fit(
142144
143145    //  step 1: get data for default parameters and check whether any changes are necessary in the first place
144146
147+     LLAMA_LOG_DEBUG (" %s: getting device memory data for initial parameters:\n " 
145148    const  dmds_t  dmds_full = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
146149    const  size_t  nd = devs.size (); //  number of devices
147150    if  (nd == 0 ) {
@@ -316,6 +319,7 @@ bool llama_params_fit(
316319        tensor_buft_overides[1 ] = {nullptr , nullptr };
317320        mparams->tensor_buft_overrides  = tensor_buft_overides;
318321
322+         LLAMA_LOG_DEBUG (" %s: getting device memory data for all MoE tensors in system memory:\n " 
319323        const  dmds_t  dmds_cpu_moe = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
320324        int64_t  global_surplus = 0 ;
321325        for  (const  llama_device_memory_data & dmd : dmds_cpu_moe) {
@@ -328,34 +332,59 @@ bool llama_params_fit(
328332            //  step 3: for MoE models, if at least the dense tensors can be fit, try fitting as many full layers as possible
329333
330334            const  uint32_t  nl_scaling = hp_ngl / nd;
331-             const  std::vector<memory_scaling> spl_part = get_memory_scaling ( //  size per device and per partial == Moe only layer
332-                 get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
335+             std::vector<memory_scaling> spl_part; //  size per device and per partial == Moe only layer
336+             {
337+                 LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 layer + all MoE tensors in system memory:\n " 
338+                 auto  tmp1 = get_memory_for_const_layer (1 );
339+                 LLAMA_LOG_DEBUG (" %s: getting device memory data for %" "  layers + all MoE tensors in system memory:\n " 
340+                 auto  tmpn = get_memory_for_const_layer (nl_scaling);
341+                 spl_part = get_memory_scaling (tmp1, tmpn, nl_scaling);
342+             }
343+             for  (size_t  id = 0 ; id < nd; id++) {
344+                 LLAMA_LOG_DEBUG (" %s: spl_part[%zu]: base=%" "  MiB, per_layer=%" "  MiB\n " 
345+                     __func__, id, spl_part[id].base /MiB, spl_part[id].per_layer /MiB);
346+             }
333347
334348            //  for spl_part all MoE tensors were still on CPU, reset the TBOs so that all tensors are on the devices again
335349            tensor_buft_overides[0 ] = {nullptr , nullptr };
336350            mparams->tensor_buft_overrides  = tensor_buft_overides;
337351
338-             const  std::vector<memory_scaling> spl_full = get_memory_scaling ( //  size per device and per full layer
339-                 get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
352+             std::vector<memory_scaling> spl_full; //  size per device and per full layer
353+             {
354+                 LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 layer + all tensors in device memory:\n " 
355+                 auto  tmp1 = get_memory_for_const_layer (1 );
356+                 LLAMA_LOG_DEBUG (" %s: getting device memory data for %" "  layers + all tensors in device memory:\n " 
357+                 auto  tmpn = get_memory_for_const_layer (nl_scaling);
358+                 spl_full = get_memory_scaling (tmp1, tmpn, nl_scaling);
359+             }
360+             for  (size_t  id = 0 ; id < nd; id++) {
361+                 LLAMA_LOG_DEBUG (" %s: spl_full[%zu]: base=%" "  MiB, per_layer=%" "  MiB\n " 
362+                     __func__, id, spl_full[id].base /MiB, spl_full[id].per_layer /MiB);
363+             }
340364
341365            //  the non-repeating tensors (e.g. output matrix) are difficult to quantify,
342366            //      get memory use with all tensors on the last device and use that as the starting point for the last device only
343367            for  (size_t  id = 0 ; id < nd - 1 ; id++) {
344368                tensor_split[id] = 0 .0f ;
345369            }
346370            tensor_split[nd - 1 ] = 1 .0f ;
371+             LLAMA_LOG_DEBUG (" %s: getting device memory data with entire model on last device:\n " 
347372            const  dmds_t  dmds_last = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
348373            tensor_split[nd - 1 ] = 0 .0f ;
349374
350375            struct  ngl  {
351376                uint32_t  part = 0 ;
352377                uint32_t  full = 0 ;
378+ 
379+                 explicit  operator  std::string () const  {
380+                     return  " [" std::to_string (part) + " , " std::to_string (full) + " ]" 
381+                 }
353382            };
354383
355384            //  utility function that distributes layers to devices and returns whether the memory margin can be met on all devices
356385            //    - ngl_per_device: resulting distribution of dense-only/full layers across devices
357386            //    - global_ngl_part: total number of sense-only layers
358-             auto  distribute_layers = [&](std::vector<ngl> & ngl_per_device, const  uint32_t  global_ngl_part) -> bool  {
387+             auto  distribute_layers = [&](std::vector<ngl> & ngl_per_device, std::vector< int64_t > & usable_memory,  const  uint32_t  global_ngl_part) -> bool  {
359388                //  reset result to initial state, initially put entire model on the last device
360389                for  (size_t  id = 0 ; id < nd - 1 ; id++) {
361390                    ngl_per_device[id] = {0 , 0 };
@@ -364,16 +393,14 @@ bool llama_params_fit(
364393                ngl_per_device.back ().full  = hp_ngl + 1 ;
365394
366395                //  usable_memory: free memory above margin that can be used for further allocations
367-                 std::vector<int64_t > usable_memory;
368-                 usable_memory.reserve (nd);
369396                for  (size_t  id = 0 ; id < nd - 1 ; id++) {
370397                    int64_t  um = dmds_last[id].free  - margin - spl_full[id].base ;
371398                    um = std::max (um, int64_t (0 ));
372-                     usable_memory. push_back (um) ;
399+                     usable_memory[id] = um ;
373400                }
374401                {
375402                    const  llama_memory_breakdown_data & mb = dmds_last.back ().mb ;
376-                     usable_memory.push_back ( dmds_last.back ().free  - int64_t (mb.model  + mb.context  + mb.context ) - margin) ;
403+                     usable_memory.back () =  dmds_last.back ().free  - int64_t (mb.model  + mb.context  + mb.context ) - margin;
377404                }
378405
379406                //  convert some layers on the last device from full layers to dense-only layers
@@ -425,9 +452,21 @@ bool llama_params_fit(
425452
426453            //  iteratively increase the number of partial layers until the memory consumption is low enough
427454            std::vector<ngl> ngl_per_device (nd);
428-             for  (uint32_t  global_ngl_part = 0 ; global_ngl_part < hp_ngl; global_ngl_part++) {
429-                 if  (distribute_layers (ngl_per_device, global_ngl_part)) {
430-                     break ;
455+             {
456+                 std::vector<int64_t > usable_memory (nd);
457+                 for  (uint32_t  global_ngl_part = 0 ; global_ngl_part < hp_ngl; global_ngl_part++) {
458+                     const  bool  success = distribute_layers (ngl_per_device, usable_memory, global_ngl_part);
459+                     std::string ngl_per_device_str = std::string (ngl_per_device[0 ]);
460+                     std::string usable_memory_str  = std::to_string (usable_memory[0 ]/MiB);
461+                     for  (size_t  id = 1 ; id < nd; id++) {
462+                         ngl_per_device_str += " , " std::string (ngl_per_device[id]);
463+                         usable_memory_str  += " , " std::to_string (usable_memory[id]/MiB);
464+                     }
465+                     LLAMA_LOG_DEBUG (" %s: global_ngl_part=%" " , success=%d, ngl_per_device=[%s], usable_memory[MiB]=[%s]\n " 
466+                         __func__, global_ngl_part, success ? 1  : 0 , ngl_per_device_str.c_str (), usable_memory_str.c_str ());
467+                     if  (success) {
468+                         break ;
469+                     }
431470                }
432471            }
433472
@@ -504,8 +543,14 @@ bool llama_params_fit(
504543    //      all layers are the same so simply determine how many layers will fit per device
505544
506545    const  uint32_t  nl_scaling = hp_ngl / nd;
507-     const  std::vector<memory_scaling> ms = get_memory_scaling (
508-         get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
546+     std::vector<memory_scaling> ms;
547+     {
548+         LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 full layer:\n " 
549+         auto  tmp1 = get_memory_for_const_layer (1 );
550+         LLAMA_LOG_DEBUG (" %s: getting device memory data for %" "  full layers:\n " 
551+         auto  tmpn = get_memory_for_const_layer (nl_scaling);
552+         ms = get_memory_scaling (tmp1, tmpn, nl_scaling);
553+     }
509554
510555    mparams->n_gpu_layers  = 0 ;
511556    std::vector<uint32_t > ngl_per_device;
0 commit comments