@@ -49,8 +49,7 @@ const DOWNLOAD_RETRY_BACKOFF_BASE: Duration = Duration::from_secs(2);
49
49
const DOWNLOAD_RETRY_CHECK_INTERVAL : Duration = Duration :: from_secs ( 1 ) ;
50
50
const OPPROTUNISTIC_WITNESS_INTERVAL : Duration = Duration :: from_millis ( 500 ) ;
51
51
const CHECK_CONNECTION_INTERVAL : Duration = Duration :: from_secs ( 10 ) ;
52
- const MAX_ERRORS_PER_PEER : u8 = 3 ;
53
- const MAX_RETRIES_PER_PEER : u8 = 5 ;
52
+ const MAX_ERRORS_PER_PEER : u8 = 5 ;
54
53
55
54
impl < T : NodeIdentity , A : AuthenticatableIdentity + ' static , B : Backend < T > + ' static >
56
55
Client < T , A , B >
@@ -103,6 +102,9 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
103
102
let max_concurrent_parameter_requests =
104
103
init_config. max_concurrent_parameter_requests ;
105
104
105
+ let mut current_downloaded_parameters = 0_u16 ;
106
+ let mut total_parameters = None ;
107
+
106
108
let mut run = RunManager :: < T , A > :: new ( RunInitConfigAndIO {
107
109
init_config,
108
110
@@ -122,7 +124,6 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
122
124
let mut sharable_model = SharableModel :: empty ( ) ;
123
125
let peer_manager = Arc :: new ( PeerManagerHandle :: new (
124
126
MAX_ERRORS_PER_PEER ,
125
- MAX_RETRIES_PER_PEER ,
126
127
param_requests_cancel_token. clone ( ) ,
127
128
) ) ;
128
129
@@ -266,22 +267,28 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
266
267
let _ = trace_span!( "NetworkEvent::DownloadComplete" , hash = %hash) . entered( ) ;
267
268
metrics. record_download_completed( hash, from) ;
268
269
if retried_downloads. remove( hash) . await . is_some( ) {
269
- debug !( "Successfully downloaded previously failed blob {}" , hex:: encode( hash) ) ;
270
+ info !( "Successfully downloaded previously failed blob {}" , hex:: encode( hash) ) ;
270
271
}
271
272
match download_data {
272
273
TransmittableDownload :: DistroResult ( distro_result) => {
273
274
debug!( "Download complete: step {} batch id {}" , distro_result. step, distro_result. batch_id) ;
274
275
run. apply_distro_result( hash, distro_result, None ) ;
275
276
} ,
276
277
TransmittableDownload :: ModelParameter ( parameter) => {
277
- debug!( "Download complete: parameter {}" , parameter. name( ) ?) ;
278
+ current_downloaded_parameters += 1 ;
279
+ info!( "Download complete: parameter {}" , parameter. name( ) ?) ;
280
+ if let Some ( total_parameters) = total_parameters {
281
+ info!( "Downloaded parameters total: {}/{}" , current_downloaded_parameters, total_parameters) ;
282
+ } else {
283
+ error!( "Total parameters not set" ) ;
284
+ }
278
285
sharable_model. add_parameter( parameter) . await ?;
279
286
if sharable_model. is_download_complete( ) {
280
287
sharable_model. send_init_parameters( ) ?;
281
288
}
282
289
} ,
283
290
TransmittableDownload :: ModelConfig ( config) => {
284
- debug !( "Download complete: model config" ) ;
291
+ info !( "Download complete: model config" ) ;
285
292
sharable_model. add_config( config) ?;
286
293
sharable_model. send_config( ) ?;
287
294
} ,
@@ -299,11 +306,12 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
299
306
// We often get an error after some time in the iroh-blobs side so we use the base backoff to retry faster.
300
307
let backoff_duration = DOWNLOAD_RETRY_BACKOFF_BASE ;
301
308
let retry_time = Some ( std:: time:: Instant :: now( ) + backoff_duration) ;
302
- peer_manager. report_blob_ticket_download_error ( dl. blob_ticket. node_addr( ) . node_id) ;
309
+ peer_manager. report_blob_ticket_request_error ( dl. blob_ticket. node_addr( ) . node_id, Some ( dl . blob_ticket . clone ( ) ) ) ;
303
310
304
311
info!(
305
- "Model Sharing download failed {} time/s (will retry in {:?}): {}" ,
312
+ "Model Sharing download failed {} time/s with provider node {} (will retry in {:?}): {}" ,
306
313
retries + 1 ,
314
+ dl. blob_ticket. node_addr( ) . node_id,
307
315
backoff_duration,
308
316
dl. error
309
317
) ;
@@ -365,7 +373,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
365
373
}
366
374
} ,
367
375
Ok ( ticket) => {
368
- info!( parameter = parameter_name, "Sending requested model parameter blob ticket" ) ;
376
+ info!( parameter = parameter_name, hash = %ticket . hash ( ) , "Sending requested model parameter blob ticket" ) ;
369
377
if let Err ( e) = protocol_req_tx. send( Ok ( ticket) ) {
370
378
warn!( "Could not send model parameter {parameter_name} blob ticket. Error: {e:?}" ) ;
371
379
} ;
@@ -381,7 +389,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
381
389
}
382
390
} ,
383
391
Ok ( config_ticket) => {
384
- info!( "Sending requested model config blob ticket" ) ;
392
+ info!( hash = %config_ticket . hash ( ) , "Sending requested model config blob ticket" ) ;
385
393
if let Err ( e) = protocol_req_tx. send( Ok ( config_ticket) ) {
386
394
warn!( "Could not send model config blob ticket. Error: {e:?}" ) ;
387
395
}
@@ -470,20 +478,21 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
470
478
for ( hash, ticket, tag, download_type) in pending_retries {
471
479
let retries = retried_downloads. update_time( hash) . await ;
472
480
473
- debug!( "Retrying download for blob {} (attempt {})" , hash, retries) ;
474
-
475
481
metrics. record_download_retry( hash) ;
476
482
// We check the type of the failed download and send it to the appropriate channel to retry it
477
483
match download_type {
478
484
DownloadType :: DistroResult ( _) => {
485
+ info!( "Retrying download for distro result, (attempt {})" , retries) ;
479
486
let _ = tx_request_download. send( ( ticket, tag) ) ;
480
487
} ,
481
488
DownloadType :: ModelSharing ( inner) => {
482
489
match inner {
483
490
ModelRequestType :: Parameter ( parameter) => {
491
+ info!( "Retrying download for model parameter: {parameter}, (attempt {})" , retries) ;
484
492
let _ = tx_params_download. send( vec![ ( ticket, ModelRequestType :: Parameter ( parameter. clone( ) ) ) ] ) ;
485
493
} ,
486
494
ModelRequestType :: Config => {
495
+ info!( "Retrying download for model config, (attempt {})" , retries) ;
487
496
let _ = tx_config_download. send( ticket) ;
488
497
}
489
498
}
@@ -521,6 +530,7 @@ impl<T: NodeIdentity, A: AuthenticatableIdentity + 'static, B: Backend<T> + 'sta
521
530
sharable_model. update_config( config_string, tokenizer) ?;
522
531
}
523
532
Some ( ( param_names, tx_params_response) ) = rx_parameters_req. recv( ) => {
533
+ total_parameters = Some ( param_names. len( ) ) ;
524
534
sharable_model. initialize_parameters( & param_names, tx_params_response) ;
525
535
526
536
let tx_params_download = tx_params_download. clone( ) ;
0 commit comments