@@ -50,8 +50,9 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
5050const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60 ; // Ping each slave at most once every minute
5151const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120 ; // Ping each slave at least once every 2 minutes
5252const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1 ;
53- const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10 ; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
54-
53+ const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10 ; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
54+ const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08 ; // minimum avg runtime to try to kill and/or resched runs
55+ const int RunManagerPanther::SECONDS_BETWEEN_ECHOS = 1 ;
5556
5657AgentInfoRec::AgentInfoRec (int _socket_fd)
5758{
@@ -520,6 +521,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
520521 }
521522
522523 std::chrono::system_clock::time_point start_time = std::chrono::system_clock::now ();
524+ last_echo_time = std::chrono::system_clock::now ();
523525 double run_time_sec = 0.0 ;
524526 while (!all_runs_complete () && terminate_reason == RUN_UNTIL_COND::NORMAL)
525527 {
@@ -560,7 +562,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
560562 }
561563
562564 }
563- w_sleep (100 );
565+ w_sleep (10 );
564566 n_no_ops = 0 ;
565567 while (true )
566568 {
@@ -726,7 +728,7 @@ void RunManagerPanther::run_idle_async()
726728 idling.set (false );
727729
728730 // Sleep 1s to avoid spinlock
729- w_sleep (100 );
731+ w_sleep (10 );
730732 continue ;
731733 }
732734
@@ -816,7 +818,7 @@ void RunManagerPanther::end_run_idle_async()
816818 }
817819
818820 // Sleep to avoid spinlock
819- w_sleep (50 );
821+ w_sleep (10 );
820822 }
821823
822824 report (" Stopped idle ping thread, as Panther manager is shutting down." , false );
@@ -857,7 +859,7 @@ void RunManagerPanther::pause_idle()
857859 }
858860
859861 // Sleep to avoid spinlock
860- w_sleep (50 );
862+ w_sleep (10 );
861863 }
862864
863865 report (" Panther idle ping thread paused prior to scheduling runs." , false );
@@ -947,7 +949,7 @@ bool RunManagerPanther::listen(pest_utils::thread_flag* terminate/* = nullptr*/)
947949 fd_set read_fds; // temp file descriptor list for select()
948950 socklen_t addr_len;
949951 timeval tv;
950- tv.tv_sec = 1 ;
952+ tv.tv_sec = 0 ;
951953 tv.tv_usec = 0 ;
952954 read_fds = master; // copy it
953955 if (w_select (fdmax+1 , &read_fds, NULL , NULL , &tv) == -1 )
@@ -1006,7 +1008,7 @@ void RunManagerPanther::close_agents()
10061008 sock_nums.push_back (si.first );
10071009 for (auto si : sock_nums)
10081010 close_agent (si);
1009- w_sleep (100 );
1011+ w_sleep (10 );
10101012
10111013 }
10121014}
@@ -1107,7 +1109,7 @@ void RunManagerPanther::schedule_runs()
11071109 duration = it_agent->get_duration_minute ();
11081110 avg_runtime = it_agent->get_runtime_minute ();
11091111 if (avg_runtime <= 0 ) avg_runtime = global_avg_runtime;
1110- if (avg_runtime <= 0 ) avg_runtime = 1.0E+10 ;
1112+ if (avg_runtime <= 0 ) avg_runtime = 1.0E+300 ;
11111113 vector<int > overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold (run_id);
11121114
11131115 if (failure_map.count (run_id) + overdue_kill_runs_vec.size () >= max_n_failure)
@@ -1131,7 +1133,9 @@ void RunManagerPanther::schedule_runs()
11311133 should_schedule = true ;
11321134 model_runs_timed_out += overdue_kill_runs_vec.size ();
11331135 }
1134- else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))
1136+
1137+ else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
1138+ (avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
11351139 && free_agent_list.empty ())
11361140 {
11371141 // If there are no free slaves kill the overdue ones
@@ -1147,7 +1151,8 @@ void RunManagerPanther::schedule_runs()
11471151 }
11481152 model_runs_timed_out += 1 ;
11491153 }
1150- else if (duration > avg_runtime*overdue_reched_fac)
1154+
1155+ else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
11511156 {
11521157 // check how many concurrent runs are going
11531158 if (n_concur < max_concurrent_runs) should_schedule = true ;
@@ -1285,6 +1290,10 @@ void RunManagerPanther::echo()
12851290{
12861291 if (!should_echo)
12871292 return ;
1293+ std::chrono::system_clock::time_point now = chrono::system_clock::now ();
1294+ if (chrono::duration_cast<std::chrono::seconds> ( now- last_echo_time).count () < SECONDS_BETWEEN_ECHOS)
1295+ return ;
1296+ last_echo_time = now;
12881297 map<string, int > stats_map = get_agent_stats ();
12891298 cout << get_time_string_short () << " mn:" << setw (5 ) << setprecision (2 ) << left << get_global_runtime_minute () << " runs("
12901299 << " C" << setw (5 ) << left << model_runs_done
@@ -1939,7 +1948,9 @@ void RunManagerPanther::kill_all_active_runs()
19391948 if (avg_runtime <= 0 ) avg_runtime = get_global_runtime_minute ();;
19401949 if (avg_runtime <= 0 ) avg_runtime = 1.0E+10 ;
19411950 duration = i->second ->get_duration_minute ();
1942- if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
1951+ if ((just_quit) || (duration > overdue_giveup_minutes) ||
1952+ ((duration >= avg_runtime*overdue_giveup_fac) &&
1953+ (avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
19431954 {
19441955 sock_id_vec.push_back (i->second ->get_socket_fd ());
19451956 }
@@ -2132,7 +2143,7 @@ RunManagerPanther::~RunManagerPanther(void)
21322143 err = w_close (listener);
21332144 FD_CLR (listener, &master);
21342145 // this is needed to ensure that the first slave closes properly
2135- w_sleep (500 );
2146+ w_sleep (10 );
21362147 for (int i = 0 ; i <= fdmax; i++)
21372148 {
21382149 if (FD_ISSET (i, &master))
@@ -2248,10 +2259,10 @@ void RunManagerYAMRCondor::cleanup(int cluster)
22482259 stringstream ss;
22492260 ss << " condor_rm " << cluster << " 1>cr_temp.stdout 2>cr_temp.stderr" ;
22502261 system (ss.str ().c_str ());
2251- w_sleep (500 );
2262+ w_sleep (10 );
22522263 ss.str (string ());
22532264 ss << " condor_rm " << cluster << " -forcex 1>cr_temp.stdout 2>cr_temp.stderr" ;
2254- w_sleep (500 );
2265+ w_sleep (10 );
22552266 system (ss.str ().c_str ());
22562267 RunManagerPanther::close_agents ();
22572268 cout << " all agents freed " << endl << endl;
0 commit comments