Skip to content

Commit 5e36ab8

Browse files
committed
Merge remote-tracking branch 'origin/develop'
2 parents 932b217 + b20768b commit 5e36ab8

File tree

7 files changed

+36
-23
lines changed

7 files changed

+36
-23
lines changed
2.1 MB
Binary file not shown.

documentation/pestpp_users_manual.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11

22
<img src="./media/image1.png" style="width:6.26806in;height:1.68194in" alt="A close up of a purple sign Description automatically generated" />
33

4-
# <a id='s1' />Version 5.2.15
4+
# <a id='s1' />Version 5.2.16
55

66
<img src="./media/image2.png" style="width:6.26806in;height:3.05972in" />
77

88
PEST++ Development Team
99

10-
November 2024
10+
December 2024
1111

1212
# <a id='s2' />Acknowledgements
1313

@@ -70,7 +70,7 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
7070

7171
# Table of Contents
7272

73-
- [Version 5.2.15](#s1)
73+
- [Version 5.2.16](#s1)
7474
- [Acknowledgements](#s2)
7575
- [Preface](#s3)
7676
- [License](#s4)

src/libs/common/config_os.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#define CONFIG_OS_H_
33

44

5-
#define PESTPP_VERSION "5.2.15";
5+
#define PESTPP_VERSION "5.2.16";
66

77
#if defined(_WIN32) || defined(_WIN64)
88
#define OS_WIN

src/libs/pestpp_common/EnsembleMethodUtils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7750,7 +7750,7 @@ void EnsembleMethod::reset_par_ensemble_to_prior_mean(){
77507750
ss << "iteration:" << iter;
77517751
vector<int> temp;
77527752
ofstream& frec = file_manager.rec_ofstream();
7753-
oe = oe_base;
7753+
oe.reserve(oe_base.get_real_names(),oe.get_var_names());
77547754
weights = weights_base;
77557755
run_ensemble_util(performance_log,frec,new_pe,oe,run_mgr_ptr,false,temp,NetPackage::NULL_DA_CYCLE, ss.str());
77567756
pe = new_pe;

src/libs/run_managers/yamr/PantherAgent.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ void PANTHERAgent::process_ctl_file(const string &ctl_filename)
151151
mi.set_fill_tpl_zeros(pest_scenario.get_pestpp_options().get_fill_tpl_zeros());
152152
mi.set_tpl_force_decimal(pest_scenario.get_pestpp_options().get_tpl_force_decimal());
153153
mi.set_num_threads(pest_scenario.get_pestpp_options().get_num_tpl_ins_threads());
154-
mi.set_sleep_ms(100);
154+
mi.set_sleep_ms(5);
155155
restart_on_error = pest_scenario.get_pestpp_options().get_panther_agent_restart_on_error();
156156
max_time_without_master_ping_seconds = pest_scenario.get_pestpp_options().get_panther_agent_no_ping_timeout_secs();
157157
FileManager fm("panther_agent");
@@ -538,7 +538,7 @@ std::pair<NetPackage::PackType,std::string> PANTHERAgent::run_model(Parameters &
538538
void PANTHERAgent::run_async(pest_utils::thread_flag* terminate, pest_utils::thread_flag* finished, exception_ptr& run_exception,
539539
Parameters* pars, Observations* obs)
540540
{
541-
mi.set_sleep_ms(100);
541+
mi.set_sleep_ms(5);
542542
mi.run(terminate,finished,run_exception, pars, obs);
543543
}
544544

src/libs/run_managers/yamr/RunManagerPanther.cpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@ const int RunManagerPanther::N_PINGS_UNRESPONSIVE = 3;
5050
const int RunManagerPanther::MIN_PING_INTERVAL_SECS = 60; // Ping each slave at most once every minute
5151
const int RunManagerPanther::MAX_PING_INTERVAL_SECS = 120; // Ping each slave at least once every 2 minutes
5252
const int RunManagerPanther::MAX_CONCURRENT_RUNS_LOWER_LIMIT = 1;
53-
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
54-
53+
const int RunManagerPanther::IDLE_THREAD_SIGNAL_TIMEOUT_SECS = 10; // Allow up to 10s for the run_idle_async() thread to acknowledge signals (pause idling, terminate)
54+
const double RunManagerPanther::MIN_AVGRUNMINS_FOR_KILL = 0.08; //minimum avg runtime to try to kill and/or resched runs
55+
const int RunManagerPanther::SECONDS_BETWEEN_ECHOS = 1;
5556

5657
AgentInfoRec::AgentInfoRec(int _socket_fd)
5758
{
@@ -520,6 +521,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
520521
}
521522

522523
std::chrono::system_clock::time_point start_time = std::chrono::system_clock::now();
524+
last_echo_time = std::chrono::system_clock::now();
523525
double run_time_sec = 0.0;
524526
while (!all_runs_complete() && terminate_reason == RUN_UNTIL_COND::NORMAL)
525527
{
@@ -560,7 +562,7 @@ RunManagerAbstract::RUN_UNTIL_COND RunManagerPanther::run_until(RUN_UNTIL_COND c
560562
}
561563

562564
}
563-
w_sleep(100);
565+
w_sleep(10);
564566
n_no_ops = 0;
565567
while (true)
566568
{
@@ -726,7 +728,7 @@ void RunManagerPanther::run_idle_async()
726728
idling.set(false);
727729

728730
// Sleep 1s to avoid spinlock
729-
w_sleep(100);
731+
w_sleep(10);
730732
continue;
731733
}
732734

@@ -816,7 +818,7 @@ void RunManagerPanther::end_run_idle_async()
816818
}
817819

818820
// Sleep to avoid spinlock
819-
w_sleep(50);
821+
w_sleep(10);
820822
}
821823

822824
report("Stopped idle ping thread, as Panther manager is shutting down.", false);
@@ -857,7 +859,7 @@ void RunManagerPanther::pause_idle()
857859
}
858860

859861
// Sleep to avoid spinlock
860-
w_sleep(50);
862+
w_sleep(10);
861863
}
862864

863865
report("Panther idle ping thread paused prior to scheduling runs.", false);
@@ -947,7 +949,7 @@ bool RunManagerPanther::listen(pest_utils::thread_flag* terminate/* = nullptr*/)
947949
fd_set read_fds; // temp file descriptor list for select()
948950
socklen_t addr_len;
949951
timeval tv;
950-
tv.tv_sec = 1;
952+
tv.tv_sec = 0;
951953
tv.tv_usec = 0;
952954
read_fds = master; // copy it
953955
if (w_select(fdmax+1, &read_fds, NULL, NULL, &tv) == -1)
@@ -1006,7 +1008,7 @@ void RunManagerPanther::close_agents()
10061008
sock_nums.push_back(si.first);
10071009
for (auto si : sock_nums)
10081010
close_agent(si);
1009-
w_sleep(100);
1011+
w_sleep(10);
10101012

10111013
}
10121014
}
@@ -1107,7 +1109,7 @@ void RunManagerPanther::schedule_runs()
11071109
duration = it_agent->get_duration_minute();
11081110
avg_runtime = it_agent->get_runtime_minute();
11091111
if (avg_runtime <= 0) avg_runtime = global_avg_runtime;
1110-
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
1112+
if (avg_runtime <= 0) avg_runtime = 1.0E+300;
11111113
vector<int> overdue_kill_runs_vec = get_overdue_runs_over_kill_threshold(run_id);
11121114

11131115
if (failure_map.count(run_id) + overdue_kill_runs_vec.size() >= max_n_failure)
@@ -1131,7 +1133,9 @@ void RunManagerPanther::schedule_runs()
11311133
should_schedule = true;
11321134
model_runs_timed_out += overdue_kill_runs_vec.size();
11331135
}
1134-
else if (((duration > overdue_giveup_minutes) || (duration > avg_runtime*overdue_giveup_fac))
1136+
1137+
else if (((duration > overdue_giveup_minutes) || ((duration > avg_runtime*overdue_giveup_fac) &&
1138+
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
11351139
&& free_agent_list.empty())
11361140
{
11371141
// If there are no free slaves kill the overdue ones
@@ -1147,7 +1151,8 @@ void RunManagerPanther::schedule_runs()
11471151
}
11481152
model_runs_timed_out += 1;
11491153
}
1150-
else if (duration > avg_runtime*overdue_reched_fac)
1154+
1155+
else if ((duration > avg_runtime*overdue_reched_fac) && (avg_runtime > MIN_AVGRUNMINS_FOR_KILL))
11511156
{
11521157
//check how many concurrent runs are going
11531158
if (n_concur < max_concurrent_runs) should_schedule = true;
@@ -1285,6 +1290,10 @@ void RunManagerPanther::echo()
12851290
{
12861291
if (!should_echo)
12871292
return;
1293+
std::chrono::system_clock::time_point now = chrono::system_clock::now();
1294+
if (chrono::duration_cast<std::chrono::seconds> ( now- last_echo_time).count() < SECONDS_BETWEEN_ECHOS)
1295+
return;
1296+
last_echo_time = now;
12881297
map<string, int> stats_map = get_agent_stats();
12891298
cout << get_time_string_short() << " mn:" << setw(5) << setprecision(2) << left << get_global_runtime_minute() << " runs("
12901299
<< "C" << setw(5) << left << model_runs_done
@@ -1939,7 +1948,9 @@ void RunManagerPanther::kill_all_active_runs()
19391948
if (avg_runtime <= 0) avg_runtime = get_global_runtime_minute();;
19401949
if (avg_runtime <= 0) avg_runtime = 1.0E+10;
19411950
duration = i->second->get_duration_minute();
1942-
if ((just_quit) || (duration > overdue_giveup_minutes) || (duration >= avg_runtime*overdue_giveup_fac))
1951+
if ((just_quit) || (duration > overdue_giveup_minutes) ||
1952+
((duration >= avg_runtime*overdue_giveup_fac) &&
1953+
(avg_runtime > MIN_AVGRUNMINS_FOR_KILL)))
19431954
{
19441955
sock_id_vec.push_back(i->second->get_socket_fd());
19451956
}
@@ -2132,7 +2143,7 @@ RunManagerPanther::~RunManagerPanther(void)
21322143
err = w_close(listener);
21332144
FD_CLR(listener, &master);
21342145
// this is needed to ensure that the first slave closes properly
2135-
w_sleep(500);
2146+
w_sleep(10);
21362147
for (int i = 0; i <= fdmax; i++)
21372148
{
21382149
if (FD_ISSET(i, &master))
@@ -2248,10 +2259,10 @@ void RunManagerYAMRCondor::cleanup(int cluster)
22482259
stringstream ss;
22492260
ss << "condor_rm " << cluster << " 1>cr_temp.stdout 2>cr_temp.stderr";
22502261
system(ss.str().c_str());
2251-
w_sleep(500);
2262+
w_sleep(10);
22522263
ss.str(string());
22532264
ss << "condor_rm " << cluster << " -forcex 1>cr_temp.stdout 2>cr_temp.stderr";
2254-
w_sleep(500);
2265+
w_sleep(10);
22552266
system(ss.str().c_str());
22562267
RunManagerPanther::close_agents();
22572268
cout << " all agents freed " << endl << endl;

src/libs/run_managers/yamr/RunManagerPanther.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ class RunManagerPanther : public RunManagerAbstract
127127
static const int MAX_PING_INTERVAL_SECS;
128128
static const int MAX_CONCURRENT_RUNS_LOWER_LIMIT;
129129
static const int IDLE_THREAD_SIGNAL_TIMEOUT_SECS;
130-
130+
static const double MIN_AVGRUNMINS_FOR_KILL;
131+
static const int SECONDS_BETWEEN_ECHOS;
131132
double overdue_reched_fac;
132133
double overdue_giveup_fac;
133134
double overdue_giveup_minutes;
@@ -141,6 +142,7 @@ class RunManagerPanther : public RunManagerAbstract
141142
long long bytes_transferred;
142143
int files_transferred;
143144
bool should_echo;
145+
std::chrono::system_clock::time_point last_echo_time;
144146
int nftx;
145147
fd_set master; // master file descriptor list
146148
list<AgentInfoRec> agent_info_set;

0 commit comments

Comments
 (0)