Skip to content

Commit 137b52a

Browse files
committed
Reworked the way that normal events are recorded: The decision as to whether to record a normal event is now made by the Anomalies class rather than by each ADOutlier derived class. Now all outlier algorithms will always have the normal event with the lowest score recorded, rather than just for HBOS
Added dedicated unit tests for Anomalies class Anomalies class checks inserted events have the expected label (had to fix a lot of unit tests!) The number of normal events in the Chimbuko log output now reflects the total number of normal events and not just the selection that were recorded Disabled the ADMonitoring check that counters are on thread 0 because a recent TAU change has made this no longer true!
1 parent a6c188f commit 137b52a

18 files changed

+293
-166
lines changed

include/chimbuko/util/Anomalies.hpp

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,37 +12,52 @@ namespace chimbuko{
1212
public:
1313
enum class EventType { Outlier, Normal };
1414

15+
Anomalies(): m_n_events_total(0){}
16+
17+
/**
18+
* @brief Record an anomalous event
19+
*/
20+
void recordAnomaly(CallListIterator_t event);
21+
1522
/**
16-
* @brief Insert a detected outlier/normal execution
23+
* @brief Record a normal event if it has the lowest score for its function
24+
* @return true if the event was recorded, false otherwise
1725
*/
18-
void insert(CallListIterator_t event, EventType type);
26+
bool recordNormalEventConditional(CallListIterator_t event);
27+
28+
/**
29+
* @brief Get all recorded outliers/normal events
30+
*/
31+
const std::vector<CallListIterator_t> & allEventsRecorded(EventType type) const{ return type == EventType::Outlier ? m_all_outliers : m_all_normal_execs; }
1932

2033
/**
21-
* @brief Get the outlier/normal events associated with a given function
34+
* @brief Get number of outliers/normal events recorded
35+
*
36+
* Note: This is not all of the normal events, only the selection of normal events that we keep for comparison purposes
2237
*/
23-
const std::vector<CallListIterator_t> & funcEvents(const unsigned long func_id, EventType type) const;
38+
size_t nEventsRecorded(EventType type) const { return allEventsRecorded(type).size(); }
2439

2540
/**
26-
* @brief Get all outliers/normal events
41+
* @brief Get the total number of events analyzed (both recorded and unrecorded)
2742
*/
28-
const std::vector<CallListIterator_t> & allEvents(EventType type) const{ return type == EventType::Outlier ? m_all_outliers : m_all_normal_execs; }
43+
size_t nEvents() const{ return m_n_events_total; }
2944

3045
/**
31-
* @brief Get number of outliers/normal events associated with a given function
46+
* @brief Count the number of events recorded of a given type for a particular function
3247
*/
33-
size_t nFuncEvents(const unsigned long func_id, EventType type) const{ return funcEvents(func_id, type).size(); }
48+
size_t nFuncEventsRecorded(unsigned long fid, EventType type) const;
3449

3550
/**
36-
* @brief Get number of outliers/normal events
51+
* @brief Return an array of iterators to events of the given type associated with a specific function
3752
*/
38-
size_t nEvents(EventType type) const { return allEvents(type).size(); }
53+
std::vector<CallListIterator_t> funcEventsRecorded(unsigned long fid, EventType type) const;
3954

4055
private:
4156
std::vector<CallListIterator_t> m_all_outliers; /**< Array of outliers */
42-
std::unordered_map<unsigned long, std::vector<CallListIterator_t> > m_func_outliers; /**< Map of function index to associated outliers */
43-
4457
std::vector<CallListIterator_t> m_all_normal_execs; /**< Array of normal executions (the algorithm will capture a limited number of these for comparison with outliers)*/
45-
std::unordered_map<unsigned long, std::vector<CallListIterator_t> > m_func_normal_execs; /**< Map of function index to associated normal executions */
58+
std::unordered_map<unsigned long, size_t> m_func_normal_exec_idx; /**< Map of function index to the index of the array entry containing the normal execution recorded for that function*/
59+
60+
size_t m_n_events_total; /**< Total number of events analyzed (both recorded and unrecorded)*/
4661
};
4762

4863
};

sim/src/ad.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -265,14 +265,14 @@ void ADsim::step(const unsigned long step){
265265
m_outlier->linkExecDataMap(&this_step_func_execs);
266266
anom = m_outlier->run(step);
267267
}else{
268-
//Collect outliers and 1 normal event/func into Anomalies object
268+
//Collect outliers and 1 normal event/func into Anomalies object (Anomalies object decides which to keep)
269269
for(const auto &exec_it : this_step_execs){
270-
if(exec_it->get_label() == -1) anom.insert(exec_it, Anomalies::EventType::Outlier);
271-
else if(anom.nFuncEvents(exec_it->get_fid(), Anomalies::EventType::Normal) == 0) anom.insert(exec_it, Anomalies::EventType::Normal);
270+
if(exec_it->get_label() == -1) anom.recordAnomaly(exec_it);
271+
else anom.recordNormalEventConditional(exec_it);
272272
}
273273
}
274-
int nanom = anom.nEvents(Anomalies::EventType::Outlier);
275-
int nnorm = anom.nEvents(Anomalies::EventType::Normal);
274+
int nanom = anom.nEventsRecorded(Anomalies::EventType::Outlier);
275+
int nnorm = anom.nEventsRecorded(Anomalies::EventType::Normal);
276276
std::cout << "Step " << step << " rank " << m_rid << " Anomalies object contains : " << nanom << " anomalies and " << nnorm << " normal events (max 1)" << std::endl;
277277

278278
//Extract provenance data and send to the provDB

src/ad/ADAnomalyProvenance.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ void ADAnomalyProvenance::getProvenanceEntries(std::vector<nlohmann::json> &anom
210210

211211
//Put new normal event provenance into m_normalevent_prov
212212
timer.start();
213-
for(auto norm_it : anomalies.allEvents(Anomalies::EventType::Normal)){
213+
for(auto norm_it : anomalies.allEventsRecorded(Anomalies::EventType::Normal)){
214214
timer2.start();
215215
m_normalevents.addNormalEvent(norm_it->get_pid(), norm_it->get_rid(), norm_it->get_tid(), norm_it->get_fid(), getEventProvenance(*norm_it, step, first_event_ts, last_event_ts));
216216
if(m_perf) m_perf->add("ad_extract_send_prov_normalevent_update_per_event_ms", timer2.elapsed_ms());
@@ -228,7 +228,7 @@ void ADAnomalyProvenance::getProvenanceEntries(std::vector<nlohmann::json> &anom
228228
timer.start();
229229
std::unordered_set<unsigned long> normal_event_fids;
230230

231-
for(auto anom_it : anomalies.allEvents(Anomalies::EventType::Outlier)){
231+
for(auto anom_it : anomalies.allEventsRecorded(Anomalies::EventType::Outlier)){
232232
timer2.start();
233233
if(anom_it->get_exclusive() < m_min_anom_time) continue; //skip executions with too short runtimes to avoid filling the database with irrelevant anomalies
234234

src/ad/ADLocalAnomalyMetrics.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ ADLocalAnomalyMetrics::State::State(const ADLocalAnomalyMetrics &parent){
1515

1616
ADLocalAnomalyMetrics::ADLocalAnomalyMetrics(int app, int rank, int step, unsigned long first_event_ts, unsigned long last_event_ts, const Anomalies &anom):
1717
m_app(app), m_rank(rank), m_step(step), m_first_event_ts(first_event_ts), m_last_event_ts(last_event_ts), m_perf(nullptr){
18-
const std::vector<CallListIterator_t> & outliers = anom.allEvents(Anomalies::EventType::Outlier);
18+
const std::vector<CallListIterator_t> & outliers = anom.allEventsRecorded(Anomalies::EventType::Outlier);
1919
for(auto const &cit: outliers){
2020
int fid = cit->get_fid();
2121
auto fit = m_func_anom_metrics.find(fid);

src/ad/ADLocalFuncStatistics.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -59,18 +59,15 @@ void ADLocalFuncStatistics::gatherStatistics(const ExecDataMap_t* exec_data){
5959
}
6060

6161
void ADLocalFuncStatistics::gatherAnomalies(const Anomalies &anom){
62-
//Loop over functions and get the number of anomalies
63-
for(auto &fstats: m_funcstats){
64-
unsigned long func_id = fstats.second.id;
65-
fstats.second.n_anomaly += anom.nFuncEvents(func_id, Anomalies::EventType::Outlier);
66-
}
67-
6862
//Gather information on the number of anomalies and stats on their scores
69-
const std::vector<CallListIterator_t> &anomalies = anom.allEvents(Anomalies::EventType::Outlier);
63+
const std::vector<CallListIterator_t> &anomalies = anom.allEventsRecorded(Anomalies::EventType::Outlier);
7064
m_anom_data.incr_n_anomalies(anomalies.size());
7165

72-
for(auto const &it : anomalies)
66+
for(auto const &it : anomalies){
7367
m_anom_data.add_outlier_score(it->get_outlier_score());
68+
++m_funcstats[it->get_fid()].n_anomaly; //increment func anomalies count
69+
}
70+
7471
}
7572

7673
ADLocalFuncStatistics::State ADLocalFuncStatistics::get_state() const{

src/ad/ADMonitoring.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ void ADMonitoring::extractCounters(const CountersByIndex_t &counters){
5757

5858
CounterDataListIterator_t cdi = *vit->second.rbegin();
5959
const CounterData_t &cd = *cdi;
60-
if(cd.get_tid() != 0) recoverable_error("Expected counter to be on thread 0!");
60+
//if(cd.get_tid() != 0) recoverable_error("Expected counter to be on thread 0! Info: "+ cd.get_json().dump()); //This doesn't seem to be true any more because Kevin made changes to the monitoring plugin
6161
entry.value = cd.get_value();
6262
entry.assigned = true;
6363
m_timestamp = cd.get_ts(); //overwrite timestamp of update (all monitoring data in a dump should arrive at the same time)

src/ad/ADOutlier.cpp

Lines changed: 17 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -242,16 +242,11 @@ unsigned long ADOutlierSSTD::compute_outliers(Anomalies &outliers,
242242
verboseStream << "!!!!!!!Detected outlier on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid()
243243
<< " runtime " << runtime << " mean " << mean << " std " << std << std::endl;
244244
n_outliers += 1;
245-
outliers.insert(itt, Anomalies::EventType::Outlier); //insert into data structure containing captured anomalies
246-
}else{
247-
//Capture maximum of one normal execution per io step
248-
if(outliers.nFuncEvents(func_id, Anomalies::EventType::Normal) == 0){
249-
verboseStream << "Detected normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid()
250-
<< " runtime " << runtime << " mean " << mean << " std " << std << std::endl;
251-
252-
outliers.insert(itt, Anomalies::EventType::Normal);
253-
}
254-
}
245+
outliers.recordAnomaly(itt); //insert into data structure containing captured anomalies
246+
}else if(outliers.recordNormalEventConditional(itt)){
247+
verboseStream << "Recorded normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid()
248+
<< " runtime " << runtime << " mean " << mean << " std " << std << std::endl;
249+
}
255250
}
256251
}
257252

@@ -460,11 +455,6 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
460455
const double bin_width = hist.bin_edges().at(1) - hist.bin_edges().at(0);
461456
verboseStream << "Bin width: " << bin_width << std::endl;
462457

463-
//Maintain the normal execution with the lowest score (most likely)
464-
bool lowest_score_set = false;
465-
double lowest_score_val = std::numeric_limits<double>::max();
466-
CallListIterator_t lowest_score_it;
467-
468458
int top_out = 0;
469459
for (auto itt : data) {
470460
if (itt->get_label() == 0) {
@@ -508,31 +498,18 @@ unsigned long ADOutlierHBOS::compute_outliers(Anomalies &outliers,
508498
if (ad_score >= l_threshold) {
509499
itt->set_label(-1);
510500
verboseStream << "!!!!!!!Detected outlier on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << runtime_i << " score " << ad_score << " (threshold " << l_threshold << ")" << std::endl;
511-
outliers.insert(itt, Anomalies::EventType::Outlier); //insert into data structure containing captured anomalies
501+
outliers.recordAnomaly(itt); //insert into data structure containing captured anomalies
512502
n_outliers += 1;
513503
}else {
514-
//Capture maximum of one normal execution per io step
515504
itt->set_label(1);
516-
517-
//Record the normal event with the lowest score (most likely)
518-
if(ad_score < lowest_score_val){
519-
lowest_score_val = ad_score;
520-
lowest_score_it = itt;
521-
lowest_score_set = true;
505+
if(outliers.recordNormalEventConditional(itt)){
506+
verboseStream << "Recorded normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << this->getStatisticValue(*itt) << " score " << itt->get_outlier_score() << " (threshold " << l_threshold << ")" << std::endl;
522507
}
523-
524508
}
525509

526510
}//if unlabeled point
527511
} //loop over data points
528512

529-
//Record only the normal event with the lowest score
530-
if(lowest_score_set){
531-
auto itt = lowest_score_it;
532-
verboseStream << "Recorded normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << this->getStatisticValue(*itt) << " score " << itt->get_outlier_score() << " (threshold " << l_threshold << ")" << std::endl;
533-
outliers.insert(itt, Anomalies::EventType::Normal);
534-
}
535-
536513
return n_outliers;
537514
}
538515

@@ -745,23 +722,19 @@ unsigned long ADOutlierCOPOD::compute_outliers(Anomalies &outliers,
745722
itt->set_outlier_score(ad_score);
746723
verboseStream << "runtime: " << runtime_i << " ad_score: " << ad_score << ", l_threshold: " << l_threshold << std::endl;
747724

748-
//Compare the ad_score with the threshold
749725
if (ad_score >= l_threshold) {
750726
itt->set_label(-1);
751-
verboseStream << "!!!!!!!Detected outlier on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << runtime_i << std::endl;
752-
outliers.insert(itt, Anomalies::EventType::Outlier); //insert into data structure containing captured anomalies
753-
++n_outliers;
754-
}
755-
else {
756-
//Capture maximum of one normal execution per io step
727+
verboseStream << "!!!!!!!Detected outlier on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << runtime_i << " score " << ad_score << " (threshold " << l_threshold << ")" << std::endl;
728+
outliers.recordAnomaly(itt); //insert into data structure containing captured anomalies
729+
n_outliers += 1;
730+
}else {
757731
itt->set_label(1);
758-
if(outliers.nFuncEvents(func_id, Anomalies::EventType::Normal) == 0) {
759-
verboseStream << "Detected normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << runtime_i << std::endl;
760-
outliers.insert(itt, Anomalies::EventType::Normal);
761-
}
732+
if(outliers.recordNormalEventConditional(itt)){
733+
verboseStream << "Recorded normal event on func id " << func_id << " (" << itt->get_funcname() << ") on thread " << itt->get_tid() << " runtime " << this->getStatisticValue(*itt) << " score " << itt->get_outlier_score() << " (threshold " << l_threshold << ")" << std::endl;
734+
}
762735
}
763-
}
764-
}
736+
}//unlabeled point
737+
}//data loop
765738

766739
return n_outliers;
767740
}

src/chimbuko.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -663,10 +663,11 @@ bool Chimbuko::runFrame(unsigned long long& n_func_events,
663663
timer.start();
664664
Anomalies anomalies = m_outlier->run(step);
665665
m_perf.add("ad_run_anom_detection_time_ms", timer.elapsed_ms());
666-
m_perf.add("ad_run_anomaly_count", anomalies.nEvents(Anomalies::EventType::Outlier));
666+
m_perf.add("ad_run_anomaly_count", anomalies.nEventsRecorded(Anomalies::EventType::Outlier));
667+
m_perf.add("ad_run_n_exec_analyzed", anomalies.nEvents());
667668

668-
int nout = anomalies.nEvents(Anomalies::EventType::Outlier);
669-
int nnormal = anomalies.nEvents(Anomalies::EventType::Normal);
669+
int nout = anomalies.nEventsRecorded(Anomalies::EventType::Outlier);
670+
int nnormal = anomalies.nEvents() - nout; //this is the total number of normal events, not just of those that were recorded
670671
n_outliers += nout;
671672
m_n_outliers_accum_prd += nout;
672673

@@ -698,7 +699,7 @@ bool Chimbuko::runFrame(unsigned long long& n_func_events,
698699
//Enable update of analysis time window bound on next step
699700
m_execdata_first_event_ts_set = false;
700701

701-
if(do_step_report){ headProgressStream(m_params.rank) << "driver rank " << m_params.rank << " event analysis complete: total=" << nout + nnormal << " normal=" << nnormal << " anomalous=" << nout << std::endl; }
702+
if(do_step_report){ headProgressStream(m_params.rank) << "driver rank " << m_params.rank << " function execution analysis complete: total=" << nout + nnormal << " normal=" << nnormal << " anomalous=" << nout << std::endl; }
702703
}//if(do_run_analysis)
703704

704705
m_perf.add("ad_run_total_step_time_excl_parse_ms", step_timer.elapsed_ms());

src/util/Anomalies.cpp

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,49 @@
11
#include "chimbuko/util/Anomalies.hpp"
2+
#include "chimbuko/util/error.hpp"
23

34
using namespace chimbuko;
45

5-
void Anomalies::insert(CallListIterator_t event, EventType type){
6-
switch(type){
7-
case EventType::Outlier:
8-
m_all_outliers.push_back(event);
9-
m_func_outliers[event->get_fid()].push_back(event);
10-
break;
11-
case EventType::Normal:
6+
void Anomalies::recordAnomaly(CallListIterator_t event){
7+
if(event->get_label() != -1){ fatal_error("Event is not an anomaly!"); }
8+
9+
m_all_outliers.push_back(event);
10+
++m_n_events_total; //increment counter of all events
11+
}
12+
13+
bool Anomalies::recordNormalEventConditional(CallListIterator_t event){
14+
if(event->get_label() != 1){ fatal_error("Event is not a normal event!"); }
15+
++m_n_events_total; //increment counter of all events
16+
17+
auto eit = m_func_normal_exec_idx.find(event->get_fid());
18+
if(eit == m_func_normal_exec_idx.end()){ //Add the event to the array and mark its location
1219
m_all_normal_execs.push_back(event);
13-
m_func_normal_execs[event->get_fid()].push_back(event);
14-
break;
15-
default:
16-
throw std::runtime_error("Invalid type");
20+
m_func_normal_exec_idx[event->get_fid()] = m_all_normal_execs.size()-1;
21+
return true;
22+
}else if(event->get_outlier_score() < m_all_normal_execs[eit->second]->get_outlier_score()){ //Replace the existing normal execution if the current has a lower score
23+
m_all_normal_execs[eit->second] = event;
24+
return true;
1725
}
26+
return false;
1827
}
1928

20-
const std::vector<CallListIterator_t> & Anomalies::funcEvents(const unsigned long func_id, EventType type) const{
21-
static std::vector<CallListIterator_t> empty;
22-
auto const &mp = type == EventType::Outlier ? m_func_outliers : m_func_normal_execs;
23-
auto it = mp.find(func_id);
24-
if(it != mp.end()) return it->second;
25-
else return empty;
29+
size_t Anomalies::nFuncEventsRecorded(unsigned long fid, EventType type) const{
30+
size_t out = 0;
31+
if(type == EventType::Outlier){
32+
for(auto it : m_all_outliers)
33+
if(it->get_fid() == fid) ++out;
34+
}else if(m_func_normal_exec_idx.count(fid)){
35+
out = 1;
36+
}
37+
return out;
38+
}
39+
40+
std::vector<CallListIterator_t> Anomalies::funcEventsRecorded(unsigned long fid, EventType type) const{
41+
std::vector<CallListIterator_t> out;
42+
if(type == EventType::Outlier){
43+
for(auto it : m_all_outliers)
44+
if(it->get_fid() == fid) out.push_back(it);
45+
}else if(m_func_normal_exec_idx.count(fid)){
46+
out.push_back( m_all_normal_execs[m_func_normal_exec_idx.find(fid)->second] );
47+
}
48+
return out;
2649
}

test/test_stat_sender.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,11 @@ TEST(PSstatSenderTest, StatSenderGlobalAnomalyStatsBounce)
110110
CallList_t call_list;
111111
auto it1 = call_list.insert(call_list.end(), createFuncExecData_t(pid,rid,tid,func_id,func_name, 100, 200) );
112112
auto it2 = call_list.insert(call_list.end(), createFuncExecData_t(pid,rid,tid,func_id,func_name, 300, 400) );
113+
it1->set_label(-1);
114+
it2->set_label(-1);
113115
Anomalies anom;
114-
anom.insert(it1, Anomalies::EventType::Outlier);
115-
anom.insert(it2, Anomalies::EventType::Outlier);
116+
anom.recordAnomaly(it1);
117+
anom.recordAnomaly(it2);
116118

117119
ExecDataMap_t dmap;
118120
dmap[func_id].push_back(it1);

0 commit comments

Comments
 (0)