Skip to content

Commit 1b3f738

Browse files
[Feature][debugcounterorch] Add support for configurable debug drop monitoring feature (#3509)
* Add support for configurable debug drop monitoring feature Note: This change depends on sonic-net/sonic-swss-common#971 Fixes #3501 HLD: sonic-net/SONiC#1912 What I did Added logic to read configuration from the DEBUG_DROP_MONITOR table. Added logic to register persistent alerts when the conditions are met. Added logic to toggle the feature off if desired on a per-counter level. Why I did it To implement the persistent drop counter monitoring feature which allows users to configure thresholds for drop counters and register alerts when persistent drops are detected. How I verified it Existing unit tests were run using make check to ensure no functionality was affected. New unit tests have been added to verify the functionality. Manual testing was performed on a SONiC switch to verify that the orchagent correctly reads the configuration, generates alerts when thresholds are met, and can be toggled off/on.
1 parent f13dd09 commit 1b3f738

File tree

8 files changed

+341
-5
lines changed

8 files changed

+341
-5
lines changed

orchagent/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ dist_swss_DATA = \
2929
pfc_restore.lua \
3030
pfc_restore_cisco-8000.lua \
3131
port_rates.lua \
32+
drop_monitor.lua \
3233
watermark_queue.lua \
3334
watermark_pg.lua \
3435
watermark_bufferpool.lua \

orchagent/debugcounterorch.cpp

Lines changed: 145 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ DebugCounterOrch::DebugCounterOrch(DBConnector *db, const vector<string>& table_
3737
publishDropCounterCapabilities();
3838

3939
gPortsOrch->attach(this);
40+
41+
// Add drop monitor lua script
42+
string dropMonitorPluginName = "drop_monitor.lua";
43+
string dropMonitorSha;
44+
45+
try
46+
{
47+
string dropMonitorLuaScript = swss::loadLuaScript(dropMonitorPluginName);
48+
dropMonitorSha = swss::loadRedisScript(m_countersDb.get(), dropMonitorLuaScript);
49+
}
50+
catch (const runtime_error &e)
51+
{
52+
SWSS_LOG_ERROR("Drop monitor flex counter group was not set successfully: %s", e.what());
53+
}
54+
55+
setFlexCounterGroupParameter(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP,
56+
DEBUG_DROP_MONITOR_FLEX_COUNTER_POLLING_INTERVAL_MS,
57+
STATS_MODE_READ,
58+
PORT_PLUGIN_FIELD,
59+
dropMonitorSha);
4060
}
4161

4262
DebugCounterOrch::~DebugCounterOrch(void)
@@ -193,6 +213,70 @@ void DebugCounterOrch::doTask(Consumer& consumer)
193213
SWSS_LOG_ERROR("Unknown operation type %s\n", op.c_str());
194214
}
195215
}
216+
else if (table_name == "DEBUG_DROP_MONITOR")
217+
{
218+
if (op == SET_COMMAND)
219+
{
220+
if (key == "CONFIG")
221+
{
222+
for (const auto& value : values)
223+
{
224+
string config_name = value.first;
225+
string config_value = value.second;
226+
227+
// Check the status of the drop counter monitor feature
228+
try
229+
{
230+
if (config_name == "status")
231+
{
232+
if (config_value == "enabled")
233+
{
234+
debug_monitor_enabled = true;
235+
string monitored_debug_counter_stat = counterIdsToStr(portDebugMonitorStatIds);
236+
SWSS_LOG_DEBUG("Enabling debug drop monitor: %s", monitored_debug_counter_stat.c_str());
237+
setFlexCounterGroupOperation(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP, "enable");
238+
for (auto const &curr : gPortsOrch->getAllPorts())
239+
{
240+
string key = string(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP) + ":" + sai_serialize_object_id(curr.second.m_port_id);
241+
startFlexCounterPolling(gSwitchId, key, monitored_debug_counter_stat, PORT_COUNTER_ID_LIST);
242+
}
243+
}
244+
else if (config_value == "disabled")
245+
{
246+
debug_monitor_enabled = false;
247+
SWSS_LOG_DEBUG("Disabling debug drop monitor");
248+
setFlexCounterGroupOperation(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP, "disable");
249+
for (auto const &curr : gPortsOrch->getAllPorts())
250+
{
251+
string key = string(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP) + ":" + sai_serialize_object_id(curr.second.m_port_id);
252+
stopFlexCounterPolling(gSwitchId, key);
253+
}
254+
}
255+
else
256+
{
257+
SWSS_LOG_ERROR("The status of drop counter monitor was not recognized: %s. Accepted values are enabled/disabled.", config_value.c_str());
258+
task_status = task_process_status::task_failed;
259+
}
260+
}
261+
else
262+
{
263+
SWSS_LOG_ERROR("Config for drop counter monitor was not recognized: %s. Accepted values are status.", config_value.c_str());
264+
task_status = task_process_status::task_failed;
265+
}
266+
}
267+
catch(const std::runtime_error& e)
268+
{
269+
SWSS_LOG_ERROR("Encountered an error when updating DEBUG_DROP_MONITOR. config_name: %s, config_value: %s", config_name.c_str(), config_value.c_str());
270+
task_status = task_process_status::task_failed;
271+
}
272+
}
273+
}
274+
}
275+
else
276+
{
277+
SWSS_LOG_ERROR("Unknown operation type %s\n", op.c_str());
278+
}
279+
}
196280
else
197281
{
198282
SWSS_LOG_ERROR("Received update from unknown table '%s'", table_name.c_str());
@@ -336,7 +420,7 @@ task_process_status DebugCounterOrch::uninstallDebugCounter(const string& counte
336420
string counter_type = counter->getCounterType();
337421
string counter_stat = counter->getDebugCounterSAIStat();
338422

339-
uninstallDebugFlexCounters(counter_type, counter_stat);
423+
uninstallDebugFlexCounters(counter_type, counter_stat, SAI_NULL_OBJECT_ID, counter_name);
340424

341425
if (counter_type == PORT_INGRESS_DROPS || counter_type == PORT_EGRESS_DROPS)
342426
{
@@ -531,6 +615,11 @@ void DebugCounterOrch::installDebugFlexCounters(const string& counter_type,
531615
SWSS_LOG_ENTER();
532616
CounterType flex_counter_type = getFlexCounterType(counter_type);
533617

618+
// Track the new counter_stat in debug drop monitor
619+
portDebugMonitorStatIds.insert(counter_stat);
620+
string monitored_debug_counter_stat = counterIdsToStr(portDebugMonitorStatIds);
621+
SWSS_LOG_DEBUG("Added %s to: %s", counter_stat.c_str(), monitored_debug_counter_stat.c_str());
622+
534623
if (flex_counter_type == CounterType::SWITCH_DEBUG)
535624
{
536625
flex_counter_manager.addFlexCounterStat(gSwitchId, flex_counter_type, counter_stat);
@@ -556,17 +645,34 @@ void DebugCounterOrch::installDebugFlexCounters(const string& counter_type,
556645
curr.second.m_port_id,
557646
flex_counter_type,
558647
counter_stat);
648+
649+
if (debug_monitor_enabled)
650+
{
651+
string key = string(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP) + ":" + sai_serialize_object_id(curr.second.m_port_id);
652+
stopFlexCounterPolling(gSwitchId, key);
653+
startFlexCounterPolling(gSwitchId, key, monitored_debug_counter_stat, PORT_COUNTER_ID_LIST);
654+
}
559655
}
560656
}
561657
}
562658

563659
void DebugCounterOrch::uninstallDebugFlexCounters(const string& counter_type,
564660
const string& counter_stat,
565-
sai_object_id_t port_id)
661+
sai_object_id_t port_id,
662+
const string& counter_name)
566663
{
567664
SWSS_LOG_ENTER();
568665
CounterType flex_counter_type = getFlexCounterType(counter_type);
569666

667+
// Remove the counter_stat from being tracked in debug drop monitor
668+
auto counter_stat_iter = portDebugMonitorStatIds.find(counter_stat);
669+
portDebugMonitorStatIds.erase(counter_stat_iter);
670+
string monitored_debug_counter_stat = counterIdsToStr(portDebugMonitorStatIds);
671+
SWSS_LOG_DEBUG("Removed %s from: %s", counter_stat.c_str(), monitored_debug_counter_stat.c_str());
672+
673+
// Make a vector of keys to delete from COUNTERS_DB, these keys are used by drop counter monitor
674+
std::vector<std::string> debug_drop_monitor_stats_fields;
675+
570676
if (flex_counter_type == CounterType::SWITCH_DEBUG)
571677
{
572678
flex_counter_manager.removeFlexCounterStat(gSwitchId, flex_counter_type, counter_stat);
@@ -575,6 +681,10 @@ void DebugCounterOrch::uninstallDebugFlexCounters(const string& counter_type,
575681
{
576682
for (auto const &curr : gPortsOrch->getAllPorts())
577683
{
684+
// Remove debug counter stat from being tracked by drop counter monitor
685+
string key = string(DEBUG_COUNTER_FLEX_COUNTER_GROUP) + ":" + sai_serialize_object_id(curr.second.m_port_id);
686+
stopFlexCounterPolling(gSwitchId, key);
687+
578688
if (port_id != SAI_NULL_OBJECT_ID)
579689
{
580690
if (curr.second.m_port_id != port_id)
@@ -592,8 +702,20 @@ void DebugCounterOrch::uninstallDebugFlexCounters(const string& counter_type,
592702
curr.second.m_port_id,
593703
flex_counter_type,
594704
counter_stat);
705+
706+
debug_drop_monitor_stats_fields.push_back("DEBUG_DROP_MONITOR_STATS|" + counter_name + "|" + curr.first);
707+
708+
if (debug_monitor_enabled)
709+
{
710+
string key = string(DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP) + ":" + sai_serialize_object_id(curr.second.m_port_id);
711+
stopFlexCounterPolling(gSwitchId, key);
712+
startFlexCounterPolling(gSwitchId, key, monitored_debug_counter_stat, PORT_COUNTER_ID_LIST);
713+
}
595714
}
596715
}
716+
717+
// Delete DEBUG_DROP_MONITOR_STATS for this debug counter
718+
m_countersDb->del(debug_drop_monitor_stats_fields);
597719
}
598720

599721
// Debug Counter Initialization Helper Functions START HERE ----------------------------------------
@@ -657,6 +779,11 @@ void DebugCounterOrch::createDropCounter(const string& counter_name, const strin
657779
}
658780
}
659781

782+
bool DebugCounterOrch::getDebugMonitorStatus()
783+
{
784+
return debug_monitor_enabled;
785+
}
786+
660787
// Debug Counter Configuration Helper Functions START HERE -----------------------------------------
661788

662789
// parseDropReasonUpdate takes a key from CONFIG_DB and returns the 1) the counter name being targeted and
@@ -684,5 +811,21 @@ bool DebugCounterOrch::isDropReasonValid(const string& drop_reason) const
684811
return true;
685812
}
686813

814+
string DebugCounterOrch::counterIdsToStr(const std::unordered_set<string>& ids) const
815+
{
816+
SWSS_LOG_ENTER();
817+
string str;
818+
819+
for (const auto& i: ids)
820+
{
821+
str += i + ",";
822+
}
687823

824+
// Remove trailing ','
825+
if (!str.empty())
826+
{
827+
str.pop_back();
828+
}
688829

830+
return str;
831+
}

orchagent/debugcounterorch.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ extern "C" {
1717
}
1818

1919
#define DEBUG_COUNTER_FLEX_COUNTER_GROUP "DEBUG_COUNTER"
20+
#define DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP "DEBUG_MONITOR_COUNTER"
21+
#define DEBUG_DROP_MONITOR_FLEX_COUNTER_POLLING_INTERVAL_MS "60000"
2022

2123
using DebugCounterMap = std::unordered_map<std::string, std::unique_ptr<DebugCounter>>;
2224

@@ -31,6 +33,8 @@ class DebugCounterOrch: public Orch, public Observer
3133
void doTask(Consumer& consumer);
3234

3335
void update(SubjectType, void *cntx);
36+
37+
bool getDebugMonitorStatus();
3438
private:
3539
// Debug Capability Reporting Functions
3640
void publishDropCounterCapabilities();
@@ -57,7 +61,8 @@ class DebugCounterOrch: public Orch, public Observer
5761
void uninstallDebugFlexCounters(
5862
const std::string& counter_type,
5963
const std::string& counter_stat,
60-
sai_object_id_t port_id = SAI_NULL_OBJECT_ID);
64+
sai_object_id_t port_id = SAI_NULL_OBJECT_ID,
65+
const std::string& counter_name = "");
6166

6267
// Debug Counter Initialization Helper Functions
6368
std::string getDebugCounterType(
@@ -74,6 +79,7 @@ class DebugCounterOrch: public Orch, public Observer
7479
std::string *counter_name,
7580
std::string *drop_reason) const;
7681
bool isDropReasonValid(const std::string& drop_reason) const;
82+
std::string counterIdsToStr(const std::unordered_set<std::string>& ids) const;
7783

7884
// Data Members
7985
std::shared_ptr<swss::DBConnector> m_stateDb = nullptr;
@@ -91,6 +97,11 @@ class DebugCounterOrch: public Orch, public Observer
9197

9298
DebugCounterMap debug_counters;
9399

100+
// portDebugStatIds will store the debug counter stats that have been
101+
// configured
102+
bool debug_monitor_enabled = false;
103+
std::unordered_set<std::string> portDebugMonitorStatIds;
104+
94105
// free_drop_counters are drop counters that have been created by a user
95106
// that do not have any drop reasons associated with them yet. Because
96107
// we cannot create a drop counter without any drop reasons, we keep track

orchagent/drop_monitor.lua

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
-- KEYS - port IDs
2+
-- ARGV[1] - counters db index
3+
-- ARGV[2] - counters table name
4+
-- ARGV[3] - poll time interval (milliseconds)
5+
6+
local counters_db = ARGV[1]
7+
local config_db = 4
8+
local debug_drop_monitor_stat_table = 'DEBUG_DROP_MONITOR_STATS'
9+
local persistent_drop_alert_table = 'PERSISTENT_DROP_ALERTS'
10+
11+
redis.call('SELECT', counters_db)
12+
13+
-- Helper functions
14+
local function parse_boolean(str) return str == "true" end
15+
local function parse_number(str) return tonumber(str) or 0 end
16+
17+
-- Get the debug counters and port name map
18+
local debug_counter_to_port_stat_map = redis.call('HGETALL', "COUNTERS_DEBUG_NAME_PORT_STAT_MAP")
19+
local debug_counter_to_port_stat_map_len = redis.call('HLEN', "COUNTERS_DEBUG_NAME_PORT_STAT_MAP")
20+
local port_name_map = redis.call('HGETALL', "COUNTERS_PORT_NAME_MAP")
21+
local port_name_map_len = redis.call('HLEN', "COUNTERS_PORT_NAME_MAP")
22+
23+
-- Iterate over the debug counter and get their specific configuration
24+
for debug_counter_index = 1, debug_counter_to_port_stat_map_len, 2 do
25+
local debug_counter = debug_counter_to_port_stat_map[debug_counter_index]
26+
local debug_counter_stat = debug_counter_to_port_stat_map[debug_counter_index + 1]
27+
28+
-- Get the configuration of debug counter
29+
redis.call('SELECT', config_db)
30+
local debug_counter_table = "DEBUG_COUNTER|" .. debug_counter
31+
local status = redis.call('HGET', debug_counter_table, 'drop_monitor_status')
32+
local drop_count_threshold = parse_number(redis.call('HGET', debug_counter_table, 'drop_count_threshold'))
33+
local incident_count_threshold = parse_number(redis.call('HGET', debug_counter_table, 'incident_count_threshold'))
34+
local window = parse_number(redis.call('HGET', debug_counter_table, 'window'))
35+
redis.call('SELECT', counters_db)
36+
37+
-- Detect persistent drops if status is enabled
38+
if status == 'enabled' then
39+
-- Iterate over all ports
40+
for port_index = 1, port_name_map_len, 2 do
41+
-- Get counter stats
42+
local port = port_name_map[port_index]
43+
local port_oid = port_name_map[port_index + 1]
44+
local counter_stat_map = "COUNTERS:" .. port_oid
45+
local current_drop_count = parse_number(redis.call('HGET', counter_stat_map, debug_counter_stat))
46+
47+
-- Calculate the delta since previous poll
48+
local prev_drop_count = parse_number(redis.call('HGET', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port, 'prev_drop_count'))
49+
local delta_drop_count = current_drop_count - prev_drop_count
50+
51+
-- Update the previous drop count
52+
redis.call('HSET', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port, 'prev_drop_count', current_drop_count)
53+
54+
-- Fetch the current timestamp
55+
local time = redis.call('TIME')
56+
local curr_unix_timestamp = tonumber(time[1])
57+
58+
-- Check if drop count is greater than drop count threshold
59+
if delta_drop_count > drop_count_threshold then
60+
redis.call('RPUSH', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port .. '|incidents', curr_unix_timestamp)
61+
end
62+
63+
-- Remove outdated incidents
64+
local incident_count = 0
65+
local number_of_outdated_incidents = 0
66+
local number_of_incidents = redis.call('LLEN', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port .. '|incidents')
67+
local incident_timestamps = redis.call('LRANGE', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port .. '|incidents', 0, number_of_incidents)
68+
for incident_index = 1, number_of_incidents do
69+
local time_delta = curr_unix_timestamp - incident_timestamps[incident_index]
70+
if (time_delta > window) then
71+
number_of_outdated_incidents = number_of_outdated_incidents + 1
72+
else
73+
incident_count = incident_count + 1
74+
end
75+
end
76+
77+
-- Delete incidents that are outside the window
78+
redis.call('LPOP', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port .. '|incidents', number_of_outdated_incidents)
79+
80+
if incident_count > incident_count_threshold then
81+
-- Generate alert for persistent drops
82+
redis.call('HSET', persistent_drop_alert_table, debug_counter .. '|' .. curr_unix_timestamp, 'Persistent packet drops detected on ' .. port)
83+
-- Delete all incidents since a persistent drop alert was issued
84+
redis.call('DEL', debug_drop_monitor_stat_table .. '|' .. debug_counter .. '|' .. port .. '|incidents')
85+
end
86+
end
87+
end
88+
end

orchagent/flexcounterorch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ unordered_map<string, string> flexCounterGroupMap =
6464
{
6565
{"PORT", PORT_STAT_COUNTER_FLEX_COUNTER_GROUP},
6666
{"PORT_RATES", PORT_RATE_COUNTER_FLEX_COUNTER_GROUP},
67+
{"DEBUG_MONITOR_COUNTER", DEBUG_DROP_MONITOR_FLEX_COUNTER_GROUP},
6768
{"PORT_BUFFER_DROP", PORT_BUFFER_DROP_STAT_FLEX_COUNTER_GROUP},
6869
{"QUEUE", QUEUE_STAT_COUNTER_FLEX_COUNTER_GROUP},
6970
{"PFCWD", PFC_WD_FLEX_COUNTER_GROUP},

orchagent/orchdaemon.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,8 @@ bool OrchDaemon::init()
419419

420420
vector<string> debug_counter_tables = {
421421
CFG_DEBUG_COUNTER_TABLE_NAME,
422-
CFG_DEBUG_COUNTER_DROP_REASON_TABLE_NAME
422+
CFG_DEBUG_COUNTER_DROP_REASON_TABLE_NAME,
423+
CFG_DEBUG_DROP_MONITOR_TABLE_NAME
423424
};
424425

425426
gDebugCounterOrch = new DebugCounterOrch(m_configDb, debug_counter_tables, 1000);

tests/mock_tests/mock_orchagent_main.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "switchorch.h"
55
#include "crmorch.h"
66
#include "portsorch.h"
7+
#include "debugcounterorch.h"
78
#include "routeorch.h"
89
#include "flowcounterrouteorch.h"
910
#include "intfsorch.h"
@@ -53,6 +54,7 @@ extern sai_object_id_t gUnderlayIfId;
5354
extern SwitchOrch *gSwitchOrch;
5455
extern CrmOrch *gCrmOrch;
5556
extern PortsOrch *gPortsOrch;
57+
extern DebugCounterOrch *gDebugCounterOrch;
5658
extern FgNhgOrch *gFgNhgOrch;
5759
extern RouteOrch *gRouteOrch;
5860
extern FlowCounterRouteOrch *gFlowCounterRouteOrch;

0 commit comments

Comments
 (0)