Skip to content

Commit 8629b68

Browse files
authored
Support ASIC/SDK health event (#3129)
#### What I did Support ASIC/SDK health event 1. config asic-sdk-health-event suppress 2. show asic-sdk-health-event [received|suppress] 3. sonic-clear asic-sdk-health-event Depends on sonic-net/sonic-buildimage#17879 #### How to verify it Unit test
1 parent a01a0a6 commit 8629b68

File tree

12 files changed

+680
-1
lines changed

12 files changed

+680
-1
lines changed

clear/main.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55
import click
66
import utilities_common.cli as clicommon
77
import utilities_common.multi_asic as multi_asic_util
8+
from sonic_py_common import multi_asic
89
from sonic_py_common.general import getstatusoutput_noshell_pipe
910
from flow_counter_util.route import exit_if_route_flow_counter_not_support
1011
from utilities_common import util_base
1112
from show.plugins.pbh import read_pbh_counters
1213
from config.plugins.pbh import serialize_pbh_counters
1314
from . import plugins
1415

15-
1616
# This is from the aliases example:
1717
# https://github.com/pallets/click/blob/57c6f09611fc47ca80db0bd010f05998b3c0aa95/examples/aliases/aliases.py
1818
class Config(object):
@@ -550,6 +550,28 @@ def route(prefix, vrf, namespace):
550550
helper = util_base.UtilHelper()
551551
helper.load_and_register_plugins(plugins, cli)
552552

553+
# ("sonic-clear asic-sdk-health-event")
554+
@cli.command()
555+
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
556+
help='Option needed for multi-asic only: provide namespace name',
557+
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
558+
@clicommon.pass_db
559+
def asic_sdk_health_event(db, namespace):
560+
"""Clear received ASIC/SDK health events"""
561+
if multi_asic.get_num_asics() > 1:
562+
namespace_list = multi_asic.get_namespaces_from_linux()
563+
else:
564+
namespace_list = [multi_asic.DEFAULT_NAMESPACE]
565+
566+
for ns in namespace_list:
567+
if namespace and namespace != ns:
568+
continue
569+
570+
state_db = db.db_clients[ns]
571+
keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*")
572+
for key in keys:
573+
state_db.delete(state_db.STATE_DB, key);
574+
553575

554576
if __name__ == '__main__':
555577
cli()

config/main.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7548,5 +7548,124 @@ def date(date, time):
75487548
clicommon.run_command(['timedatectl', 'set-time', date_time])
75497549

75507550

7551+
#
7552+
# 'asic-sdk-health-event' group ('config asic-sdk-health-event ...')
7553+
#
7554+
@config.group()
7555+
def asic_sdk_health_event():
7556+
"""Configuring asic-sdk-health-event"""
7557+
pass
7558+
7559+
7560+
@asic_sdk_health_event.group()
7561+
def suppress():
7562+
"""Suppress ASIC/SDK health event"""
7563+
pass
7564+
7565+
7566+
def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, namespace):
7567+
ctx = click.get_current_context()
7568+
7569+
if multi_asic.get_num_asics() > 1:
7570+
namespace_list = multi_asic.get_namespaces_from_linux()
7571+
else:
7572+
namespace_list = [DEFAULT_NAMESPACE]
7573+
7574+
severityCapabilities = {
7575+
"fatal": "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY",
7576+
"warning": "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY",
7577+
"notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY"
7578+
}
7579+
7580+
if category_list:
7581+
categories = {"software", "firmware", "cpu_hw", "asic_hw"}
7582+
7583+
if category_list == 'none':
7584+
suppressedCategoriesList = []
7585+
elif category_list == 'all':
7586+
suppressedCategoriesList = list(categories)
7587+
else:
7588+
suppressedCategoriesList = category_list.split(',')
7589+
7590+
unsupportCategories = set(suppressedCategoriesList) - categories
7591+
if unsupportCategories:
7592+
ctx.fail("Invalid category(ies): {}".format(unsupportCategories))
7593+
7594+
for ns in namespace_list:
7595+
if namespace and namespace != ns:
7596+
continue
7597+
7598+
config_db = db.cfgdb_clients[ns]
7599+
state_db = db.db_clients[ns]
7600+
7601+
entry_name = "SWITCH_CAPABILITY|switch"
7602+
if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"):
7603+
ctx.fail("ASIC/SDK health event is not supported on the platform")
7604+
7605+
if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]):
7606+
ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity))
7607+
7608+
entry = config_db.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)
7609+
need_remove = False
7610+
noarg = True
7611+
7612+
if category_list:
7613+
noarg = False
7614+
if suppressedCategoriesList:
7615+
entry["categories"] = suppressedCategoriesList
7616+
elif entry.get("categories"):
7617+
entry.pop("categories")
7618+
need_remove = True
7619+
7620+
if max_events is not None:
7621+
noarg = False
7622+
if max_events > 0:
7623+
entry["max_events"] = max_events
7624+
elif entry.get("max_events"):
7625+
entry.pop("max_events")
7626+
need_remove = True
7627+
7628+
if noarg:
7629+
ctx.fail("At least one argument should be provided!")
7630+
7631+
if entry:
7632+
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, entry)
7633+
elif need_remove:
7634+
config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None)
7635+
7636+
7637+
@suppress.command()
7638+
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
7639+
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
7640+
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
7641+
help='Option needed for multi-asic only: provide namespace name',
7642+
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
7643+
@clicommon.pass_db
7644+
def fatal(db, category_list, max_events, namespace):
7645+
handle_asic_sdk_health_suppress(db, 'fatal', category_list, max_events, namespace)
7646+
7647+
7648+
@suppress.command()
7649+
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
7650+
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
7651+
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
7652+
help='Option needed for multi-asic only: provide namespace name',
7653+
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
7654+
@clicommon.pass_db
7655+
def warning(db, category_list, max_events, namespace):
7656+
handle_asic_sdk_health_suppress(db, 'warning', category_list, max_events, namespace)
7657+
7658+
7659+
@suppress.command()
7660+
@click.option('--category-list', metavar='<category_list>', type=str, help="Categories to be suppressed")
7661+
@click.option('--max-events', metavar='<max_events>', type=click.IntRange(0), help="Maximum number of received events")
7662+
@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False,
7663+
help='Option needed for multi-asic only: provide namespace name',
7664+
type=click.Choice(multi_asic_util.multi_asic_ns_choices()))
7665+
@clicommon.pass_db
7666+
def notice(db, category_list, max_events, namespace):
7667+
handle_asic_sdk_health_suppress(db, 'notice', category_list, max_events, namespace)
7668+
7669+
75517670
if __name__ == '__main__':
75527671
config()

doc/Command-Reference.md

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,10 @@
2929
* [ARP & NDP](#arp--ndp)
3030
* [ARP show commands](#arp-show-commands)
3131
* [NDP show commands](#ndp-show-commands)
32+
* [ASIC SDK health event](#asic-sdk-health-event)
33+
* [ASIC SDK health event config commands](#asic-sdk-health-event-config-commands)
34+
* [ASIC SDK health event show commands](#asic-sdk-health-event-show-commands)
35+
* [ASIC SDK health event clear commands](#asic-sdk-health-event-clear-commands)
3236
* [BFD](#bfd)
3337
* [BFD show commands](#bfd-show-commands)
3438
* [BGP](#bgp)
@@ -1930,6 +1934,158 @@ This command is used to display: ACL rules, tables and their priority, ACL packe
19301934
19311935
If the `PACKETS COUNT` and `BYTES COUNT` fields have some numeric value it means that it is a SONiC ACL's and those counters are created in SONiC `COUNTERS_DB`.
19321936
1937+
## ASIC SDK health event
1938+
1939+
### ASIC SDK health event config commands
1940+
1941+
**config asic-sdk-health-event suppress **
1942+
1943+
This command is for a customer to configure the categories that he/she wants to suppress for a certain severity.
1944+
1945+
- Usage:
1946+
```
1947+
config config asic-sdk-health-event suppress <severity> [--category-list <category-list>|<none>|<all>] [--max-events <max-events>]
1948+
```
1949+
1950+
- Parameters:
1951+
- severity: Specify the severity whose ASIC/SDK health events to be suppressed. It can be one of `fatal`, `warning`, and `notice`.
1952+
- category-list: Specify the categories from which the ASIC/SDK health events to be suppressed. It is a list whose element is one of `software`, `firmware`, `cpu_hw`, `asic_hw` separated by a comma.
1953+
If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. In this case, it will not be stored in the CONFIG_DB.
1954+
If the category-list is `all`, all the categories are suppressed and none category will be notified for `severity`.
1955+
- max-events: Specify the maximum number of events of the severity to be stored in the STATE_DB.
1956+
There is no limitation if the max-events is 0. In this case, it will not be stored in the CONFIG_DB.
1957+
1958+
- Examples:
1959+
```
1960+
admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal --category-list cpu_hw,software --max-events 10240
1961+
```
1962+
1963+
This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. Maximum number of such events in the STATE_DB is 10240.
1964+
1965+
### ASIC SDK health event show commands
1966+
1967+
**show asic-sdk-health-event received**
1968+
1969+
This command displays the received ASIC/SDK health events.
1970+
1971+
- Usage:
1972+
```
1973+
show asic-sdk-health-event received [-n <asicname>]
1974+
```
1975+
1976+
- Details:
1977+
- show asic-sdk-health-event received: Display the ASIC/SDK health events received on all ASICs
1978+
- show asic-sdk-health-event received -n asic0: Display all the ASIC/SDK health events received on asic0
1979+
1980+
1981+
- Example:
1982+
```
1983+
admin@sonic:~$ show asic-sdk-health-event received
1984+
Time Severity Category Description
1985+
------------------- ----------- --------- -----------------
1986+
2023-10-20 05:07:34 fatal firmware Command timeout
1987+
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
1988+
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
1989+
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
1990+
```
1991+
1992+
- Example on a multi ASIC system:
1993+
```
1994+
admin@sonic:~$ show asic-sdk-health-event received
1995+
asic0:
1996+
Time Severity Category Description
1997+
------------------- ----------- --------- -----------------
1998+
2023-10-20 05:07:34 fatal firmware Command timeout
1999+
2023-10-20 03:06:25 fatal software SDK daemon keep alive failed
2000+
asic1:
2001+
Time Severity Category Description
2002+
------------------- ----------- --------- -----------------
2003+
2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error
2004+
2023-10-20 01:58:43 notice asic_hw Correctable ECC error
2005+
```
2006+
2007+
Optionally, you can specify the asic name in order to display the ASIC/SDK health events received on that particular ASIC on a multi ASIC system
2008+
2009+
- Example:
2010+
```
2011+
admin@sonic:~$ show asic-sdk-health-event received -n asic1
2012+
asic1:
2013+
Time Severity Category Description
2014+
------------------- ----------- --------- -----------------
2015+
2023-10-20 05:07:34 fatal firmware Command timeout
2016+
```
2017+
2018+
**show asic-sdk-health-event suppress-configuration**
2019+
2020+
This command displays the suppressed category list and maximum number of events of ASIC/SDK health events.
2021+
2022+
- Usage:
2023+
```
2024+
show asic-sdk-health-event suppressed-category-list [-n <asicname>]
2025+
```
2026+
2027+
- Details:
2028+
- show asic-sdk-health-event suppress-configuration: Display the ASIC/SDK health event suppress category list and maximum number of events on all ASICs
2029+
- show asic-sdk-health-event suppress-configuration -n asic0: Display all the ASIC/SDK health event suppress category list and maximum number of events on asic0
2030+
2031+
2032+
- Example:
2033+
```
2034+
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
2035+
Severity Suppressed category-list Max events
2036+
---------- -------------------------- ------------
2037+
fatal software unlimited
2038+
notice none 1024
2039+
warning firmware,asic_hw 10240
2040+
```
2041+
2042+
- Example on a multi ASIC system:
2043+
```
2044+
admin@sonic:~$ show asic-sdk-health-event suppress-configuration
2045+
asic0:
2046+
Severity Suppressed category-list Max events
2047+
---------- -------------------------- ------------
2048+
notice none 1024
2049+
warning firmware,asic_hw 10240
2050+
asic1:
2051+
Severity Suppressed category-list Max events
2052+
---------- -------------------------- ------------
2053+
fatal software unlimited
2054+
```
2055+
2056+
Optionally, you can specify the asic name in order to display the ASIC/SDK health event suppress category list on that particular ASIC on a multi ASIC system
2057+
2058+
- Example:
2059+
```
2060+
admin@sonic:~$ show asic-sdk-health-event suppress-configuration -n asic1
2061+
asic1:
2062+
Severity Suppressed category-list Max events
2063+
---------- -------------------------- ------------
2064+
fatal software unlimited
2065+
```
2066+
2067+
### ASIC SDK health event clear commands
2068+
2069+
**sonic-clear asic-sdk-health-event**
2070+
2071+
This command clears all the received ASIC/SDK health events.
2072+
2073+
- Usage:
2074+
```
2075+
sonic-clear asic-sdk-health-event [-n <asicname>]
2076+
```
2077+
2078+
- Details:
2079+
- sonic-clear asic-sdk-health-event: Clear the ASIC/SDK health events received on all ASICs
2080+
- sonic-clear asic-sdk-health-event -n asic0: Display all the ASIC/SDK health events received on asic0
2081+
2082+
2083+
- Example:
2084+
```
2085+
admin@sonic:~$ sonic-clear asic-sdk-health-event
2086+
```
2087+
2088+
Go Back To [Beginning of the document](#) or [Beginning of this section](#asic-sdk-health-event)
19332089
19342090
## ARP & NDP
19352091

scripts/generate_dump

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1952,6 +1952,8 @@ main() {
19521952
# 1st counter snapshot early. Need 2 snapshots to make sense of counters trend.
19531953
save_counter_snapshot $asic 1
19541954

1955+
save_cmd "show asic-sdk-health-event received" "asic.sdk.health.event" &
1956+
19551957
save_cmd "systemd-analyze blame" "systemd.analyze.blame" &
19561958
save_cmd "systemd-analyze dump" "systemd.analyze.dump" &
19571959
save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" &

0 commit comments

Comments
 (0)