Skip to content

Commit 44ad17b

Browse files
authored
[Smartswitch] Add module specific pcie attach/detach functions for smartswitch platforms (sonic-net#557)
* Platform changes to handle PCIE detach/attach * Use file lock instead of thread lock * Added sensord changes
1 parent d5c04f5 commit 44ad17b

File tree

2 files changed

+343
-1
lines changed

2 files changed

+343
-1
lines changed

sonic_platform_base/module_base.py

Lines changed: 163 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,18 @@
66
"""
77

88
import sys
9+
import os
10+
import fcntl
911
from . import device_base
12+
import json
13+
import threading
14+
import contextlib
15+
import shutil
1016

17+
# PCI state database constants
18+
PCIE_DETACH_INFO_TABLE = "PCIE_DETACH_INFO"
19+
PCIE_OPERATION_DETACHING = "detaching"
20+
PCIE_OPERATION_ATTACHING = "attaching"
1121

1222
class ModuleBase(device_base.DeviceBase):
1323
"""
@@ -16,6 +26,7 @@ class ModuleBase(device_base.DeviceBase):
1626
"""
1727
# Device type definition. Note, this is a constant.
1828
DEVICE_TYPE = "module"
29+
PCI_OPERATION_LOCK_FILE_PATH = "/var/lock/{}_pci.lock"
1930

2031
# Possible card types for modular chassis
2132
MODULE_TYPE_SUPERVISOR = "SUPERVISOR"
@@ -73,6 +84,8 @@ def __init__(self):
7384
self._thermal_list = []
7485
self._voltage_sensor_list = []
7586
self._current_sensor_list = []
87+
self.state_db_connector = None
88+
self.pci_bus_info = None
7689

7790
# List of SfpBase-derived objects representing all sfps
7891
# available on the module
@@ -81,6 +94,17 @@ def __init__(self):
8194
# List of ASIC-derived objects representing all ASICs
8295
# visibile in PCI domain on the module
8396
self._asic_list = []
97+
98+
@contextlib.contextmanager
99+
def _pci_operation_lock(self):
100+
"""File-based lock for PCI operations using flock"""
101+
lock_file_path = self.PCI_OPERATION_LOCK_FILE_PATH.format(self.get_name())
102+
with open(lock_file_path, 'w') as f:
103+
try:
104+
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
105+
yield
106+
finally:
107+
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
84108

85109
def get_base_mac(self):
86110
"""
@@ -271,10 +295,70 @@ def get_pci_bus_info(self):
271295
Retrieves the bus information.
272296
273297
Returns:
274-
Returns the PCI bus information in BDF format like "[DDDD:]BB:SS:F"
298+
Returns the PCI bus information in list of BDF format like "[DDDD:]BB:SS:F"
275299
"""
276300
raise NotImplementedError
277301

302+
def handle_pci_removal(self):
303+
"""
304+
Handles PCI device removal by updating state database and detaching device.
305+
306+
Returns:
307+
bool: True if operation was successful, False otherwise
308+
"""
309+
try:
310+
bus_info_list = self.get_pci_bus_info()
311+
with self._pci_operation_lock():
312+
for bus in bus_info_list:
313+
self.pci_entry_state_db(bus, PCIE_OPERATION_DETACHING)
314+
return self.pci_detach()
315+
except Exception as e:
316+
sys.stderr.write("Failed to handle PCI removal: {}\n".format(str(e)))
317+
return False
318+
319+
def pci_entry_state_db(self, pcie_string, operation):
320+
"""
321+
Generic function to handle PCI device state database entry.
322+
323+
Args:
324+
pcie_string (str): The PCI bus string to be written to state database
325+
operation (str): The operation being performed ("detaching" or "attaching")
326+
327+
Raises:
328+
RuntimeError: If state database connection fails
329+
"""
330+
try:
331+
# Do not use import if swsscommon is not needed
332+
import swsscommon
333+
PCIE_DETACH_INFO_TABLE_KEY = PCIE_DETACH_INFO_TABLE+"|"+pcie_string
334+
if not self.state_db_connector:
335+
self.state_db_connector = swsscommon.swsscommon.DBConnector("STATE_DB", 0)
336+
if operation == PCIE_OPERATION_ATTACHING:
337+
self.state_db_connector.delete(PCIE_DETACH_INFO_TABLE_KEY)
338+
return
339+
self.state_db_connector.hset(PCIE_DETACH_INFO_TABLE_KEY, "bus_info", pcie_string)
340+
self.state_db_connector.hset(PCIE_DETACH_INFO_TABLE_KEY, "dpu_state", operation)
341+
except Exception as e:
342+
sys.stderr.write("Failed to write pcie bus info to state database: {}\n".format(str(e)))
343+
344+
def handle_pci_rescan(self):
345+
"""
346+
Handles PCI device rescan by updating state database and reattaching device.
347+
348+
Returns:
349+
bool: True if operation was successful, False otherwise
350+
"""
351+
try:
352+
bus_info_list = self.get_pci_bus_info()
353+
with self._pci_operation_lock():
354+
return_value = self.pci_reattach()
355+
for bus in bus_info_list:
356+
self.pci_entry_state_db(bus, PCIE_OPERATION_ATTACHING)
357+
return return_value
358+
except Exception as e:
359+
sys.stderr.write("Failed to handle PCI rescan: {}\n".format(str(e)))
360+
return False
361+
278362
def pci_detach(self):
279363
"""
280364
Detaches the PCI device.
@@ -687,3 +771,81 @@ def get_all_asics(self):
687771
And '0000:05:00.0' is its PCI address.
688772
"""
689773
return self._asic_list
774+
775+
def handle_sensor_removal(self):
776+
"""
777+
Handles sensor removal by copying ignore configuration file from platform folder
778+
to sensors.d directory and restarting sensord if the file exists.
779+
780+
Returns:
781+
bool: True if operation was successful, False otherwise
782+
"""
783+
try:
784+
module_name = self.get_name()
785+
source_file = f"/usr/share/sonic/platform/module_sensors_ignore_conf/ignore_sensors_{module_name}.conf"
786+
target_file = f"/etc/sensors.d/ignore_sensors_{module_name}.conf"
787+
788+
# If source file does not exist, we dont need to copy it and restart sensord
789+
if not os.path.exists(source_file):
790+
return True
791+
792+
shutil.copy2(source_file, target_file)
793+
794+
# Restart sensord
795+
os.system("service sensord restart")
796+
797+
return True
798+
except Exception as e:
799+
sys.stderr.write("Failed to handle sensor removal: {}\n".format(str(e)))
800+
return False
801+
802+
def handle_sensor_addition(self):
803+
"""
804+
Handles sensor addition by removing the ignore configuration file from
805+
sensors.d directory and restarting sensord.
806+
807+
Returns:
808+
bool: True if operation was successful, False otherwise
809+
"""
810+
try:
811+
module_name = self.get_name()
812+
target_file = f"/etc/sensors.d/ignore_sensors_{module_name}.conf"
813+
814+
# If target file does not exist, we dont need to remove it and restart sensord
815+
if not os.path.exists(target_file):
816+
return True
817+
818+
# Remove the file
819+
os.remove(target_file)
820+
821+
# Restart sensord
822+
os.system("service sensord restart")
823+
824+
return True
825+
except Exception as e:
826+
sys.stderr.write("Failed to handle sensor addition: {}\n".format(str(e)))
827+
return False
828+
829+
def module_pre_shutdown(self):
830+
"""
831+
Handles module pre-shutdown operations by detaching PCI devices and handling sensor removal.
832+
This function should be called before shutting down a module.
833+
834+
Returns:
835+
bool: True if all operations were successful, False otherwise
836+
"""
837+
sensor_result = self.handle_sensor_removal()
838+
pci_result = self.handle_pci_removal()
839+
return pci_result and sensor_result
840+
841+
def module_post_startup(self):
842+
"""
843+
Handles module post-startup operations by reattaching PCI devices and handling sensor addition.
844+
This function should be called after a module has started up.
845+
846+
Returns:
847+
bool: True if all operations were successful, False otherwise
848+
"""
849+
pci_result = self.handle_pci_rescan()
850+
sensor_result = self.handle_sensor_addition()
851+
return pci_result and sensor_result

tests/module_base_test.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,35 @@
11
from sonic_platform_base.module_base import ModuleBase
2+
import pytest
3+
import json
4+
import os
5+
import fcntl
6+
from unittest.mock import patch, MagicMock, call
7+
from io import StringIO
8+
import shutil
9+
10+
class MockFile:
11+
def __init__(self, data=None):
12+
self.data = data
13+
self.written_data = None
14+
self.closed = False
15+
self.fileno_called = False
16+
17+
def __enter__(self):
18+
return self
19+
20+
def __exit__(self, *args):
21+
self.closed = True
22+
23+
def read(self):
24+
return self.data
25+
26+
def write(self, data):
27+
self.written_data = data
28+
29+
def fileno(self):
30+
self.fileno_called = True
31+
return 123
32+
233

334
class TestModuleBase:
435

@@ -39,3 +70,152 @@ def test_sensors(self):
3970
assert(module.get_all_current_sensors() == ["s1"])
4071
assert(module.get_current_sensor(0) == "s1")
4172

73+
def test_pci_entry_state_db(self):
74+
module = ModuleBase()
75+
mock_connector = MagicMock()
76+
module.state_db_connector = mock_connector
77+
78+
module.pci_entry_state_db("0000:00:00.0", "detaching")
79+
mock_connector.hset.assert_has_calls([
80+
call("PCIE_DETACH_INFO|0000:00:00.0", "bus_info", "0000:00:00.0"),
81+
call("PCIE_DETACH_INFO|0000:00:00.0", "dpu_state", "detaching")
82+
])
83+
84+
module.pci_entry_state_db("0000:00:00.0", "attaching")
85+
mock_connector.delete.assert_called_with("PCIE_DETACH_INFO|0000:00:00.0")
86+
87+
mock_connector.hset.side_effect = Exception("DB Error")
88+
module.pci_entry_state_db("0000:00:00.0", "detaching")
89+
90+
def test_pci_operation_lock(self):
91+
module = ModuleBase()
92+
mock_file = MockFile()
93+
94+
with patch('builtins.open', return_value=mock_file) as mock_file_open, \
95+
patch('fcntl.flock') as mock_flock, \
96+
patch.object(module, 'get_name', return_value="DPU0"), \
97+
patch('os.makedirs') as mock_makedirs:
98+
99+
with module._pci_operation_lock():
100+
mock_flock.assert_called_with(123, fcntl.LOCK_EX)
101+
102+
mock_flock.assert_has_calls([
103+
call(123, fcntl.LOCK_EX),
104+
call(123, fcntl.LOCK_UN)
105+
])
106+
assert mock_file.fileno_called
107+
108+
def test_handle_pci_removal(self):
109+
module = ModuleBase()
110+
111+
with patch.object(module, 'get_pci_bus_info', return_value=["0000:00:00.0"]), \
112+
patch.object(module, 'pci_entry_state_db') as mock_db, \
113+
patch.object(module, 'pci_detach', return_value=True), \
114+
patch.object(module, '_pci_operation_lock') as mock_lock, \
115+
patch.object(module, 'get_name', return_value="DPU0"):
116+
assert module.handle_pci_removal() is True
117+
mock_db.assert_called_with("0000:00:00.0", "detaching")
118+
mock_lock.assert_called_once()
119+
120+
with patch.object(module, 'get_pci_bus_info', side_effect=Exception()):
121+
assert module.handle_pci_removal() is False
122+
123+
def test_handle_pci_rescan(self):
124+
module = ModuleBase()
125+
126+
with patch.object(module, 'get_pci_bus_info', return_value=["0000:00:00.0"]), \
127+
patch.object(module, 'pci_entry_state_db') as mock_db, \
128+
patch.object(module, 'pci_reattach', return_value=True), \
129+
patch.object(module, '_pci_operation_lock') as mock_lock, \
130+
patch.object(module, 'get_name', return_value="DPU0"):
131+
assert module.handle_pci_rescan() is True
132+
mock_db.assert_called_with("0000:00:00.0", "attaching")
133+
mock_lock.assert_called_once()
134+
135+
with patch.object(module, 'get_pci_bus_info', side_effect=Exception()):
136+
assert module.handle_pci_rescan() is False
137+
138+
def test_handle_sensor_removal(self):
139+
module = ModuleBase()
140+
141+
with patch.object(module, 'get_name', return_value="DPU0"), \
142+
patch('os.path.exists', return_value=True), \
143+
patch('shutil.copy2') as mock_copy, \
144+
patch('os.system') as mock_system:
145+
assert module.handle_sensor_removal() is True
146+
mock_copy.assert_called_once_with("/usr/share/sonic/platform/module_sensors_ignore_conf/ignore_sensors_DPU0.conf",
147+
"/etc/sensors.d/ignore_sensors_DPU0.conf")
148+
mock_system.assert_called_once_with("service sensord restart")
149+
150+
with patch.object(module, 'get_name', return_value="DPU0"), \
151+
patch('os.path.exists', return_value=False), \
152+
patch('shutil.copy2') as mock_copy, \
153+
patch('os.system') as mock_system:
154+
assert module.handle_sensor_removal() is True
155+
mock_copy.assert_not_called()
156+
mock_system.assert_not_called()
157+
158+
with patch.object(module, 'get_name', return_value="DPU0"), \
159+
patch('os.path.exists', return_value=True), \
160+
patch('shutil.copy2', side_effect=Exception("Copy failed")):
161+
assert module.handle_sensor_removal() is False
162+
163+
def test_handle_sensor_addition(self):
164+
module = ModuleBase()
165+
166+
with patch.object(module, 'get_name', return_value="DPU0"), \
167+
patch('os.path.exists', return_value=True), \
168+
patch('os.remove') as mock_remove, \
169+
patch('os.system') as mock_system:
170+
assert module.handle_sensor_addition() is True
171+
mock_remove.assert_called_once_with("/etc/sensors.d/ignore_sensors_DPU0.conf")
172+
mock_system.assert_called_once_with("service sensord restart")
173+
174+
with patch.object(module, 'get_name', return_value="DPU0"), \
175+
patch('os.path.exists', return_value=False), \
176+
patch('os.remove') as mock_remove, \
177+
patch('os.system') as mock_system:
178+
assert module.handle_sensor_addition() is True
179+
mock_remove.assert_not_called()
180+
mock_system.assert_not_called()
181+
182+
with patch.object(module, 'get_name', return_value="DPU0"), \
183+
patch('os.path.exists', return_value=True), \
184+
patch('os.remove', side_effect=Exception("Remove failed")):
185+
assert module.handle_sensor_addition() is False
186+
187+
def test_module_pre_shutdown(self):
188+
module = ModuleBase()
189+
190+
# Test successful case
191+
with patch.object(module, 'handle_pci_removal', return_value=True), \
192+
patch.object(module, 'handle_sensor_removal', return_value=True):
193+
assert module.module_pre_shutdown() is True
194+
195+
# Test PCI removal failure
196+
with patch.object(module, 'handle_pci_removal', return_value=False), \
197+
patch.object(module, 'handle_sensor_removal', return_value=True):
198+
assert module.module_pre_shutdown() is False
199+
200+
# Test sensor removal failure
201+
with patch.object(module, 'handle_pci_removal', return_value=True), \
202+
patch.object(module, 'handle_sensor_removal', return_value=False):
203+
assert module.module_pre_shutdown() is False
204+
205+
def test_module_post_startup(self):
206+
module = ModuleBase()
207+
208+
# Test successful case
209+
with patch.object(module, 'handle_pci_rescan', return_value=True), \
210+
patch.object(module, 'handle_sensor_addition', return_value=True):
211+
assert module.module_post_startup() is True
212+
213+
# Test PCI rescan failure
214+
with patch.object(module, 'handle_pci_rescan', return_value=False), \
215+
patch.object(module, 'handle_sensor_addition', return_value=True):
216+
assert module.module_post_startup() is False
217+
218+
# Test sensor addition failure
219+
with patch.object(module, 'handle_pci_rescan', return_value=True), \
220+
patch.object(module, 'handle_sensor_addition', return_value=False):
221+
assert module.module_post_startup() is False

0 commit comments

Comments
 (0)