dmonitoringmodeld: clean up data structures (#36624)

ZwX1616 · web-flow · commit b778da1d7c4e · 2025-11-14T14:29:04.000-08:00
* update onnx

* get meta

* start

* cast

* deprecate notready

* more

* line too long

* 2
diff --git a/cereal/log.capnp b/cereal/log.capnp
@@ -2166,7 +2166,8 @@ struct DriverStateV2 {
     leftBlinkProb @7 :Float32;
     rightBlinkProb @8 :Float32;
     sunglassesProb @9 :Float32;
-    notReadyProb @12 :List(Float32);
+    phoneProb @13 :Float32;
+    notReadyProbDEPRECATED @12 :List(Float32);
     occludedProbDEPRECATED @10 :Float32;
     readyProbDEPRECATED @11 :List(Float32);
   }
diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -32,7 +32,7 @@ lenvCython.Program('models/commonmodel_pyx.so', 'models/commonmodel_pyx.pyx', LI
 tinygrad_files = ["#"+x for x in glob.glob(env.Dir("#tinygrad_repo").relpath + "/**", recursive=True, root_dir=env.Dir("#").abspath) if 'pycache' not in x]
 
 # Get model metadata
-for model_name in ['driving_vision', 'driving_policy']:
+for model_name in ['driving_vision', 'driving_policy', 'dmonitoring_model']:
   fn = File(f"models/{model_name}").abspath
   script_files = [File(Dir("#selfdrive/modeld").File("get_model_metadata.py").abspath)]
   cmd = f'python3 {Dir("#selfdrive/modeld").abspath}/get_model_metadata.py {fn}.onnx'
diff --git a/selfdrive/modeld/dmonitoringmodeld.py b/selfdrive/modeld/dmonitoringmodeld.py
@@ -7,7 +7,6 @@
 import math
 import time
 import pickle
-import ctypes
 import numpy as np
 from pathlib import Path
 
@@ -16,59 +15,31 @@
 from msgq.visionipc import VisionIpcClient, VisionStreamType, VisionBuf
 from openpilot.common.swaglog import cloudlog
 from openpilot.common.realtime import config_realtime_process
-from openpilot.common.transformations.model import dmonitoringmodel_intrinsics, DM_INPUT_SIZE
+from openpilot.common.transformations.model import dmonitoringmodel_intrinsics
 from openpilot.common.transformations.camera import _ar_ox_fisheye, _os_fisheye
 from openpilot.selfdrive.modeld.models.commonmodel_pyx import CLContext, MonitoringModelFrame
 from openpilot.selfdrive.modeld.parse_model_outputs import sigmoid
 from openpilot.selfdrive.modeld.runners.tinygrad_helpers import qcom_tensor_from_opencl_address
 
-MODEL_WIDTH, MODEL_HEIGHT = DM_INPUT_SIZE
-CALIB_LEN = 3
-FEATURE_LEN = 512
-OUTPUT_SIZE = 83 + FEATURE_LEN
-
 PROCESS_NAME = "selfdrive.modeld.dmonitoringmodeld"
 SEND_RAW_PRED = os.getenv('SEND_RAW_PRED')
 MODEL_PKL_PATH = Path(__file__).parent / 'models/dmonitoring_model_tinygrad.pkl'
-
-# TODO: slice from meta
-class DriverStateResult(ctypes.Structure):
-  _fields_ = [
-    ("face_orientation", ctypes.c_float*3),
-    ("face_position", ctypes.c_float*3),
-    ("face_orientation_std", ctypes.c_float*3),
-    ("face_position_std", ctypes.c_float*3),
-    ("face_prob", ctypes.c_float),
-    ("_unused_a", ctypes.c_float*8),
-    ("left_eye_prob", ctypes.c_float),
-    ("_unused_b", ctypes.c_float*8),
-    ("right_eye_prob", ctypes.c_float),
-    ("left_blink_prob", ctypes.c_float),
-    ("right_blink_prob", ctypes.c_float),
-    ("sunglasses_prob", ctypes.c_float),
-    ("_unused_c", ctypes.c_float),
-    ("_unused_d", ctypes.c_float*4),
-    ("not_ready_prob", ctypes.c_float*2)]
-
-
-class DMonitoringModelResult(ctypes.Structure):
-  _fields_ = [
-    ("driver_state_lhd", DriverStateResult),
-    ("driver_state_rhd", DriverStateResult),
-    ("wheel_on_right_prob", ctypes.c_float),
-    ("features", ctypes.c_float*FEATURE_LEN)]
+METADATA_PATH = Path(__file__).parent / 'models/dmonitoring_model_metadata.pkl'
 
 
 class ModelState:
   inputs: dict[str, np.ndarray]
   output: np.ndarray
 
   def __init__(self, cl_ctx):
-    assert ctypes.sizeof(DMonitoringModelResult) == OUTPUT_SIZE * ctypes.sizeof(ctypes.c_float)
+    with open(METADATA_PATH, 'rb') as f:
+      model_metadata = pickle.load(f)
+      self.input_shapes =  model_metadata['input_shapes']
+      self.output_slices = model_metadata['output_slices']
 
     self.frame = MonitoringModelFrame(cl_ctx)
     self.numpy_inputs = {
-      'calib': np.zeros((1, CALIB_LEN), dtype=np.float32),
+      'calib': np.zeros(self.input_shapes['calib'], dtype=np.float32),
     }
 
     self.tensor_inputs = {k: Tensor(v, device='NPY').realize() for k,v in self.numpy_inputs.items()}
@@ -84,9 +55,9 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple
     if TICI:
       # The imgs tensors are backed by opencl memory, only need init once
       if 'input_img' not in self.tensor_inputs:
-        self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, (1, MODEL_WIDTH*MODEL_HEIGHT), dtype=dtypes.uint8)
+        self.tensor_inputs['input_img'] = qcom_tensor_from_opencl_address(input_img_cl.mem_address, self.input_shapes['input_img'], dtype=dtypes.uint8)
     else:
-      self.tensor_inputs['input_img'] = Tensor(self.frame.buffer_from_cl(input_img_cl).reshape((1, MODEL_WIDTH*MODEL_HEIGHT)), dtype=dtypes.uint8).realize()
+      self.tensor_inputs['input_img'] = Tensor(self.frame.buffer_from_cl(input_img_cl).reshape(self.input_shapes['input_img']), dtype=dtypes.uint8).realize()
 
 
     output = self.model_run(**self.tensor_inputs).contiguous().realize().uop.base.buffer.numpy()
@@ -95,31 +66,31 @@ def run(self, buf: VisionBuf, calib: np.ndarray, transform: np.ndarray) -> tuple
     return output, t2 - t1
 
 
-def fill_driver_state(msg, ds_result: DriverStateResult):
-  msg.faceOrientation = list(ds_result.face_orientation)
-  msg.faceOrientationStd = [math.exp(x) for x in ds_result.face_orientation_std]
-  msg.facePosition = list(ds_result.face_position[:2])
-  msg.facePositionStd = [math.exp(x) for x in ds_result.face_position_std[:2]]
-  msg.faceProb = float(sigmoid(ds_result.face_prob))
-  msg.leftEyeProb = float(sigmoid(ds_result.left_eye_prob))
-  msg.rightEyeProb = float(sigmoid(ds_result.right_eye_prob))
-  msg.leftBlinkProb = float(sigmoid(ds_result.left_blink_prob))
-  msg.rightBlinkProb = float(sigmoid(ds_result.right_blink_prob))
-  msg.sunglassesProb = float(sigmoid(ds_result.sunglasses_prob))
-  msg.notReadyProb = [float(sigmoid(x)) for x in ds_result.not_ready_prob]
-
-
-def get_driverstate_packet(model_output: np.ndarray, frame_id: int, location_ts: int, execution_time: float, gpu_execution_time: float):
-  model_result = ctypes.cast(model_output.ctypes.data, ctypes.POINTER(DMonitoringModelResult)).contents
+def fill_driver_state(msg, model_output, output_slices, ds_suffix):
+  face_descs = model_output[output_slices[f'face_descs_{ds_suffix}']]
+  face_descs_std = face_descs[-6:]
+  msg.faceOrientation = [float(x) for x in face_descs[:3]]
+  msg.faceOrientationStd = [math.exp(x) for x in face_descs_std[:3]]
+  msg.facePosition = [float(x) for x in face_descs[3:5]]
+  msg.facePositionStd = [math.exp(x) for x in face_descs_std[3:5]]
+  msg.faceProb = float(sigmoid(model_output[output_slices[f'face_prob_{ds_suffix}']][0]))
+  msg.leftEyeProb = float(sigmoid(model_output[output_slices[f'left_eye_prob_{ds_suffix}']][0]))
+  msg.rightEyeProb = float(sigmoid(model_output[output_slices[f'right_eye_prob_{ds_suffix}']][0]))
+  msg.leftBlinkProb = float(sigmoid(model_output[output_slices[f'left_blink_prob_{ds_suffix}']][0]))
+  msg.rightBlinkProb = float(sigmoid(model_output[output_slices[f'right_blink_prob_{ds_suffix}']][0]))
+  msg.sunglassesProb = float(sigmoid(model_output[output_slices[f'sunglasses_prob_{ds_suffix}']][0]))
+  msg.phoneProb = float(sigmoid(model_output[output_slices[f'using_phone_prob_{ds_suffix}']][0]))
+
+def get_driverstate_packet(model_output: np.ndarray, output_slices: dict[str, slice], frame_id: int, location_ts: int, exec_time: float, gpu_exec_time: float):
   msg = messaging.new_message('driverStateV2', valid=True)
   ds = msg.driverStateV2
   ds.frameId = frame_id
-  ds.modelExecutionTime = execution_time
-  ds.gpuExecutionTime = gpu_execution_time
-  ds.wheelOnRightProb = float(sigmoid(model_result.wheel_on_right_prob))
+  ds.modelExecutionTime = exec_time
+  ds.gpuExecutionTime = gpu_exec_time
+  ds.wheelOnRightProb = float(sigmoid(model_output[output_slices['wheel_on_right']][0]))
   ds.rawPredictions = model_output.tobytes() if SEND_RAW_PRED else b''
-  fill_driver_state(ds.leftDriverData, model_result.driver_state_lhd)
-  fill_driver_state(ds.rightDriverData, model_result.driver_state_rhd)
+  fill_driver_state(ds.leftDriverData, model_output, output_slices, 'lhd')
+  fill_driver_state(ds.rightDriverData, model_output, output_slices, 'rhd')
   return msg
 
 
@@ -140,7 +111,7 @@ def main():
   sm = SubMaster(["liveCalibration"])
   pm = PubMaster(["driverStateV2"])
 
-  calib = np.zeros(CALIB_LEN, dtype=np.float32)
+  calib = np.zeros(model.numpy_inputs['calib'].size, dtype=np.float32)
   model_transform = None
 
   while True:
@@ -160,7 +131,8 @@ def main():
     model_output, gpu_execution_time = model.run(buf, calib, model_transform)
     t2 = time.perf_counter()
 
-    pm.send("driverStateV2", get_driverstate_packet(model_output, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time))
+    msg = get_driverstate_packet(model_output, model.output_slices, vipc_client.frame_id, vipc_client.timestamp_sof, t2 - t1, gpu_execution_time)
+    pm.send("driverStateV2", msg)
 
 
 if __name__ == "__main__":
diff --git a/selfdrive/modeld/models/dmonitoring_model.onnx b/selfdrive/modeld/models/dmonitoring_model.onnx
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a53626ab84757813fb16a1441704f2ae7192bef88c331bdc2415be6981d204f
-size 7191776
+oid sha256:3446bf8b22e50e47669a25bf32460ae8baf8547037f346753e19ecbfcf6d4e59
+size 6954368
diff --git a/selfdrive/monitoring/helpers.py b/selfdrive/monitoring/helpers.py
@@ -37,12 +37,12 @@ def __init__(self):
     self._BLINK_THRESHOLD = 0.865
 
     if HARDWARE.get_device_type() == 'mici':
-      self._EE_THRESH11 = 0.75
+      self._PHONE_THRESH = 0.75
     else:
-      self._EE_THRESH11 = 0.4
-    self._EE_THRESH12 = 15.0
-    self._EE_MAX_OFFSET1 = 0.06
-    self._EE_MIN_OFFSET1 = 0.025
+      self._PHONE_THRESH = 0.4
+    self._PHONE_THRESH2 = 15.0
+    self._PHONE_MAX_OFFSET = 0.06
+    self._PHONE_MIN_OFFSET = 0.025
 
     self._POSE_PITCH_THRESHOLD = 0.3133
     self._POSE_PITCH_THRESHOLD_SLACK = 0.3237
@@ -84,7 +84,7 @@ class DistractedType:
   NOT_DISTRACTED = 0
   DISTRACTED_POSE = 1 << 0
   DISTRACTED_BLINK = 1 << 1
-  DISTRACTED_E2E = 1 << 2
+  DISTRACTED_PHONE = 1 << 2
 
 class DriverPose:
   def __init__(self, max_trackable):
@@ -142,9 +142,9 @@ def __init__(self, rhd_saved=False, settings=None, always_on=False):
     self.wheelpos_learner = RunningStatFilter()
     self.pose = DriverPose(self.settings._POSE_OFFSET_MAX_COUNT)
     self.blink = DriverBlink()
-    self.eev1 = 0.
-    self.ee1_offseter = RunningStatFilter(max_trackable=self.settings._POSE_OFFSET_MAX_COUNT)
-    self.ee1_calibrated = False
+    self.phone_prob = 0.
+    self.phone_offseter = RunningStatFilter(max_trackable=self.settings._POSE_OFFSET_MAX_COUNT)
+    self.phone_calibrated = False
 
     self.always_on = always_on
     self.distracted_types = []
@@ -242,13 +242,13 @@ def _get_distracted_types(self):
     if (self.blink.left + self.blink.right)*0.5 > self.settings._BLINK_THRESHOLD:
       distracted_types.append(DistractedType.DISTRACTED_BLINK)
 
-    if self.ee1_calibrated:
-      ee1_dist = self.eev1 > max(min(self.ee1_offseter.filtered_stat.M, self.settings._EE_MAX_OFFSET1), self.settings._EE_MIN_OFFSET1) \
-                              * self.settings._EE_THRESH12
+    if self.phone_calibrated:
+      using_phone = self.phone_prob > max(min(self.phone_offseter.filtered_stat.M, self.settings._PHONE_MAX_OFFSET), self.settings._PHONE_MIN_OFFSET) \
+                              * self.settings._PHONE_THRESH2
     else:
-      ee1_dist = self.eev1 > self.settings._EE_THRESH11
-    if ee1_dist:
-      distracted_types.append(DistractedType.DISTRACTED_E2E)
+      using_phone = self.phone_prob > self.settings._PHONE_THRESH
+    if using_phone:
+      distracted_types.append(DistractedType.DISTRACTED_PHONE)
 
     return distracted_types
 
@@ -267,8 +267,7 @@ def _update_states(self, driver_state, cal_rpy, car_speed, op_engaged, standstil
       self.wheel_on_right = self.wheel_on_right_last
     driver_data = driver_state.rightDriverData if self.wheel_on_right else driver_state.leftDriverData
     if not all(len(x) > 0 for x in (driver_data.faceOrientation, driver_data.facePosition,
-                                    driver_data.faceOrientationStd, driver_data.facePositionStd,
-                                    driver_data.notReadyProb)):
+                                    driver_data.faceOrientationStd, driver_data.facePositionStd)):
       return
 
     self.face_detected = driver_data.faceProb > self.settings._FACE_THRESHOLD
@@ -284,10 +283,10 @@ def _update_states(self, driver_state, cal_rpy, car_speed, op_engaged, standstil
                                                                   * (driver_data.sunglassesProb < self.settings._SG_THRESHOLD)
     self.blink.right = driver_data.rightBlinkProb * (driver_data.rightEyeProb > self.settings._EYE_THRESHOLD) \
                                                                   * (driver_data.sunglassesProb < self.settings._SG_THRESHOLD)
-    self.eev1 = driver_data.notReadyProb[0]
+    self.phone_prob = driver_data.phoneProb
 
     self.distracted_types = self._get_distracted_types()
-    self.driver_distracted = (DistractedType.DISTRACTED_E2E in self.distracted_types or DistractedType.DISTRACTED_POSE in self.distracted_types
+    self.driver_distracted = (DistractedType.DISTRACTED_PHONE in self.distracted_types or DistractedType.DISTRACTED_POSE in self.distracted_types
                                 or DistractedType.DISTRACTED_BLINK in self.distracted_types) \
                               and driver_data.faceProb > self.settings._FACE_THRESHOLD and self.pose.low_std
     self.driver_distraction_filter.update(self.driver_distracted)
@@ -297,11 +296,11 @@ def _update_states(self, driver_state, cal_rpy, car_speed, op_engaged, standstil
     if self.face_detected and car_speed > self.settings._POSE_CALIB_MIN_SPEED and self.pose.low_std and (not op_engaged or not self.driver_distracted):
       self.pose.pitch_offseter.push_and_update(self.pose.pitch)
       self.pose.yaw_offseter.push_and_update(self.pose.yaw)
-      self.ee1_offseter.push_and_update(self.eev1)
+      self.phone_offseter.push_and_update(self.phone_prob)
 
     self.pose.calibrated = self.pose.pitch_offseter.filtered_stat.n > self.settings._POSE_OFFSET_MIN_COUNT and \
                                        self.pose.yaw_offseter.filtered_stat.n > self.settings._POSE_OFFSET_MIN_COUNT
-    self.ee1_calibrated = self.ee1_offseter.filtered_stat.n > self.settings._POSE_OFFSET_MIN_COUNT
+    self.phone_calibrated = self.phone_offseter.filtered_stat.n > self.settings._POSE_OFFSET_MIN_COUNT
 
     if self.face_detected and not self.driver_distracted:
       if model_std_max > self.settings._DCAM_UNCERTAIN_ALERT_THRESHOLD:
diff --git a/selfdrive/monitoring/test_monitoring.py b/selfdrive/monitoring/test_monitoring.py
@@ -25,7 +25,7 @@ def make_msg(face_detected, distracted=False, model_uncertain=False):
   ds.leftDriverData.faceOrientationStd = [1.*model_uncertain, 1.*model_uncertain, 1.*model_uncertain]
   ds.leftDriverData.facePositionStd = [1.*model_uncertain, 1.*model_uncertain]
   # TODO: test both separately when e2e is used
-  ds.leftDriverData.notReadyProb = [0., 0.]
+  ds.leftDriverData.phoneProb = 0.
   return ds
 
 
diff --git a/selfdrive/test/process_replay/model_replay.py b/selfdrive/test/process_replay/model_replay.py
@@ -77,7 +77,7 @@ def generate_report(proposed, master, tmp, commit):
                      (lambda x: get_idx_if_non_empty(x.leftDriverData.faceProb), "leftDriverData.faceProb"),
                      (lambda x: get_idx_if_non_empty(x.leftDriverData.faceOrientation, 0), "leftDriverData.faceOrientation0"),
                      (lambda x: get_idx_if_non_empty(x.leftDriverData.leftBlinkProb), "leftDriverData.leftBlinkProb"),
-                     (lambda x: get_idx_if_non_empty(x.leftDriverData.notReadyProb, 0), "leftDriverData.notReadyProb0"),
+                     (lambda x: get_idx_if_non_empty(x.leftDriverData.phoneProb), "leftDriverData.phoneProb"),
                      (lambda x: get_idx_if_non_empty(x.rightDriverData.faceProb), "rightDriverData.faceProb"),
                     ], "driverStateV2")