Skip to content

Commit add1ec5

Browse files
committed
Propagate udev messages into the container
By default, in terms of kobject uevent messages, containers will: * Without userns: see kernel messages but not udev messages * With userns: see no messages at all And this creates trouble for libraries such as libusb, which relies on these netlink messages to detect device updates. We fix this by filling the gap for systemd and send out our own udev messages.
1 parent fa91da8 commit add1ec5

File tree

6 files changed

+209
-1
lines changed

6 files changed

+209
-1
lines changed

Cargo.lock

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@ tokio = { version = "1", features = ["full"] }
1717
tokio-stream = "0.1"
1818
async-stream = "0.3"
1919
udev = "0.9"
20-
rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount"] }
20+
rustix = { version = "1", features = ["fs", "stdio", "process", "thread", "pipe", "mount", "net"] }
2121
bitflags = "2"
2222
humantime = "2"
2323
serde = { version = "1", features = ["derive"] }
2424
serde_json = "1"
2525
safe-fork = "0.1.1"
2626
aya = "0.13"
27+
murmur2 = "0.1.0"
28+
zerocopy = { version = "0.8.24", features = ["derive"] }
2729

2830
[build-dependencies]
2931
anyhow = { version = "1", features = ["backtrace"] }

src/hotplug/kobject_uevent.rs

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
use std::io::{IoSlice, Write};
2+
use std::os::fd::OwnedFd;
3+
4+
use anyhow::Result;
5+
use rustix::net::{AddressFamily, SendFlags, SocketType, netlink::SocketAddrNetlink};
6+
use zerocopy::{Immutable, IntoBytes};
7+
8+
use crate::util::namespace::NetNamespace;
9+
10+
// This needs to be compatible with
11+
// https://github.com/systemd/systemd/blob/main/src/libsystemd/sd-device/device-monitor.c.
12+
#[repr(C)]
13+
#[derive(Immutable, IntoBytes)]
14+
struct MonitorNetlinkHeader {
15+
/// "libudev" prefix to distinguish libudev and kernel messages.
16+
prefix: [u8; 8],
17+
/// Magic to protect against daemon <-> Library message format mismatch
18+
/// Used in the kernel from socket filter rules; needs to be stored in network order.
19+
magic: u32,
20+
/// Total length of header structure known to the sender.
21+
header_size: u32,
22+
/// Properties string buffer
23+
properties_off: u32,
24+
properties_len: u32,
25+
/// Hashes of primary device properties strings, to let libudev subscribers
26+
/// use in-kernel socket filters; values need to be stored in network order.
27+
filter_subsystem_hash: u32,
28+
filter_devtype_hash: u32,
29+
filter_tag_bloom_hi: u32,
30+
filter_tag_bloom_lo: u32,
31+
}
32+
33+
/// Udev netlink event sender.
34+
///
35+
/// When a device is added/removed, after processing rules, `systemd-udevd` will send a netlink
36+
/// message to `kobject_uevent` netlink socket. This is picked up by libudev monitor users.
37+
///
38+
/// This netlink socket is namespaced, so udevd-sent messages are not observed by the container.
39+
/// This sender takes the place of udevd and ensures that libudev users inside the container may
40+
/// see the device add/removal event after being processed by container-hotplug.
41+
pub struct UdevSender {
42+
socket: OwnedFd,
43+
seq_num: u64,
44+
ns: NetNamespace,
45+
}
46+
47+
impl UdevSender {
48+
pub fn new(ns: NetNamespace) -> Result<Self> {
49+
let socket = ns.with(|| {
50+
rustix::net::socket(
51+
AddressFamily::NETLINK,
52+
SocketType::DGRAM,
53+
Some(rustix::net::netlink::KOBJECT_UEVENT),
54+
)
55+
})??;
56+
57+
Ok(Self {
58+
socket,
59+
seq_num: 0,
60+
ns,
61+
})
62+
}
63+
64+
pub fn send(&mut self, device: &udev::Device, event: &str) -> Result<()> {
65+
self.seq_num += 1;
66+
67+
let mut properties = Vec::new();
68+
write!(properties, "ACTION={event}\0SEQNUM={}\0", self.seq_num)?;
69+
for property in device.properties() {
70+
// These properties are specially handled.
71+
if property.name() == "ACTION" || property.name() == "SEQNUM" {
72+
continue;
73+
}
74+
properties.extend_from_slice(property.name().as_encoded_bytes());
75+
properties.push(b'=');
76+
properties.extend_from_slice(property.value().as_encoded_bytes());
77+
properties.push(0);
78+
}
79+
let header = MonitorNetlinkHeader {
80+
prefix: *b"libudev\0",
81+
magic: 0xFEEDCAFEu32.to_be(),
82+
header_size: std::mem::size_of::<MonitorNetlinkHeader>() as u32,
83+
properties_off: std::mem::size_of::<MonitorNetlinkHeader>() as u32,
84+
properties_len: properties.len() as u32,
85+
filter_subsystem_hash: device
86+
.subsystem()
87+
.map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be())
88+
.unwrap_or_default(),
89+
filter_devtype_hash: device
90+
.devtype()
91+
.map(|x| murmur2::murmur2ne(x.as_encoded_bytes(), 0).to_be())
92+
.unwrap_or_default(),
93+
// Don't bother computing the value in the same way as systemd,
94+
// just be conservative and always make it match -- this is an optimisation anyway.
95+
filter_tag_bloom_hi: 0xFFFFFFFF,
96+
filter_tag_bloom_lo: 0xFFFFFFFF,
97+
};
98+
99+
// We re-enter the namespace to obtain root UID/GID so it'll be trusted by libudev.
100+
// Otherwise, when userns is used, we're the global root which is mapped to nobody in the
101+
// container. libudev will use SCM credentials to check for the sender and identify if the
102+
// message is to be trusted.
103+
//
104+
// Technically just changing UID/GID is sufficient and network namespace re-entering isn't
105+
// necessary -- but there's no harm in doing so and it makes code simpler.
106+
self.ns.with(|| {
107+
rustix::net::sendmsg_addr(
108+
&self.socket,
109+
&SocketAddrNetlink::new(0, 2),
110+
&[IoSlice::new(header.as_bytes()), IoSlice::new(&properties)],
111+
&mut Default::default(),
112+
SendFlags::empty(),
113+
)
114+
})??;
115+
Ok(())
116+
}
117+
}

src/hotplug/mod.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
mod attached_device;
2+
mod kobject_uevent;
23
pub use attached_device::AttachedDevice;
4+
pub use kobject_uevent::UdevSender;
35

46
use std::collections::HashMap;
57
use std::path::PathBuf;
@@ -20,6 +22,7 @@ pub struct HotPlug {
2022
symlinks: Vec<cli::Symlink>,
2123
monitor: DeviceMonitor,
2224
devices: HashMap<PathBuf, AttachedDevice>,
25+
udev_sender: UdevSender,
2326
}
2427

2528
impl HotPlug {
@@ -31,11 +34,16 @@ impl HotPlug {
3134
let monitor = DeviceMonitor::new(hub_path.clone())?;
3235
let devices = Default::default();
3336

37+
let udev_sender = UdevSender::new(crate::util::namespace::NetNamespace::of_pid(
38+
container.pid(),
39+
)?)?;
40+
3441
Ok(Self {
3542
container,
3643
symlinks,
3744
monitor,
3845
devices,
46+
udev_sender,
3947
})
4048
}
4149

@@ -80,6 +88,8 @@ impl HotPlug {
8088
self.container.symlink(&devnode.path, symlink).await?;
8189
}
8290

91+
self.udev_sender.send(device.udev(), "add")?;
92+
8393
let syspath = device.syspath().to_owned();
8494
let device = AttachedDevice { device, symlinks };
8595
self.devices.insert(syspath, device.clone());
@@ -100,6 +110,8 @@ impl HotPlug {
100110
self.container.rm(symlink).await?;
101111
}
102112

113+
self.udev_sender.send(device.udev(), "remove")?;
114+
103115
Ok(Some(Event::Detach(device)))
104116
}
105117
}

src/runc/container.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ impl Container {
125125
Ok(container)
126126
}
127127

128+
pub fn pid(&self) -> Pid {
129+
self.pid
130+
}
131+
128132
/// Remount /dev inside the init namespace.
129133
///
130134
/// When user namespace is used, the /dev created by runc will be mounted inside the user namespace,

src/util/namespace.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,48 @@ impl MntNamespace {
153153
})
154154
}
155155
}
156+
157+
pub struct NetNamespace {
158+
net_fd: File,
159+
user_ns: UserNamespace,
160+
}
161+
162+
impl NetNamespace {
163+
/// Open the network namespace of a process.
164+
pub fn of_pid(pid: Pid) -> Result<NetNamespace> {
165+
let net_fd = File::open(format!("/proc/{}/ns/net", pid.as_raw_nonzero()))?;
166+
let user_ns = UserNamespace::of_pid(pid)?;
167+
Ok(NetNamespace { net_fd, user_ns })
168+
}
169+
170+
/// Enter the network namespace.
171+
///
172+
/// This operation is not reversible.
173+
pub fn enter(&self) -> Result<()> {
174+
// Switch this particular thread to the container's network namespace.
175+
rustix::thread::move_into_link_name_space(
176+
self.net_fd.as_fd(),
177+
Some(LinkNameSpaceType::Network),
178+
)?;
179+
180+
// Similar to mount namespace, we also want to behave as container root.
181+
// This is so that SCM credentials are seen properly.
182+
self.user_ns.enter()?;
183+
Ok(())
184+
}
185+
186+
/// Execute inside the mount namespace.
187+
pub fn with<T: Send, F: FnOnce() -> T + Send>(&self, f: F) -> Result<T> {
188+
// To avoid messing with rest of the process, we do everything in a new thread.
189+
// Use scoped thread to avoid 'static bound (we need to access fd).
190+
std::thread::scope(|scope| {
191+
scope
192+
.spawn(|| -> Result<T> {
193+
self.enter()?;
194+
Ok(f())
195+
})
196+
.join()
197+
.map_err(|_| anyhow::anyhow!("work thread panicked"))?
198+
})
199+
}
200+
}

0 commit comments

Comments
 (0)