Skip to content

Commit ec2d0c0

Browse files
committed
posix-timers: Provide a mechanism to allocate a given timer ID
Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers with the same timer ID on restore. It uses sys_timer_create() and relies on the monotonic increasing timer ID provided by this syscall. It creates and deletes timers until the desired ID is reached. This is can loop for a long time, when the checkpointed process had a very sparse timer ID range. It has been debated to implement a new syscall to allow the creation of timers with a given timer ID, but that's tideous due to the 32/64bit compat issues of sigevent_t and of dubious value. The restore mechanism of CRIU creates the timers in a state where all threads of the restored process are held on a barrier and cannot issue syscalls. That means the restorer task has exclusive control. This allows to address this issue with a prctl() so that the restorer thread can do: if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON)) goto linear_mode; create_timers_with_explicit_ids(); prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF); This is backwards compatible because the prctl() fails on older kernels and CRIU can fall back to the linear timer ID mechanism. CRIU versions which do not know about the prctl() just work as before. Implement the prctl() and modify timer_create() so that it copies the requested timer ID from userspace by utilizing the existing timer_t pointer, which is used to copy out the allocated timer ID on success. If the prctl() is disabled, which it is by default, timer_create() works as before and does not try to read from the userspace pointer. There is no problem when a broken or rogue user space application enables the prctl(). If the user space pointer does not contain a valid ID, then timer_create() fails. If the data is not initialized, but constains a random valid ID, timer_create() will create that random timer ID or fail if the ID is already given out. As CRIU must use the raw syscall to avoid manipulating the internal state of the restored process, this has no library dependencies and can be adopted by CRIU right away. Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with the create/delete method. With the prctl() it takes 3 microseconds. Signed-off-by: Thomas Gleixner <[email protected]> Reviewed-by: Frederic Weisbecker <[email protected]> Reviewed-by: Cyrill Gorcunov <[email protected]> Tested-by: Cyrill Gorcunov <[email protected]> Link: https://lore.kernel.org/all/87jz8vz0en.ffs@tglx
1 parent 2dc4dbf commit ec2d0c0

File tree

5 files changed

+98
-26
lines changed

5 files changed

+98
-26
lines changed

include/linux/posix-timers.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ bool posixtimer_init_sigqueue(struct sigqueue *q);
114114
void posixtimer_send_sigqueue(struct k_itimer *tmr);
115115
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
116116
void posixtimer_free_timer(struct k_itimer *timer);
117+
long posixtimer_create_prctl(unsigned long ctrl);
117118

118119
/* Init task static initializer */
119120
#define INIT_CPU_TIMERBASE(b) { \
@@ -140,6 +141,7 @@ static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
140141
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
141142
struct sigqueue *timer_sigq) { return false; }
142143
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
144+
static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
143145
#endif
144146

145147
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK

include/linux/sched/signal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ struct signal_struct {
136136
#ifdef CONFIG_POSIX_TIMERS
137137

138138
/* POSIX.1b Interval Timers */
139+
unsigned int timer_create_restore_ids:1;
139140
atomic_t next_posix_timer_id;
140141
struct hlist_head posix_timers;
141142
struct hlist_head ignored_posix_timers;

include/uapi/linux/prctl.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,4 +353,15 @@ struct prctl_mm_map {
353353
*/
354354
#define PR_LOCK_SHADOW_STACK_STATUS 76
355355

356+
/*
357+
* Controls the mode of timer_create() for CRIU restore operations.
358+
* Enabling this allows CRIU to restore timers with explicit IDs.
359+
*
360+
* Don't use for normal operations as the result might be undefined.
361+
*/
362+
#define PR_TIMER_CREATE_RESTORE_IDS 77
363+
# define PR_TIMER_CREATE_RESTORE_IDS_OFF 0
364+
# define PR_TIMER_CREATE_RESTORE_IDS_ON 1
365+
# define PR_TIMER_CREATE_RESTORE_IDS_GET 2
366+
356367
#endif /* _LINUX_PRCTL_H */

kernel/sys.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2811,6 +2811,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
28112811
return -EINVAL;
28122812
error = arch_lock_shadow_stack_status(me, arg2);
28132813
break;
2814+
case PR_TIMER_CREATE_RESTORE_IDS:
2815+
if (arg3 || arg4 || arg5)
2816+
return -EINVAL;
2817+
error = posixtimer_create_prctl(arg2);
2818+
break;
28142819
default:
28152820
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
28162821
error = -EINVAL;

kernel/time/posix-timers.c

Lines changed: 79 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/nospec.h>
2020
#include <linux/posix-clock.h>
2121
#include <linux/posix-timers.h>
22+
#include <linux/prctl.h>
2223
#include <linux/sched/task.h>
2324
#include <linux/slab.h>
2425
#include <linux/syscalls.h>
@@ -57,6 +58,8 @@ static const struct k_clock * const posix_clocks[];
5758
static const struct k_clock *clockid_to_kclock(const clockid_t id);
5859
static const struct k_clock clock_realtime, clock_monotonic;
5960

61+
#define TIMER_ANY_ID INT_MIN
62+
6063
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
6164
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
6265
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
@@ -128,38 +131,60 @@ static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_s
128131
return false;
129132
}
130133

131-
static int posix_timer_add(struct k_itimer *timer)
134+
static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
135+
{
136+
struct timer_hash_bucket *bucket = hash_bucket(sig, id);
137+
138+
scoped_guard (spinlock, &bucket->lock) {
139+
/*
140+
* Validate under the lock as this could have raced against
141+
* another thread ending up with the same ID, which is
142+
* highly unlikely, but possible.
143+
*/
144+
if (!posix_timer_hashed(bucket, sig, id)) {
145+
/*
146+
* Set the timer ID and the signal pointer to make
147+
* it identifiable in the hash table. The signal
148+
* pointer has bit 0 set to indicate that it is not
149+
* yet fully initialized. posix_timer_hashed()
150+
* masks this bit out, but the syscall lookup fails
151+
* to match due to it being set. This guarantees
152+
* that there can't be duplicate timer IDs handed
153+
* out.
154+
*/
155+
timer->it_id = (timer_t)id;
156+
timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
157+
hlist_add_head_rcu(&timer->t_hash, &bucket->head);
158+
return true;
159+
}
160+
}
161+
return false;
162+
}
163+
164+
static int posix_timer_add(struct k_itimer *timer, int req_id)
132165
{
133166
struct signal_struct *sig = current->signal;
134167

168+
if (unlikely(req_id != TIMER_ANY_ID)) {
169+
if (!posix_timer_add_at(timer, sig, req_id))
170+
return -EBUSY;
171+
172+
/*
173+
* Move the ID counter past the requested ID, so that after
174+
* switching back to normal mode the IDs are outside of the
175+
* exact allocated region. That avoids ID collisions on the
176+
* next regular timer_create() invocations.
177+
*/
178+
atomic_set(&sig->next_posix_timer_id, req_id + 1);
179+
return req_id;
180+
}
181+
135182
for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
136183
/* Get the next timer ID and clamp it to positive space */
137184
unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
138-
struct timer_hash_bucket *bucket = hash_bucket(sig, id);
139185

140-
scoped_guard (spinlock, &bucket->lock) {
141-
/*
142-
* Validate under the lock as this could have raced
143-
* against another thread ending up with the same
144-
* ID, which is highly unlikely, but possible.
145-
*/
146-
if (!posix_timer_hashed(bucket, sig, id)) {
147-
/*
148-
* Set the timer ID and the signal pointer to make
149-
* it identifiable in the hash table. The signal
150-
* pointer has bit 0 set to indicate that it is not
151-
* yet fully initialized. posix_timer_hashed()
152-
* masks this bit out, but the syscall lookup fails
153-
* to match due to it being set. This guarantees
154-
* that there can't be duplicate timer IDs handed
155-
* out.
156-
*/
157-
timer->it_id = (timer_t)id;
158-
timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
159-
hlist_add_head_rcu(&timer->t_hash, &bucket->head);
160-
return id;
161-
}
162-
}
186+
if (posix_timer_add_at(timer, sig, id))
187+
return id;
163188
cond_resched();
164189
}
165190
/* POSIX return code when no timer ID could be allocated */
@@ -364,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
364389
return HRTIMER_NORESTART;
365390
}
366391

392+
long posixtimer_create_prctl(unsigned long ctrl)
393+
{
394+
switch (ctrl) {
395+
case PR_TIMER_CREATE_RESTORE_IDS_OFF:
396+
current->signal->timer_create_restore_ids = 0;
397+
return 0;
398+
case PR_TIMER_CREATE_RESTORE_IDS_ON:
399+
current->signal->timer_create_restore_ids = 1;
400+
return 0;
401+
case PR_TIMER_CREATE_RESTORE_IDS_GET:
402+
return current->signal->timer_create_restore_ids;
403+
}
404+
return -EINVAL;
405+
}
406+
367407
static struct pid *good_sigevent(sigevent_t * event)
368408
{
369409
struct pid *pid = task_tgid(current);
@@ -435,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
435475
timer_t __user *created_timer_id)
436476
{
437477
const struct k_clock *kc = clockid_to_kclock(which_clock);
478+
timer_t req_id = TIMER_ANY_ID;
438479
struct k_itimer *new_timer;
439480
int error, new_timer_id;
440481

@@ -449,11 +490,20 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
449490

450491
spin_lock_init(&new_timer->it_lock);
451492

493+
/* Special case for CRIU to restore timers with a given timer ID. */
494+
if (unlikely(current->signal->timer_create_restore_ids)) {
495+
if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
496+
return -EFAULT;
497+
/* Valid IDs are 0..INT_MAX */
498+
if ((unsigned int)req_id > INT_MAX)
499+
return -EINVAL;
500+
}
501+
452502
/*
453503
* Add the timer to the hash table. The timer is not yet valid
454504
* after insertion, but has a unique ID allocated.
455505
*/
456-
new_timer_id = posix_timer_add(new_timer);
506+
new_timer_id = posix_timer_add(new_timer, req_id);
457507
if (new_timer_id < 0) {
458508
posixtimer_free_timer(new_timer);
459509
return new_timer_id;
@@ -1041,6 +1091,9 @@ void exit_itimers(struct task_struct *tsk)
10411091
struct hlist_node *next;
10421092
struct k_itimer *timer;
10431093

1094+
/* Clear restore mode for exec() */
1095+
tsk->signal->timer_create_restore_ids = 0;
1096+
10441097
if (hlist_empty(&tsk->signal->posix_timers))
10451098
return;
10461099

0 commit comments

Comments
 (0)