@@ -69,6 +69,28 @@ struct ioem_data {
69
69
70
70
static bool ioem_limit_should_affect (struct ioem_data * data , struct request * rq );
71
71
72
+ /**
73
+ * struct ioem_priv - The priv data stored in request
74
+ * @time_to_send: The expected sending time of the request
75
+ *
76
+ * The expected sending time is calculated when this request comes into the
77
+ * scheduler, then it will be stored in the `struct ioem_priv`. This struct
78
+ * shouldn't be longer than three pointers, as the `rq->elv` only have three
79
+ * pointers long.
80
+ */
81
+ struct ioem_priv {
82
+ u64 time_to_send ;
83
+ bool ioem_limit_should_affect ;
84
+ }__attribute__((packed ));
85
+
86
+ struct ioem_priv * ioem_priv (struct request * rq )
87
+ {
88
+ BUILD_BUG_ON (sizeof (struct ioem_priv ) > sizeof (rq -> elv ));
89
+ // `priv` has two pointers long, is enough to store the `ioem_priv`.
90
+ return (struct ioem_priv * )(& rq -> elv .priv [0 ]);
91
+ }
92
+
93
+
72
94
/**
73
95
* struct irl - request limit
74
96
* @lock: The lock protects the config
@@ -90,6 +112,8 @@ struct irl {
90
112
atomic64_t io_counter ;
91
113
atomic64_t last_expire_time ;
92
114
struct hrtimer timer ;
115
+
116
+ atomic64_t affected_request_counter ;
93
117
};
94
118
95
119
/**
@@ -137,6 +161,7 @@ static void irl_init(struct irl* counter)
137
161
rwlock_init (& counter -> lock );
138
162
hrtimer_init (& counter -> timer , CLOCK_MONOTONIC , HRTIMER_MODE_ABS_PINNED );
139
163
counter -> timer .function = irl_timer_callback ;
164
+ atomic64_set (& counter -> last_expire_time , ktime_get_ns ());
140
165
}
141
166
142
167
struct irl_dispatch_return {
@@ -145,7 +170,7 @@ struct irl_dispatch_return {
145
170
};
146
171
147
172
/**
148
- * irl_dispatch() - change the config of irl
173
+ * irl_dispatch() - check whether this request can dispatch
149
174
* @data: The corresponding ioem_data struct
150
175
* @rq: The request to be dispatch
151
176
*
@@ -163,11 +188,12 @@ static struct irl_dispatch_return irl_dispatch(struct ioem_data* data, struct re
163
188
u64 counter ;
164
189
u64 quota ;
165
190
u64 period ;
191
+ u64 last_expire_time = atomic64_read (& irl -> last_expire_time );
166
192
167
193
read_lock (& irl -> lock );
168
194
169
195
period = atomic64_read (& irl -> io_period_us );
170
- if (period == 0 || !ioem_limit_should_affect ( data , rq )) {
196
+ if (period == 0 || !ioem_priv ( rq )-> ioem_limit_should_affect ) {
171
197
// the irl is not enabled
172
198
ret .dispatch = 1 ;
173
199
ret .time_to_send = 0 ;
@@ -182,12 +208,12 @@ static struct irl_dispatch_return irl_dispatch(struct ioem_data* data, struct re
182
208
counter = atomic64_read (& irl -> io_counter );
183
209
}
184
210
if (counter < quota ) {
185
- //
186
211
ret .dispatch = 1 ;
187
212
ret .time_to_send = 0 ;
213
+ atomic64_dec (& irl -> affected_request_counter );
188
214
} else {
189
215
ret .dispatch = 0 ;
190
- ret .time_to_send = ktime_get_ns () + period * NSEC_PER_USEC ;
216
+ ret .time_to_send = last_expire_time + period * NSEC_PER_USEC ;
191
217
}
192
218
}
193
219
@@ -197,24 +223,33 @@ static struct irl_dispatch_return irl_dispatch(struct ioem_data* data, struct re
197
223
}
198
224
199
225
/**
200
- * struct ioem_priv - The priv data stored in request
201
- * @time_to_send: The expected sending time of the request
226
+ * irl_enqueue() - optimize the time_to_send of a request which will enqueue
227
+ * @data: The corresponding ioem_data struct
228
+ * @rq: The request to be dispatch
202
229
*
203
- * The expected sending time is calculated when this request comes into the
204
- * scheduler, then it will be stored in the `struct ioem_priv`. This struct
205
- * shouldn't be longer than three pointers, as the `rq->elv` only have three
206
- * pointers long.
230
+ * This function will read the counter inside irl. If the counter is already
231
+ * greater than the quota and the `time_to_send` is earlier than the next
232
+ * period, it will set the `time_to_send` of the request to the next period.
207
233
*/
208
- struct ioem_priv {
209
- u64 time_to_send ;
210
- unsigned int pid_ns ;
211
- }__attribute__((packed ));
212
-
213
- struct ioem_priv * ioem_priv (struct request * rq )
234
+ static void irl_enqueue (struct ioem_data * data , struct request * rq )
214
235
{
215
- BUILD_BUG_ON (sizeof (struct ioem_priv ) > sizeof (rq -> elv ));
216
- // `priv` has two pointers long, is enough to store the `ioem_priv`.
217
- return (struct ioem_priv * )(& rq -> elv .priv [0 ]);
236
+ u64 next_period , period , counter ;
237
+ struct irl * irl = data -> irl ;
238
+
239
+ period = atomic64_read (& irl -> io_period_us );
240
+ if (period == 0 || !ioem_priv (rq )-> ioem_limit_should_affect ) {
241
+ return ;
242
+ }
243
+
244
+ counter = atomic64_fetch_add (1 , & irl -> affected_request_counter );
245
+ read_lock (& irl -> lock );
246
+ if (atomic64_read (& irl -> io_counter ) > irl -> io_quota ) {
247
+ next_period = atomic64_read (& irl -> last_expire_time ) + atomic64_read (& irl -> io_period_us ) * NSEC_PER_USEC * (counter / irl -> io_quota );
248
+ if (ioem_priv (rq )-> time_to_send < next_period ) {
249
+ ioem_priv (rq )-> time_to_send = next_period ;
250
+ };
251
+ }
252
+ read_unlock (& irl -> lock );
218
253
}
219
254
220
255
static void ioem_data_sync_with_injections (struct ioem_data * data );
@@ -276,19 +311,23 @@ static void ioem_data_init(struct ioem_data* data, enum hrtimer_restart (*functi
276
311
* @data: The `ioem_data` strucutre
277
312
* @rq: The request
278
313
*
279
- * The request will be inserted into the rb tree
314
+ * The request will be inserted into the rb tree. Before inserting the request,
315
+ * it will also check whether this request will be affected by the irl and
316
+ * whether the irl has
280
317
*/
281
318
static void ioem_enqueue (struct ioem_data * data , struct request * rq )
282
319
{
283
320
struct rb_node * * p = & data -> root .rb_node , * parent = NULL ;
284
321
322
+ irl_enqueue (data , rq );
323
+
285
324
while (* p ) {
286
325
struct request * parent_rq ;
287
326
288
327
parent = * p ;
289
328
parent_rq = rb_entry_safe (parent , struct request , rb_node );
290
329
291
- if (ioem_priv (rq )-> time_to_send >= ioem_priv (parent_rq )-> time_to_send )
330
+ if (ioem_priv (rq )-> time_to_send > ioem_priv (parent_rq )-> time_to_send )
292
331
p = & parent -> rb_right ;
293
332
else
294
333
p = & parent -> rb_left ;
@@ -319,24 +358,41 @@ static struct request* ioem_dequeue(struct ioem_data *data)
319
358
return NULL ;
320
359
}
321
360
322
- rq = ioem_peek_request (data );
323
-
324
361
now = ktime_get_ns ();
325
- time_to_send = ioem_priv (rq )-> time_to_send ;
326
-
327
- if (time_to_send <= now ) {
362
+ while (true) {
328
363
struct irl_dispatch_return irl_ret ;
364
+
365
+ rq = ioem_peek_request (data );
366
+ time_to_send = ioem_priv (rq )-> time_to_send ;
367
+
368
+ // if this request's `time_to_send` is earlier than now, later requests
369
+ // will be all later than now, then we need to return without any
370
+ // request dispatched.
371
+ if (time_to_send > now ) {
372
+ rq = NULL ;
373
+ break ;
374
+ }
375
+
376
+ // check the IRL to decide whether the quota has exceeded
377
+ ioem_erase_head (data , rq );
378
+
329
379
irl_ret = irl_dispatch (data , rq );
330
380
if (irl_ret .dispatch > 0 ) {
331
- ioem_erase_head (data , rq );
381
+ // not exceeded, return the request
382
+ break ;
332
383
} else {
333
- time_to_send = irl_ret .time_to_send ;
384
+ // exceeded. Modify the time_to_send of this request, and reinsert
385
+ // to the rb_tree.
386
+ ioem_priv (rq )-> time_to_send = irl_ret .time_to_send ;
387
+ ioem_enqueue (data , rq );
388
+
334
389
rq = NULL ;
335
390
}
336
- } else {
337
- rq = NULL ;
338
391
}
339
392
393
+ // There are three possible situations to reach here:
394
+ // 1. The request is not NULL and is prepared to send
395
+ // 2. The earliest time_to_send is later than now
340
396
if (rq != NULL ) {
341
397
return rq ;
342
398
}
@@ -468,22 +524,26 @@ struct request* ioem_mq_dispatch_request(struct blk_mq_hw_ctx * hctx)
468
524
469
525
static void ioem_mq_insert_requests (struct blk_mq_hw_ctx * hctx , struct list_head * list , bool at_head )
470
526
{
527
+ struct request * rq , * next ;
471
528
struct ioem_data * id = hctx -> sched_data ;
472
529
473
530
spin_lock (& id -> lock );
474
531
ioem_data_sync_with_injections (id );
475
532
476
- while (!list_empty (list )) {
477
- struct request * rq ;
478
-
533
+ list_for_each_entry_safe (rq , next , list , queuelist ) {
479
534
rq = list_first_entry (list , struct request , queuelist );
480
- list_del_init (& rq -> queuelist );
535
+
536
+ list_del (& rq -> queuelist );
481
537
482
- ioem_priv (rq )-> time_to_send = ktime_get_ns ();
483
- ioem_priv (rq )-> pid_ns = ns_inum (task_active_pid_ns (current ));
538
+ if (at_head ) {
539
+ ioem_priv (rq )-> time_to_send = 0 ;
540
+ ioem_priv (rq )-> ioem_limit_should_affect = 0 ;
541
+ } else {
542
+ ioem_priv (rq )-> time_to_send = ktime_get_ns ();
543
+ ioem_priv (rq )-> ioem_limit_should_affect = ioem_limit_should_affect (id , rq );
544
+ }
484
545
485
546
ioem_error_injection (rq );
486
-
487
547
ioem_enqueue (id , rq );
488
548
489
549
#if (LINUX_VERSION_CODE < KERNEL_VERSION (5 , 12 , 0 )) && (LINUX_VERSION_CODE >= KERNEL_VERSION (5 , 10 , 0 ))
@@ -964,12 +1024,21 @@ static s64 ioem_random(s64 mu, s32 jitter, struct crndstate *state) {
964
1024
return ((rnd % (2 * (u32 )jitter )) + mu ) - jitter ;
965
1025
}
966
1026
1027
+ /**
1028
+ * ioem_should_inject() - whether this request should be injected
1029
+ * @rq: The io request
1030
+ * @e: The ioem injection
1031
+ *
1032
+ * This functions should be called under process context, which means the
1033
+ * `current` should point to the current process, so that we can get the pid
1034
+ * namespace (or other information) of the process.
1035
+ */
967
1036
static bool ioem_should_inject (struct request * rq , struct ioem_injection * e ) {
968
1037
if (rq -> bio == NULL || e == NULL ) {
969
1038
return 0 ;
970
1039
}
971
1040
972
- if (e -> arg .pid_ns != 0 && ioem_priv ( rq ) -> pid_ns != e -> arg .pid_ns ) {
1041
+ if (e -> arg .pid_ns != 0 && ns_inum ( task_active_pid_ns ( current )) != e -> arg .pid_ns ) {
973
1042
return 0 ;
974
1043
}
975
1044
0 commit comments