@@ -262,3 +262,176 @@ int page_counter_memparse(const char *buf, const char *max,
262
262
263
263
return 0 ;
264
264
}
265
+
266
+
267
+ /*
268
+ * This function calculates an individual page counter's effective
269
+ * protection which is derived from its own memory.min/low, its
270
+ * parent's and siblings' settings, as well as the actual memory
271
+ * distribution in the tree.
272
+ *
273
+ * The following rules apply to the effective protection values:
274
+ *
275
+ * 1. At the first level of reclaim, effective protection is equal to
276
+ * the declared protection in memory.min and memory.low.
277
+ *
278
+ * 2. To enable safe delegation of the protection configuration, at
279
+ * subsequent levels the effective protection is capped to the
280
+ * parent's effective protection.
281
+ *
282
+ * 3. To make complex and dynamic subtrees easier to configure, the
283
+ * user is allowed to overcommit the declared protection at a given
284
+ * level. If that is the case, the parent's effective protection is
285
+ * distributed to the children in proportion to how much protection
286
+ * they have declared and how much of it they are utilizing.
287
+ *
288
+ * This makes distribution proportional, but also work-conserving:
289
+ * if one counter claims much more protection than it uses memory,
290
+ * the unused remainder is available to its siblings.
291
+ *
292
+ * 4. Conversely, when the declared protection is undercommitted at a
293
+ * given level, the distribution of the larger parental protection
294
+ * budget is NOT proportional. A counter's protection from a sibling
295
+ * is capped to its own memory.min/low setting.
296
+ *
297
+ * 5. However, to allow protecting recursive subtrees from each other
298
+ * without having to declare each individual counter's fixed share
299
+ * of the ancestor's claim to protection, any unutilized -
300
+ * "floating" - protection from up the tree is distributed in
301
+ * proportion to each counter's *usage*. This makes the protection
302
+ * neutral wrt sibling cgroups and lets them compete freely over
303
+ * the shared parental protection budget, but it protects the
304
+ * subtree as a whole from neighboring subtrees.
305
+ *
306
+ * Note that 4. and 5. are not in conflict: 4. is about protecting
307
+ * against immediate siblings whereas 5. is about protecting against
308
+ * neighboring subtrees.
309
+ */
310
+ static unsigned long effective_protection (unsigned long usage ,
311
+ unsigned long parent_usage ,
312
+ unsigned long setting ,
313
+ unsigned long parent_effective ,
314
+ unsigned long siblings_protected ,
315
+ bool recursive_protection )
316
+ {
317
+ unsigned long protected ;
318
+ unsigned long ep ;
319
+
320
+ protected = min (usage , setting );
321
+ /*
322
+ * If all cgroups at this level combined claim and use more
323
+ * protection than what the parent affords them, distribute
324
+ * shares in proportion to utilization.
325
+ *
326
+ * We are using actual utilization rather than the statically
327
+ * claimed protection in order to be work-conserving: claimed
328
+ * but unused protection is available to siblings that would
329
+ * otherwise get a smaller chunk than what they claimed.
330
+ */
331
+ if (siblings_protected > parent_effective )
332
+ return protected * parent_effective / siblings_protected ;
333
+
334
+ /*
335
+ * Ok, utilized protection of all children is within what the
336
+ * parent affords them, so we know whatever this child claims
337
+ * and utilizes is effectively protected.
338
+ *
339
+ * If there is unprotected usage beyond this value, reclaim
340
+ * will apply pressure in proportion to that amount.
341
+ *
342
+ * If there is unutilized protection, the cgroup will be fully
343
+ * shielded from reclaim, but we do return a smaller value for
344
+ * protection than what the group could enjoy in theory. This
345
+ * is okay. With the overcommit distribution above, effective
346
+ * protection is always dependent on how memory is actually
347
+ * consumed among the siblings anyway.
348
+ */
349
+ ep = protected ;
350
+
351
+ /*
352
+ * If the children aren't claiming (all of) the protection
353
+ * afforded to them by the parent, distribute the remainder in
354
+ * proportion to the (unprotected) memory of each cgroup. That
355
+ * way, cgroups that aren't explicitly prioritized wrt each
356
+ * other compete freely over the allowance, but they are
357
+ * collectively protected from neighboring trees.
358
+ *
359
+ * We're using unprotected memory for the weight so that if
360
+ * some cgroups DO claim explicit protection, we don't protect
361
+ * the same bytes twice.
362
+ *
363
+ * Check both usage and parent_usage against the respective
364
+ * protected values. One should imply the other, but they
365
+ * aren't read atomically - make sure the division is sane.
366
+ */
367
+ if (!recursive_protection )
368
+ return ep ;
369
+
370
+ if (parent_effective > siblings_protected &&
371
+ parent_usage > siblings_protected &&
372
+ usage > protected ) {
373
+ unsigned long unclaimed ;
374
+
375
+ unclaimed = parent_effective - siblings_protected ;
376
+ unclaimed *= usage - protected ;
377
+ unclaimed /= parent_usage - siblings_protected ;
378
+
379
+ ep += unclaimed ;
380
+ }
381
+
382
+ return ep ;
383
+ }
384
+
385
+
386
+ /**
387
+ * page_counter_calculate_protection - check if memory consumption is in the normal range
388
+ * @root: the top ancestor of the sub-tree being checked
389
+ * @counter: the page_counter the counter to update
390
+ * @recursive_protection: Whether to use memory_recursiveprot behavior.
391
+ *
392
+ * Calculates elow/emin thresholds for given page_counter.
393
+ *
394
+ * WARNING: This function is not stateless! It can only be used as part
395
+ * of a top-down tree iteration, not for isolated queries.
396
+ */
397
+ void page_counter_calculate_protection (struct page_counter * root ,
398
+ struct page_counter * counter ,
399
+ bool recursive_protection )
400
+ {
401
+ unsigned long usage , parent_usage ;
402
+ struct page_counter * parent = counter -> parent ;
403
+
404
+ /*
405
+ * Effective values of the reclaim targets are ignored so they
406
+ * can be stale. Have a look at mem_cgroup_protection for more
407
+ * details.
408
+ * TODO: calculation should be more robust so that we do not need
409
+ * that special casing.
410
+ */
411
+ if (root == counter )
412
+ return ;
413
+
414
+ usage = page_counter_read (counter );
415
+ if (!usage )
416
+ return ;
417
+
418
+ if (parent == root ) {
419
+ counter -> emin = READ_ONCE (counter -> min );
420
+ counter -> elow = READ_ONCE (counter -> low );
421
+ return ;
422
+ }
423
+
424
+ parent_usage = page_counter_read (parent );
425
+
426
+ WRITE_ONCE (counter -> emin , effective_protection (usage , parent_usage ,
427
+ READ_ONCE (counter -> min ),
428
+ READ_ONCE (parent -> emin ),
429
+ atomic_long_read (& parent -> children_min_usage ),
430
+ recursive_protection ));
431
+
432
+ WRITE_ONCE (counter -> elow , effective_protection (usage , parent_usage ,
433
+ READ_ONCE (counter -> low ),
434
+ READ_ONCE (parent -> elow ),
435
+ atomic_long_read (& parent -> children_low_usage ),
436
+ recursive_protection ));
437
+ }
0 commit comments