Merge branch 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cgroup v2 interface is now official. It's no longer hidden behind a devel flag and can be mounted using the new cgroup2 fs type. Unfortunately, cpu v2 interface hasn't made it yet due to the discussion around in-process hierarchical resource distribution and only memory and io controllers can be used on the v2 interface at the moment. - The existing documentation which has always been a bit of mess is relocated under Documentation/cgroup-v1/. Documentation/cgroup-v2.txt is added as the authoritative documentation for the v2 interface. - Some features are added through for-4.5-ancestor-test branch to enable netfilter xt_cgroup match to use cgroup v2 paths. The actual netfilter changes will be merged through the net tree which pulled in the said branch. - Various cleanups * 'for-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: rename cgroup documentations cgroup: fix a typo. cgroup: Remove resource_counter.txt in Documentation/cgroup-legacy/00-INDEX. cgroup: demote subsystem init messages to KERN_DEBUG cgroup: Fix uninitialized variable warning cgroup: put controller Kconfig options in meaningful order cgroup: clean up the kernel configuration menu nomenclature cgroup_pids: fix a typo. Subject: cgroup: Fix incomplete dd command in blkio documentation cgroup: kill cgrp_ss_priv[CGROUP_CANFORK_COUNT] and friends cpuset: Replace all instances of time_t with time64_t cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation cgroup: rename Documentation/cgroups/ to Documentation/cgroup-legacy/ cgroup: replace __DEVEL__sane_behavior with cgroup2 fs type
This commit is contained in:
commit
34a9304a96
27 changed files with 1469 additions and 963 deletions
|
@ -24,7 +24,5 @@ net_prio.txt
|
|||
- Network priority cgroups details and usages.
|
||||
pids.txt
|
||||
- Process number cgroups details and usages.
|
||||
resource_counter.txt
|
||||
- Resource Counter API.
|
||||
unified-hierarchy.txt
|
||||
- Description the new/next cgroup interface.
|
|
@ -84,8 +84,7 @@ Throttling/Upper Limit policy
|
|||
|
||||
- Run dd to read a file and see if rate is throttled to 1MB/s or not.
|
||||
|
||||
# dd if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
|
||||
# iflag=direct
|
||||
# dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
|
||||
1024+0 records in
|
||||
1024+0 records out
|
||||
4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
|
||||
|
@ -374,82 +373,3 @@ One can experience an overall throughput drop if you have created multiple
|
|||
groups and put applications in that group which are not driving enough
|
||||
IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
|
||||
on individual groups and throughput should improve.
|
||||
|
||||
Writeback
|
||||
=========
|
||||
|
||||
Page cache is dirtied through buffered writes and shared mmaps and
|
||||
written asynchronously to the backing filesystem by the writeback
|
||||
mechanism. Writeback sits between the memory and IO domains and
|
||||
regulates the proportion of dirty memory by balancing dirtying and
|
||||
write IOs.
|
||||
|
||||
On traditional cgroup hierarchies, relationships between different
|
||||
controllers cannot be established making it impossible for writeback
|
||||
to operate accounting for cgroup resource restrictions and all
|
||||
writeback IOs are attributed to the root cgroup.
|
||||
|
||||
If both the blkio and memory controllers are used on the v2 hierarchy
|
||||
and the filesystem supports cgroup writeback, writeback operations
|
||||
correctly follow the resource restrictions imposed by both memory and
|
||||
blkio controllers.
|
||||
|
||||
Writeback examines both system-wide and per-cgroup dirty memory status
|
||||
and enforces the more restrictive of the two. Also, writeback control
|
||||
parameters which are absolute values - vm.dirty_bytes and
|
||||
vm.dirty_background_bytes - are distributed across cgroups according
|
||||
to their current writeback bandwidth.
|
||||
|
||||
There's a peculiarity stemming from the discrepancy in ownership
|
||||
granularity between memory controller and writeback. While memory
|
||||
controller tracks ownership per page, writeback operates on inode
|
||||
basis. cgroup writeback bridges the gap by tracking ownership by
|
||||
inode but migrating ownership if too many foreign pages, pages which
|
||||
don't match the current inode ownership, have been encountered while
|
||||
writing back the inode.
|
||||
|
||||
This is a conscious design choice as writeback operations are
|
||||
inherently tied to inodes making strictly following page ownership
|
||||
complicated and inefficient. The only use case which suffers from
|
||||
this compromise is multiple cgroups concurrently dirtying disjoint
|
||||
regions of the same inode, which is an unlikely use case and decided
|
||||
to be unsupported. Note that as memory controller assigns page
|
||||
ownership on the first use and doesn't update it until the page is
|
||||
released, even if cgroup writeback strictly follows page ownership,
|
||||
multiple cgroups dirtying overlapping areas wouldn't work as expected.
|
||||
In general, write-sharing an inode across multiple cgroups is not well
|
||||
supported.
|
||||
|
||||
Filesystem support for cgroup writeback
|
||||
---------------------------------------
|
||||
|
||||
A filesystem can make writeback IOs cgroup-aware by updating
|
||||
address_space_operations->writepage[s]() to annotate bio's using the
|
||||
following two functions.
|
||||
|
||||
* wbc_init_bio(@wbc, @bio)
|
||||
|
||||
Should be called for each bio carrying writeback data and associates
|
||||
the bio with the inode's owner cgroup. Can be called anytime
|
||||
between bio allocation and submission.
|
||||
|
||||
* wbc_account_io(@wbc, @page, @bytes)
|
||||
|
||||
Should be called for each data segment being written out. While
|
||||
this function doesn't care exactly when it's called during the
|
||||
writeback session, it's the easiest and most natural to call it as
|
||||
data segments are added to a bio.
|
||||
|
||||
With writeback bio's annotated, cgroup support can be enabled per
|
||||
super_block by setting MS_CGROUPWB in ->s_flags. This allows for
|
||||
selective disabling of cgroup writeback support which is helpful when
|
||||
certain filesystem features, e.g. journaled data mode, are
|
||||
incompatible.
|
||||
|
||||
wbc_init_bio() binds the specified bio to its cgroup. Depending on
|
||||
the configuration, the bio may be executed at a lower priority and if
|
||||
the writeback session is holding shared resources, e.g. a journal
|
||||
entry, may lead to priority inversion. There is no one easy solution
|
||||
for the problem. Filesystems can try to work around specific problem
|
||||
cases by skipping wbc_init_bio() or using bio_associate_blkcg()
|
||||
directly.
|
1293
Documentation/cgroup-v2.txt
Normal file
1293
Documentation/cgroup-v2.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,647 +0,0 @@
|
|||
|
||||
Cgroup unified hierarchy
|
||||
|
||||
April, 2014 Tejun Heo <tj@kernel.org>
|
||||
|
||||
This document describes the changes made by unified hierarchy and
|
||||
their rationales. It will eventually be merged into the main cgroup
|
||||
documentation.
|
||||
|
||||
CONTENTS
|
||||
|
||||
1. Background
|
||||
2. Basic Operation
|
||||
2-1. Mounting
|
||||
2-2. cgroup.subtree_control
|
||||
2-3. cgroup.controllers
|
||||
3. Structural Constraints
|
||||
3-1. Top-down
|
||||
3-2. No internal tasks
|
||||
4. Delegation
|
||||
4-1. Model of delegation
|
||||
4-2. Common ancestor rule
|
||||
5. Other Changes
|
||||
5-1. [Un]populated Notification
|
||||
5-2. Other Core Changes
|
||||
5-3. Controller File Conventions
|
||||
5-3-1. Format
|
||||
5-3-2. Control Knobs
|
||||
5-4. Per-Controller Changes
|
||||
5-4-1. io
|
||||
5-4-2. cpuset
|
||||
5-4-3. memory
|
||||
6. Planned Changes
|
||||
6-1. CAP for resource control
|
||||
|
||||
|
||||
1. Background
|
||||
|
||||
cgroup allows an arbitrary number of hierarchies and each hierarchy
|
||||
can host any number of controllers. While this seems to provide a
|
||||
high level of flexibility, it isn't quite useful in practice.
|
||||
|
||||
For example, as there is only one instance of each controller, utility
|
||||
type controllers such as freezer which can be useful in all
|
||||
hierarchies can only be used in one. The issue is exacerbated by the
|
||||
fact that controllers can't be moved around once hierarchies are
|
||||
populated. Another issue is that all controllers bound to a hierarchy
|
||||
are forced to have exactly the same view of the hierarchy. It isn't
|
||||
possible to vary the granularity depending on the specific controller.
|
||||
|
||||
In practice, these issues heavily limit which controllers can be put
|
||||
on the same hierarchy and most configurations resort to putting each
|
||||
controller on its own hierarchy. Only closely related ones, such as
|
||||
the cpu and cpuacct controllers, make sense to put on the same
|
||||
hierarchy. This often means that userland ends up managing multiple
|
||||
similar hierarchies repeating the same steps on each hierarchy
|
||||
whenever a hierarchy management operation is necessary.
|
||||
|
||||
Unfortunately, support for multiple hierarchies comes at a steep cost.
|
||||
Internal implementation in cgroup core proper is dazzlingly
|
||||
complicated but more importantly the support for multiple hierarchies
|
||||
restricts how cgroup is used in general and what controllers can do.
|
||||
|
||||
There's no limit on how many hierarchies there may be, which means
|
||||
that a task's cgroup membership can't be described in finite length.
|
||||
The key may contain any varying number of entries and is unlimited in
|
||||
length, which makes it highly awkward to handle and leads to addition
|
||||
of controllers which exist only to identify membership, which in turn
|
||||
exacerbates the original problem.
|
||||
|
||||
Also, as a controller can't have any expectation regarding what shape
|
||||
of hierarchies other controllers would be on, each controller has to
|
||||
assume that all other controllers are operating on completely
|
||||
orthogonal hierarchies. This makes it impossible, or at least very
|
||||
cumbersome, for controllers to cooperate with each other.
|
||||
|
||||
In most use cases, putting controllers on hierarchies which are
|
||||
completely orthogonal to each other isn't necessary. What usually is
|
||||
called for is the ability to have differing levels of granularity
|
||||
depending on the specific controller. In other words, hierarchy may
|
||||
be collapsed from leaf towards root when viewed from specific
|
||||
controllers. For example, a given configuration might not care about
|
||||
how memory is distributed beyond a certain level while still wanting
|
||||
to control how CPU cycles are distributed.
|
||||
|
||||
Unified hierarchy is the next version of cgroup interface. It aims to
|
||||
address the aforementioned issues by having more structure while
|
||||
retaining enough flexibility for most use cases. Various other
|
||||
general and controller-specific interface issues are also addressed in
|
||||
the process.
|
||||
|
||||
|
||||
2. Basic Operation
|
||||
|
||||
2-1. Mounting
|
||||
|
||||
Currently, unified hierarchy can be mounted with the following mount
|
||||
command. Note that this is still under development and scheduled to
|
||||
change soon.
|
||||
|
||||
mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT
|
||||
|
||||
All controllers which support the unified hierarchy and are not bound
|
||||
to other hierarchies are automatically bound to unified hierarchy and
|
||||
show up at the root of it. Controllers which are enabled only in the
|
||||
root of unified hierarchy can be bound to other hierarchies. This
|
||||
allows mixing unified hierarchy with the traditional multiple
|
||||
hierarchies in a fully backward compatible way.
|
||||
|
||||
A controller can be moved across hierarchies only after the controller
|
||||
is no longer referenced in its current hierarchy. Because per-cgroup
|
||||
controller states are destroyed asynchronously and controllers may
|
||||
have lingering references, a controller may not show up immediately on
|
||||
the unified hierarchy after the final umount of the previous
|
||||
hierarchy. Similarly, a controller should be fully disabled to be
|
||||
moved out of the unified hierarchy and it may take some time for the
|
||||
disabled controller to become available for other hierarchies;
|
||||
furthermore, due to dependencies among controllers, other controllers
|
||||
may need to be disabled too.
|
||||
|
||||
While useful for development and manual configurations, dynamically
|
||||
moving controllers between the unified and other hierarchies is
|
||||
strongly discouraged for production use. It is recommended to decide
|
||||
the hierarchies and controller associations before starting using the
|
||||
controllers.
|
||||
|
||||
|
||||
2-2. cgroup.subtree_control
|
||||
|
||||
All cgroups on unified hierarchy have a "cgroup.subtree_control" file
|
||||
which governs which controllers are enabled on the children of the
|
||||
cgroup. Let's assume a hierarchy like the following.
|
||||
|
||||
root - A - B - C
|
||||
\ D
|
||||
|
||||
root's "cgroup.subtree_control" file determines which controllers are
|
||||
enabled on A. A's on B. B's on C and D. This coincides with the
|
||||
fact that controllers on the immediate sub-level are used to
|
||||
distribute the resources of the parent. In fact, it's natural to
|
||||
assume that resource control knobs of a child belong to its parent.
|
||||
Enabling a controller in a "cgroup.subtree_control" file declares that
|
||||
distribution of the respective resources of the cgroup will be
|
||||
controlled. Note that this means that controller enable states are
|
||||
shared among siblings.
|
||||
|
||||
When read, the file contains a space-separated list of currently
|
||||
enabled controllers. A write to the file should contain a
|
||||
space-separated list of controllers with '+' or '-' prefixed (without
|
||||
the quotes). Controllers prefixed with '+' are enabled and '-'
|
||||
disabled. If a controller is listed multiple times, the last entry
|
||||
wins. The specific operations are executed atomically - either all
|
||||
succeed or fail.
|
||||
|
||||
|
||||
2-3. cgroup.controllers
|
||||
|
||||
Read-only "cgroup.controllers" file contains a space-separated list of
|
||||
controllers which can be enabled in the cgroup's
|
||||
"cgroup.subtree_control" file.
|
||||
|
||||
In the root cgroup, this lists controllers which are not bound to
|
||||
other hierarchies and the content changes as controllers are bound to
|
||||
and unbound from other hierarchies.
|
||||
|
||||
In non-root cgroups, the content of this file equals that of the
|
||||
parent's "cgroup.subtree_control" file as only controllers enabled
|
||||
from the parent can be used in its children.
|
||||
|
||||
|
||||
3. Structural Constraints
|
||||
|
||||
3-1. Top-down
|
||||
|
||||
As it doesn't make sense to nest control of an uncontrolled resource,
|
||||
all non-root "cgroup.subtree_control" files can only contain
|
||||
controllers which are enabled in the parent's "cgroup.subtree_control"
|
||||
file. A controller can be enabled only if the parent has the
|
||||
controller enabled and a controller can't be disabled if one or more
|
||||
children have it enabled.
|
||||
|
||||
|
||||
3-2. No internal tasks
|
||||
|
||||
One long-standing issue that cgroup faces is the competition between
|
||||
tasks belonging to the parent cgroup and its children cgroups. This
|
||||
is inherently nasty as two different types of entities compete and
|
||||
there is no agreed-upon obvious way to handle it. Different
|
||||
controllers are doing different things.
|
||||
|
||||
The cpu controller considers tasks and cgroups as equivalents and maps
|
||||
nice levels to cgroup weights. This works for some cases but falls
|
||||
flat when children should be allocated specific ratios of CPU cycles
|
||||
and the number of internal tasks fluctuates - the ratios constantly
|
||||
change as the number of competing entities fluctuates. There also are
|
||||
other issues. The mapping from nice level to weight isn't obvious or
|
||||
universal, and there are various other knobs which simply aren't
|
||||
available for tasks.
|
||||
|
||||
The io controller implicitly creates a hidden leaf node for each
|
||||
cgroup to host the tasks. The hidden leaf has its own copies of all
|
||||
the knobs with "leaf_" prefixed. While this allows equivalent control
|
||||
over internal tasks, it's with serious drawbacks. It always adds an
|
||||
extra layer of nesting which may not be necessary, makes the interface
|
||||
messy and significantly complicates the implementation.
|
||||
|
||||
The memory controller currently doesn't have a way to control what
|
||||
happens between internal tasks and child cgroups and the behavior is
|
||||
not clearly defined. There have been attempts to add ad-hoc behaviors
|
||||
and knobs to tailor the behavior to specific workloads. Continuing
|
||||
this direction will lead to problems which will be extremely difficult
|
||||
to resolve in the long term.
|
||||
|
||||
Multiple controllers struggle with internal tasks and came up with
|
||||
different ways to deal with it; unfortunately, all the approaches in
|
||||
use now are severely flawed and, furthermore, the widely different
|
||||
behaviors make cgroup as whole highly inconsistent.
|
||||
|
||||
It is clear that this is something which needs to be addressed from
|
||||
cgroup core proper in a uniform way so that controllers don't need to
|
||||
worry about it and cgroup as a whole shows a consistent and logical
|
||||
behavior. To achieve that, unified hierarchy enforces the following
|
||||
structural constraint:
|
||||
|
||||
Except for the root, only cgroups which don't contain any task may
|
||||
have controllers enabled in their "cgroup.subtree_control" files.
|
||||
|
||||
Combined with other properties, this guarantees that, when a
|
||||
controller is looking at the part of the hierarchy which has it
|
||||
enabled, tasks are always only on the leaves. This rules out
|
||||
situations where child cgroups compete against internal tasks of the
|
||||
parent.
|
||||
|
||||
There are two things to note. Firstly, the root cgroup is exempt from
|
||||
the restriction. Root contains tasks and anonymous resource
|
||||
consumption which can't be associated with any other cgroup and
|
||||
requires special treatment from most controllers. How resource
|
||||
consumption in the root cgroup is governed is up to each controller.
|
||||
|
||||
Secondly, the restriction doesn't take effect if there is no enabled
|
||||
controller in the cgroup's "cgroup.subtree_control" file. This is
|
||||
important as otherwise it wouldn't be possible to create children of a
|
||||
populated cgroup. To control resource distribution of a cgroup, the
|
||||
cgroup must create children and transfer all its tasks to the children
|
||||
before enabling controllers in its "cgroup.subtree_control" file.
|
||||
|
||||
|
||||
4. Delegation
|
||||
|
||||
4-1. Model of delegation
|
||||
|
||||
A cgroup can be delegated to a less privileged user by granting write
|
||||
access of the directory and its "cgroup.procs" file to the user. Note
|
||||
that the resource control knobs in a given directory concern the
|
||||
resources of the parent and thus must not be delegated along with the
|
||||
directory.
|
||||
|
||||
Once delegated, the user can build sub-hierarchy under the directory,
|
||||
organize processes as it sees fit and further distribute the resources
|
||||
it got from the parent. The limits and other settings of all resource
|
||||
controllers are hierarchical and regardless of what happens in the
|
||||
delegated sub-hierarchy, nothing can escape the resource restrictions
|
||||
imposed by the parent.
|
||||
|
||||
Currently, cgroup doesn't impose any restrictions on the number of
|
||||
cgroups in or nesting depth of a delegated sub-hierarchy; however,
|
||||
this may in the future be limited explicitly.
|
||||
|
||||
|
||||
4-2. Common ancestor rule
|
||||
|
||||
On the unified hierarchy, to write to a "cgroup.procs" file, in
|
||||
addition to the usual write permission to the file and uid match, the
|
||||
writer must also have write access to the "cgroup.procs" file of the
|
||||
common ancestor of the source and destination cgroups. This prevents
|
||||
delegatees from smuggling processes across disjoint sub-hierarchies.
|
||||
|
||||
Let's say cgroups C0 and C1 have been delegated to user U0 who created
|
||||
C00, C01 under C0 and C10 under C1 as follows.
|
||||
|
||||
~~~~~~~~~~~~~ - C0 - C00
|
||||
~ cgroup ~ \ C01
|
||||
~ hierarchy ~
|
||||
~~~~~~~~~~~~~ - C1 - C10
|
||||
|
||||
C0 and C1 are separate entities in terms of resource distribution
|
||||
regardless of their relative positions in the hierarchy. The
|
||||
resources the processes under C0 are entitled to are controlled by
|
||||
C0's ancestors and may be completely different from C1. It's clear
|
||||
that the intention of delegating C0 to U0 is allowing U0 to organize
|
||||
the processes under C0 and further control the distribution of C0's
|
||||
resources.
|
||||
|
||||
On traditional hierarchies, if a task has write access to "tasks" or
|
||||
"cgroup.procs" file of a cgroup and its uid agrees with the target, it
|
||||
can move the target to the cgroup. In the above example, U0 will not
|
||||
only be able to move processes in each sub-hierarchy but also across
|
||||
the two sub-hierarchies, effectively allowing it to violate the
|
||||
organizational and resource restrictions implied by the hierarchical
|
||||
structure above C0 and C1.
|
||||
|
||||
On the unified hierarchy, let's say U0 wants to write the pid of a
|
||||
process which has a matching uid and is currently in C10 into
|
||||
"C00/cgroup.procs". U0 obviously has write access to the file and
|
||||
migration permission on the process; however, the common ancestor of
|
||||
the source cgroup C10 and the destination cgroup C00 is above the
|
||||
points of delegation and U0 would not have write access to its
|
||||
"cgroup.procs" and thus be denied with -EACCES.
|
||||
|
||||
|
||||
5. Other Changes
|
||||
|
||||
5-1. [Un]populated Notification
|
||||
|
||||
cgroup users often need a way to determine when a cgroup's
|
||||
subhierarchy becomes empty so that it can be cleaned up. cgroup
|
||||
currently provides release_agent for it; unfortunately, this mechanism
|
||||
is riddled with issues.
|
||||
|
||||
- It delivers events by forking and execing a userland binary
|
||||
specified as the release_agent. This is a long deprecated method of
|
||||
notification delivery. It's extremely heavy, slow and cumbersome to
|
||||
integrate with larger infrastructure.
|
||||
|
||||
- There is single monitoring point at the root. There's no way to
|
||||
delegate management of a subtree.
|
||||
|
||||
- The event isn't recursive. It triggers when a cgroup doesn't have
|
||||
any tasks or child cgroups. Events for internal nodes trigger only
|
||||
after all children are removed. This again makes it impossible to
|
||||
delegate management of a subtree.
|
||||
|
||||
- Events are filtered from the kernel side. A "notify_on_release"
|
||||
file is used to subscribe to or suppress release events. This is
|
||||
unnecessarily complicated and probably done this way because event
|
||||
delivery itself was expensive.
|
||||
|
||||
Unified hierarchy implements "populated" field in "cgroup.events"
|
||||
interface file which can be used to monitor whether the cgroup's
|
||||
subhierarchy has tasks in it or not. Its value is 0 if there is no
|
||||
task in the cgroup and its descendants; otherwise, 1. poll and
|
||||
[id]notify events are triggered when the value changes.
|
||||
|
||||
This is significantly lighter and simpler and trivially allows
|
||||
delegating management of subhierarchy - subhierarchy monitoring can
|
||||
block further propagation simply by putting itself or another process
|
||||
in the subhierarchy and monitor events that it's interested in from
|
||||
there without interfering with monitoring higher in the tree.
|
||||
|
||||
In unified hierarchy, the release_agent mechanism is no longer
|
||||
supported and the interface files "release_agent" and
|
||||
"notify_on_release" do not exist.
|
||||
|
||||
|
||||
5-2. Other Core Changes
|
||||
|
||||
- None of the mount options is allowed.
|
||||
|
||||
- remount is disallowed.
|
||||
|
||||
- rename(2) is disallowed.
|
||||
|
||||
- The "tasks" file is removed. Everything should at process
|
||||
granularity. Use the "cgroup.procs" file instead.
|
||||
|
||||
- The "cgroup.procs" file is not sorted. pids will be unique unless
|
||||
they got recycled in-between reads.
|
||||
|
||||
- The "cgroup.clone_children" file is removed.
|
||||
|
||||
- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
|
||||
to before exiting. If the cgroup is removed before the zombie is
|
||||
reaped, " (deleted)" is appeneded to the path.
|
||||
|
||||
|
||||
5-3. Controller File Conventions
|
||||
|
||||
5-3-1. Format
|
||||
|
||||
In general, all controller files should be in one of the following
|
||||
formats whenever possible.
|
||||
|
||||
- Values only files
|
||||
|
||||
VAL0 VAL1...\n
|
||||
|
||||
- Flat keyed files
|
||||
|
||||
KEY0 VAL0\n
|
||||
KEY1 VAL1\n
|
||||
...
|
||||
|
||||
- Nested keyed files
|
||||
|
||||
KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
|
||||
KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
|
||||
...
|
||||
|
||||
For a writeable file, the format for writing should generally match
|
||||
reading; however, controllers may allow omitting later fields or
|
||||
implement restricted shortcuts for most common use cases.
|
||||
|
||||
For both flat and nested keyed files, only the values for a single key
|
||||
can be written at a time. For nested keyed files, the sub key pairs
|
||||
may be specified in any order and not all pairs have to be specified.
|
||||
|
||||
|
||||
5-3-2. Control Knobs
|
||||
|
||||
- Settings for a single feature should generally be implemented in a
|
||||
single file.
|
||||
|
||||
- In general, the root cgroup should be exempt from resource control
|
||||
and thus shouldn't have resource control knobs.
|
||||
|
||||
- If a controller implements ratio based resource distribution, the
|
||||
control knob should be named "weight" and have the range [1, 10000]
|
||||
and 100 should be the default value. The values are chosen to allow
|
||||
enough and symmetric bias in both directions while keeping it
|
||||
intuitive (the default is 100%).
|
||||
|
||||
- If a controller implements an absolute resource guarantee and/or
|
||||
limit, the control knobs should be named "min" and "max"
|
||||
respectively. If a controller implements best effort resource
|
||||
gurantee and/or limit, the control knobs should be named "low" and
|
||||
"high" respectively.
|
||||
|
||||
In the above four control files, the special token "max" should be
|
||||
used to represent upward infinity for both reading and writing.
|
||||
|
||||
- If a setting has configurable default value and specific overrides,
|
||||
the default settings should be keyed with "default" and appear as
|
||||
the first entry in the file. Specific entries can use "default" as
|
||||
its value to indicate inheritance of the default value.
|
||||
|
||||
- For events which are not very high frequency, an interface file
|
||||
"events" should be created which lists event key value pairs.
|
||||
Whenever a notifiable event happens, file modified event should be
|
||||
generated on the file.
|
||||
|
||||
|
||||
5-4. Per-Controller Changes
|
||||
|
||||
5-4-1. io
|
||||
|
||||
- blkio is renamed to io. The interface is overhauled anyway. The
|
||||
new name is more in line with the other two major controllers, cpu
|
||||
and memory, and better suited given that it may be used for cgroup
|
||||
writeback without involving block layer.
|
||||
|
||||
- Everything including stat is always hierarchical making separate
|
||||
recursive stat files pointless and, as no internal node can have
|
||||
tasks, leaf weights are meaningless. The operation model is
|
||||
simplified and the interface is overhauled accordingly.
|
||||
|
||||
io.stat
|
||||
|
||||
The stat file. The reported stats are from the point where
|
||||
bio's are issued to request_queue. The stats are counted
|
||||
independent of which policies are enabled. Each line in the
|
||||
file follows the following format. More fields may later be
|
||||
added at the end.
|
||||
|
||||
$MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
|
||||
|
||||
io.weight
|
||||
|
||||
The weight setting, currently only available and effective if
|
||||
cfq-iosched is in use for the target device. The weight is
|
||||
between 1 and 10000 and defaults to 100. The first line
|
||||
always contains the default weight in the following format to
|
||||
use when per-device setting is missing.
|
||||
|
||||
default $WEIGHT
|
||||
|
||||
Subsequent lines list per-device weights of the following
|
||||
format.
|
||||
|
||||
$MAJ:$MIN $WEIGHT
|
||||
|
||||
Writing "$WEIGHT" or "default $WEIGHT" changes the default
|
||||
setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
|
||||
while "$MAJ:$MIN default" clears it.
|
||||
|
||||
This file is available only on non-root cgroups.
|
||||
|
||||
io.max
|
||||
|
||||
The maximum bandwidth and/or iops setting, only available if
|
||||
blk-throttle is enabled. The file is of the following format.
|
||||
|
||||
$MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
|
||||
|
||||
${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
|
||||
read/write IOs per second. "max" indicates no limit. Writing
|
||||
to the file follows the same format but the individual
|
||||
settings may be omitted or specified in any order.
|
||||
|
||||
This file is available only on non-root cgroups.
|
||||
|
||||
|
||||
5-4-2. cpuset
|
||||
|
||||
- Tasks are kept in empty cpusets after hotplug and take on the masks
|
||||
of the nearest non-empty ancestor, instead of being moved to it.
|
||||
|
||||
- A task can be moved into an empty cpuset, and again it takes on the
|
||||
masks of the nearest non-empty ancestor.
|
||||
|
||||
|
||||
5-4-3. memory
|
||||
|
||||
- use_hierarchy is on by default and the cgroup file for the flag is
|
||||
not created.
|
||||
|
||||
- The original lower boundary, the soft limit, is defined as a limit
|
||||
that is per default unset. As a result, the set of cgroups that
|
||||
global reclaim prefers is opt-in, rather than opt-out. The costs
|
||||
for optimizing these mostly negative lookups are so high that the
|
||||
implementation, despite its enormous size, does not even provide the
|
||||
basic desirable behavior. First off, the soft limit has no
|
||||
hierarchical meaning. All configured groups are organized in a
|
||||
global rbtree and treated like equal peers, regardless where they
|
||||
are located in the hierarchy. This makes subtree delegation
|
||||
impossible. Second, the soft limit reclaim pass is so aggressive
|
||||
that it not just introduces high allocation latencies into the
|
||||
system, but also impacts system performance due to overreclaim, to
|
||||
the point where the feature becomes self-defeating.
|
||||
|
||||
The memory.low boundary on the other hand is a top-down allocated
|
||||
reserve. A cgroup enjoys reclaim protection when it and all its
|
||||
ancestors are below their low boundaries, which makes delegation of
|
||||
subtrees possible. Secondly, new cgroups have no reserve per
|
||||
default and in the common case most cgroups are eligible for the
|
||||
preferred reclaim pass. This allows the new low boundary to be
|
||||
efficiently implemented with just a minor addition to the generic
|
||||
reclaim code, without the need for out-of-band data structures and
|
||||
reclaim passes. Because the generic reclaim code considers all
|
||||
cgroups except for the ones running low in the preferred first
|
||||
reclaim pass, overreclaim of individual groups is eliminated as
|
||||
well, resulting in much better overall workload performance.
|
||||
|
||||
- The original high boundary, the hard limit, is defined as a strict
|
||||
limit that can not budge, even if the OOM killer has to be called.
|
||||
But this generally goes against the goal of making the most out of
|
||||
the available memory. The memory consumption of workloads varies
|
||||
during runtime, and that requires users to overcommit. But doing
|
||||
that with a strict upper limit requires either a fairly accurate
|
||||
prediction of the working set size or adding slack to the limit.
|
||||
Since working set size estimation is hard and error prone, and
|
||||
getting it wrong results in OOM kills, most users tend to err on the
|
||||
side of a looser limit and end up wasting precious resources.
|
||||
|
||||
The memory.high boundary on the other hand can be set much more
|
||||
conservatively. When hit, it throttles allocations by forcing them
|
||||
into direct reclaim to work off the excess, but it never invokes the
|
||||
OOM killer. As a result, a high boundary that is chosen too
|
||||
aggressively will not terminate the processes, but instead it will
|
||||
lead to gradual performance degradation. The user can monitor this
|
||||
and make corrections until the minimal memory footprint that still
|
||||
gives acceptable performance is found.
|
||||
|
||||
In extreme cases, with many concurrent allocations and a complete
|
||||
breakdown of reclaim progress within the group, the high boundary
|
||||
can be exceeded. But even then it's mostly better to satisfy the
|
||||
allocation from the slack available in other groups or the rest of
|
||||
the system than killing the group. Otherwise, memory.max is there
|
||||
to limit this type of spillover and ultimately contain buggy or even
|
||||
malicious applications.
|
||||
|
||||
- The original control file names are unwieldy and inconsistent in
|
||||
many different ways. For example, the upper boundary hit count is
|
||||
exported in the memory.failcnt file, but an OOM event count has to
|
||||
be manually counted by listening to memory.oom_control events, and
|
||||
lower boundary / soft limit events have to be counted by first
|
||||
setting a threshold for that value and then counting those events.
|
||||
Also, usage and limit files encode their units in the filename.
|
||||
That makes the filenames very long, even though this is not
|
||||
information that a user needs to be reminded of every time they type
|
||||
out those names.
|
||||
|
||||
To address these naming issues, as well as to signal clearly that
|
||||
the new interface carries a new configuration model, the naming
|
||||
conventions in it necessarily differ from the old interface.
|
||||
|
||||
- The original limit files indicate the state of an unset limit with a
|
||||
Very High Number, and a configured limit can be unset by echoing -1
|
||||
into those files. But that very high number is implementation and
|
||||
architecture dependent and not very descriptive. And while -1 can
|
||||
be understood as an underflow into the highest possible value, -2 or
|
||||
-10M etc. do not work, so it's not consistent.
|
||||
|
||||
memory.low, memory.high, and memory.max will use the string "max" to
|
||||
indicate and set the highest possible value.
|
||||
|
||||
6. Planned Changes
|
||||
|
||||
6-1. CAP for resource control
|
||||
|
||||
Unified hierarchy will require one of the capabilities(7), which is
|
||||
yet to be decided, for all resource control related knobs. Process
|
||||
organization operations - creation of sub-cgroups and migration of
|
||||
processes in sub-hierarchies may be delegated by changing the
|
||||
ownership and/or permissions on the cgroup directory and
|
||||
"cgroup.procs" interface file; however, all operations which affect
|
||||
resource control - writes to a "cgroup.subtree_control" file or any
|
||||
controller-specific knobs - will require an explicit CAP privilege.
|
||||
|
||||
This, in part, is to prevent the cgroup interface from being
|
||||
inadvertently promoted to programmable API used by non-privileged
|
||||
binaries. cgroup exposes various aspects of the system in ways which
|
||||
aren't properly abstracted for direct consumption by regular programs.
|
||||
This is an administration interface much closer to sysctl knobs than
|
||||
system calls. Even the basic access model, being filesystem path
|
||||
based, isn't suitable for direct consumption. There's no way to
|
||||
access "my cgroup" in a race-free way or make multiple operations
|
||||
atomic against migration to another cgroup.
|
||||
|
||||
Another aspect is that, for better or for worse, the cgroup interface
|
||||
goes through far less scrutiny than regular interfaces for
|
||||
unprivileged userland. The upside is that cgroup is able to expose
|
||||
useful features which may not be suitable for general consumption in a
|
||||
reasonable time frame. It provides a relatively short path between
|
||||
internal details and userland-visible interface. Of course, this
|
||||
shortcut comes with high risk. We go through what we go through for
|
||||
general kernel APIs for good reasons. It may end up leaking internal
|
||||
details in a way which can exert significant pain by locking the
|
||||
kernel into a contract that can't be maintained in a reasonable
|
||||
manner.
|
||||
|
||||
Also, due to the specific nature, cgroup and its controllers don't
|
||||
tend to attract attention from a wide scope of developers. cgroup's
|
||||
short history is already fraught with severely mis-designed
|
||||
interfaces, unnecessary commitments to and exposing of internal
|
||||
details, broken and dangerous implementations of various features.
|
||||
|
||||
Keeping cgroup as an administration interface is both advantageous for
|
||||
its role and imperative given its nature. Some of the cgroup features
|
||||
may make sense for unprivileged access. If deemed justified, those
|
||||
must be further abstracted and implemented as a different interface,
|
||||
be it a system call or process-private filesystem, and survive through
|
||||
the scrutiny that any interface for general consumption is required to
|
||||
go through.
|
||||
|
||||
Requiring CAP is not a complete solution but should serve as a
|
||||
significant deterrent against spraying cgroup usages in non-privileged
|
||||
programs.
|
|
@ -34,17 +34,12 @@ struct seq_file;
|
|||
|
||||
/* define the enumeration of all cgroup subsystems */
|
||||
#define SUBSYS(_x) _x ## _cgrp_id,
|
||||
#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
|
||||
__unused_tag_ ## _t = CGROUP_ ## _t - 1,
|
||||
enum cgroup_subsys_id {
|
||||
#include <linux/cgroup_subsys.h>
|
||||
CGROUP_SUBSYS_COUNT,
|
||||
};
|
||||
#undef SUBSYS_TAG
|
||||
#undef SUBSYS
|
||||
|
||||
#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
|
||||
|
||||
/* bits in struct cgroup_subsys_state flags field */
|
||||
enum {
|
||||
CSS_NO_REF = (1 << 0), /* no reference counting for this css */
|
||||
|
@ -66,7 +61,6 @@ enum {
|
|||
|
||||
/* cgroup_root->flags */
|
||||
enum {
|
||||
CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
|
||||
CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
|
||||
CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
|
||||
};
|
||||
|
@ -439,9 +433,9 @@ struct cgroup_subsys {
|
|||
int (*can_attach)(struct cgroup_taskset *tset);
|
||||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||
void (*attach)(struct cgroup_taskset *tset);
|
||||
int (*can_fork)(struct task_struct *task, void **priv_p);
|
||||
void (*cancel_fork)(struct task_struct *task, void *priv);
|
||||
void (*fork)(struct task_struct *task, void *priv);
|
||||
int (*can_fork)(struct task_struct *task);
|
||||
void (*cancel_fork)(struct task_struct *task);
|
||||
void (*fork)(struct task_struct *task);
|
||||
void (*exit)(struct task_struct *task);
|
||||
void (*free)(struct task_struct *task);
|
||||
void (*bind)(struct cgroup_subsys_state *root_css);
|
||||
|
@ -527,7 +521,6 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
|
|||
|
||||
#else /* CONFIG_CGROUPS */
|
||||
|
||||
#define CGROUP_CANFORK_COUNT 0
|
||||
#define CGROUP_SUBSYS_COUNT 0
|
||||
|
||||
static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
|
||||
|
|
|
@ -97,12 +97,9 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
struct pid *pid, struct task_struct *tsk);
|
||||
|
||||
void cgroup_fork(struct task_struct *p);
|
||||
extern int cgroup_can_fork(struct task_struct *p,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT]);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT]);
|
||||
extern void cgroup_post_fork(struct task_struct *p,
|
||||
void *old_ss_priv[CGROUP_CANFORK_COUNT]);
|
||||
extern int cgroup_can_fork(struct task_struct *p);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p);
|
||||
extern void cgroup_post_fork(struct task_struct *p);
|
||||
void cgroup_exit(struct task_struct *p);
|
||||
void cgroup_free(struct task_struct *p);
|
||||
|
||||
|
@ -562,13 +559,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
|
|||
struct dentry *dentry) { return -EINVAL; }
|
||||
|
||||
static inline void cgroup_fork(struct task_struct *p) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
||||
{ return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p) {}
|
||||
static inline void cgroup_exit(struct task_struct *p) {}
|
||||
static inline void cgroup_free(struct task_struct *p) {}
|
||||
|
||||
|
|
|
@ -6,14 +6,8 @@
|
|||
|
||||
/*
|
||||
* This file *must* be included with SUBSYS() defined.
|
||||
* SUBSYS_TAG() is a noop if undefined.
|
||||
*/
|
||||
|
||||
#ifndef SUBSYS_TAG
|
||||
#define __TMP_SUBSYS_TAG
|
||||
#define SUBSYS_TAG(_x)
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_CPUSETS)
|
||||
SUBSYS(cpuset)
|
||||
#endif
|
||||
|
@ -58,17 +52,10 @@ SUBSYS(net_prio)
|
|||
SUBSYS(hugetlb)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Subsystems that implement the can_fork() family of callbacks.
|
||||
*/
|
||||
SUBSYS_TAG(CANFORK_START)
|
||||
|
||||
#if IS_ENABLED(CONFIG_CGROUP_PIDS)
|
||||
SUBSYS(pids)
|
||||
#endif
|
||||
|
||||
SUBSYS_TAG(CANFORK_END)
|
||||
|
||||
/*
|
||||
* The following subsystems are not supported on the default hierarchy.
|
||||
*/
|
||||
|
@ -76,11 +63,6 @@ SUBSYS_TAG(CANFORK_END)
|
|||
SUBSYS(debug)
|
||||
#endif
|
||||
|
||||
#ifdef __TMP_SUBSYS_TAG
|
||||
#undef __TMP_SUBSYS_TAG
|
||||
#undef SUBSYS_TAG
|
||||
#endif
|
||||
|
||||
/*
|
||||
* DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
|
||||
*/
|
||||
|
|
|
@ -54,6 +54,7 @@
|
|||
|
||||
#define SMB_SUPER_MAGIC 0x517B
|
||||
#define CGROUP_SUPER_MAGIC 0x27e0eb
|
||||
#define CGROUP2_SUPER_MAGIC 0x63677270
|
||||
|
||||
|
||||
#define STACK_END_MAGIC 0x57AC6E9D
|
||||
|
|
245
init/Kconfig
245
init/Kconfig
|
@ -940,95 +940,24 @@ menuconfig CGROUPS
|
|||
|
||||
if CGROUPS
|
||||
|
||||
config CGROUP_DEBUG
|
||||
bool "Example debug cgroup subsystem"
|
||||
default n
|
||||
help
|
||||
This option enables a simple cgroup subsystem that
|
||||
exports useful debugging information about the cgroups
|
||||
framework.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config CGROUP_FREEZER
|
||||
bool "Freezer cgroup subsystem"
|
||||
help
|
||||
Provides a way to freeze and unfreeze all tasks in a
|
||||
cgroup.
|
||||
|
||||
config CGROUP_PIDS
|
||||
bool "PIDs cgroup subsystem"
|
||||
help
|
||||
Provides enforcement of process number limits in the scope of a
|
||||
cgroup. Any attempt to fork more processes than is allowed in the
|
||||
cgroup will fail. PIDs are fundamentally a global resource because it
|
||||
is fairly trivial to reach PID exhaustion before you reach even a
|
||||
conservative kmemcg limit. As a result, it is possible to grind a
|
||||
system to halt without being limited by other cgroup policies. The
|
||||
PIDs cgroup subsystem is designed to stop this from happening.
|
||||
|
||||
It should be noted that organisational operations (such as attaching
|
||||
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
|
||||
since the PIDs limit only affects a process's ability to fork, not to
|
||||
attach to a cgroup.
|
||||
|
||||
config CGROUP_DEVICE
|
||||
bool "Device controller for cgroups"
|
||||
help
|
||||
Provides a cgroup implementing whitelists for devices which
|
||||
a process in the cgroup can mknod or open.
|
||||
|
||||
config CPUSETS
|
||||
bool "Cpuset support"
|
||||
help
|
||||
This option will let you create and manage CPUSETs which
|
||||
allow dynamically partitioning a system into sets of CPUs and
|
||||
Memory Nodes and assigning tasks to run only within those sets.
|
||||
This is primarily useful on large SMP or NUMA systems.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config PROC_PID_CPUSET
|
||||
bool "Include legacy /proc/<pid>/cpuset file"
|
||||
depends on CPUSETS
|
||||
default y
|
||||
|
||||
config CGROUP_CPUACCT
|
||||
bool "Simple CPU accounting cgroup subsystem"
|
||||
help
|
||||
Provides a simple Resource Controller for monitoring the
|
||||
total CPU consumed by the tasks in a cgroup.
|
||||
|
||||
config PAGE_COUNTER
|
||||
bool
|
||||
|
||||
config MEMCG
|
||||
bool "Memory Resource Controller for Control Groups"
|
||||
bool "Memory controller"
|
||||
select PAGE_COUNTER
|
||||
select EVENTFD
|
||||
help
|
||||
Provides a memory resource controller that manages both anonymous
|
||||
memory and page cache. (See Documentation/cgroups/memory.txt)
|
||||
Provides control over the memory footprint of tasks in a cgroup.
|
||||
|
||||
config MEMCG_SWAP
|
||||
bool "Memory Resource Controller Swap Extension"
|
||||
bool "Swap controller"
|
||||
depends on MEMCG && SWAP
|
||||
help
|
||||
Add swap management feature to memory resource controller. When you
|
||||
enable this, you can limit mem+swap usage per cgroup. In other words,
|
||||
when you disable this, memory resource controller has no cares to
|
||||
usage of swap...a process can exhaust all of the swap. This extension
|
||||
is useful when you want to avoid exhaustion swap but this itself
|
||||
adds more overheads and consumes memory for remembering information.
|
||||
Especially if you use 32bit system or small memory system, please
|
||||
be careful about enabling this. When memory resource controller
|
||||
is disabled by boot option, this will be automatically disabled and
|
||||
there will be no overhead from this. Even when you set this config=y,
|
||||
if boot option "swapaccount=0" is set, swap will not be accounted.
|
||||
Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page
|
||||
size is 4096bytes, 512k per 1Gbytes of swap.
|
||||
Provides control over the swap space consumed by tasks in a cgroup.
|
||||
|
||||
config MEMCG_SWAP_ENABLED
|
||||
bool "Memory Resource Controller Swap Extension enabled by default"
|
||||
bool "Swap controller enabled by default"
|
||||
depends on MEMCG_SWAP
|
||||
default y
|
||||
help
|
||||
|
@ -1052,34 +981,43 @@ config MEMCG_KMEM
|
|||
the kmem extension can use it to guarantee that no group of processes
|
||||
will ever exhaust kernel resources alone.
|
||||
|
||||
config CGROUP_HUGETLB
|
||||
bool "HugeTLB Resource Controller for Control Groups"
|
||||
depends on HUGETLB_PAGE
|
||||
select PAGE_COUNTER
|
||||
config BLK_CGROUP
|
||||
bool "IO controller"
|
||||
depends on BLOCK
|
||||
default n
|
||||
help
|
||||
Provides a cgroup Resource Controller for HugeTLB pages.
|
||||
When you enable this, you can put a per cgroup limit on HugeTLB usage.
|
||||
The limit is enforced during page fault. Since HugeTLB doesn't
|
||||
support page reclaim, enforcing the limit at page fault time implies
|
||||
that, the application will get SIGBUS signal if it tries to access
|
||||
HugeTLB pages beyond its limit. This requires the application to know
|
||||
beforehand how much HugeTLB pages it would require for its use. The
|
||||
control group is tracked in the third page lru pointer. This means
|
||||
that we cannot use the controller with huge page less than 3 pages.
|
||||
---help---
|
||||
Generic block IO controller cgroup interface. This is the common
|
||||
cgroup interface which should be used by various IO controlling
|
||||
policies.
|
||||
|
||||
config CGROUP_PERF
|
||||
bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
|
||||
depends on PERF_EVENTS && CGROUPS
|
||||
help
|
||||
This option extends the per-cpu mode to restrict monitoring to
|
||||
threads which belong to the cgroup specified and run on the
|
||||
designated cpu.
|
||||
Currently, CFQ IO scheduler uses it to recognize task groups and
|
||||
control disk bandwidth allocation (proportional time slice allocation)
|
||||
to such task groups. It is also used by bio throttling logic in
|
||||
block layer to implement upper limit in IO rates on a device.
|
||||
|
||||
Say N if unsure.
|
||||
This option only enables generic Block IO controller infrastructure.
|
||||
One needs to also enable actual IO controlling logic/policy. For
|
||||
enabling proportional weight division of disk bandwidth in CFQ, set
|
||||
CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
|
||||
CONFIG_BLK_DEV_THROTTLING=y.
|
||||
|
||||
See Documentation/cgroups/blkio-controller.txt for more information.
|
||||
|
||||
config DEBUG_BLK_CGROUP
|
||||
bool "IO controller debugging"
|
||||
depends on BLK_CGROUP
|
||||
default n
|
||||
---help---
|
||||
Enable some debugging help. Currently it exports additional stat
|
||||
files in a cgroup which can be useful for debugging.
|
||||
|
||||
config CGROUP_WRITEBACK
|
||||
bool
|
||||
depends on MEMCG && BLK_CGROUP
|
||||
default y
|
||||
|
||||
menuconfig CGROUP_SCHED
|
||||
bool "Group CPU scheduler"
|
||||
bool "CPU controller"
|
||||
default n
|
||||
help
|
||||
This feature lets CPU scheduler recognize task groups and control CPU
|
||||
|
@ -1116,41 +1054,90 @@ config RT_GROUP_SCHED
|
|||
|
||||
endif #CGROUP_SCHED
|
||||
|
||||
config BLK_CGROUP
|
||||
bool "Block IO controller"
|
||||
depends on BLOCK
|
||||
config CGROUP_PIDS
|
||||
bool "PIDs controller"
|
||||
help
|
||||
Provides enforcement of process number limits in the scope of a
|
||||
cgroup. Any attempt to fork more processes than is allowed in the
|
||||
cgroup will fail. PIDs are fundamentally a global resource because it
|
||||
is fairly trivial to reach PID exhaustion before you reach even a
|
||||
conservative kmemcg limit. As a result, it is possible to grind a
|
||||
system to halt without being limited by other cgroup policies. The
|
||||
PIDs cgroup subsystem is designed to stop this from happening.
|
||||
|
||||
It should be noted that organisational operations (such as attaching
|
||||
to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
|
||||
since the PIDs limit only affects a process's ability to fork, not to
|
||||
attach to a cgroup.
|
||||
|
||||
config CGROUP_FREEZER
|
||||
bool "Freezer controller"
|
||||
help
|
||||
Provides a way to freeze and unfreeze all tasks in a
|
||||
cgroup.
|
||||
|
||||
config CGROUP_HUGETLB
|
||||
bool "HugeTLB controller"
|
||||
depends on HUGETLB_PAGE
|
||||
select PAGE_COUNTER
|
||||
default n
|
||||
---help---
|
||||
Generic block IO controller cgroup interface. This is the common
|
||||
cgroup interface which should be used by various IO controlling
|
||||
policies.
|
||||
help
|
||||
Provides a cgroup controller for HugeTLB pages.
|
||||
When you enable this, you can put a per cgroup limit on HugeTLB usage.
|
||||
The limit is enforced during page fault. Since HugeTLB doesn't
|
||||
support page reclaim, enforcing the limit at page fault time implies
|
||||
that, the application will get SIGBUS signal if it tries to access
|
||||
HugeTLB pages beyond its limit. This requires the application to know
|
||||
beforehand how much HugeTLB pages it would require for its use. The
|
||||
control group is tracked in the third page lru pointer. This means
|
||||
that we cannot use the controller with huge page less than 3 pages.
|
||||
|
||||
Currently, CFQ IO scheduler uses it to recognize task groups and
|
||||
control disk bandwidth allocation (proportional time slice allocation)
|
||||
to such task groups. It is also used by bio throttling logic in
|
||||
block layer to implement upper limit in IO rates on a device.
|
||||
config CPUSETS
|
||||
bool "Cpuset controller"
|
||||
help
|
||||
This option will let you create and manage CPUSETs which
|
||||
allow dynamically partitioning a system into sets of CPUs and
|
||||
Memory Nodes and assigning tasks to run only within those sets.
|
||||
This is primarily useful on large SMP or NUMA systems.
|
||||
|
||||
This option only enables generic Block IO controller infrastructure.
|
||||
One needs to also enable actual IO controlling logic/policy. For
|
||||
enabling proportional weight division of disk bandwidth in CFQ, set
|
||||
CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
|
||||
CONFIG_BLK_DEV_THROTTLING=y.
|
||||
Say N if unsure.
|
||||
|
||||
See Documentation/cgroups/blkio-controller.txt for more information.
|
||||
|
||||
config DEBUG_BLK_CGROUP
|
||||
bool "Enable Block IO controller debugging"
|
||||
depends on BLK_CGROUP
|
||||
default n
|
||||
---help---
|
||||
Enable some debugging help. Currently it exports additional stat
|
||||
files in a cgroup which can be useful for debugging.
|
||||
|
||||
config CGROUP_WRITEBACK
|
||||
bool
|
||||
depends on MEMCG && BLK_CGROUP
|
||||
config PROC_PID_CPUSET
|
||||
bool "Include legacy /proc/<pid>/cpuset file"
|
||||
depends on CPUSETS
|
||||
default y
|
||||
|
||||
config CGROUP_DEVICE
|
||||
bool "Device controller"
|
||||
help
|
||||
Provides a cgroup controller implementing whitelists for
|
||||
devices which a process in the cgroup can mknod or open.
|
||||
|
||||
config CGROUP_CPUACCT
|
||||
bool "Simple CPU accounting controller"
|
||||
help
|
||||
Provides a simple controller for monitoring the
|
||||
total CPU consumed by the tasks in a cgroup.
|
||||
|
||||
config CGROUP_PERF
|
||||
bool "Perf controller"
|
||||
depends on PERF_EVENTS
|
||||
help
|
||||
This option extends the perf per-cpu mode to restrict monitoring
|
||||
to threads which belong to the cgroup specified and run on the
|
||||
designated cpu.
|
||||
|
||||
Say N if unsure.
|
||||
|
||||
config CGROUP_DEBUG
|
||||
bool "Example controller"
|
||||
default n
|
||||
help
|
||||
This option enables a simple controller that exports
|
||||
debugging information about the cgroups framework.
|
||||
|
||||
Say N.
|
||||
|
||||
endif # CGROUPS
|
||||
|
||||
config CHECKPOINT_RESTORE
|
||||
|
|
|
@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly;
|
|||
/* Ditto for the can_fork callback. */
|
||||
static unsigned long have_canfork_callback __read_mostly;
|
||||
|
||||
static struct file_system_type cgroup2_fs_type;
|
||||
static struct cftype cgroup_dfl_base_files[];
|
||||
static struct cftype cgroup_legacy_base_files[];
|
||||
|
||||
|
@ -1623,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
|||
all_ss = true;
|
||||
continue;
|
||||
}
|
||||
if (!strcmp(token, "__DEVEL__sane_behavior")) {
|
||||
opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
|
||||
continue;
|
||||
}
|
||||
if (!strcmp(token, "noprefix")) {
|
||||
opts->flags |= CGRP_ROOT_NOPREFIX;
|
||||
continue;
|
||||
|
@ -1693,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
|
|||
return -ENOENT;
|
||||
}
|
||||
|
||||
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
|
||||
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
|
||||
if (nr_opts != 1) {
|
||||
pr_err("sane_behavior: no other mount options allowed\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the 'all' option was specified select all the subsystems,
|
||||
* otherwise if 'none', 'name=' and a subsystem name options were
|
||||
|
@ -1981,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||
int flags, const char *unused_dev_name,
|
||||
void *data)
|
||||
{
|
||||
bool is_v2 = fs_type == &cgroup2_fs_type;
|
||||
struct super_block *pinned_sb = NULL;
|
||||
struct cgroup_subsys *ss;
|
||||
struct cgroup_root *root;
|
||||
|
@ -1997,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||
if (!use_task_css_set_links)
|
||||
cgroup_enable_task_cg_lists();
|
||||
|
||||
if (is_v2) {
|
||||
if (data) {
|
||||
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
cgrp_dfl_root_visible = true;
|
||||
root = &cgrp_dfl_root;
|
||||
cgroup_get(&root->cgrp);
|
||||
goto out_mount;
|
||||
}
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
/* First find the desired set of subsystems */
|
||||
|
@ -2004,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
|||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/* look for a matching existing root */
|
||||
if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
|
||||
cgrp_dfl_root_visible = true;
|
||||
root = &cgrp_dfl_root;
|
||||
cgroup_get(&root->cgrp);
|
||||
ret = 0;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Destruction of cgroup root is asynchronous, so subsystems may
|
||||
* still be dying after the previous unmount. Let's drain the
|
||||
|
@ -2123,9 +2114,10 @@ out_free:
|
|||
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
out_mount:
|
||||
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
||||
CGROUP_SUPER_MAGIC, &new_sb);
|
||||
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
||||
&new_sb);
|
||||
if (IS_ERR(dentry) || !new_sb)
|
||||
cgroup_put(&root->cgrp);
|
||||
|
||||
|
@ -2168,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = {
|
|||
.kill_sb = cgroup_kill_sb,
|
||||
};
|
||||
|
||||
static struct file_system_type cgroup2_fs_type = {
|
||||
.name = "cgroup2",
|
||||
.mount = cgroup_mount,
|
||||
.kill_sb = cgroup_kill_sb,
|
||||
};
|
||||
|
||||
/**
|
||||
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
||||
* @task: target task
|
||||
|
@ -4039,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
|
|||
goto out_err;
|
||||
|
||||
/*
|
||||
* Migrate tasks one-by-one until @form is empty. This fails iff
|
||||
* Migrate tasks one-by-one until @from is empty. This fails iff
|
||||
* ->can_attach() fails.
|
||||
*/
|
||||
do {
|
||||
|
@ -5171,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
|
|||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
|
||||
pr_debug("Initializing cgroup subsys %s\n", ss->name);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
|
@ -5329,6 +5327,7 @@ int __init cgroup_init(void)
|
|||
|
||||
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
|
||||
WARN_ON(register_filesystem(&cgroup_fs_type));
|
||||
WARN_ON(register_filesystem(&cgroup2_fs_type));
|
||||
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
|
||||
|
||||
return 0;
|
||||
|
@ -5472,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = {
|
|||
.release = single_release,
|
||||
};
|
||||
|
||||
static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
|
||||
{
|
||||
if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
|
||||
return &ss_priv[i - CGROUP_CANFORK_START];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
|
||||
{
|
||||
void **private = subsys_canfork_priv_p(ss_priv, i);
|
||||
return private ? *private : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_fork - initialize cgroup related fields during copy_process()
|
||||
* @child: pointer to task_struct of forking parent process.
|
||||
|
@ -5507,14 +5493,13 @@ void cgroup_fork(struct task_struct *child)
|
|||
* returns an error, the fork aborts with that error code. This allows for
|
||||
* a cgroup subsystem to conditionally allow or deny new forks.
|
||||
*/
|
||||
int cgroup_can_fork(struct task_struct *child,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
||||
int cgroup_can_fork(struct task_struct *child)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i, j, ret;
|
||||
|
||||
for_each_subsys_which(ss, i, &have_canfork_callback) {
|
||||
ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
|
||||
ret = ss->can_fork(child);
|
||||
if (ret)
|
||||
goto out_revert;
|
||||
}
|
||||
|
@ -5526,7 +5511,7 @@ out_revert:
|
|||
if (j >= i)
|
||||
break;
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
|
||||
ss->cancel_fork(child);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -5539,15 +5524,14 @@ out_revert:
|
|||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded.
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child,
|
||||
void *ss_priv[CGROUP_CANFORK_COUNT])
|
||||
void cgroup_cancel_fork(struct task_struct *child)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
|
||||
ss->cancel_fork(child);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -5560,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child,
|
|||
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
|
||||
* list.
|
||||
*/
|
||||
void cgroup_post_fork(struct task_struct *child,
|
||||
void *old_ss_priv[CGROUP_CANFORK_COUNT])
|
||||
void cgroup_post_fork(struct task_struct *child)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
|
@ -5605,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child,
|
|||
* and addition to css_set.
|
||||
*/
|
||||
for_each_subsys_which(ss, i, &have_fork_callback)
|
||||
ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
|
||||
ss->fork(child);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
|
|||
* to do anything as freezer_attach() will put @task into the appropriate
|
||||
* state.
|
||||
*/
|
||||
static void freezer_fork(struct task_struct *task, void *private)
|
||||
static void freezer_fork(struct task_struct *task)
|
||||
{
|
||||
struct freezer *freezer;
|
||||
|
||||
|
|
|
@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num)
|
|||
*
|
||||
* This function follows the set limit. It will fail if the charge would cause
|
||||
* the new value to exceed the hierarchical limit. Returns 0 if the charge
|
||||
* succeded, otherwise -EAGAIN.
|
||||
* succeeded, otherwise -EAGAIN.
|
||||
*/
|
||||
static int pids_try_charge(struct pids_cgroup *pids, int num)
|
||||
{
|
||||
|
@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
|
|||
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
||||
* on threadgroup_change_begin() held by the copy_process().
|
||||
*/
|
||||
static int pids_can_fork(struct task_struct *task, void **priv_p)
|
||||
static int pids_can_fork(struct task_struct *task)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
|
@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p)
|
|||
return pids_try_charge(pids, 1);
|
||||
}
|
||||
|
||||
static void pids_cancel_fork(struct task_struct *task, void *priv)
|
||||
static void pids_cancel_fork(struct task_struct *task)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
#include <linux/stat.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/time.h>
|
||||
#include <linux/time64.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
|
@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
|
|||
struct fmeter {
|
||||
int cnt; /* unprocessed events count */
|
||||
int val; /* most recent output value */
|
||||
time_t time; /* clock (secs) when val computed */
|
||||
time64_t time; /* clock (secs) when val computed */
|
||||
spinlock_t lock; /* guards read or write of above */
|
||||
};
|
||||
|
||||
|
@ -1374,7 +1375,7 @@ out:
|
|||
*/
|
||||
|
||||
#define FM_COEF 933 /* coefficient for half-life of 10 secs */
|
||||
#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
|
||||
#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
|
||||
#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
|
||||
#define FM_SCALE 1000 /* faux fixed point scale */
|
||||
|
||||
|
@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp)
|
|||
/* Internal meter update - process cnt events and update value */
|
||||
static void fmeter_update(struct fmeter *fmp)
|
||||
{
|
||||
time_t now = get_seconds();
|
||||
time_t ticks = now - fmp->time;
|
||||
time64_t now;
|
||||
u32 ticks;
|
||||
|
||||
now = ktime_get_seconds();
|
||||
ticks = now - fmp->time;
|
||||
|
||||
if (ticks == 0)
|
||||
return;
|
||||
|
|
|
@ -1250,7 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||
{
|
||||
int retval;
|
||||
struct task_struct *p;
|
||||
void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
|
||||
|
||||
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
@ -1527,7 +1526,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||
* between here and cgroup_post_fork() if an organisation operation is in
|
||||
* progress.
|
||||
*/
|
||||
retval = cgroup_can_fork(p, cgrp_ss_priv);
|
||||
retval = cgroup_can_fork(p);
|
||||
if (retval)
|
||||
goto bad_fork_free_pid;
|
||||
|
||||
|
@ -1609,7 +1608,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||
write_unlock_irq(&tasklist_lock);
|
||||
|
||||
proc_fork_connector(p);
|
||||
cgroup_post_fork(p, cgrp_ss_priv);
|
||||
cgroup_post_fork(p);
|
||||
threadgroup_change_end(current);
|
||||
perf_event_fork(p);
|
||||
|
||||
|
@ -1619,7 +1618,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
|||
return p;
|
||||
|
||||
bad_fork_cancel_cgroup:
|
||||
cgroup_cancel_fork(p, cgrp_ss_priv);
|
||||
cgroup_cancel_fork(p);
|
||||
bad_fork_free_pid:
|
||||
if (pid != &init_struct_pid)
|
||||
free_pid(pid);
|
||||
|
|
|
@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
|
|||
sched_offline_group(tg);
|
||||
}
|
||||
|
||||
static void cpu_cgroup_fork(struct task_struct *task, void *private)
|
||||
static void cpu_cgroup_fork(struct task_struct *task)
|
||||
{
|
||||
sched_move_task(task);
|
||||
}
|
||||
|
|
|
@ -4813,7 +4813,7 @@ static void mem_cgroup_clear_mc(void)
|
|||
static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct mem_cgroup *memcg;
|
||||
struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
|
||||
struct mem_cgroup *from;
|
||||
struct task_struct *leader, *p;
|
||||
struct mm_struct *mm;
|
||||
|
|
Loading…
Reference in a new issue