Merge branch 'cgroup/for-4.11-rdmacg' into cgroup/for-4.11
Merge in to resolve conflicts in Documentation/cgroup-v2.txt. The conflicts are from multiple section additions and trivial to resolve. Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
commit
63f1ca5945
14 changed files with 1071 additions and 10 deletions
109
Documentation/cgroup-v1/rdma.txt
Normal file
109
Documentation/cgroup-v1/rdma.txt
Normal file
|
@ -0,0 +1,109 @@
|
|||
RDMA Controller
|
||||
----------------
|
||||
|
||||
Contents
|
||||
--------
|
||||
|
||||
1. Overview
|
||||
1-1. What is RDMA controller?
|
||||
1-2. Why RDMA controller needed?
|
||||
1-3. How is RDMA controller implemented?
|
||||
2. Usage Examples
|
||||
|
||||
1. Overview
|
||||
|
||||
1-1. What is RDMA controller?
|
||||
-----------------------------
|
||||
|
||||
RDMA controller allows user to limit RDMA/IB specific resources that a given
|
||||
set of processes can use. These processes are grouped using RDMA controller.
|
||||
|
||||
RDMA controller defines two resources which can be limited for processes of a
|
||||
cgroup.
|
||||
|
||||
1-2. Why RDMA controller needed?
|
||||
--------------------------------
|
||||
|
||||
Currently user space applications can easily take away all the rdma verb
|
||||
specific resources such as AH, CQ, QP, MR etc. Due to which other applications
|
||||
in other cgroup or kernel space ULPs may not even get chance to allocate any
|
||||
rdma resources. This can leads to service unavailability.
|
||||
|
||||
Therefore RDMA controller is needed through which resource consumption
|
||||
of processes can be limited. Through this controller different rdma
|
||||
resources can be accounted.
|
||||
|
||||
1-3. How is RDMA controller implemented?
|
||||
----------------------------------------
|
||||
|
||||
RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
|
||||
resource accounting per cgroup, per device using resource pool structure.
|
||||
Each such resource pool is limited up to 64 resources in given resource pool
|
||||
by rdma cgroup, which can be extended later if required.
|
||||
|
||||
This resource pool object is linked to the cgroup css. Typically there
|
||||
are 0 to 4 resource pool instances per cgroup, per device in most use cases.
|
||||
But nothing limits to have it more. At present hundreds of RDMA devices per
|
||||
single cgroup may not be handled optimally, however there is no
|
||||
known use case or requirement for such configuration either.
|
||||
|
||||
Since RDMA resources can be allocated from any process and can be freed by any
|
||||
of the child processes which shares the address space, rdma resources are
|
||||
always owned by the creator cgroup css. This allows process migration from one
|
||||
to other cgroup without major complexity of transferring resource ownership;
|
||||
because such ownership is not really present due to shared nature of
|
||||
rdma resources. Linking resources around css also ensures that cgroups can be
|
||||
deleted after processes migrated. This allow progress migration as well with
|
||||
active resources, even though that is not a primary use case.
|
||||
|
||||
Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
|
||||
the caller. Same rdma cgroup should be passed while uncharging the resource.
|
||||
This also allows process migrated with active RDMA resource to charge
|
||||
to new owner cgroup for new resource. It also allows to uncharge resource of
|
||||
a process from previously charged cgroup which is migrated to new cgroup,
|
||||
even though that is not a primary use case.
|
||||
|
||||
Resource pool object is created in following situations.
|
||||
(a) User sets the limit and no previous resource pool exist for the device
|
||||
of interest for the cgroup.
|
||||
(b) No resource limits were configured, but IB/RDMA stack tries to
|
||||
charge the resource. So that it correctly uncharge them when applications are
|
||||
running without limits and later on when limits are enforced during uncharging,
|
||||
otherwise usage count will drop to negative.
|
||||
|
||||
Resource pool is destroyed if all the resource limits are set to max and
|
||||
it is the last resource getting deallocated.
|
||||
|
||||
User should set all the limit to max value if it intents to remove/unconfigure
|
||||
the resource pool for a particular device.
|
||||
|
||||
IB stack honors limits enforced by the rdma controller. When application
|
||||
query about maximum resource limits of IB device, it returns minimum of
|
||||
what is configured by user for a given cgroup and what is supported by
|
||||
IB device.
|
||||
|
||||
Following resources can be accounted by rdma controller.
|
||||
hca_handle Maximum number of HCA Handles
|
||||
hca_object Maximum number of HCA Objects
|
||||
|
||||
2. Usage Examples
|
||||
-----------------
|
||||
|
||||
(a) Configure resource limit:
|
||||
echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
|
||||
echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
|
||||
|
||||
(b) Query resource limit:
|
||||
cat /sys/fs/cgroup/rdma/2/rdma.max
|
||||
#Output:
|
||||
mlx4_0 hca_handle=2 hca_object=2000
|
||||
ocrdma1 hca_handle=3 hca_object=max
|
||||
|
||||
(c) Query current usage:
|
||||
cat /sys/fs/cgroup/rdma/2/rdma.current
|
||||
#Output:
|
||||
mlx4_0 hca_handle=1 hca_object=20
|
||||
ocrdma1 hca_handle=1 hca_object=23
|
||||
|
||||
(d) Delete resource limit:
|
||||
echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
|
|
@ -49,8 +49,10 @@ CONTENTS
|
|||
5-3-2. Writeback
|
||||
5-4. PID
|
||||
5-4-1. PID Interface Files
|
||||
5-5. Misc
|
||||
5-5-1. perf_event
|
||||
5-5. RDMA
|
||||
5-5-1. RDMA Interface Files
|
||||
5-6. Misc
|
||||
5-6-1. perf_event
|
||||
6. Namespace
|
||||
6-1. Basics
|
||||
6-2. The Root and Views
|
||||
|
@ -1160,9 +1162,45 @@ through fork() or clone(). These will return -EAGAIN if the creation
|
|||
of a new process would cause a cgroup policy to be violated.
|
||||
|
||||
|
||||
5-5. Misc
|
||||
5-5. RDMA
|
||||
|
||||
5-5-1. perf_event
|
||||
The "rdma" controller regulates the distribution and accounting of
|
||||
of RDMA resources.
|
||||
|
||||
5-5-1. RDMA Interface Files
|
||||
|
||||
rdma.max
|
||||
A readwrite nested-keyed file that exists for all the cgroups
|
||||
except root that describes current configured resource limit
|
||||
for a RDMA/IB device.
|
||||
|
||||
Lines are keyed by device name and are not ordered.
|
||||
Each line contains space separated resource name and its configured
|
||||
limit that can be distributed.
|
||||
|
||||
The following nested keys are defined.
|
||||
|
||||
hca_handle Maximum number of HCA Handles
|
||||
hca_object Maximum number of HCA Objects
|
||||
|
||||
An example for mlx4 and ocrdma device follows.
|
||||
|
||||
mlx4_0 hca_handle=2 hca_object=2000
|
||||
ocrdma1 hca_handle=3 hca_object=max
|
||||
|
||||
rdma.current
|
||||
A read-only file that describes current resource usage.
|
||||
It exists for all the cgroup except root.
|
||||
|
||||
An example for mlx4 and ocrdma device follows.
|
||||
|
||||
mlx4_0 hca_handle=1 hca_object=20
|
||||
ocrdma1 hca_handle=1 hca_object=23
|
||||
|
||||
|
||||
5-6. Misc
|
||||
|
||||
5-6-1. perf_event
|
||||
|
||||
perf_event controller, if not mounted on a legacy hierarchy, is
|
||||
automatically enabled on the v2 hierarchy so that perf events can
|
||||
|
|
|
@ -13,6 +13,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \
|
|||
multicast.o mad.o smi.o agent.o mad_rmpp.o
|
||||
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
|
||||
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
|
||||
ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o
|
||||
|
||||
ib_cm-y := cm.o
|
||||
|
||||
|
|
62
drivers/infiniband/core/cgroup.c
Normal file
62
drivers/infiniband/core/cgroup.c
Normal file
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms and conditions of the GNU General Public License,
|
||||
* version 2, as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
#include "core_priv.h"
|
||||
|
||||
/**
|
||||
* ib_device_register_rdmacg - register with rdma cgroup.
|
||||
* @device: device to register to participate in resource
|
||||
* accounting by rdma cgroup.
|
||||
*
|
||||
* Register with the rdma cgroup. Should be called before
|
||||
* exposing rdma device to user space applications to avoid
|
||||
* resource accounting leak.
|
||||
* Returns 0 on success or otherwise failure code.
|
||||
*/
|
||||
int ib_device_register_rdmacg(struct ib_device *device)
|
||||
{
|
||||
device->cg_device.name = device->name;
|
||||
return rdmacg_register_device(&device->cg_device);
|
||||
}
|
||||
|
||||
/**
|
||||
* ib_device_unregister_rdmacg - unregister with rdma cgroup.
|
||||
* @device: device to unregister.
|
||||
*
|
||||
* Unregister with the rdma cgroup. Should be called after
|
||||
* all the resources are deallocated, and after a stage when any
|
||||
* other resource allocation by user application cannot be done
|
||||
* for this device to avoid any leak in accounting.
|
||||
*/
|
||||
void ib_device_unregister_rdmacg(struct ib_device *device)
|
||||
{
|
||||
rdmacg_unregister_device(&device->cg_device);
|
||||
}
|
||||
|
||||
int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index)
|
||||
{
|
||||
return rdmacg_try_charge(&cg_obj->cg, &device->cg_device,
|
||||
resource_index);
|
||||
}
|
||||
EXPORT_SYMBOL(ib_rdmacg_try_charge);
|
||||
|
||||
void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index)
|
||||
{
|
||||
rdmacg_uncharge(cg_obj->cg, &device->cg_device,
|
||||
resource_index);
|
||||
}
|
||||
EXPORT_SYMBOL(ib_rdmacg_uncharge);
|
|
@ -35,6 +35,7 @@
|
|||
|
||||
#include <linux/list.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/cgroup_rdma.h>
|
||||
|
||||
#include <rdma/ib_verbs.h>
|
||||
|
||||
|
@ -121,6 +122,35 @@ int ib_cache_setup_one(struct ib_device *device);
|
|||
void ib_cache_cleanup_one(struct ib_device *device);
|
||||
void ib_cache_release_one(struct ib_device *device);
|
||||
|
||||
#ifdef CONFIG_CGROUP_RDMA
|
||||
int ib_device_register_rdmacg(struct ib_device *device);
|
||||
void ib_device_unregister_rdmacg(struct ib_device *device);
|
||||
|
||||
int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index);
|
||||
|
||||
void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index);
|
||||
#else
|
||||
static inline int ib_device_register_rdmacg(struct ib_device *device)
|
||||
{ return 0; }
|
||||
|
||||
static inline void ib_device_unregister_rdmacg(struct ib_device *device)
|
||||
{ }
|
||||
|
||||
static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index)
|
||||
{ return 0; }
|
||||
|
||||
static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj,
|
||||
struct ib_device *device,
|
||||
enum rdmacg_resource_type resource_index)
|
||||
{ }
|
||||
#endif
|
||||
|
||||
static inline bool rdma_is_upper_dev_rcu(struct net_device *dev,
|
||||
struct net_device *upper)
|
||||
{
|
||||
|
|
|
@ -360,10 +360,18 @@ int ib_register_device(struct ib_device *device,
|
|||
goto out;
|
||||
}
|
||||
|
||||
ret = ib_device_register_rdmacg(device);
|
||||
if (ret) {
|
||||
pr_warn("Couldn't register device with rdma cgroup\n");
|
||||
ib_cache_cleanup_one(device);
|
||||
goto out;
|
||||
}
|
||||
|
||||
memset(&device->attrs, 0, sizeof(device->attrs));
|
||||
ret = device->query_device(device, &device->attrs, &uhw);
|
||||
if (ret) {
|
||||
pr_warn("Couldn't query the device attributes\n");
|
||||
ib_device_unregister_rdmacg(device);
|
||||
ib_cache_cleanup_one(device);
|
||||
goto out;
|
||||
}
|
||||
|
@ -372,6 +380,7 @@ int ib_register_device(struct ib_device *device,
|
|||
if (ret) {
|
||||
pr_warn("Couldn't register device %s with driver model\n",
|
||||
device->name);
|
||||
ib_device_unregister_rdmacg(device);
|
||||
ib_cache_cleanup_one(device);
|
||||
goto out;
|
||||
}
|
||||
|
@ -421,6 +430,7 @@ void ib_unregister_device(struct ib_device *device)
|
|||
|
||||
mutex_unlock(&device_mutex);
|
||||
|
||||
ib_device_unregister_rdmacg(device);
|
||||
ib_device_unregister_sysfs(device);
|
||||
ib_cache_cleanup_one(device);
|
||||
|
||||
|
|
|
@ -316,6 +316,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
|
|||
struct ib_udata udata;
|
||||
struct ib_ucontext *ucontext;
|
||||
struct file *filp;
|
||||
struct ib_rdmacg_object cg_obj;
|
||||
int ret;
|
||||
|
||||
if (out_len < sizeof resp)
|
||||
|
@ -335,13 +336,18 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
|
|||
(unsigned long) cmd.response + sizeof resp,
|
||||
in_len - sizeof cmd, out_len - sizeof resp);
|
||||
|
||||
ret = ib_rdmacg_try_charge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
|
||||
if (IS_ERR(ucontext)) {
|
||||
ret = PTR_ERR(ucontext);
|
||||
goto err;
|
||||
goto err_alloc;
|
||||
}
|
||||
|
||||
ucontext->device = ib_dev;
|
||||
ucontext->cg_obj = cg_obj;
|
||||
INIT_LIST_HEAD(&ucontext->pd_list);
|
||||
INIT_LIST_HEAD(&ucontext->mr_list);
|
||||
INIT_LIST_HEAD(&ucontext->mw_list);
|
||||
|
@ -407,6 +413,9 @@ err_free:
|
|||
put_pid(ucontext->tgid);
|
||||
ib_dev->dealloc_ucontext(ucontext);
|
||||
|
||||
err_alloc:
|
||||
ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE);
|
||||
|
||||
err:
|
||||
mutex_unlock(&file->mutex);
|
||||
return ret;
|
||||
|
@ -561,6 +570,13 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
|
|||
return -ENOMEM;
|
||||
|
||||
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
|
||||
ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret) {
|
||||
kfree(uobj);
|
||||
return ret;
|
||||
}
|
||||
|
||||
down_write(&uobj->mutex);
|
||||
|
||||
pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
|
||||
|
@ -605,6 +621,7 @@ err_idr:
|
|||
ib_dealloc_pd(pd);
|
||||
|
||||
err:
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
put_uobj_write(uobj);
|
||||
return ret;
|
||||
}
|
||||
|
@ -637,6 +654,8 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
goto err_put;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
uobj->live = 0;
|
||||
put_uobj_write(uobj);
|
||||
|
||||
|
@ -1006,6 +1025,10 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
|
|||
goto err_put;
|
||||
}
|
||||
}
|
||||
ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_charge;
|
||||
|
||||
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
|
||||
cmd.access_flags, &udata);
|
||||
|
@ -1054,6 +1077,9 @@ err_unreg:
|
|||
ib_dereg_mr(mr);
|
||||
|
||||
err_put:
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
err_charge:
|
||||
put_pd_read(pd);
|
||||
|
||||
err_free:
|
||||
|
@ -1178,6 +1204,8 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
|
||||
|
||||
mutex_lock(&file->mutex);
|
||||
|
@ -1226,6 +1254,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
|
|||
in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr),
|
||||
out_len - sizeof(resp));
|
||||
|
||||
ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_charge;
|
||||
|
||||
mw = pd->device->alloc_mw(pd, cmd.mw_type, &udata);
|
||||
if (IS_ERR(mw)) {
|
||||
ret = PTR_ERR(mw);
|
||||
|
@ -1271,6 +1304,9 @@ err_unalloc:
|
|||
uverbs_dealloc_mw(mw);
|
||||
|
||||
err_put:
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
err_charge:
|
||||
put_pd_read(pd);
|
||||
|
||||
err_free:
|
||||
|
@ -1306,6 +1342,8 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
|
||||
|
||||
mutex_lock(&file->mutex);
|
||||
|
@ -1405,6 +1443,11 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
|
|||
if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
|
||||
attr.flags = cmd->flags;
|
||||
|
||||
ret = ib_rdmacg_try_charge(&obj->uobject.cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_charge;
|
||||
|
||||
cq = ib_dev->create_cq(ib_dev, &attr,
|
||||
file->ucontext, uhw);
|
||||
if (IS_ERR(cq)) {
|
||||
|
@ -1452,6 +1495,10 @@ err_free:
|
|||
ib_destroy_cq(cq);
|
||||
|
||||
err_file:
|
||||
ib_rdmacg_uncharge(&obj->uobject.cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
err_charge:
|
||||
if (ev_file)
|
||||
ib_uverbs_release_ucq(file, ev_file, obj);
|
||||
|
||||
|
@ -1732,6 +1779,8 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
|
||||
|
||||
mutex_lock(&file->mutex);
|
||||
|
@ -1904,6 +1953,11 @@ static int create_qp(struct ib_uverbs_file *file,
|
|||
goto err_put;
|
||||
}
|
||||
|
||||
ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_put;
|
||||
|
||||
if (cmd->qp_type == IB_QPT_XRC_TGT)
|
||||
qp = ib_create_qp(pd, &attr);
|
||||
else
|
||||
|
@ -1911,7 +1965,7 @@ static int create_qp(struct ib_uverbs_file *file,
|
|||
|
||||
if (IS_ERR(qp)) {
|
||||
ret = PTR_ERR(qp);
|
||||
goto err_put;
|
||||
goto err_create;
|
||||
}
|
||||
|
||||
if (cmd->qp_type != IB_QPT_XRC_TGT) {
|
||||
|
@ -1992,6 +2046,10 @@ err_cb:
|
|||
err_destroy:
|
||||
ib_destroy_qp(qp);
|
||||
|
||||
err_create:
|
||||
ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
err_put:
|
||||
if (xrcd)
|
||||
put_xrcd_read(xrcd_uobj);
|
||||
|
@ -2518,6 +2576,8 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
if (obj->uxrcd)
|
||||
atomic_dec(&obj->uxrcd->refcnt);
|
||||
|
||||
|
@ -2969,11 +3029,16 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
|
|||
memset(&attr.dmac, 0, sizeof(attr.dmac));
|
||||
memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
|
||||
|
||||
ret = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_charge;
|
||||
|
||||
ah = pd->device->create_ah(pd, &attr, &udata);
|
||||
|
||||
if (IS_ERR(ah)) {
|
||||
ret = PTR_ERR(ah);
|
||||
goto err_put;
|
||||
goto err_create;
|
||||
}
|
||||
|
||||
ah->device = pd->device;
|
||||
|
@ -3012,7 +3077,10 @@ err_copy:
|
|||
err_destroy:
|
||||
ib_destroy_ah(ah);
|
||||
|
||||
err_put:
|
||||
err_create:
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
err_charge:
|
||||
put_pd_read(pd);
|
||||
|
||||
err:
|
||||
|
@ -3046,6 +3114,8 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
|
||||
|
||||
mutex_lock(&file->mutex);
|
||||
|
@ -3822,10 +3892,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
|
|||
err = -EINVAL;
|
||||
goto err_free;
|
||||
}
|
||||
|
||||
err = ib_rdmacg_try_charge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (err)
|
||||
goto err_free;
|
||||
|
||||
flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
|
||||
if (IS_ERR(flow_id)) {
|
||||
err = PTR_ERR(flow_id);
|
||||
goto err_free;
|
||||
goto err_create;
|
||||
}
|
||||
flow_id->uobject = uobj;
|
||||
uobj->object = flow_id;
|
||||
|
@ -3858,6 +3934,8 @@ err_copy:
|
|||
idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
|
||||
destroy_flow:
|
||||
ib_destroy_flow(flow_id);
|
||||
err_create:
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
err_free:
|
||||
kfree(flow_attr);
|
||||
err_put:
|
||||
|
@ -3897,8 +3975,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
|
|||
flow_id = uobj->object;
|
||||
|
||||
ret = ib_destroy_flow(flow_id);
|
||||
if (!ret)
|
||||
if (!ret) {
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
uobj->live = 0;
|
||||
}
|
||||
|
||||
put_uobj_write(uobj);
|
||||
|
||||
|
@ -3966,6 +4047,11 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
|
|||
obj->uevent.events_reported = 0;
|
||||
INIT_LIST_HEAD(&obj->uevent.event_list);
|
||||
|
||||
ret = ib_rdmacg_try_charge(&obj->uevent.uobject.cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
if (ret)
|
||||
goto err_put_cq;
|
||||
|
||||
srq = pd->device->create_srq(pd, &attr, udata);
|
||||
if (IS_ERR(srq)) {
|
||||
ret = PTR_ERR(srq);
|
||||
|
@ -4030,6 +4116,8 @@ err_destroy:
|
|||
ib_destroy_srq(srq);
|
||||
|
||||
err_put:
|
||||
ib_rdmacg_uncharge(&obj->uevent.uobject.cg_obj, ib_dev,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
put_pd_read(pd);
|
||||
|
||||
err_put_cq:
|
||||
|
@ -4216,6 +4304,8 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_OBJECT);
|
||||
|
||||
if (srq_type == IB_SRQT_XRC) {
|
||||
us = container_of(obj, struct ib_usrq_object, uevent);
|
||||
atomic_dec(&us->uxrcd->refcnt);
|
||||
|
|
|
@ -51,6 +51,7 @@
|
|||
#include <rdma/ib.h>
|
||||
|
||||
#include "uverbs.h"
|
||||
#include "core_priv.h"
|
||||
|
||||
MODULE_AUTHOR("Roland Dreier");
|
||||
MODULE_DESCRIPTION("InfiniBand userspace verbs access");
|
||||
|
@ -237,6 +238,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
|
||||
ib_destroy_ah(ah);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
kfree(uobj);
|
||||
}
|
||||
|
||||
|
@ -246,6 +249,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
|
||||
uverbs_dealloc_mw(mw);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
kfree(uobj);
|
||||
}
|
||||
|
||||
|
@ -254,6 +259,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
|
||||
ib_destroy_flow(flow_id);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
kfree(uobj);
|
||||
}
|
||||
|
||||
|
@ -266,6 +273,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
if (qp == qp->real_qp)
|
||||
ib_uverbs_detach_umcast(qp, uqp);
|
||||
ib_destroy_qp(qp);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
ib_uverbs_release_uevent(file, &uqp->uevent);
|
||||
kfree(uqp);
|
||||
}
|
||||
|
@ -298,6 +307,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
|
||||
ib_destroy_srq(srq);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
ib_uverbs_release_uevent(file, uevent);
|
||||
kfree(uevent);
|
||||
}
|
||||
|
@ -310,6 +321,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
|
||||
ib_destroy_cq(cq);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
ib_uverbs_release_ucq(file, ev_file, ucq);
|
||||
kfree(ucq);
|
||||
}
|
||||
|
@ -319,6 +332,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
|
||||
ib_dereg_mr(mr);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
kfree(uobj);
|
||||
}
|
||||
|
||||
|
@ -339,11 +354,16 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
|
|||
|
||||
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
|
||||
ib_dealloc_pd(pd);
|
||||
ib_rdmacg_uncharge(&uobj->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_OBJECT);
|
||||
kfree(uobj);
|
||||
}
|
||||
|
||||
put_pid(context->tgid);
|
||||
|
||||
ib_rdmacg_uncharge(&context->cg_obj, context->device,
|
||||
RDMACG_RESOURCE_HCA_HANDLE);
|
||||
|
||||
return context->device->dealloc_ucontext(context);
|
||||
}
|
||||
|
||||
|
|
53
include/linux/cgroup_rdma.h
Normal file
53
include/linux/cgroup_rdma.h
Normal file
|
@ -0,0 +1,53 @@
|
|||
/*
|
||||
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
|
||||
*
|
||||
* This file is subject to the terms and conditions of version 2 of the GNU
|
||||
* General Public License. See the file COPYING in the main directory of the
|
||||
* Linux distribution for more details.
|
||||
*/
|
||||
|
||||
#ifndef _CGROUP_RDMA_H
|
||||
#define _CGROUP_RDMA_H
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
enum rdmacg_resource_type {
|
||||
RDMACG_RESOURCE_HCA_HANDLE,
|
||||
RDMACG_RESOURCE_HCA_OBJECT,
|
||||
RDMACG_RESOURCE_MAX,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_CGROUP_RDMA
|
||||
|
||||
struct rdma_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
/*
|
||||
* head to keep track of all resource pools
|
||||
* that belongs to this cgroup.
|
||||
*/
|
||||
struct list_head rpools;
|
||||
};
|
||||
|
||||
struct rdmacg_device {
|
||||
struct list_head dev_node;
|
||||
struct list_head rpools;
|
||||
char *name;
|
||||
};
|
||||
|
||||
/*
|
||||
* APIs for RDMA/IB stack to publish when a device wants to
|
||||
* participate in resource accounting
|
||||
*/
|
||||
int rdmacg_register_device(struct rdmacg_device *device);
|
||||
void rdmacg_unregister_device(struct rdmacg_device *device);
|
||||
|
||||
/* APIs for RDMA/IB stack to charge/uncharge pool specific resources */
|
||||
int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
|
||||
struct rdmacg_device *device,
|
||||
enum rdmacg_resource_type index);
|
||||
void rdmacg_uncharge(struct rdma_cgroup *cg,
|
||||
struct rdmacg_device *device,
|
||||
enum rdmacg_resource_type index);
|
||||
#endif /* CONFIG_CGROUP_RDMA */
|
||||
#endif /* _CGROUP_RDMA_H */
|
|
@ -56,6 +56,10 @@ SUBSYS(hugetlb)
|
|||
SUBSYS(pids)
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_CGROUP_RDMA)
|
||||
SUBSYS(rdma)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following subsystems are not supported on the default hierarchy.
|
||||
*/
|
||||
|
|
|
@ -60,6 +60,7 @@
|
|||
#include <linux/atomic.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/cgroup_rdma.h>
|
||||
|
||||
extern struct workqueue_struct *ib_wq;
|
||||
extern struct workqueue_struct *ib_comp_wq;
|
||||
|
@ -1331,6 +1332,12 @@ struct ib_fmr_attr {
|
|||
|
||||
struct ib_umem;
|
||||
|
||||
struct ib_rdmacg_object {
|
||||
#ifdef CONFIG_CGROUP_RDMA
|
||||
struct rdma_cgroup *cg; /* owner rdma cgroup */
|
||||
#endif
|
||||
};
|
||||
|
||||
struct ib_ucontext {
|
||||
struct ib_device *device;
|
||||
struct list_head pd_list;
|
||||
|
@ -1363,6 +1370,8 @@ struct ib_ucontext {
|
|||
struct list_head no_private_counters;
|
||||
int odp_mrs_count;
|
||||
#endif
|
||||
|
||||
struct ib_rdmacg_object cg_obj;
|
||||
};
|
||||
|
||||
struct ib_uobject {
|
||||
|
@ -1370,6 +1379,7 @@ struct ib_uobject {
|
|||
struct ib_ucontext *context; /* associated user context */
|
||||
void *object; /* containing object */
|
||||
struct list_head list; /* link to context's list */
|
||||
struct ib_rdmacg_object cg_obj; /* rdmacg object */
|
||||
int id; /* index into kernel idr */
|
||||
struct kref ref;
|
||||
struct rw_semaphore mutex; /* protects .live */
|
||||
|
@ -2118,6 +2128,10 @@ struct ib_device {
|
|||
struct attribute_group *hw_stats_ag;
|
||||
struct rdma_hw_stats *hw_stats;
|
||||
|
||||
#ifdef CONFIG_CGROUP_RDMA
|
||||
struct rdmacg_device cg_device;
|
||||
#endif
|
||||
|
||||
/**
|
||||
* The following mandatory functions are used only at device
|
||||
* registration. Keep functions such as these at the end of this
|
||||
|
|
10
init/Kconfig
10
init/Kconfig
|
@ -1090,6 +1090,16 @@ config CGROUP_PIDS
|
|||
since the PIDs limit only affects a process's ability to fork, not to
|
||||
attach to a cgroup.
|
||||
|
||||
config CGROUP_RDMA
|
||||
bool "RDMA controller"
|
||||
help
|
||||
Provides enforcement of RDMA resources defined by IB stack.
|
||||
It is fairly easy for consumers to exhaust RDMA resources, which
|
||||
can result into resource unavailability to other consumers.
|
||||
RDMA controller is designed to stop this from happening.
|
||||
Attaching processes with active RDMA resources to the cgroup
|
||||
hierarchy is allowed even if can cross the hierarchy's limit.
|
||||
|
||||
config CGROUP_FREEZER
|
||||
bool "Freezer controller"
|
||||
help
|
||||
|
|
|
@ -2,4 +2,5 @@ obj-y := cgroup.o namespace.o cgroup-v1.o
|
|||
|
||||
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
|
||||
obj-$(CONFIG_CGROUP_PIDS) += pids.o
|
||||
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
|
||||
obj-$(CONFIG_CPUSETS) += cpuset.o
|
||||
|
|
619
kernel/cgroup/rdma.c
Normal file
619
kernel/cgroup/rdma.c
Normal file
|
@ -0,0 +1,619 @@
|
|||
/*
|
||||
* RDMA resource limiting controller for cgroups.
|
||||
*
|
||||
* Used to allow a cgroup hierarchy to stop processes from consuming
|
||||
* additional RDMA resources after a certain limit is reached.
|
||||
*
|
||||
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
|
||||
*
|
||||
* This file is subject to the terms and conditions of version 2 of the GNU
|
||||
* General Public License. See the file COPYING in the main directory of the
|
||||
* Linux distribution for more details.
|
||||
*/
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/cgroup_rdma.h>
|
||||
|
||||
#define RDMACG_MAX_STR "max"
|
||||
|
||||
/*
|
||||
* Protects list of resource pools maintained on per cgroup basis
|
||||
* and rdma device list.
|
||||
*/
|
||||
static DEFINE_MUTEX(rdmacg_mutex);
|
||||
static LIST_HEAD(rdmacg_devices);
|
||||
|
||||
enum rdmacg_file_type {
|
||||
RDMACG_RESOURCE_TYPE_MAX,
|
||||
RDMACG_RESOURCE_TYPE_STAT,
|
||||
};
|
||||
|
||||
/*
|
||||
* resource table definition as to be seen by the user.
|
||||
* Need to add entries to it when more resources are
|
||||
* added/defined at IB verb/core layer.
|
||||
*/
|
||||
static char const *rdmacg_resource_names[] = {
|
||||
[RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
|
||||
[RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
|
||||
};
|
||||
|
||||
/* resource tracker for each resource of rdma cgroup */
|
||||
struct rdmacg_resource {
|
||||
int max;
|
||||
int usage;
|
||||
};
|
||||
|
||||
/*
|
||||
* resource pool object which represents per cgroup, per device
|
||||
* resources. There are multiple instances of this object per cgroup,
|
||||
* therefore it cannot be embedded within rdma_cgroup structure. It
|
||||
* is maintained as list.
|
||||
*/
|
||||
struct rdmacg_resource_pool {
|
||||
struct rdmacg_device *device;
|
||||
struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
|
||||
|
||||
struct list_head cg_node;
|
||||
struct list_head dev_node;
|
||||
|
||||
/* count active user tasks of this pool */
|
||||
u64 usage_sum;
|
||||
/* total number counts which are set to max */
|
||||
int num_max_cnt;
|
||||
};
|
||||
|
||||
static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return container_of(css, struct rdma_cgroup, css);
|
||||
}
|
||||
|
||||
static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
|
||||
{
|
||||
return css_rdmacg(cg->css.parent);
|
||||
}
|
||||
|
||||
static inline struct rdma_cgroup *get_current_rdmacg(void)
|
||||
{
|
||||
return css_rdmacg(task_get_css(current, rdma_cgrp_id));
|
||||
}
|
||||
|
||||
static void set_resource_limit(struct rdmacg_resource_pool *rpool,
|
||||
int index, int new_max)
|
||||
{
|
||||
if (new_max == S32_MAX) {
|
||||
if (rpool->resources[index].max != S32_MAX)
|
||||
rpool->num_max_cnt++;
|
||||
} else {
|
||||
if (rpool->resources[index].max == S32_MAX)
|
||||
rpool->num_max_cnt--;
|
||||
}
|
||||
rpool->resources[index].max = new_max;
|
||||
}
|
||||
|
||||
static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
|
||||
set_resource_limit(rpool, i, S32_MAX);
|
||||
}
|
||||
|
||||
static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
|
||||
{
|
||||
lockdep_assert_held(&rdmacg_mutex);
|
||||
|
||||
list_del(&rpool->cg_node);
|
||||
list_del(&rpool->dev_node);
|
||||
kfree(rpool);
|
||||
}
|
||||
|
||||
static struct rdmacg_resource_pool *
|
||||
find_cg_rpool_locked(struct rdma_cgroup *cg,
|
||||
struct rdmacg_device *device)
|
||||
|
||||
{
|
||||
struct rdmacg_resource_pool *pool;
|
||||
|
||||
lockdep_assert_held(&rdmacg_mutex);
|
||||
|
||||
list_for_each_entry(pool, &cg->rpools, cg_node)
|
||||
if (pool->device == device)
|
||||
return pool;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct rdmacg_resource_pool *
|
||||
get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
|
||||
{
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
|
||||
rpool = find_cg_rpool_locked(cg, device);
|
||||
if (rpool)
|
||||
return rpool;
|
||||
|
||||
rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
|
||||
if (!rpool)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
rpool->device = device;
|
||||
set_all_resource_max_limit(rpool);
|
||||
|
||||
INIT_LIST_HEAD(&rpool->cg_node);
|
||||
INIT_LIST_HEAD(&rpool->dev_node);
|
||||
list_add_tail(&rpool->cg_node, &cg->rpools);
|
||||
list_add_tail(&rpool->dev_node, &device->rpools);
|
||||
return rpool;
|
||||
}
|
||||
|
||||
/**
|
||||
* uncharge_cg_locked - uncharge resource for rdma cgroup
|
||||
* @cg: pointer to cg to uncharge and all parents in hierarchy
|
||||
* @device: pointer to rdmacg device
|
||||
* @index: index of the resource to uncharge in cg (resource pool)
|
||||
*
|
||||
* It also frees the resource pool which was created as part of
|
||||
* charging operation when there are no resources attached to
|
||||
* resource pool.
|
||||
*/
|
||||
static void
|
||||
uncharge_cg_locked(struct rdma_cgroup *cg,
|
||||
struct rdmacg_device *device,
|
||||
enum rdmacg_resource_type index)
|
||||
{
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
|
||||
rpool = find_cg_rpool_locked(cg, device);
|
||||
|
||||
/*
|
||||
* rpool cannot be null at this stage. Let kernel operate in case
|
||||
* if there a bug in IB stack or rdma controller, instead of crashing
|
||||
* the system.
|
||||
*/
|
||||
if (unlikely(!rpool)) {
|
||||
pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
|
||||
return;
|
||||
}
|
||||
|
||||
rpool->resources[index].usage--;
|
||||
|
||||
/*
|
||||
* A negative count (or overflow) is invalid,
|
||||
* it indicates a bug in the rdma controller.
|
||||
*/
|
||||
WARN_ON_ONCE(rpool->resources[index].usage < 0);
|
||||
rpool->usage_sum--;
|
||||
if (rpool->usage_sum == 0 &&
|
||||
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
|
||||
/*
|
||||
* No user of the rpool and all entries are set to max, so
|
||||
* safe to delete this rpool.
|
||||
*/
|
||||
free_cg_rpool_locked(rpool);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
|
||||
* @device: pointer to rdmacg device
|
||||
* @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
|
||||
* stop uncharging
|
||||
* @index: index of the resource to uncharge in cg in given resource pool
|
||||
*/
|
||||
static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
|
||||
struct rdmacg_device *device,
|
||||
struct rdma_cgroup *stop_cg,
|
||||
enum rdmacg_resource_type index)
|
||||
{
|
||||
struct rdma_cgroup *p;
|
||||
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
|
||||
for (p = cg; p != stop_cg; p = parent_rdmacg(p))
|
||||
uncharge_cg_locked(p, device, index);
|
||||
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
|
||||
css_put(&cg->css);
|
||||
}
|
||||
|
||||
/**
|
||||
* rdmacg_uncharge - hierarchically uncharge rdma resource count
|
||||
* @device: pointer to rdmacg device
|
||||
* @index: index of the resource to uncharge in cgroup in given resource pool
|
||||
*/
|
||||
void rdmacg_uncharge(struct rdma_cgroup *cg,
|
||||
struct rdmacg_device *device,
|
||||
enum rdmacg_resource_type index)
|
||||
{
|
||||
if (index >= RDMACG_RESOURCE_MAX)
|
||||
return;
|
||||
|
||||
rdmacg_uncharge_hierarchy(cg, device, NULL, index);
|
||||
}
|
||||
EXPORT_SYMBOL(rdmacg_uncharge);
|
||||
|
||||
/**
|
||||
* rdmacg_try_charge - hierarchically try to charge the rdma resource
|
||||
* @rdmacg: pointer to rdma cgroup which will own this resource
|
||||
* @device: pointer to rdmacg device
|
||||
* @index: index of the resource to charge in cgroup (resource pool)
|
||||
*
|
||||
* This function follows charging resource in hierarchical way.
|
||||
* It will fail if the charge would cause the new value to exceed the
|
||||
* hierarchical limit.
|
||||
* Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
|
||||
* Returns pointer to rdmacg for this resource when charging is successful.
|
||||
*
|
||||
* Charger needs to account resources on two criteria.
|
||||
* (a) per cgroup & (b) per device resource usage.
|
||||
* Per cgroup resource usage ensures that tasks of cgroup doesn't cross
|
||||
* the configured limits. Per device provides granular configuration
|
||||
* in multi device usage. It allocates resource pool in the hierarchy
|
||||
* for each parent it come across for first resource. Later on resource
|
||||
* pool will be available. Therefore it will be much faster thereon
|
||||
* to charge/uncharge.
|
||||
*/
|
||||
int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
|
||||
struct rdmacg_device *device,
|
||||
enum rdmacg_resource_type index)
|
||||
{
|
||||
struct rdma_cgroup *cg, *p;
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
s64 new;
|
||||
int ret = 0;
|
||||
|
||||
if (index >= RDMACG_RESOURCE_MAX)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* hold on to css, as cgroup can be removed but resource
|
||||
* accounting happens on css.
|
||||
*/
|
||||
cg = get_current_rdmacg();
|
||||
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
for (p = cg; p; p = parent_rdmacg(p)) {
|
||||
rpool = get_cg_rpool_locked(p, device);
|
||||
if (IS_ERR(rpool)) {
|
||||
ret = PTR_ERR(rpool);
|
||||
goto err;
|
||||
} else {
|
||||
new = rpool->resources[index].usage + 1;
|
||||
if (new > rpool->resources[index].max) {
|
||||
ret = -EAGAIN;
|
||||
goto err;
|
||||
} else {
|
||||
rpool->resources[index].usage = new;
|
||||
rpool->usage_sum++;
|
||||
}
|
||||
}
|
||||
}
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
|
||||
*rdmacg = cg;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
rdmacg_uncharge_hierarchy(cg, device, p, index);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(rdmacg_try_charge);
|
||||
|
||||
/**
|
||||
* rdmacg_register_device - register rdmacg device to rdma controller.
|
||||
* @device: pointer to rdmacg device whose resources need to be accounted.
|
||||
*
|
||||
* If IB stack wish a device to participate in rdma cgroup resource
|
||||
* tracking, it must invoke this API to register with rdma cgroup before
|
||||
* any user space application can start using the RDMA resources.
|
||||
* Returns 0 on success or EINVAL when table length given is beyond
|
||||
* supported size.
|
||||
*/
|
||||
int rdmacg_register_device(struct rdmacg_device *device)
|
||||
{
|
||||
INIT_LIST_HEAD(&device->dev_node);
|
||||
INIT_LIST_HEAD(&device->rpools);
|
||||
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
list_add_tail(&device->dev_node, &rdmacg_devices);
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(rdmacg_register_device);
|
||||
|
||||
/**
|
||||
* rdmacg_unregister_device - unregister rdmacg device from rdma controller.
|
||||
* @device: pointer to rdmacg device which was previously registered with rdma
|
||||
* controller using rdmacg_register_device().
|
||||
*
|
||||
* IB stack must invoke this after all the resources of the IB device
|
||||
* are destroyed and after ensuring that no more resources will be created
|
||||
* when this API is invoked.
|
||||
*/
|
||||
void rdmacg_unregister_device(struct rdmacg_device *device)
|
||||
{
|
||||
struct rdmacg_resource_pool *rpool, *tmp;
|
||||
|
||||
/*
|
||||
* Synchronize with any active resource settings,
|
||||
* usage query happening via configfs.
|
||||
*/
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
list_del_init(&device->dev_node);
|
||||
|
||||
/*
|
||||
* Now that this device is off the cgroup list, its safe to free
|
||||
* all the rpool resources.
|
||||
*/
|
||||
list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
|
||||
free_cg_rpool_locked(rpool);
|
||||
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
}
|
||||
EXPORT_SYMBOL(rdmacg_unregister_device);
|
||||
|
||||
static int parse_resource(char *c, int *intval)
|
||||
{
|
||||
substring_t argstr;
|
||||
const char **table = &rdmacg_resource_names[0];
|
||||
char *name, *value = c;
|
||||
size_t len;
|
||||
int ret, i = 0;
|
||||
|
||||
name = strsep(&value, "=");
|
||||
if (!name || !value)
|
||||
return -EINVAL;
|
||||
|
||||
len = strlen(value);
|
||||
|
||||
for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
|
||||
if (strcmp(table[i], name))
|
||||
continue;
|
||||
|
||||
argstr.from = value;
|
||||
argstr.to = value + len;
|
||||
|
||||
ret = match_int(&argstr, intval);
|
||||
if (ret >= 0) {
|
||||
if (*intval < 0)
|
||||
break;
|
||||
return i;
|
||||
}
|
||||
if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
|
||||
*intval = S32_MAX;
|
||||
return i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int rdmacg_parse_limits(char *options,
|
||||
int *new_limits, unsigned long *enables)
|
||||
{
|
||||
char *c;
|
||||
int err = -EINVAL;
|
||||
|
||||
/* parse resource options */
|
||||
while ((c = strsep(&options, " ")) != NULL) {
|
||||
int index, intval;
|
||||
|
||||
index = parse_resource(c, &intval);
|
||||
if (index < 0)
|
||||
goto err;
|
||||
|
||||
new_limits[index] = intval;
|
||||
*enables |= BIT(index);
|
||||
}
|
||||
return 0;
|
||||
|
||||
err:
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
|
||||
{
|
||||
struct rdmacg_device *device;
|
||||
|
||||
lockdep_assert_held(&rdmacg_mutex);
|
||||
|
||||
list_for_each_entry(device, &rdmacg_devices, dev_node)
|
||||
if (!strcmp(name, device->name))
|
||||
return device;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
struct rdma_cgroup *cg = css_rdmacg(of_css(of));
|
||||
const char *dev_name;
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
struct rdmacg_device *device;
|
||||
char *options = strstrip(buf);
|
||||
int *new_limits;
|
||||
unsigned long enables = 0;
|
||||
int i = 0, ret = 0;
|
||||
|
||||
/* extract the device name first */
|
||||
dev_name = strsep(&options, " ");
|
||||
if (!dev_name) {
|
||||
ret = -EINVAL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
|
||||
if (!new_limits) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = rdmacg_parse_limits(options, new_limits, &enables);
|
||||
if (ret)
|
||||
goto parse_err;
|
||||
|
||||
/* acquire lock to synchronize with hot plug devices */
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
|
||||
device = rdmacg_get_device_locked(dev_name);
|
||||
if (!device) {
|
||||
ret = -ENODEV;
|
||||
goto dev_err;
|
||||
}
|
||||
|
||||
rpool = get_cg_rpool_locked(cg, device);
|
||||
if (IS_ERR(rpool)) {
|
||||
ret = PTR_ERR(rpool);
|
||||
goto dev_err;
|
||||
}
|
||||
|
||||
/* now set the new limits of the rpool */
|
||||
for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
|
||||
set_resource_limit(rpool, i, new_limits[i]);
|
||||
|
||||
if (rpool->usage_sum == 0 &&
|
||||
rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
|
||||
/*
|
||||
* No user of the rpool and all entries are set to max, so
|
||||
* safe to delete this rpool.
|
||||
*/
|
||||
free_cg_rpool_locked(rpool);
|
||||
}
|
||||
|
||||
dev_err:
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
|
||||
parse_err:
|
||||
kfree(new_limits);
|
||||
|
||||
err:
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
static void print_rpool_values(struct seq_file *sf,
|
||||
struct rdmacg_resource_pool *rpool)
|
||||
{
|
||||
enum rdmacg_file_type sf_type;
|
||||
int i;
|
||||
u32 value;
|
||||
|
||||
sf_type = seq_cft(sf)->private;
|
||||
|
||||
for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
|
||||
seq_puts(sf, rdmacg_resource_names[i]);
|
||||
seq_putc(sf, '=');
|
||||
if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
|
||||
if (rpool)
|
||||
value = rpool->resources[i].max;
|
||||
else
|
||||
value = S32_MAX;
|
||||
} else {
|
||||
if (rpool)
|
||||
value = rpool->resources[i].usage;
|
||||
else
|
||||
value = 0;
|
||||
}
|
||||
|
||||
if (value == S32_MAX)
|
||||
seq_puts(sf, RDMACG_MAX_STR);
|
||||
else
|
||||
seq_printf(sf, "%d", value);
|
||||
seq_putc(sf, ' ');
|
||||
}
|
||||
}
|
||||
|
||||
static int rdmacg_resource_read(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct rdmacg_device *device;
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
|
||||
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
|
||||
list_for_each_entry(device, &rdmacg_devices, dev_node) {
|
||||
seq_printf(sf, "%s ", device->name);
|
||||
|
||||
rpool = find_cg_rpool_locked(cg, device);
|
||||
print_rpool_values(sf, rpool);
|
||||
|
||||
seq_putc(sf, '\n');
|
||||
}
|
||||
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct cftype rdmacg_files[] = {
|
||||
{
|
||||
.name = "max",
|
||||
.write = rdmacg_resource_set_max,
|
||||
.seq_show = rdmacg_resource_read,
|
||||
.private = RDMACG_RESOURCE_TYPE_MAX,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "current",
|
||||
.seq_show = rdmacg_resource_read,
|
||||
.private = RDMACG_RESOURCE_TYPE_STAT,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
rdmacg_css_alloc(struct cgroup_subsys_state *parent)
|
||||
{
|
||||
struct rdma_cgroup *cg;
|
||||
|
||||
cg = kzalloc(sizeof(*cg), GFP_KERNEL);
|
||||
if (!cg)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
INIT_LIST_HEAD(&cg->rpools);
|
||||
return &cg->css;
|
||||
}
|
||||
|
||||
static void rdmacg_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct rdma_cgroup *cg = css_rdmacg(css);
|
||||
|
||||
kfree(cg);
|
||||
}
|
||||
|
||||
/**
|
||||
* rdmacg_css_offline - cgroup css_offline callback
|
||||
* @css: css of interest
|
||||
*
|
||||
* This function is called when @css is about to go away and responsible
|
||||
* for shooting down all rdmacg associated with @css. As part of that it
|
||||
* marks all the resource pool entries to max value, so that when resources are
|
||||
* uncharged, associated resource pool can be freed as well.
|
||||
*/
|
||||
static void rdmacg_css_offline(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct rdma_cgroup *cg = css_rdmacg(css);
|
||||
struct rdmacg_resource_pool *rpool;
|
||||
|
||||
mutex_lock(&rdmacg_mutex);
|
||||
|
||||
list_for_each_entry(rpool, &cg->rpools, cg_node)
|
||||
set_all_resource_max_limit(rpool);
|
||||
|
||||
mutex_unlock(&rdmacg_mutex);
|
||||
}
|
||||
|
||||
struct cgroup_subsys rdma_cgrp_subsys = {
|
||||
.css_alloc = rdmacg_css_alloc,
|
||||
.css_free = rdmacg_css_free,
|
||||
.css_offline = rdmacg_css_offline,
|
||||
.legacy_cftypes = rdmacg_files,
|
||||
.dfl_cftypes = rdmacg_files,
|
||||
};
|
Loading…
Reference in a new issue