Another RDMA update from Chuck Lever, and a bunch of miscellaneous

bugfixes.
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJZE2UeAAoJECebzXlCjuG+St8P/0vG+ps9sY012E6Wh9gy4Ev4
 BtxG/c3CtcxrbNzW+cFhdEloBGtC0VvcrKNCozJTK4LdaPYErkyRBpjgXvIggT9I
 GWY4ftpH3eJ6uByN9Okgc3/1la2poDflJO/nYhdRed3YHOnXTtx/746tu1xAnVCV
 tFtDGrbJZTprt5c3zETtdtquCUSy2aMT5ZbrdU3yWBCwQMNSIufN3an8epfB++xx
 Ct+G0HTRffcWAdYuLT0N1HKqm8pkncdNMFpm7mVw0hMCRy552G3fuj8LtkhVTvKE
 1KN3zXY4jhaUYWD5Yt6AJcpLEro65b8swYk4e9FP2TNUpCmuRdXT9cb9vE8YztxC
 8s4N23RHaEx9I6pC3OU64a2HfhiQM/oOIvjlhTBsjojXsQcqZFD1vsoSYA8Byl0w
 m9EQWqPqge4m6yEYl7uAyL6xSthbrhcU1Ks5jvNXGcWzEQj7BATnynJANsfZ+y6r
 ZoVcsRNX49m1BG+p9br+9DFffPiNFUMqxbfr73L9HRep3OsPeFKazFG0bKd3hOqA
 E6L/AnBd9soSqTuTvbisWrGWbomhtd5G/fAa1uHrWTPHMXUWCmkguiau51FNfcHu
 xcJlBBVCvUmmd5u3wF6QeiyjPs4KEBzQzsOUsWKHRxDBp6s+5PX/lHuXRBlDP+fN
 TQq0KbvBtea1OyMaRtoV
 =Rtl/
 -----END PGP SIGNATURE-----

Merge tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux

Pull nfsd updates from Bruce Fields:
 "Another RDMA update from Chuck Lever, and a bunch of miscellaneous
  bugfixes"

* tag 'nfsd-4.12' of git://linux-nfs.org/~bfields/linux: (26 commits)
  nfsd: Fix up the "supattr_exclcreat" attributes
  nfsd: encoders mustn't use unitialized values in error cases
  nfsd: fix undefined behavior in nfsd4_layout_verify
  lockd: fix lockd shutdown race
  NFSv4: Fix callback server shutdown
  SUNRPC: Refactor svc_set_num_threads()
  NFSv4.x/callback: Create the callback service through svc_create_pooled
  lockd: remove redundant check on block
  svcrdma: Clean out old XDR encoders
  svcrdma: Remove the req_map cache
  svcrdma: Remove unused RDMA Write completion handler
  svcrdma: Reduce size of sge array in struct svc_rdma_op_ctxt
  svcrdma: Clean up RPC-over-RDMA backchannel reply processing
  svcrdma: Report Write/Reply chunk overruns
  svcrdma: Clean up RDMA_ERROR path
  svcrdma: Use rdma_rw API in RPC reply path
  svcrdma: Introduce local rdma_rw API helpers
  svcrdma: Clean up svc_rdma_get_inv_rkey()
  svcrdma: Add helper to save pages under I/O
  svcrdma: Eliminate RPCRDMA_SQ_DEPTH_MULT
  ...
This commit is contained in:
Linus Torvalds 2017-05-10 13:29:23 -07:00
commit c70422f760
23 changed files with 1370 additions and 939 deletions

View file

@ -132,6 +132,8 @@ lockd(void *vrqstp)
{
int err = 0;
struct svc_rqst *rqstp = vrqstp;
struct net *net = &init_net;
struct lockd_net *ln = net_generic(net, lockd_net_id);
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@ -176,6 +178,8 @@ lockd(void *vrqstp)
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
return 0;
}
@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
if (ln->nlmsvc_users) {
if (--ln->nlmsvc_users == 0) {
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
svc_shutdown_net(serv, net);
dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
}

View file

@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
if (!(block = nlmsvc_find_block(cookie)))
return;
if (block) {
if (status == nlm_lck_denied_grace_period) {
/* Try again in a couple of seconds */
nlmsvc_insert_block(block, 10 * HZ);
} else {
/* Lock is now held by client, or has been rejected.
* In both cases, the block should be removed. */
nlmsvc_unlink_block(block);
}
if (status == nlm_lck_denied_grace_period) {
/* Try again in a couple of seconds */
nlmsvc_insert_block(block, 10 * HZ);
} else {
/*
* Lock is now held by client, or has been rejected.
* In both cases, the block should be removed.
*/
nlmsvc_unlink_block(block);
}
nlmsvc_release_block(block);
}

View file

@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp)
set_freezable();
while (!kthread_should_stop()) {
while (!kthread_freezable_should_stop(NULL)) {
if (signal_pending(current))
flush_signals(current);
/*
* Listen for a request on the socket
*/
@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
svc_exit_thread(rqstp);
module_put_and_exit(0);
return 0;
}
@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp)
set_freezable();
while (!kthread_should_stop()) {
if (try_to_freeze())
continue;
while (!kthread_freezable_should_stop(NULL)) {
if (signal_pending(current))
flush_signals(current);
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp)
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
schedule();
if (!kthread_should_stop())
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
flush_signals(current);
}
svc_exit_thread(rqstp);
module_put_and_exit(0);
return 0;
}
@ -221,14 +229,14 @@ err_bind:
static struct svc_serv_ops nfs40_cb_sv_ops = {
.svo_function = nfs4_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
.svo_setup = svc_set_num_threads,
.svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
#if defined(CONFIG_NFS_V4_1)
static struct svc_serv_ops nfs41_cb_sv_ops = {
.svo_function = nfs41_callback_svc,
.svo_enqueue_xprt = svc_xprt_do_enqueue,
.svo_setup = svc_set_num_threads,
.svo_setup = svc_set_num_threads_sync,
.svo_module = THIS_MODULE,
};
@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);

View file

@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
args->count = ntohl(*p++);
if (!xdr_argsize_check(rqstp, p))
return 0;
len = min(args->count, max_blocksize);
/* set up the kvec */
@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
v++;
}
args->vlen = v;
return xdr_argsize_check(rqstp, p);
return 1;
}
int
@ -541,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
p = decode_fh(p, &args->fh);
if (!p)
return 0;
if (!xdr_argsize_check(rqstp, p))
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
return 1;
}
int
@ -569,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
args->verf = p; p += 2;
args->dircount = ~0;
args->count = ntohl(*p++);
if (!xdr_argsize_check(rqstp, p))
return 0;
args->count = min_t(u32, args->count, PAGE_SIZE);
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
return 1;
}
int
@ -590,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->dircount = ntohl(*p++);
args->count = ntohl(*p++);
if (!xdr_argsize_check(rqstp, p))
return 0;
len = args->count = min(args->count, max_blocksize);
while (len > 0) {
struct page *p = *(rqstp->rq_next_page++);
@ -597,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->buffer = page_address(p);
len -= PAGE_SIZE;
}
return xdr_argsize_check(rqstp, p);
return 1;
}
int

View file

@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
return NULL;
}
if (!(exp->ex_layout_types & (1 << layout_type))) {
if (layout_type >= LAYOUT_TYPE_MAX ||
!(exp->ex_layout_types & (1 << layout_type))) {
dprintk("%s: layout type %d not supported\n",
__func__, layout_type);
return NULL;

View file

@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
target->cl_clientid.cl_id = source->cl_clientid.cl_id;
}
int strdup_if_nonnull(char **target, char *source)
{
if (source) {
*target = kstrdup(source, GFP_KERNEL);
if (!*target)
return -ENOMEM;
} else
*target = NULL;
return 0;
}
static int copy_cred(struct svc_cred *target, struct svc_cred *source)
{
int ret;
target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL);
target->cr_raw_principal = kstrdup(source->cr_raw_principal,
GFP_KERNEL);
if ((source->cr_principal && ! target->cr_principal) ||
(source->cr_raw_principal && ! target->cr_raw_principal))
return -ENOMEM;
ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
if (ret)
return ret;
ret = strdup_if_nonnull(&target->cr_raw_principal,
source->cr_raw_principal);
if (ret)
return ret;
target->cr_flavor = source->cr_flavor;
target->cr_uid = source->cr_uid;
target->cr_gid = source->cr_gid;

View file

@ -2831,9 +2831,14 @@ out_acl:
}
#endif /* CONFIG_NFSD_PNFS */
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0,
NFSD_SUPPATTR_EXCLCREAT_WORD1,
NFSD_SUPPATTR_EXCLCREAT_WORD2);
u32 supp[3];
memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0;
supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1;
supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2;
status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]);
if (status)
goto out;
}
@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_getdeviceinfo *gdev)
{
struct xdr_stream *xdr = &resp->xdr;
const struct nfsd4_layout_ops *ops =
nfsd4_layout_ops[gdev->gd_layout_type];
const struct nfsd4_layout_ops *ops;
u32 starting_len = xdr->buf->len, needed_len;
__be32 *p;
@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
/* If maxcount is 0 then just update notifications */
if (gdev->gd_maxcount != 0) {
ops = nfsd4_layout_ops[gdev->gd_layout_type];
nfserr = ops->encode_getdeviceinfo(xdr, gdev);
if (nfserr) {
/*
@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_layoutget *lgp)
{
struct xdr_stream *xdr = &resp->xdr;
const struct nfsd4_layout_ops *ops =
nfsd4_layout_ops[lgp->lg_layout_type];
const struct nfsd4_layout_ops *ops;
__be32 *p;
dprintk("%s: err %d\n", __func__, nfserr);
@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(lgp->lg_seg.iomode);
*p++ = cpu_to_be32(lgp->lg_layout_type);
ops = nfsd4_layout_ops[lgp->lg_layout_type];
nfserr = ops->encode_layoutget(xdr, lgp);
out:
kfree(lgp->lg_content);

View file

@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
len = args->count = ntohl(*p++);
p++; /* totalcount - unused */
if (!xdr_argsize_check(rqstp, p))
return 0;
len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
/* set up somewhere to store response.
@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
v++;
}
args->vlen = v;
return xdr_argsize_check(rqstp, p);
return 1;
}
int
@ -362,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
p = decode_fh(p, &args->fh);
if (!p)
return 0;
if (!xdr_argsize_check(rqstp, p))
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
return 1;
}
int
@ -402,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
args->cookie = ntohl(*p++);
args->count = ntohl(*p++);
args->count = min_t(u32, args->count, PAGE_SIZE);
if (!xdr_argsize_check(rqstp, p))
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
return 1;
}
/*

View file

@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
err = follow_down(&path);
if (err < 0)
goto out;
if (path.mnt == exp->ex_path.mnt && path.dentry == dentry &&
nfsd_mountpoint(dentry, exp) == 2) {
/* This is only a mountpoint in some other namespace */
path_put(&path);
goto out;
}
exp2 = rqst_exp_get_by_name(rqstp, &path);
if (IS_ERR(exp2)) {
@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st
/*
* For nfsd purposes, we treat V4ROOT exports as though there was an
* export at *every* directory.
* We return:
* '1' if this dentry *must* be an export point,
* '2' if it might be, if there is really a mount here, and
* '0' if there is no chance of an export point here.
*/
int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
{
if (d_mountpoint(dentry))
if (!d_inode(dentry))
return 0;
if (exp->ex_flags & NFSEXP_V4ROOT)
return 1;
if (nfsd4_is_junction(dentry))
return 1;
if (!(exp->ex_flags & NFSEXP_V4ROOT))
return 0;
return d_inode(dentry) != NULL;
if (d_mountpoint(dentry))
/*
* Might only be a mountpoint in a different namespace,
* but we need to check.
*/
return 2;
return 0;
}
__be32

View file

@ -143,6 +143,9 @@ enum rpcrdma_proc {
#define rdma_done cpu_to_be32(RDMA_DONE)
#define rdma_error cpu_to_be32(RDMA_ERROR)
#define err_vers cpu_to_be32(ERR_VERS)
#define err_chunk cpu_to_be32(ERR_CHUNK)
/*
* Private extension to RPC-over-RDMA Version One.
* Message passed during RDMA-CM connection set-up.

View file

@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p)
{
char *cp = (char *)p;
struct kvec *vec = &rqstp->rq_arg.head[0];
return cp >= (char*)vec->iov_base
&& cp <= (char*)vec->iov_base + vec->iov_len;
return cp == (char *)vec->iov_base + vec->iov_len;
}
static inline int
@ -474,6 +473,7 @@ void svc_pool_map_put(void);
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
struct svc_serv_ops *);
int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
int svc_pool_stats_open(struct svc_serv *serv, struct file *file);
void svc_destroy(struct svc_serv *);
void svc_shutdown_net(struct svc_serv *, struct net *);

View file

@ -48,6 +48,12 @@
#include <rdma/rdma_cm.h>
#define SVCRDMA_DEBUG
/* Default and maximum inline threshold sizes */
enum {
RPCRDMA_DEF_INLINE_THRESH = 4096,
RPCRDMA_MAX_INLINE_THRESH = 65536
};
/* RPC/RDMA parameters and stats */
extern unsigned int svcrdma_ord;
extern unsigned int svcrdma_max_requests;
@ -85,27 +91,11 @@ struct svc_rdma_op_ctxt {
enum dma_data_direction direction;
int count;
unsigned int mapped_sges;
struct ib_sge sge[RPCSVC_MAXPAGES];
struct ib_send_wr send_wr;
struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE];
struct page *pages[RPCSVC_MAXPAGES];
};
/*
* NFS_ requests are mapped on the client side by the chunk lists in
* the RPCRDMA header. During the fetching of the RPC from the client
* and the writing of the reply to the client, the memory in the
* client and the memory in the server must be mapped as contiguous
* vaddr/len for access by the hardware. These data strucures keep
* these mappings.
*
* For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the
* 'sge' in the svc_rdma_req_map maps the server side RPC reply and the
* 'ch' field maps the read-list of the RPCRDMA header to the 'sge'
* mapping of the reply.
*/
struct svc_rdma_chunk_sge {
int start; /* sge no for this chunk */
int count; /* sge count for this chunk */
};
struct svc_rdma_fastreg_mr {
struct ib_mr *mr;
struct scatterlist *sg;
@ -114,15 +104,7 @@ struct svc_rdma_fastreg_mr {
enum dma_data_direction direction;
struct list_head frmr_list;
};
struct svc_rdma_req_map {
struct list_head free;
unsigned long count;
union {
struct kvec sge[RPCSVC_MAXPAGES];
struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES];
unsigned long lkey[RPCSVC_MAXPAGES];
};
};
#define RDMACTXT_F_LAST_CTXT 2
#define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */
@ -144,14 +126,15 @@ struct svcxprt_rdma {
u32 sc_max_requests; /* Max requests */
u32 sc_max_bc_requests;/* Backward credits */
int sc_max_req_size; /* Size of each RQ WR buf */
u8 sc_port_num;
struct ib_pd *sc_pd;
spinlock_t sc_ctxt_lock;
struct list_head sc_ctxts;
int sc_ctxt_used;
spinlock_t sc_map_lock;
struct list_head sc_maps;
spinlock_t sc_rw_ctxt_lock;
struct list_head sc_rw_ctxts;
struct list_head sc_rq_dto_q;
spinlock_t sc_rq_dto_lock;
@ -181,9 +164,7 @@ struct svcxprt_rdma {
/* The default ORD value is based on two outstanding full-size writes with a
* page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */
#define RPCRDMA_ORD (64/4)
#define RPCRDMA_SQ_DEPTH_MULT 8
#define RPCRDMA_MAX_REQUESTS 32
#define RPCRDMA_MAX_REQ_SIZE 4096
/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our
* current NFSv4.1 implementation supports one backchannel slot.
@ -201,19 +182,11 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
/* svc_rdma_backchannel.c */
extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
struct rpcrdma_msg *rmsgp,
__be32 *rdma_resp,
struct xdr_buf *rcvbuf);
/* svc_rdma_marshal.c */
extern int svc_rdma_xdr_decode_req(struct xdr_buf *);
extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
struct rpcrdma_msg *,
enum rpcrdma_errcode, __be32 *);
extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int);
extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int);
extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int,
__be32, __be64, u32);
extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp);
/* svc_rdma_recvfrom.c */
extern int svc_rdma_recvfrom(struct svc_rqst *);
@ -224,16 +197,25 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
struct svc_rdma_op_ctxt *, int *, u32 *,
u32, u32, u64, bool);
/* svc_rdma_rw.c */
extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma);
extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
__be32 *wr_ch, struct xdr_buf *xdr);
extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
__be32 *rp_ch, bool writelist,
struct xdr_buf *xdr);
/* svc_rdma_sendto.c */
extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *,
struct svc_rdma_req_map *, bool);
extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma,
struct svc_rdma_op_ctxt *ctxt,
__be32 *rdma_resp, unsigned int len);
extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma,
struct svc_rdma_op_ctxt *ctxt,
int num_sge, u32 inv_rkey);
extern int svc_rdma_sendto(struct svc_rqst *);
extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *,
int);
/* svc_rdma_transport.c */
extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *);
extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *);
extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *);
extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *);
extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *);
@ -244,9 +226,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *);
extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int);
extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt);
extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *);
extern void svc_rdma_put_req_map(struct svcxprt_rdma *,
struct svc_rdma_req_map *);
extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *);
extern void svc_rdma_put_frmr(struct svcxprt_rdma *,
struct svc_rdma_fastreg_mr *);

View file

@ -22,6 +22,8 @@
#ifndef _NFSD_CLD_H
#define _NFSD_CLD_H
#include <linux/types.h>
/* latest upcall version available */
#define CLD_UPCALL_VERSION 1
@ -37,18 +39,18 @@ enum cld_command {
/* representation of long-form NFSv4 client ID */
struct cld_name {
uint16_t cn_len; /* length of cm_id */
__u16 cn_len; /* length of cm_id */
unsigned char cn_id[NFS4_OPAQUE_LIMIT]; /* client-provided */
} __attribute__((packed));
/* message struct for communication with userspace */
struct cld_msg {
uint8_t cm_vers; /* upcall version */
uint8_t cm_cmd; /* upcall command */
int16_t cm_status; /* return code */
uint32_t cm_xid; /* transaction id */
__u8 cm_vers; /* upcall version */
__u8 cm_cmd; /* upcall command */
__s16 cm_status; /* return code */
__u32 cm_xid; /* transaction id */
union {
int64_t cm_gracetime; /* grace period start time */
__s64 cm_gracetime; /* grace period start time */
struct cld_name cm_name;
} __attribute__((packed)) cm_u;
} __attribute__((packed));

View file

@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA
tristate "RPC-over-RDMA transport"
depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
default SUNRPC && INFINIBAND
select SG_POOL
help
This option allows the NFS client and server to use RDMA
transports (InfiniBand, iWARP, or RoCE).

View file

@ -702,6 +702,65 @@ found_pool:
return task;
}
/* create new threads */
static int
svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct svc_rqst *rqstp;
struct task_struct *task;
struct svc_pool *chosen_pool;
unsigned int state = serv->sv_nrthreads-1;
int node;
do {
nrservs--;
chosen_pool = choose_pool(serv, pool, &state);
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
if (IS_ERR(rqstp))
return PTR_ERR(rqstp);
__module_get(serv->sv_ops->svo_module);
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
return PTR_ERR(task);
}
rqstp->rq_task = task;
if (serv->sv_nrpools > 1)
svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
svc_sock_update_bufs(serv);
wake_up_process(task);
} while (nrservs > 0);
return 0;
}
/* destroy old threads */
static int
svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct task_struct *task;
unsigned int state = serv->sv_nrthreads-1;
/* destroy old threads */
do {
task = choose_victim(serv, pool, &state);
if (task == NULL)
break;
send_sig(SIGINT, task, 1);
nrservs++;
} while (nrservs < 0);
return 0;
}
/*
* Create or destroy enough new threads to make the number
* of threads the given number. If `pool' is non-NULL, applies
@ -719,13 +778,6 @@ found_pool:
int
svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct svc_rqst *rqstp;
struct task_struct *task;
struct svc_pool *chosen_pool;
int error = 0;
unsigned int state = serv->sv_nrthreads-1;
int node;
if (pool == NULL) {
/* The -1 assumes caller has done a svc_get() */
nrservs -= (serv->sv_nrthreads-1);
@ -735,46 +787,52 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
spin_unlock_bh(&pool->sp_lock);
}
/* create new threads */
while (nrservs > 0) {
nrservs--;
chosen_pool = choose_pool(serv, pool, &state);
node = svc_pool_map_get_node(chosen_pool->sp_id);
rqstp = svc_prepare_thread(serv, chosen_pool, node);
if (IS_ERR(rqstp)) {
error = PTR_ERR(rqstp);
break;
}
__module_get(serv->sv_ops->svo_module);
task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
node, "%s", serv->sv_name);
if (IS_ERR(task)) {
error = PTR_ERR(task);
module_put(serv->sv_ops->svo_module);
svc_exit_thread(rqstp);
break;
}
rqstp->rq_task = task;
if (serv->sv_nrpools > 1)
svc_pool_map_set_cpumask(task, chosen_pool->sp_id);
svc_sock_update_bufs(serv);
wake_up_process(task);
}
/* destroy old threads */
while (nrservs < 0 &&
(task = choose_victim(serv, pool, &state)) != NULL) {
send_sig(SIGINT, task, 1);
nrservs++;
}
return error;
if (nrservs > 0)
return svc_start_kthreads(serv, pool, nrservs);
if (nrservs < 0)
return svc_signal_kthreads(serv, pool, nrservs);
return 0;
}
EXPORT_SYMBOL_GPL(svc_set_num_threads);
/* destroy old threads */
static int
svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
struct task_struct *task;
unsigned int state = serv->sv_nrthreads-1;
/* destroy old threads */
do {
task = choose_victim(serv, pool, &state);
if (task == NULL)
break;
kthread_stop(task);
nrservs++;
} while (nrservs < 0);
return 0;
}
int
svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
{
if (pool == NULL) {
/* The -1 assumes caller has done a svc_get() */
nrservs -= (serv->sv_nrthreads-1);
} else {
spin_lock_bh(&pool->sp_lock);
nrservs -= pool->sp_nrthreads;
spin_unlock_bh(&pool->sp_lock);
}
if (nrservs > 0)
return svc_start_kthreads(serv, pool, nrservs);
if (nrservs < 0)
return svc_stop_kthreads(serv, pool, nrservs);
return 0;
}
EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
/*
* Called from a server thread as it's exiting. Caller must hold the "service
* mutex" for the service.

View file

@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o \
svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
svc_rdma_rw.o module.o
rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o

View file

@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
static unsigned int min_max_inline = 4096;
static unsigned int max_max_inline = 65536;
unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH;
static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH;
static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH;
atomic_t rdma_stat_recv;
atomic_t rdma_stat_read;
@ -247,8 +247,6 @@ int svc_rdma_init(void)
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
dprintk("\tsq_depth : %u\n",
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);

View file

@ -12,7 +12,17 @@
#undef SVCRDMA_BACKCHANNEL_DEBUG
int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
/**
* svc_rdma_handle_bc_reply - Process incoming backchannel reply
* @xprt: controlling backchannel transport
* @rdma_resp: pointer to incoming transport header
* @rcvbuf: XDR buffer into which to decode the reply
*
* Returns:
* %0 if @rcvbuf is filled in, xprt_complete_rqst called,
* %-EAGAIN if server should call ->recvfrom again.
*/
int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
struct xdr_buf *rcvbuf)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
p = (__be32 *)src->iov_base;
len = src->iov_len;
xid = rmsgp->rm_xid;
xid = *rdma_resp;
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: xid=%08x, length=%zu\n",
__func__, be32_to_cpu(xid), len);
pr_info("%s: RPC/RDMA: %*ph\n",
__func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
__func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp);
pr_info("%s: RPC: %*ph\n",
__func__, (int)len, p);
#endif
@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
goto out_unlock;
memcpy(dst->iov_base, p, len);
credits = be32_to_cpu(rmsgp->rm_credit);
credits = be32_to_cpup(rdma_resp + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
@ -90,9 +100,9 @@ out_notfound:
* Caller holds the connection's mutex and has already marshaled
* the RPC/RDMA request.
*
* This is similar to svc_rdma_reply, but takes an rpc_rqst
* instead, does not support chunks, and avoids blocking memory
* allocation.
* This is similar to svc_rdma_send_reply_msg, but takes a struct
* rpc_rqst instead, does not support chunks, and avoids blocking
* memory allocation.
*
* XXX: There is still an opportunity to block in svc_rdma_send()
* if there are no SQ entries to post the Send. This may occur if
@ -101,59 +111,36 @@ out_notfound:
static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
struct rpc_rqst *rqst)
{
struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
struct svc_rdma_op_ctxt *ctxt;
struct svc_rdma_req_map *vec;
struct ib_send_wr send_wr;
int ret;
vec = svc_rdma_get_req_map(rdma);
ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
if (ret)
ctxt = svc_rdma_get_context(rdma);
/* rpcrdma_bc_send_request builds the transport header and
* the backchannel RPC message in the same buffer. Thus only
* one SGE is needed to send both.
*/
ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer,
rqst->rq_snd_buf.len);
if (ret < 0)
goto out_err;
ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
if (ret)
goto out_err;
ctxt = svc_rdma_get_context(rdma);
ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
ctxt->count = 1;
ctxt->direction = DMA_TO_DEVICE;
ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[0].length = sndbuf->len;
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
sndbuf->len, DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
ret = -EIO;
ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0);
if (ret)
goto out_unmap;
}
svc_rdma_count_mappings(rdma, ctxt);
memset(&send_wr, 0, sizeof(send_wr));
ctxt->cqe.done = svc_rdma_wc_send;
send_wr.wr_cqe = &ctxt->cqe;
send_wr.sg_list = ctxt->sge;
send_wr.num_sge = 1;
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
ret = svc_rdma_send(rdma, &send_wr);
if (ret) {
ret = -EIO;
goto out_unmap;
}
out_err:
svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: %s returns %d\n", __func__, ret);
return ret;
out_unmap:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
ret = -EIO;
goto out_err;
}

View file

@ -166,92 +166,3 @@ out_inval:
dprintk("svcrdma: failed to parse transport header\n");
return -EINVAL;
}
int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt,
struct rpcrdma_msg *rmsgp,
enum rpcrdma_errcode err, __be32 *va)
{
__be32 *startp = va;
*va++ = rmsgp->rm_xid;
*va++ = rmsgp->rm_vers;
*va++ = xprt->sc_fc_credits;
*va++ = rdma_error;
*va++ = cpu_to_be32(err);
if (err == ERR_VERS) {
*va++ = rpcrdma_version;
*va++ = rpcrdma_version;
}
return (int)((unsigned long)va - (unsigned long)startp);
}
/**
* svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header
* @rdma_resp: buffer containing Reply transport header
*
* Returns length of transport header, in bytes.
*/
unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp)
{
unsigned int nsegs;
__be32 *p;
p = rdma_resp;
/* RPC-over-RDMA V1 replies never have a Read list. */
p += rpcrdma_fixed_maxsz + 1;
/* Skip Write list. */
while (*p++ != xdr_zero) {
nsegs = be32_to_cpup(p++);
p += nsegs * rpcrdma_segment_maxsz;
}
/* Skip Reply chunk. */
if (*p++ != xdr_zero) {
nsegs = be32_to_cpup(p++);
p += nsegs * rpcrdma_segment_maxsz;
}
return (unsigned long)p - (unsigned long)rdma_resp;
}
void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks)
{
struct rpcrdma_write_array *ary;
/* no read-list */
rmsgp->rm_body.rm_chunks[0] = xdr_zero;
/* write-array discrim */
ary = (struct rpcrdma_write_array *)
&rmsgp->rm_body.rm_chunks[1];
ary->wc_discrim = xdr_one;
ary->wc_nchunks = cpu_to_be32(chunks);
/* write-list terminator */
ary->wc_array[chunks].wc_target.rs_handle = xdr_zero;
/* reply-array discriminator */
ary->wc_array[chunks].wc_target.rs_length = xdr_zero;
}
void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary,
int chunks)
{
ary->wc_discrim = xdr_one;
ary->wc_nchunks = cpu_to_be32(chunks);
}
void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary,
int chunk_no,
__be32 rs_handle,
__be64 rs_offset,
u32 write_len)
{
struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target;
seg->rs_handle = rs_handle;
seg->rs_offset = rs_offset;
seg->rs_length = cpu_to_be32(write_len);
}

View file

@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp,
rqstp->rq_arg.buflen = head->arg.buflen;
}
static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
__be32 *rdma_argp, int status)
{
struct svc_rdma_op_ctxt *ctxt;
__be32 *p, *err_msgp;
unsigned int length;
struct page *page;
int ret;
ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
if (ret)
return;
page = alloc_page(GFP_KERNEL);
if (!page)
return;
err_msgp = page_address(page);
p = err_msgp;
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = xprt->sc_fc_credits;
*p++ = rdma_error;
if (status == -EPROTONOSUPPORT) {
*p++ = err_vers;
*p++ = rpcrdma_version;
*p++ = rpcrdma_version;
} else {
*p++ = err_chunk;
}
length = (unsigned long)p - (unsigned long)err_msgp;
/* Map transport header; no RPC message payload */
ctxt = svc_rdma_get_context(xprt);
ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length);
if (ret) {
dprintk("svcrdma: Error %d mapping send for protocol error\n",
ret);
return;
}
ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0);
if (ret) {
dprintk("svcrdma: Error %d posting send for protocol error\n",
ret);
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
}
}
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
static bool
svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt,
__be32 *rdma_resp)
{
__be32 *p = (__be32 *)rmsgp;
__be32 *p;
if (!xprt->xpt_bc_xprt)
return false;
if (rmsgp->rm_type != rdma_msg)
return false;
if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
return false;
if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
return false;
if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
p = rdma_resp + 3;
if (*p++ != rdma_msg)
return false;
/* sanity */
if (p[7] != rmsgp->rm_xid)
if (*p++ != xdr_zero)
return false;
if (*p++ != xdr_zero)
return false;
if (*p++ != xdr_zero)
return false;
/* XID sanity */
if (*p++ != *rdma_resp)
return false;
/* call direction */
if (p[8] == cpu_to_be32(RPC_CALL))
if (*p == cpu_to_be32(RPC_CALL))
return false;
return true;
@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
goto out_drop;
rqstp->rq_xprt_hlen = ret;
if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) {
ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt,
&rmsgp->rm_xid,
&rqstp->rq_arg);
svc_rdma_put_context(ctxt, 0);
if (ret)
@ -686,7 +739,7 @@ complete:
return ret;
out_err:
svc_rdma_send_error(rdma_xprt, rmsgp, ret);
svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret);
svc_rdma_put_context(ctxt, 0);
return 0;

View file

@ -0,0 +1,512 @@
/*
* Copyright (c) 2016 Oracle. All rights reserved.
*
* Use the core R/W API to move RPC-over-RDMA Read and Write chunks.
*/
#include <linux/sunrpc/rpc_rdma.h>
#include <linux/sunrpc/svc_rdma.h>
#include <linux/sunrpc/debug.h>
#include <rdma/rw.h>
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
/* Each R/W context contains state for one chain of RDMA Read or
* Write Work Requests.
*
* Each WR chain handles a single contiguous server-side buffer,
* because scatterlist entries after the first have to start on
* page alignment. xdr_buf iovecs cannot guarantee alignment.
*
* Each WR chain handles only one R_key. Each RPC-over-RDMA segment
* from a client may contain a unique R_key, so each WR chain moves
* up to one segment at a time.
*
* The scatterlist makes this data structure over 4KB in size. To
* make it less likely to fail, and to handle the allocation for
* smaller I/O requests without disabling bottom-halves, these
* contexts are created on demand, but cached and reused until the
* controlling svcxprt_rdma is destroyed.
*/
struct svc_rdma_rw_ctxt {
struct list_head rw_list;
struct rdma_rw_ctx rw_ctx;
int rw_nents;
struct sg_table rw_sg_table;
struct scatterlist rw_first_sgl[0];
};
static inline struct svc_rdma_rw_ctxt *
svc_rdma_next_ctxt(struct list_head *list)
{
return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt,
rw_list);
}
static struct svc_rdma_rw_ctxt *
svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges)
{
struct svc_rdma_rw_ctxt *ctxt;
spin_lock(&rdma->sc_rw_ctxt_lock);
ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts);
if (ctxt) {
list_del(&ctxt->rw_list);
spin_unlock(&rdma->sc_rw_ctxt_lock);
} else {
spin_unlock(&rdma->sc_rw_ctxt_lock);
ctxt = kmalloc(sizeof(*ctxt) +
SG_CHUNK_SIZE * sizeof(struct scatterlist),
GFP_KERNEL);
if (!ctxt)
goto out;
INIT_LIST_HEAD(&ctxt->rw_list);
}
ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl;
if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges,
ctxt->rw_sg_table.sgl)) {
kfree(ctxt);
ctxt = NULL;
}
out:
return ctxt;
}
static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma,
struct svc_rdma_rw_ctxt *ctxt)
{
sg_free_table_chained(&ctxt->rw_sg_table, true);
spin_lock(&rdma->sc_rw_ctxt_lock);
list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts);
spin_unlock(&rdma->sc_rw_ctxt_lock);
}
/**
* svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts
* @rdma: transport about to be destroyed
*
*/
void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma)
{
struct svc_rdma_rw_ctxt *ctxt;
while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) {
list_del(&ctxt->rw_list);
kfree(ctxt);
}
}
/* A chunk context tracks all I/O for moving one Read or Write
* chunk. This is a a set of rdma_rw's that handle data movement
* for all segments of one chunk.
*
* These are small, acquired with a single allocator call, and
* no more than one is needed per chunk. They are allocated on
* demand, and not cached.
*/
struct svc_rdma_chunk_ctxt {
struct ib_cqe cc_cqe;
struct svcxprt_rdma *cc_rdma;
struct list_head cc_rwctxts;
int cc_sqecount;
enum dma_data_direction cc_dir;
};
static void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
struct svc_rdma_chunk_ctxt *cc,
enum dma_data_direction dir)
{
cc->cc_rdma = rdma;
svc_xprt_get(&rdma->sc_xprt);
INIT_LIST_HEAD(&cc->cc_rwctxts);
cc->cc_sqecount = 0;
cc->cc_dir = dir;
}
static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc)
{
struct svcxprt_rdma *rdma = cc->cc_rdma;
struct svc_rdma_rw_ctxt *ctxt;
while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) {
list_del(&ctxt->rw_list);
rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
ctxt->rw_nents, cc->cc_dir);
svc_rdma_put_rw_ctxt(rdma, ctxt);
}
svc_xprt_put(&rdma->sc_xprt);
}
/* State for sending a Write or Reply chunk.
* - Tracks progress of writing one chunk over all its segments
* - Stores arguments for the SGL constructor functions
*/
struct svc_rdma_write_info {
/* write state of this chunk */
unsigned int wi_seg_off;
unsigned int wi_seg_no;
unsigned int wi_nsegs;
__be32 *wi_segs;
/* SGL constructor arguments */
struct xdr_buf *wi_xdr;
unsigned char *wi_base;
unsigned int wi_next_off;
struct svc_rdma_chunk_ctxt wi_cc;
};
static struct svc_rdma_write_info *
svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk)
{
struct svc_rdma_write_info *info;
info = kmalloc(sizeof(*info), GFP_KERNEL);
if (!info)
return info;
info->wi_seg_off = 0;
info->wi_seg_no = 0;
info->wi_nsegs = be32_to_cpup(++chunk);
info->wi_segs = ++chunk;
svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE);
return info;
}
static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
{
svc_rdma_cc_release(&info->wi_cc);
kfree(info);
}
/**
* svc_rdma_write_done - Write chunk completion
* @cq: controlling Completion Queue
* @wc: Work Completion
*
* Pages under I/O are freed by a subsequent Send completion.
*/
static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_chunk_ctxt *cc =
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
struct svcxprt_rdma *rdma = cc->cc_rdma;
struct svc_rdma_write_info *info =
container_of(cc, struct svc_rdma_write_info, wi_cc);
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
if (unlikely(wc->status != IB_WC_SUCCESS)) {
set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
if (wc->status != IB_WC_WR_FLUSH_ERR)
pr_err("svcrdma: write ctx: %s (%u/0x%x)\n",
ib_wc_status_msg(wc->status),
wc->status, wc->vendor_err);
}
svc_rdma_write_info_free(info);
}
/* This function sleeps when the transport's Send Queue is congested.
*
* Assumptions:
* - If ib_post_send() succeeds, only one completion is expected,
* even if one or more WRs are flushed. This is true when posting
* an rdma_rw_ctx or when posting a single signaled WR.
*/
static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc)
{
struct svcxprt_rdma *rdma = cc->cc_rdma;
struct svc_xprt *xprt = &rdma->sc_xprt;
struct ib_send_wr *first_wr, *bad_wr;
struct list_head *tmp;
struct ib_cqe *cqe;
int ret;
first_wr = NULL;
cqe = &cc->cc_cqe;
list_for_each(tmp, &cc->cc_rwctxts) {
struct svc_rdma_rw_ctxt *ctxt;
ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list);
first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, cqe, first_wr);
cqe = NULL;
}
do {
if (atomic_sub_return(cc->cc_sqecount,
&rdma->sc_sq_avail) > 0) {
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
if (ret)
break;
return 0;
}
atomic_inc(&rdma_stat_sq_starve);
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wait_event(rdma->sc_send_wait,
atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
} while (1);
pr_err("svcrdma: ib_post_send failed (%d)\n", ret);
set_bit(XPT_CLOSE, &xprt->xpt_flags);
/* If even one was posted, there will be a completion. */
if (bad_wr != first_wr)
return 0;
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
wake_up(&rdma->sc_send_wait);
return -ENOTCONN;
}
/* Build and DMA-map an SGL that covers one kvec in an xdr_buf
*/
static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info,
unsigned int len,
struct svc_rdma_rw_ctxt *ctxt)
{
struct scatterlist *sg = ctxt->rw_sg_table.sgl;
sg_set_buf(&sg[0], info->wi_base, len);
info->wi_base += len;
ctxt->rw_nents = 1;
}
/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist.
*/
static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info,
unsigned int remaining,
struct svc_rdma_rw_ctxt *ctxt)
{
unsigned int sge_no, sge_bytes, page_off, page_no;
struct xdr_buf *xdr = info->wi_xdr;
struct scatterlist *sg;
struct page **page;
page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK;
page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT;
page = xdr->pages + page_no;
info->wi_next_off += remaining;
sg = ctxt->rw_sg_table.sgl;
sge_no = 0;
do {
sge_bytes = min_t(unsigned int, remaining,
PAGE_SIZE - page_off);
sg_set_page(sg, *page, sge_bytes, page_off);
remaining -= sge_bytes;
sg = sg_next(sg);
page_off = 0;
sge_no++;
page++;
} while (remaining);
ctxt->rw_nents = sge_no;
}
/* Construct RDMA Write WRs to send a portion of an xdr_buf containing
* an RPC Reply.
*/
static int
svc_rdma_build_writes(struct svc_rdma_write_info *info,
void (*constructor)(struct svc_rdma_write_info *info,
unsigned int len,
struct svc_rdma_rw_ctxt *ctxt),
unsigned int remaining)
{
struct svc_rdma_chunk_ctxt *cc = &info->wi_cc;
struct svcxprt_rdma *rdma = cc->cc_rdma;
struct svc_rdma_rw_ctxt *ctxt;
__be32 *seg;
int ret;
cc->cc_cqe.done = svc_rdma_write_done;
seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz;
do {
unsigned int write_len;
u32 seg_length, seg_handle;
u64 seg_offset;
if (info->wi_seg_no >= info->wi_nsegs)
goto out_overflow;
seg_handle = be32_to_cpup(seg);
seg_length = be32_to_cpup(seg + 1);
xdr_decode_hyper(seg + 2, &seg_offset);
seg_offset += info->wi_seg_off;
write_len = min(remaining, seg_length - info->wi_seg_off);
ctxt = svc_rdma_get_rw_ctxt(rdma,
(write_len >> PAGE_SHIFT) + 2);
if (!ctxt)
goto out_noctx;
constructor(info, write_len, ctxt);
ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp,
rdma->sc_port_num, ctxt->rw_sg_table.sgl,
ctxt->rw_nents, 0, seg_offset,
seg_handle, DMA_TO_DEVICE);
if (ret < 0)
goto out_initerr;
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
if (write_len == seg_length - info->wi_seg_off) {
seg += 4;
info->wi_seg_no++;
info->wi_seg_off = 0;
} else {
info->wi_seg_off += write_len;
}
remaining -= write_len;
} while (remaining);
return 0;
out_overflow:
dprintk("svcrdma: inadequate space in Write chunk (%u)\n",
info->wi_nsegs);
return -E2BIG;
out_noctx:
dprintk("svcrdma: no R/W ctxs available\n");
return -ENOMEM;
out_initerr:
svc_rdma_put_rw_ctxt(rdma, ctxt);
pr_err("svcrdma: failed to map pagelist (%d)\n", ret);
return -EIO;
}
/* Send one of an xdr_buf's kvecs by itself. To send a Reply
* chunk, the whole RPC Reply is written back to the client.
* This function writes either the head or tail of the xdr_buf
* containing the Reply.
*/
static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
struct kvec *vec)
{
info->wi_base = vec->iov_base;
return svc_rdma_build_writes(info, svc_rdma_vec_to_sg,
vec->iov_len);
}
/* Send an xdr_buf's page list by itself. A Write chunk is
* just the page list. a Reply chunk is the head, page list,
* and tail. This function is shared between the two types
* of chunk.
*/
static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
struct xdr_buf *xdr)
{
info->wi_xdr = xdr;
info->wi_next_off = 0;
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
xdr->page_len);
}
/**
* svc_rdma_send_write_chunk - Write all segments in a Write chunk
* @rdma: controlling RDMA transport
* @wr_ch: Write chunk provided by client
* @xdr: xdr_buf containing the data payload
*
* Returns a non-negative number of bytes the chunk consumed, or
* %-E2BIG if the payload was larger than the Write chunk,
* %-ENOMEM if rdma_rw context pool was exhausted,
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
int ret;
if (!xdr->page_len)
return 0;
info = svc_rdma_write_info_alloc(rdma, wr_ch);
if (!info)
return -ENOMEM;
ret = svc_rdma_send_xdr_pagelist(info, xdr);
if (ret < 0)
goto out_err;
ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
if (ret < 0)
goto out_err;
return xdr->page_len;
out_err:
svc_rdma_write_info_free(info);
return ret;
}
/**
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
* @rdma: controlling RDMA transport
* @rp_ch: Reply chunk provided by client
* @writelist: true if client provided a Write list
* @xdr: xdr_buf containing an RPC Reply
*
* Returns a non-negative number of bytes the chunk consumed, or
* %-E2BIG if the payload was larger than the Reply chunk,
* %-ENOMEM if rdma_rw context pool was exhausted,
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
bool writelist, struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
int consumed, ret;
info = svc_rdma_write_info_alloc(rdma, rp_ch);
if (!info)
return -ENOMEM;
ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]);
if (ret < 0)
goto out_err;
consumed = xdr->head[0].iov_len;
/* Send the page list in the Reply chunk only if the
* client did not provide Write chunks.
*/
if (!writelist && xdr->page_len) {
ret = svc_rdma_send_xdr_pagelist(info, xdr);
if (ret < 0)
goto out_err;
consumed += xdr->page_len;
}
if (xdr->tail[0].iov_len) {
ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]);
if (ret < 0)
goto out_err;
consumed += xdr->tail[0].iov_len;
}
ret = svc_rdma_post_chunk_ctxt(&info->wi_cc);
if (ret < 0)
goto out_err;
return consumed;
out_err:
svc_rdma_write_info_free(info);
return ret;
}

File diff suppressed because it is too large Load diff

View file

@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
}
}
static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
{
struct svc_rdma_req_map *map;
map = kmalloc(sizeof(*map), flags);
if (map)
INIT_LIST_HEAD(&map->free);
return map;
}
static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
{
unsigned int i;
/* One for each receive buffer on this connection. */
i = xprt->sc_max_requests;
while (i--) {
struct svc_rdma_req_map *map;
map = alloc_req_map(GFP_KERNEL);
if (!map) {
dprintk("svcrdma: No memory for request map\n");
return false;
}
list_add(&map->free, &xprt->sc_maps);
}
return true;
}
struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
{
struct svc_rdma_req_map *map = NULL;
spin_lock(&xprt->sc_map_lock);
if (list_empty(&xprt->sc_maps))
goto out_empty;
map = list_first_entry(&xprt->sc_maps,
struct svc_rdma_req_map, free);
list_del_init(&map->free);
spin_unlock(&xprt->sc_map_lock);
out:
map->count = 0;
return map;
out_empty:
spin_unlock(&xprt->sc_map_lock);
/* Pre-allocation amount was incorrect */
map = alloc_req_map(GFP_NOIO);
if (map)
goto out;
WARN_ONCE(1, "svcrdma: empty request map list?\n");
return NULL;
}
void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
struct svc_rdma_req_map *map)
{
spin_lock(&xprt->sc_map_lock);
list_add(&map->free, &xprt->sc_maps);
spin_unlock(&xprt->sc_map_lock);
}
static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
while (!list_empty(&xprt->sc_maps)) {
struct svc_rdma_req_map *map;
map = list_first_entry(&xprt->sc_maps,
struct svc_rdma_req_map, free);
list_del(&map->free);
kfree(map);
}
}
/* QP event handler */
static void qp_event_handler(struct ib_event *event, void *context)
{
@ -473,24 +394,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
svc_rdma_put_context(ctxt, 1);
}
/**
* svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
* @cq: completion queue
* @wc: completed WR
*
*/
void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct svc_rdma_op_ctxt *ctxt;
svc_rdma_send_wc_common_put(cq, wc, "write");
ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 0);
}
/**
* svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
* @cq: completion queue
@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
INIT_LIST_HEAD(&cma_xprt->sc_maps);
INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
spin_lock_init(&cma_xprt->sc_ctxt_lock);
spin_lock_init(&cma_xprt->sc_map_lock);
spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
/*
* Note that this implies that the underlying transport support
@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
newxprt, newxprt->sc_cm_id);
dev = newxprt->sc_cm_id->device;
newxprt->sc_port_num = newxprt->sc_cm_id->port_num;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
svcrdma_max_bc_requests);
newxprt->sc_rq_depth = newxprt->sc_max_requests +
newxprt->sc_max_bc_requests;
newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
newxprt->sc_sq_depth = newxprt->sc_rq_depth;
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
if (!svc_rdma_prealloc_ctxts(newxprt))
goto errout;
if (!svc_rdma_prealloc_maps(newxprt))
goto errout;
/*
* Limit ORD based on client limit, local device limit, and
@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
memset(&qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.port_num = newxprt->sc_cm_id->port_num;
qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work)
}
rdma_dealloc_frmr_q(rdma);
svc_rdma_destroy_rw_ctxts(rdma);
svc_rdma_destroy_ctxts(rdma);
svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))