ceph: clean up unsafe d_parent access in __choose_mds
__choose_mds exists to pick an MDS to use when issuing a call. Doing that typically involves picking an inode and using the authoritative MDS for it. In most cases, that's pretty straightforward, as we are using an inode to which we hold a reference (usually represented by r_dentry or r_inode in the request). In the case of a snapshotted directory however, we need to fetch the non-snapped parent, which involves walking back up the parents in the tree. The dentries in the snapshot dir are effectively frozen but the overall parent is _not_, and could vanish if a concurrent rename were to occur. Clean this code up and take special care to ensure the validity of the entries we're working with. First, try to use the inode in r_locked_dir if one exists. If not and all we have is r_dentry, then we have to walk back up the tree. Use the rcu_read_lock for this so we can ensure that any d_parent we find won't go away, and take extra care to deal with the possibility that the dentries could go negative. Change get_nonsnap_parent to return an inode, and take a reference to that inode before returning (if any). Change all of the other places where we set "inode" in __choose_mds to also take a reference, and then call iput on that inode before exiting the function. Link: http://tracker.ceph.com/issues/18148 Signed-off-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Yan, Zheng <zyan@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
parent
c470abd4fd
commit
30c71233a1
1 changed files with 42 additions and 22 deletions
|
@ -667,6 +667,27 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
|
|||
ceph_mdsc_put_request(req);
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk back up the dentry tree until we hit a dentry representing a
|
||||
* non-snapshot inode. We do this using the rcu_read_lock (which must be held
|
||||
* when calling this) to ensure that the objects won't disappear while we're
|
||||
* working with them. Once we hit a candidate dentry, we attempt to take a
|
||||
* reference to it, and return that as the result.
|
||||
*/
|
||||
static struct inode *get_nonsnap_parent(struct dentry *dentry) { struct inode
|
||||
*inode = NULL;
|
||||
|
||||
while (dentry && !IS_ROOT(dentry)) {
|
||||
inode = d_inode_rcu(dentry);
|
||||
if (!inode || ceph_snap(inode) == CEPH_NOSNAP)
|
||||
break;
|
||||
dentry = dentry->d_parent;
|
||||
}
|
||||
if (inode)
|
||||
inode = igrab(inode);
|
||||
return inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Choose mds to send request to next. If there is a hint set in the
|
||||
* request (e.g., due to a prior forward hint from the mds), use that.
|
||||
|
@ -675,19 +696,6 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
|
|||
*
|
||||
* Called under mdsc->mutex.
|
||||
*/
|
||||
static struct dentry *get_nonsnap_parent(struct dentry *dentry)
|
||||
{
|
||||
/*
|
||||
* we don't need to worry about protecting the d_parent access
|
||||
* here because we never renaming inside the snapped namespace
|
||||
* except to resplice to another snapdir, and either the old or new
|
||||
* result is a valid result.
|
||||
*/
|
||||
while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
|
||||
dentry = dentry->d_parent;
|
||||
return dentry;
|
||||
}
|
||||
|
||||
static int __choose_mds(struct ceph_mds_client *mdsc,
|
||||
struct ceph_mds_request *req)
|
||||
{
|
||||
|
@ -717,30 +725,39 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||
inode = NULL;
|
||||
if (req->r_inode) {
|
||||
inode = req->r_inode;
|
||||
ihold(inode);
|
||||
} else if (req->r_dentry) {
|
||||
/* ignore race with rename; old or new d_parent is okay */
|
||||
struct dentry *parent = req->r_dentry->d_parent;
|
||||
struct inode *dir = d_inode(parent);
|
||||
struct dentry *parent;
|
||||
struct inode *dir;
|
||||
|
||||
if (dir->i_sb != mdsc->fsc->sb) {
|
||||
/* not this fs! */
|
||||
rcu_read_lock();
|
||||
parent = req->r_dentry->d_parent;
|
||||
dir = req->r_locked_dir ? : d_inode_rcu(parent);
|
||||
|
||||
if (!dir || dir->i_sb != mdsc->fsc->sb) {
|
||||
/* not this fs or parent went negative */
|
||||
inode = d_inode(req->r_dentry);
|
||||
if (inode)
|
||||
ihold(inode);
|
||||
} else if (ceph_snap(dir) != CEPH_NOSNAP) {
|
||||
/* direct snapped/virtual snapdir requests
|
||||
* based on parent dir inode */
|
||||
struct dentry *dn = get_nonsnap_parent(parent);
|
||||
inode = d_inode(dn);
|
||||
inode = get_nonsnap_parent(parent);
|
||||
dout("__choose_mds using nonsnap parent %p\n", inode);
|
||||
} else {
|
||||
/* dentry target */
|
||||
inode = d_inode(req->r_dentry);
|
||||
if (!inode || mode == USE_AUTH_MDS) {
|
||||
/* dir + name */
|
||||
inode = dir;
|
||||
inode = igrab(dir);
|
||||
hash = ceph_dentry_hash(dir, req->r_dentry);
|
||||
is_hash = true;
|
||||
} else {
|
||||
ihold(inode);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
|
||||
|
@ -769,7 +786,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||
(int)r, frag.ndist);
|
||||
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
|
||||
CEPH_MDS_STATE_ACTIVE)
|
||||
return mds;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* since this file/dir wasn't known to be
|
||||
|
@ -784,7 +801,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||
inode, ceph_vinop(inode), frag.frag, mds);
|
||||
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
|
||||
CEPH_MDS_STATE_ACTIVE)
|
||||
return mds;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -797,6 +814,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||
cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
|
||||
if (!cap) {
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
iput(inode);
|
||||
goto random;
|
||||
}
|
||||
mds = cap->session->s_mds;
|
||||
|
@ -804,6 +822,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
|
|||
inode, ceph_vinop(inode), mds,
|
||||
cap == ci->i_auth_cap ? "auth " : "", cap);
|
||||
spin_unlock(&ci->i_ceph_lock);
|
||||
out:
|
||||
iput(inode);
|
||||
return mds;
|
||||
|
||||
random:
|
||||
|
|
Loading…
Reference in a new issue