[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro <viro@ZenIV.linux.org.uk> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
parent
2c463e9548
commit
3d733633a6
2 changed files with 244 additions and 15 deletions
252
fs/namespace.c
252
fs/namespace.c
|
@ -17,6 +17,7 @@
|
|||
#include <linux/quotaops.h>
|
||||
#include <linux/acct.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/seq_file.h>
|
||||
|
@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
|
|||
return tmp & (HASH_SIZE - 1);
|
||||
}
|
||||
|
||||
#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
|
||||
|
||||
struct vfsmount *alloc_vfsmnt(const char *name)
|
||||
{
|
||||
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
|
||||
|
@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
|
|||
INIT_LIST_HEAD(&mnt->mnt_share);
|
||||
INIT_LIST_HEAD(&mnt->mnt_slave_list);
|
||||
INIT_LIST_HEAD(&mnt->mnt_slave);
|
||||
atomic_set(&mnt->__mnt_writers, 0);
|
||||
if (name) {
|
||||
int size = strlen(name) + 1;
|
||||
char *newname = kmalloc(size, GFP_KERNEL);
|
||||
|
@ -80,6 +84,92 @@ struct vfsmount *alloc_vfsmnt(const char *name)
|
|||
return mnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Most r/o checks on a fs are for operations that take
|
||||
* discrete amounts of time, like a write() or unlink().
|
||||
* We must keep track of when those operations start
|
||||
* (for permission checks) and when they end, so that
|
||||
* we can determine when writes are able to occur to
|
||||
* a filesystem.
|
||||
*/
|
||||
/*
|
||||
* __mnt_is_readonly: check whether a mount is read-only
|
||||
* @mnt: the mount to check for its write status
|
||||
*
|
||||
* This shouldn't be used directly ouside of the VFS.
|
||||
* It does not guarantee that the filesystem will stay
|
||||
* r/w, just that it is right *now*. This can not and
|
||||
* should not be used in place of IS_RDONLY(inode).
|
||||
* mnt_want/drop_write() will _keep_ the filesystem
|
||||
* r/w.
|
||||
*/
|
||||
int __mnt_is_readonly(struct vfsmount *mnt)
|
||||
{
|
||||
return (mnt->mnt_sb->s_flags & MS_RDONLY);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
|
||||
|
||||
struct mnt_writer {
|
||||
/*
|
||||
* If holding multiple instances of this lock, they
|
||||
* must be ordered by cpu number.
|
||||
*/
|
||||
spinlock_t lock;
|
||||
struct lock_class_key lock_class; /* compiles out with !lockdep */
|
||||
unsigned long count;
|
||||
struct vfsmount *mnt;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
|
||||
|
||||
static int __init init_mnt_writers(void)
|
||||
{
|
||||
int cpu;
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
|
||||
spin_lock_init(&writer->lock);
|
||||
lockdep_set_class(&writer->lock, &writer->lock_class);
|
||||
writer->count = 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
fs_initcall(init_mnt_writers);
|
||||
|
||||
static void unlock_mnt_writers(void)
|
||||
{
|
||||
int cpu;
|
||||
struct mnt_writer *cpu_writer;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
cpu_writer = &per_cpu(mnt_writers, cpu);
|
||||
spin_unlock(&cpu_writer->lock);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
|
||||
{
|
||||
if (!cpu_writer->mnt)
|
||||
return;
|
||||
/*
|
||||
* This is in case anyone ever leaves an invalid,
|
||||
* old ->mnt and a count of 0.
|
||||
*/
|
||||
if (!cpu_writer->count)
|
||||
return;
|
||||
atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
|
||||
cpu_writer->count = 0;
|
||||
}
|
||||
/*
|
||||
* must hold cpu_writer->lock
|
||||
*/
|
||||
static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
|
||||
struct vfsmount *mnt)
|
||||
{
|
||||
if (cpu_writer->mnt == mnt)
|
||||
return;
|
||||
__clear_mnt_count(cpu_writer);
|
||||
cpu_writer->mnt = mnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Most r/o checks on a fs are for operations that take
|
||||
* discrete amounts of time, like a write() or unlink().
|
||||
|
@ -100,12 +190,77 @@ struct vfsmount *alloc_vfsmnt(const char *name)
|
|||
*/
|
||||
int mnt_want_write(struct vfsmount *mnt)
|
||||
{
|
||||
if (__mnt_is_readonly(mnt))
|
||||
return -EROFS;
|
||||
return 0;
|
||||
int ret = 0;
|
||||
struct mnt_writer *cpu_writer;
|
||||
|
||||
cpu_writer = &get_cpu_var(mnt_writers);
|
||||
spin_lock(&cpu_writer->lock);
|
||||
if (__mnt_is_readonly(mnt)) {
|
||||
ret = -EROFS;
|
||||
goto out;
|
||||
}
|
||||
use_cpu_writer_for_mount(cpu_writer, mnt);
|
||||
cpu_writer->count++;
|
||||
out:
|
||||
spin_unlock(&cpu_writer->lock);
|
||||
put_cpu_var(mnt_writers);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mnt_want_write);
|
||||
|
||||
static void lock_mnt_writers(void)
|
||||
{
|
||||
int cpu;
|
||||
struct mnt_writer *cpu_writer;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
cpu_writer = &per_cpu(mnt_writers, cpu);
|
||||
spin_lock(&cpu_writer->lock);
|
||||
__clear_mnt_count(cpu_writer);
|
||||
cpu_writer->mnt = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* These per-cpu write counts are not guaranteed to have
|
||||
* matched increments and decrements on any given cpu.
|
||||
* A file open()ed for write on one cpu and close()d on
|
||||
* another cpu will imbalance this count. Make sure it
|
||||
* does not get too far out of whack.
|
||||
*/
|
||||
static void handle_write_count_underflow(struct vfsmount *mnt)
|
||||
{
|
||||
if (atomic_read(&mnt->__mnt_writers) >=
|
||||
MNT_WRITER_UNDERFLOW_LIMIT)
|
||||
return;
|
||||
/*
|
||||
* It isn't necessary to hold all of the locks
|
||||
* at the same time, but doing it this way makes
|
||||
* us share a lot more code.
|
||||
*/
|
||||
lock_mnt_writers();
|
||||
/*
|
||||
* vfsmount_lock is for mnt_flags.
|
||||
*/
|
||||
spin_lock(&vfsmount_lock);
|
||||
/*
|
||||
* If coalescing the per-cpu writer counts did not
|
||||
* get us back to a positive writer count, we have
|
||||
* a bug.
|
||||
*/
|
||||
if ((atomic_read(&mnt->__mnt_writers) < 0) &&
|
||||
!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
|
||||
printk(KERN_DEBUG "leak detected on mount(%p) writers "
|
||||
"count: %d\n",
|
||||
mnt, atomic_read(&mnt->__mnt_writers));
|
||||
WARN_ON(1);
|
||||
/* use the flag to keep the dmesg spam down */
|
||||
mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
|
||||
}
|
||||
spin_unlock(&vfsmount_lock);
|
||||
unlock_mnt_writers();
|
||||
}
|
||||
|
||||
/**
|
||||
* mnt_drop_write - give up write access to a mount
|
||||
* @mnt: the mount on which to give up write access
|
||||
|
@ -116,23 +271,61 @@ EXPORT_SYMBOL_GPL(mnt_want_write);
|
|||
*/
|
||||
void mnt_drop_write(struct vfsmount *mnt)
|
||||
{
|
||||
int must_check_underflow = 0;
|
||||
struct mnt_writer *cpu_writer;
|
||||
|
||||
cpu_writer = &get_cpu_var(mnt_writers);
|
||||
spin_lock(&cpu_writer->lock);
|
||||
|
||||
use_cpu_writer_for_mount(cpu_writer, mnt);
|
||||
if (cpu_writer->count > 0) {
|
||||
cpu_writer->count--;
|
||||
} else {
|
||||
must_check_underflow = 1;
|
||||
atomic_dec(&mnt->__mnt_writers);
|
||||
}
|
||||
|
||||
spin_unlock(&cpu_writer->lock);
|
||||
/*
|
||||
* Logically, we could call this each time,
|
||||
* but the __mnt_writers cacheline tends to
|
||||
* be cold, and makes this expensive.
|
||||
*/
|
||||
if (must_check_underflow)
|
||||
handle_write_count_underflow(mnt);
|
||||
/*
|
||||
* This could be done right after the spinlock
|
||||
* is taken because the spinlock keeps us on
|
||||
* the cpu, and disables preemption. However,
|
||||
* putting it here bounds the amount that
|
||||
* __mnt_writers can underflow. Without it,
|
||||
* we could theoretically wrap __mnt_writers.
|
||||
*/
|
||||
put_cpu_var(mnt_writers);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mnt_drop_write);
|
||||
|
||||
/*
|
||||
* __mnt_is_readonly: check whether a mount is read-only
|
||||
* @mnt: the mount to check for its write status
|
||||
*
|
||||
* This shouldn't be used directly ouside of the VFS.
|
||||
* It does not guarantee that the filesystem will stay
|
||||
* r/w, just that it is right *now*. This can not and
|
||||
* should not be used in place of IS_RDONLY(inode).
|
||||
*/
|
||||
int __mnt_is_readonly(struct vfsmount *mnt)
|
||||
int mnt_make_readonly(struct vfsmount *mnt)
|
||||
{
|
||||
return (mnt->mnt_sb->s_flags & MS_RDONLY);
|
||||
int ret = 0;
|
||||
|
||||
lock_mnt_writers();
|
||||
/*
|
||||
* With all the locks held, this value is stable
|
||||
*/
|
||||
if (atomic_read(&mnt->__mnt_writers) > 0) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* actually set mount's r/o flag here to make
|
||||
* __mnt_is_readonly() true, which keeps anyone
|
||||
* from doing a successful mnt_want_write().
|
||||
*/
|
||||
out:
|
||||
unlock_mnt_writers();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__mnt_is_readonly);
|
||||
|
||||
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
|
||||
{
|
||||
|
@ -325,7 +518,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
|
|||
|
||||
static inline void __mntput(struct vfsmount *mnt)
|
||||
{
|
||||
int cpu;
|
||||
struct super_block *sb = mnt->mnt_sb;
|
||||
/*
|
||||
* We don't have to hold all of the locks at the
|
||||
* same time here because we know that we're the
|
||||
* last reference to mnt and that no new writers
|
||||
* can come in.
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
|
||||
if (cpu_writer->mnt != mnt)
|
||||
continue;
|
||||
spin_lock(&cpu_writer->lock);
|
||||
atomic_add(cpu_writer->count, &mnt->__mnt_writers);
|
||||
cpu_writer->count = 0;
|
||||
/*
|
||||
* Might as well do this so that no one
|
||||
* ever sees the pointer and expects
|
||||
* it to be valid.
|
||||
*/
|
||||
cpu_writer->mnt = NULL;
|
||||
spin_unlock(&cpu_writer->lock);
|
||||
}
|
||||
/*
|
||||
* This probably indicates that somebody messed
|
||||
* up a mnt_want/drop_write() pair. If this
|
||||
* happens, the filesystem was probably unable
|
||||
* to make r/w->r/o transitions.
|
||||
*/
|
||||
WARN_ON(atomic_read(&mnt->__mnt_writers));
|
||||
dput(mnt->mnt_root);
|
||||
free_vfsmnt(mnt);
|
||||
deactivate_super(sb);
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include <linux/types.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/nodemask.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <asm/atomic.h>
|
||||
|
||||
|
@ -30,6 +31,7 @@ struct mnt_namespace;
|
|||
#define MNT_RELATIME 0x20
|
||||
|
||||
#define MNT_SHRINKABLE 0x100
|
||||
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
|
||||
|
||||
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
|
||||
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
|
||||
|
@ -62,6 +64,11 @@ struct vfsmount {
|
|||
int mnt_expiry_mark; /* true if marked for expiry */
|
||||
int mnt_pinned;
|
||||
int mnt_ghosts;
|
||||
/*
|
||||
* This value is not stable unless all of the mnt_writers[] spinlocks
|
||||
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
|
||||
*/
|
||||
atomic_t __mnt_writers;
|
||||
};
|
||||
|
||||
static inline struct vfsmount *mntget(struct vfsmount *mnt)
|
||||
|
|
Loading…
Reference in a new issue