implement in-kernel gendisk events handling

Currently, media presence polling for removeable block devices is done
from userland.  There are several issues with this.

* Polling is done by periodically opening the device.  For SCSI
  devices, the command sequence generated by such action involves a
  few different commands including TEST_UNIT_READY.  This behavior,
  while perfectly legal, is different from Windows which only issues
  single command, GET_EVENT_STATUS_NOTIFICATION.  Unfortunately, some
  ATAPI devices lock up after being periodically queried such command
  sequences.

* There is no reliable and unintrusive way for a userland program to
  tell whether the target device is safe for media presence polling.
  For example, polling for media presence during an on-going burning
  session can make it fail.  The polling program can avoid this by
  opening the device with O_EXCL but then it risks making a valid
  exclusive user of the device fail w/ -EBUSY.

* Userland polling is unnecessarily heavy and in-kernel implementation
  is lighter and better coordinated (workqueue, timer slack).

This patch implements framework for in-kernel disk event handling,
which includes media presence polling.

* bdops->check_events() is added, which supercedes ->media_changed().
  It should check whether there's any pending event and return if so.
  Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
  DISK_EVENT_EJECT_REQUEST.  ->check_events() is guaranteed not to be
  called parallelly.

* gendisk->events and ->async_events are added.  These should be
  initialized by block driver before passing the device to add_disk().
  The former contains the mask of all supported events and the latter
  the mask of all events which the device can report without polling.
  /sys/block/*/events[_async] export these to userland.

* Kernel parameter block.events_dfl_poll_msecs controls the system
  polling interval (default is 0 which means disable) and
  /sys/block/*/events_poll_msecs control polling intervals for
  individual devices (default is -1 meaning use system setting).  Note
  that if a device can report all supported events asynchronously and
  its polling interval isn't explicitly set, the device won't be
  polled regardless of the system polling interval.

* If a device is opened exclusively with write access, event checking
  is automatically disabled until all write exclusive accesses are
  released.

* There are event 'clearing' events.  For example, both of currently
  defined events are cleared after the device has been successfully
  opened.  This information is passed to ->check_events() callback
  using @clearing argument as a hint.

* Event checking is always performed from system_nrt_wq and timer
  slack is set to 25% for polling.

* Nothing changes for drivers which implement ->media_changed() but
  not ->check_events().  Going forward, all drivers will be converted
  to ->check_events() and ->media_change() will be dropped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
This commit is contained in:
Tejun Heo 2010-12-08 20:57:37 +01:00 committed by Jens Axboe
parent d2bf1b6723
commit 77ea887e43
5 changed files with 484 additions and 8 deletions

View file

@ -18,6 +18,7 @@
#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include "blk.h"
@ -35,6 +36,10 @@ static DEFINE_IDR(ext_devt_idr);
static struct device_type disk_type;
static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
/**
* disk_get_part - get partition
* @disk: disk to look partition from
@ -609,6 +614,8 @@ void add_disk(struct gendisk *disk)
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
WARN_ON(retval);
disk_add_events(disk);
}
EXPORT_SYMBOL(add_disk);
@ -617,6 +624,8 @@ void del_gendisk(struct gendisk *disk)
struct disk_part_iter piter;
struct hd_struct *part;
disk_del_events(disk);
/* invalidate stuff */
disk_part_iter_init(&piter, disk,
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
@ -1089,6 +1098,7 @@ static void disk_release(struct device *dev)
{
struct gendisk *disk = dev_to_disk(dev);
disk_release_events(disk);
kfree(disk->random);
disk_replace_part_tbl(disk, NULL);
free_part_stats(&disk->part0);
@ -1350,3 +1360,422 @@ int invalidate_partition(struct gendisk *disk, int partno)
}
EXPORT_SYMBOL(invalidate_partition);
/*
* Disk events - monitor disk events like media change and eject request.
*/
struct disk_events {
struct list_head node; /* all disk_event's */
struct gendisk *disk; /* the associated disk */
spinlock_t lock;
int block; /* event blocking depth */
unsigned int pending; /* events already sent out */
unsigned int clearing; /* events being cleared */
long poll_msecs; /* interval, -1 for default */
struct delayed_work dwork;
};
static const char *disk_events_strs[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
};
static char *disk_uevents[] = {
[ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
[ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
};
/* list of all disk_events */
static DEFINE_MUTEX(disk_events_mutex);
static LIST_HEAD(disk_events);
/* disable in-kernel polling by default */
static unsigned long disk_events_dfl_poll_msecs = 0;
static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
{
struct disk_events *ev = disk->ev;
long intv_msecs = 0;
/*
* If device-specific poll interval is set, always use it. If
* the default is being used, poll iff there are events which
* can't be monitored asynchronously.
*/
if (ev->poll_msecs >= 0)
intv_msecs = ev->poll_msecs;
else if (disk->events & ~disk->async_events)
intv_msecs = disk_events_dfl_poll_msecs;
return msecs_to_jiffies(intv_msecs);
}
static void __disk_block_events(struct gendisk *disk, bool sync)
{
struct disk_events *ev = disk->ev;
unsigned long flags;
bool cancel;
spin_lock_irqsave(&ev->lock, flags);
cancel = !ev->block++;
spin_unlock_irqrestore(&ev->lock, flags);
if (cancel) {
if (sync)
cancel_delayed_work_sync(&disk->ev->dwork);
else
cancel_delayed_work(&disk->ev->dwork);
}
}
static void __disk_unblock_events(struct gendisk *disk, bool check_now)
{
struct disk_events *ev = disk->ev;
unsigned long intv;
unsigned long flags;
spin_lock_irqsave(&ev->lock, flags);
if (WARN_ON_ONCE(ev->block <= 0))
goto out_unlock;
if (--ev->block)
goto out_unlock;
/*
* Not exactly a latency critical operation, set poll timer
* slack to 25% and kick event check.
*/
intv = disk_events_poll_jiffies(disk);
set_timer_slack(&ev->dwork.timer, intv / 4);
if (check_now)
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
else if (intv)
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
out_unlock:
spin_unlock_irqrestore(&ev->lock, flags);
}
/**
* disk_block_events - block and flush disk event checking
* @disk: disk to block events for
*
* On return from this function, it is guaranteed that event checking
* isn't in progress and won't happen until unblocked by
* disk_unblock_events(). Events blocking is counted and the actual
* unblocking happens after the matching number of unblocks are done.
*
* Note that this intentionally does not block event checking from
* disk_clear_events().
*
* CONTEXT:
* Might sleep.
*/
void disk_block_events(struct gendisk *disk)
{
if (disk->ev)
__disk_block_events(disk, true);
}
/**
* disk_unblock_events - unblock disk event checking
* @disk: disk to unblock events for
*
* Undo disk_block_events(). When the block count reaches zero, it
* starts events polling if configured.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_unblock_events(struct gendisk *disk)
{
if (disk->ev)
__disk_unblock_events(disk, true);
}
/**
* disk_check_events - schedule immediate event checking
* @disk: disk to check events for
*
* Schedule immediate event checking on @disk if not blocked.
*
* CONTEXT:
* Don't care. Safe to call from irq context.
*/
void disk_check_events(struct gendisk *disk)
{
if (disk->ev) {
__disk_block_events(disk, false);
__disk_unblock_events(disk, true);
}
}
EXPORT_SYMBOL_GPL(disk_check_events);
/**
* disk_clear_events - synchronously check, clear and return pending events
* @disk: disk to fetch and clear events from
* @mask: mask of events to be fetched and clearted
*
* Disk events are synchronously checked and pending events in @mask
* are cleared and returned. This ignores the block count.
*
* CONTEXT:
* Might sleep.
*/
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{
const struct block_device_operations *bdops = disk->fops;
struct disk_events *ev = disk->ev;
unsigned int pending;
if (!ev) {
/* for drivers still using the old ->media_changed method */
if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
bdops->media_changed && bdops->media_changed(disk))
return DISK_EVENT_MEDIA_CHANGE;
return 0;
}
/* tell the workfn about the events being cleared */
spin_lock_irq(&ev->lock);
ev->clearing |= mask;
spin_unlock_irq(&ev->lock);
/* uncondtionally schedule event check and wait for it to finish */
__disk_block_events(disk, true);
queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
flush_delayed_work(&ev->dwork);
__disk_unblock_events(disk, false);
/* then, fetch and clear pending events */
spin_lock_irq(&ev->lock);
WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
pending = ev->pending & mask;
ev->pending &= ~mask;
spin_unlock_irq(&ev->lock);
return pending;
}
static void disk_events_workfn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
struct gendisk *disk = ev->disk;
char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
unsigned int clearing = ev->clearing;
unsigned int events;
unsigned long intv;
int nr_events = 0, i;
/* check events */
events = disk->fops->check_events(disk, clearing);
/* accumulate pending events and schedule next poll if necessary */
spin_lock_irq(&ev->lock);
events &= ~ev->pending;
ev->pending |= events;
ev->clearing &= ~clearing;
intv = disk_events_poll_jiffies(disk);
if (!ev->block && intv)
queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
spin_unlock_irq(&ev->lock);
/* tell userland about new events */
for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
if (events & (1 << i))
envp[nr_events++] = disk_uevents[i];
if (nr_events)
kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
}
/*
* A disk events enabled device has the following sysfs nodes under
* its /sys/block/X/ directory.
*
* events : list of all supported events
* events_async : list of events which can be detected w/o polling
* events_poll_msecs : polling interval, 0: disable, -1: system default
*/
static ssize_t __disk_events_show(unsigned int events, char *buf)
{
const char *delim = "";
ssize_t pos = 0;
int i;
for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
if (events & (1 << i)) {
pos += sprintf(buf + pos, "%s%s",
delim, disk_events_strs[i]);
delim = " ";
}
if (pos)
pos += sprintf(buf + pos, "\n");
return pos;
}
static ssize_t disk_events_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return __disk_events_show(disk->events, buf);
}
static ssize_t disk_events_async_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return __disk_events_show(disk->async_events, buf);
}
static ssize_t disk_events_poll_msecs_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct gendisk *disk = dev_to_disk(dev);
return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
}
static ssize_t disk_events_poll_msecs_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct gendisk *disk = dev_to_disk(dev);
long intv;
if (!count || !sscanf(buf, "%ld", &intv))
return -EINVAL;
if (intv < 0 && intv != -1)
return -EINVAL;
__disk_block_events(disk, true);
disk->ev->poll_msecs = intv;
__disk_unblock_events(disk, true);
return count;
}
static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
disk_events_poll_msecs_show,
disk_events_poll_msecs_store);
static const struct attribute *disk_events_attrs[] = {
&dev_attr_events.attr,
&dev_attr_events_async.attr,
&dev_attr_events_poll_msecs.attr,
NULL,
};
/*
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
* /sys/module/block/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)
{
struct disk_events *ev;
int ret;
ret = param_set_ulong(val, kp);
if (ret < 0)
return ret;
mutex_lock(&disk_events_mutex);
list_for_each_entry(ev, &disk_events, node)
disk_check_events(ev->disk);
mutex_unlock(&disk_events_mutex);
return 0;
}
static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
.set = disk_events_set_dfl_poll_msecs,
.get = param_get_ulong,
};
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "block."
module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
&disk_events_dfl_poll_msecs, 0644);
/*
* disk_{add|del|release}_events - initialize and destroy disk_events.
*/
static void disk_add_events(struct gendisk *disk)
{
struct disk_events *ev;
if (!disk->fops->check_events || !(disk->events | disk->async_events))
return;
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
if (!ev) {
pr_warn("%s: failed to initialize events\n", disk->disk_name);
return;
}
if (sysfs_create_files(&disk_to_dev(disk)->kobj,
disk_events_attrs) < 0) {
pr_warn("%s: failed to create sysfs files for events\n",
disk->disk_name);
kfree(ev);
return;
}
disk->ev = ev;
INIT_LIST_HEAD(&ev->node);
ev->disk = disk;
spin_lock_init(&ev->lock);
ev->block = 1;
ev->poll_msecs = -1;
INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
mutex_lock(&disk_events_mutex);
list_add_tail(&ev->node, &disk_events);
mutex_unlock(&disk_events_mutex);
/*
* Block count is initialized to 1 and the following initial
* unblock kicks it into action.
*/
__disk_unblock_events(disk, true);
}
static void disk_del_events(struct gendisk *disk)
{
if (!disk->ev)
return;
__disk_block_events(disk, true);
mutex_lock(&disk_events_mutex);
list_del_init(&disk->ev->node);
mutex_unlock(&disk_events_mutex);
sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
}
static void disk_release_events(struct gendisk *disk)
{
/* the block count should be 1 from disk_del_events() */
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}

View file

@ -948,10 +948,11 @@ int check_disk_change(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
const struct block_device_operations *bdops = disk->fops;
unsigned int events;
if (!bdops->media_changed)
return 0;
if (!bdops->media_changed(bdev->bd_disk))
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return 0;
flush_disk(bdev);
@ -1158,9 +1159,10 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
if (whole) {
/* finish claiming */
mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
if (res == 0) {
if (!res) {
BUG_ON(!bd_may_claim(bdev, whole, holder));
/*
* Note that for a whole device bd_holders
@ -1180,6 +1182,20 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
wake_up_bit(&whole->bd_claiming, 0);
spin_unlock(&bdev_lock);
/*
* Block event polling for write claims. Any write
* holder makes the write_holder state stick until all
* are released. This is good enough and tracking
* individual writeable reference is too fragile given
* the way @mode is used in blkdev_get/put().
*/
if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
bdev->bd_write_holder = true;
disk_block_events(bdev->bd_disk);
}
mutex_unlock(&bdev->bd_mutex);
bdput(whole);
}
@ -1353,12 +1369,23 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
spin_unlock(&bdev_lock);
/* if this was the last claim, holder link should go too */
if (bdev_free)
/*
* If this was the last claim, remove holder link and
* unblock evpoll if it was a write holder.
*/
if (bdev_free) {
bd_unlink_disk_holder(bdev);
if (bdev->bd_write_holder) {
disk_unblock_events(bdev->bd_disk);
bdev->bd_write_holder = false;
} else
disk_check_events(bdev->bd_disk);
}
mutex_unlock(&bdev->bd_mutex);
}
} else
disk_check_events(bdev->bd_disk);
return __blkdev_put(bdev, mode, 0);
}
EXPORT_SYMBOL(blkdev_put);

View file

@ -1251,6 +1251,9 @@ struct block_device_operations {
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);

View file

@ -662,6 +662,7 @@ struct block_device {
void * bd_claiming;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct gendisk * bd_holder_disk; /* for sysfs slave linkng */
#endif

View file

@ -127,6 +127,11 @@ struct hd_struct {
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
enum {
DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
};
#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
@ -143,6 +148,8 @@ struct disk_part_tbl {
struct hd_struct __rcu *part[];
};
struct disk_events;
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
@ -154,6 +161,10 @@ struct gendisk {
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, mode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
@ -171,8 +182,8 @@ struct gendisk {
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk)
return disk->part0.policy;
}
extern void disk_block_events(struct gendisk *disk);
extern void disk_unblock_events(struct gendisk *disk);
extern void disk_check_events(struct gendisk *disk);
extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);