[PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq v3). It provides full process fairness, while giving excellent aggregate system throughput even for many competing processes. It supports io priorities, either inherited from the cpu nice value or set directly with the ioprio_get/set syscalls. The latter closely mimic set/getpriority. This import is based on my latest from -mm. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-27 10:55:12 +02:00 · 2005-06-27 10:55:12 +02:00 · 22e2c507c3
commit 22e2c507c3
parent 020f46a39e
26 changed files with 1732 additions and 739 deletions
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@ -289,3 +289,5 @@ ENTRY(sys_call_table)
 	.long sys_add_key
 	.long sys_request_key
 	.long sys_keyctl
+	.long sys_ioprio_set
+	.long sys_ioprio_get		/* 290 */
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@ -1577,8 +1577,8 @@ sys_call_table:
 	data8 sys_add_key
 	data8 sys_request_key
 	data8 sys_keyctl
-	data8 sys_ni_syscall
-	data8 sys_ni_syscall			// 1275
+	data8 sys_ioprio_set
+	data8 sys_ioprio_get			// 1275
 	data8 sys_set_zone_reclaim
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
--- a/arch/ppc/kernel/misc.S
+++ b/arch/ppc/kernel/misc.S
@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
 	.long sys_request_key		/* 270 */
 	.long sys_keyctl
 	.long sys_waitid
+	.long sys_ioprio_set
+	.long sys_ioprio_get
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
 	rq->elevator_private = NULL;
 }

-static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+static int as_set_request(request_queue_t *q, struct request *rq,
+			  struct bio *bio, int gfp_mask)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 	return 1;
 }

-static int as_may_queue(request_queue_t *q, int rw)
+static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator->elevator_data;
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
 }

 static int
-deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		     int gfp_mask)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }

-int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		    int gfp_mask)
 {
 	elevator_t *e = q->elevator;

 	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
+		return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);

 	rq->elevator_private = NULL;
 	return 0;
@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
 		e->ops->elevator_put_req_fn(q, rq);
 }

-int elv_may_queue(request_queue_t *q, int rw)
+int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	elevator_t *e = q->elevator;

 	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw);
+		return e->ops->elevator_may_queue_fn(q, rw, bio);

 	return ELV_MQUEUE_MAY;
 }
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
+	rq->ioprio = 0;
 	rq->buffer = NULL;
 	rq->ref_count = 1;
 	rq->q = q;
@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
 	if (!blk_remove_plug(q))
 		return;

-	/*
-	 * was plugged, fire request_fn if queue has stuff to do
-	 */
-	if (elv_next_request(q))
-		q->request_fn(q);
+	q->request_fn(q);
 }
 EXPORT_SYMBOL(__generic_unplug_device);

@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }

-static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
-						int gfp_mask)
+static inline struct request *
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
 	 */
 	rq->flags = rw;

-	if (!elv_set_request(q, rq, gfp_mask))
+	if (!elv_set_request(q, rq, bio, gfp_mask))
 		return rq;

 	mempool_free(rq, q->rq.rq_pool);
@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
 /*
 * Get a free request, queue_lock must not be held
 */
-static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
+static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
+				   int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		}
 	}

-	switch (elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw, bio)) {
 		case ELV_MQUEUE_NO:
 			goto rq_starved;
 		case ELV_MQUEUE_MAY:
@ -1920,7 +1918,7 @@ get_rq:
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);

-	rq = blk_alloc_request(q, rw, gfp_mask);
+	rq = blk_alloc_request(q, rw, bio, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@ -1961,7 +1959,8 @@ out:
 * No available requests for this queue, unplug the device and wait for some
 * requests to become available.
 */
-static struct request *get_request_wait(request_queue_t *q, int rw)
+static struct request *get_request_wait(request_queue_t *q, int rw,
+					struct bio *bio)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);

-		rq = get_request(q, rw, GFP_NOIO);
+		rq = get_request(q, rw, bio, GFP_NOIO);

 		if (!rq) {
 			struct io_context *ioc;
@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 	BUG_ON(rw != READ && rw != WRITE);

 	if (gfp_mask & __GFP_WAIT)
-		rq = get_request_wait(q, rw);
+		rq = get_request_wait(q, rw, NULL);
 	else
-		rq = get_request(q, rw, gfp_mask);
+		rq = get_request(q, rw, NULL, gfp_mask);

 	return rq;
 }
@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
 		return;

 	req->rq_status = RQ_INACTIVE;
-	req->q = NULL;
 	req->rl = NULL;

 	/*
@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
 		req->rq_disk->in_flight--;
 	}

+	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+
 	__blk_put_request(q, next);
 	return 1;
 }
@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
+	unsigned short prio;
 	sector_t sector;

 	sector = bio->bi_sector;
 	nr_sectors = bio_sectors(bio);
 	cur_nr_sectors = bio_cur_sectors(bio);
+	prio = bio_prio(bio);

 	rw = bio_data_dir(bio);
 	sync = bio_sync(bio);
@ -2559,6 +2561,7 @@ again:
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req);
@ -2583,6 +2586,7 @@ again:
 			req->hard_cur_sectors = cur_nr_sectors;
 			req->sector = req->hard_sector = sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req);
@ -2610,7 +2614,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
+		if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@ -2618,7 +2622,7 @@ get_rq:
 			if (bio_rw_ahead(bio))
 				goto end_io;
 	
-			freereq = get_request_wait(q, rw);
+			freereq = get_request_wait(q, rw, bio);
 		}
 		goto again;
 	}
@ -2646,6 +2650,7 @@ get_rq:
 	req->buffer = bio_data(bio);	/* see ->buffer comment above */
 	req->waiting = NULL;
 	req->bio = req->biotail = bio;
+	req->ioprio = prio;
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;

@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
 	if (bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;

-		switch (bio->bi_rw) {
+		switch (bio_data_dir(bio)) {
 		case READ:
 			p->read_sectors += bio_sectors(bio);
 			p->reads++;
@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
 {
 	struct request_list *rl = &q->rq;
 	struct request *rq;
+	int requeued = 0;

 	spin_lock_irq(q->queue_lock);
 	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
 		rq = list_entry_rq(q->drain_list.next);

 		list_del_init(&rq->queuelist);
-		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+		elv_requeue_request(q, rq);
+		requeued++;
 	}

+	if (requeued)
+		q->request_fn(q);
+
 	spin_unlock_irq(q->queue_lock);

 	wake_up(&rl->wait[0]);
@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)

 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
-	bio->bi_rw = rw;
+	bio->bi_rw |= rw;
 	if (rw & WRITE)
 		mod_page_state(pgpgout, count);
 	else
@ -3257,8 +3267,11 @@ void exit_io_context(void)
 	struct io_context *ioc;

 	local_irq_save(flags);
+	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
+	ioc->task = NULL;
+	task_unlock(current);
 	local_irq_restore(flags);

 	if (ioc->aic && ioc->aic->exit)
@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
 	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->pid = tsk->pid;
+		ret->task = current;
+		ret->set_ioprio = NULL;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic = NULL;
-		spin_lock_init(&ret->lock);

 		local_irq_save(flags);

--- a/fs/Makefile
+++ b/fs/Makefile
@ -10,6 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
+		ioprio.o

 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@ -0,0 +1,172 @@
+/*
+ * fs/ioprio.c
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
+ *
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ioprio.h>
+#include <linux/blkdev.h>
+
+static int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+	struct io_context *ioc;
+
+	if (task->uid != current->euid &&
+	    task->uid != current->uid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	task_lock(task);
+
+	task->ioprio = ioprio;
+
+	ioc = task->io_context;
+	if (ioc && ioc->set_ioprio)
+		ioc->set_ioprio(ioc, ioprio);
+
+	task_unlock(task);
+	return 0;
+}
+
+asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
+{
+	int class = IOPRIO_PRIO_CLASS(ioprio);
+	int data = IOPRIO_PRIO_DATA(ioprio);
+	struct task_struct *p, *g;
+	struct user_struct *user;
+	int ret;
+
+	switch (class) {
+		case IOPRIO_CLASS_RT:
+			if (!capable(CAP_SYS_ADMIN))
+				return -EPERM;
+			/* fall through, rt has prio field too */
+		case IOPRIO_CLASS_BE:
+			if (data >= IOPRIO_BE_NR || data < 0)
+				return -EINVAL;
+
+			break;
+		case IOPRIO_CLASS_IDLE:
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	ret = -ESRCH;
+	read_lock_irq(&tasklist_lock);
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_pid(who);
+			if (p)
+				ret = set_task_ioprio(p, ioprio);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current->user;
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (p->uid != who)
+					continue;
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	read_unlock_irq(&tasklist_lock);
+	return ret;
+}
+
+asmlinkage int sys_ioprio_get(int which, int who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	int ret = -ESRCH;
+
+	read_lock_irq(&tasklist_lock);
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_pid(who);
+			if (p)
+				ret = p->ioprio;
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				if (ret == -ESRCH)
+					ret = p->ioprio;
+				else
+					ret = ioprio_best(ret, p->ioprio);
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current->user;
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (p->uid != user->uid)
+					continue;
+				if (ret == -ESRCH)
+					ret = p->ioprio;
+				else
+					ret = ioprio_best(ret, p->ioprio);
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	read_unlock_irq(&tasklist_lock);
+	return ret;
+}
+
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@ -645,18 +645,22 @@ struct buffer_chunk {

 static void write_chunk(struct buffer_chunk *chunk) {
    int i;
+    get_fs_excl();
    for (i = 0; i < chunk->nr ; i++) {
 	submit_logged_buffer(chunk->bh[i]) ;
    }
    chunk->nr = 0;
+    put_fs_excl();
 }

 static void write_ordered_chunk(struct buffer_chunk *chunk) {
    int i;
+    get_fs_excl();
    for (i = 0; i < chunk->nr ; i++) {
 	submit_ordered_buffer(chunk->bh[i]) ;
    }
    chunk->nr = 0;
+    put_fs_excl();
 }

 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
    return 0 ;
  }

+  get_fs_excl();
+
  /* before we can put our commit blocks on disk, we have to make sure everyone older than
  ** us is on disk too
  */
@ -1055,6 +1061,7 @@ put_jl:

  if (retval)
    reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
+  put_fs_excl();
  return retval;
 }

@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
    return 0 ;
  }

+  get_fs_excl();
+
  /* if all the work is already done, get out of here */
  if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
      atomic_read(&(jl->j_commit_left)) <= 0) {
@ -1450,6 +1459,7 @@ flush_older_and_return:
  put_journal_list(s, jl);
  if (flushall)
    up(&journal->j_flush_sem);
+  put_fs_excl();
  return err ;
 } 

@ -2719,6 +2729,7 @@ relock:
  th->t_trans_id = journal->j_trans_id ;
  unlock_journal(p_s_sb) ;
  INIT_LIST_HEAD (&th->t_list);
+  get_fs_excl();
  return 0 ;

 out_fail:
@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
  BUG_ON (th->t_refcount > 1);
  BUG_ON (!th->t_trans_id);

+  put_fs_excl();
  current->journal_info = th->t_handle_save;
  reiserfs_check_lock_depth(p_s_sb, "journal end");
  if (journal->j_len == 0) {
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@ -294,8 +294,10 @@
 #define __NR_add_key		286
 #define __NR_request_key	287
 #define __NR_keyctl		288
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290

-#define NR_syscalls 289
+#define NR_syscalls 291

 /*
 * user-visible error numbers are in the range -1 - -128: see
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@ -263,6 +263,8 @@
 #define __NR_add_key			1271
 #define __NR_request_key		1272
 #define __NR_keyctl			1273
+#define __NR_ioprio_set			1274
+#define __NR_ioprio_get			1275
 #define __NR_set_zone_reclaim		1276

 #ifdef __KERNEL__
--- a/include/asm-ppc/unistd.h
+++ b/include/asm-ppc/unistd.h
@ -277,8 +277,10 @@
 #define __NR_request_key	270
 #define __NR_keyctl		271
 #define __NR_waitid		272
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274

-#define __NR_syscalls		273
+#define __NR_syscalls		275

 #define __NR(n)	#n

--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
 __SYSCALL(__NR_request_key, sys_request_key)
 #define __NR_keyctl		250
 __SYSCALL(__NR_keyctl, sys_keyctl)
+#define __NR_ioprio_set		251
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get		252
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)

-#define __NR_syscall_max __NR_keyctl
+#define __NR_syscall_max __NR_ioprio_get
 #ifndef __NO_STUBS

 /* user-visible error numbers are in the range -1 - -4095 */
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@ -22,6 +22,7 @@

 #include <linux/highmem.h>
 #include <linux/mempool.h>
+#include <linux/ioprio.h>

 /* Platforms may set this to teach the BIO layer about IOMMU hardware. */
 #include <asm/io.h>
@ -149,6 +150,19 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4

+/*
+ * upper 16 bits of bi_rw define the io priority of this bio
+ */
+#define BIO_PRIO_SHIFT	(8 * sizeof(unsigned long) - IOPRIO_BITS)
+#define bio_prio(bio)	((bio)->bi_rw >> BIO_PRIO_SHIFT)
+#define bio_prio_valid(bio)	ioprio_valid(bio_prio(bio))
+
+#define bio_set_prio(bio, prio)		do {			\
+	WARN_ON(prio >= (1 << IOPRIO_BITS));			\
+	(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1);		\
+	(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT);	\
+} while (0)
+
 /*
 * various member access, note that bio_data should of course not be used
 * on highmem page vectors
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -54,16 +54,23 @@ struct as_io_context {

 struct cfq_queue;
 struct cfq_io_context {
-	void (*dtor)(struct cfq_io_context *);
-	void (*exit)(struct cfq_io_context *);
-
-	struct io_context *ioc;
-
 	/*
 	 * circular list of cfq_io_contexts belonging to a process io context
 	 */
 	struct list_head list;
 	struct cfq_queue *cfqq;
+	void *key;
+
+	struct io_context *ioc;
+
+	unsigned long last_end_request;
+	unsigned long last_queue;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	void (*dtor)(struct cfq_io_context *);
+	void (*exit)(struct cfq_io_context *);
 };

 /*
@ -73,7 +80,9 @@ struct cfq_io_context {
 */
 struct io_context {
 	atomic_t refcount;
-	pid_t pid;
+	struct task_struct *task;
+
+	int (*set_ioprio)(struct io_context *, unsigned int);

 	/*
 	 * For request batching
@ -81,8 +90,6 @@ struct io_context {
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */

-	spinlock_t lock;
-
 	struct as_io_context *aic;
 	struct cfq_io_context *cic;
 };
@ -134,6 +141,8 @@ struct request {

 	void *elevator_private;

+	unsigned short ioprio;
+
 	int rq_status;	/* should split this into a few status bits */
 	struct gendisk *rq_disk;
 	int errors;
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
 typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
 typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
-typedef int (elevator_may_queue_fn) (request_queue_t *, int);
+typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);

-typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
+typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);

@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
 extern struct request *elv_latter_request(request_queue_t *, struct request *);
 extern int elv_register_queue(request_queue_t *q);
 extern void elv_unregister_queue(request_queue_t *q);
-extern int elv_may_queue(request_queue_t *, int);
+extern int elv_may_queue(request_queue_t *, int, struct bio *);
 extern void elv_completed_request(request_queue_t *, struct request *);
-extern int elv_set_request(request_queue_t *, struct request *, int);
+extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
 extern void elv_put_request(request_queue_t *, struct request *);

 /*
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -213,6 +213,7 @@ extern int dir_notify_enable;
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
+#include <linux/sched.h>

 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@ -822,16 +823,34 @@ enum {
 #define vfs_check_frozen(sb, level) \
 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))

+static inline void get_fs_excl(void)
+{
+	atomic_inc(&current->fs_excl);
+}
+
+static inline void put_fs_excl(void)
+{
+	atomic_dec(&current->fs_excl);
+}
+
+static inline int has_fs_excl(void)
+{
+	return atomic_read(&current->fs_excl);
+}
+
+
 /*
 * Superblock locking.
 */
 static inline void lock_super(struct super_block * sb)
 {
+	get_fs_excl();
 	down(&sb->s_lock);
 }

 static inline void unlock_super(struct super_block * sb)
 {
+	put_fs_excl();
 	up(&sb->s_lock);
 }

--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@ -81,6 +81,7 @@ extern struct group_info init_groups;
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.ioprio		= 0,						\
 	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
@ -110,6 +111,7 @@ extern struct group_info init_groups;
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.fs_excl	= ATOMIC_INIT(0),				\
 }


--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@ -0,0 +1,87 @@
+#ifndef IOPRIO_H
+#define IOPRIO_H
+
+#include <linux/sched.h>
+
+/*
+ * Gives us 8 prio classes with 13-bits of data for each class
+ */
+#define IOPRIO_BITS		(16)
+#define IOPRIO_CLASS_SHIFT	(13)
+#define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
+
+#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
+#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
+
+#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
+
+/*
+ * These are the io priority groups as implemented by CFQ. RT is the realtime
+ * class, it always gets premium service. BE is the best-effort scheduling
+ * class, the default for any process. IDLE is the idle scheduling class, it
+ * is only served when no one else is using the disk.
+ */
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+/*
+ * 8 best effort priority levels are supported
+ */
+#define IOPRIO_BE_NR	(8)
+
+asmlinkage int sys_ioprio_set(int, int, int);
+asmlinkage int sys_ioprio_get(int, int);
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+/*
+ * if process has set io priority explicitly, use that. if not, convert
+ * the cpu scheduler nice value to an io priority
+ */
+#define IOPRIO_NORM	(4)
+static inline int task_ioprio(struct task_struct *task)
+{
+	WARN_ON(!ioprio_valid(task->ioprio));
+	return IOPRIO_PRIO_DATA(task->ioprio);
+}
+
+static inline int task_nice_ioprio(struct task_struct *task)
+{
+	return (task_nice(task) + 20) / 5;
+}
+
+/*
+ * For inheritance, return the highest of the two given priorities
+ */
+static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
+{
+	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
+	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+
+	if (!ioprio_valid(aprio))
+		return bprio;
+	if (!ioprio_valid(bprio))
+		return aprio;
+
+	if (aclass == IOPRIO_CLASS_NONE)
+		aclass = IOPRIO_CLASS_BE;
+	if (bclass == IOPRIO_CLASS_NONE)
+		bclass = IOPRIO_CLASS_BE;
+
+	if (aclass == bclass)
+		return min(aprio, bprio);
+	if (aclass > bclass)
+		return bprio;
+	else
+		return aprio;
+}
+
+#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -608,6 +608,8 @@ struct task_struct {
 	struct list_head run_list;
 	prio_array_t *array;

+	unsigned short ioprio;
+
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
@ -763,6 +765,7 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+	atomic_t fs_excl;	/* holding fs exclusive resources */
 };

 static inline pid_t process_group(struct task_struct *tsk)
@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);

 /*
 * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
- * subscriptions and synchronises with wait4().  Also used in procfs.
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@ -14,11 +14,13 @@ extern struct list_head inode_unused;
 * Yes, writeback.h requires sched.h
 * No, sched.h is not included from here.
 */
-static inline int current_is_pdflush(void)
+static inline int task_is_pdflush(struct task_struct *task)
 {
-	return current->flags & PF_FLUSHER;
+	return task->flags & PF_FLUSHER;
 }

+#define current_is_pdflush()	task_is_pdflush(current)
+
 /*
 * fs/fs-writeback.c
 */
--- a/kernel/exit.c
+++ b/kernel/exit.c
@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)

 	profile_task_exit(tsk);

+	WARN_ON(atomic_read(&tsk->fs_excl));
+
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
--- a/kernel/fork.c
+++ b/kernel/fork.c
@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 	}

+	/*
+	 * inherit ioprio
+	 */
+	p->ioprio = current->ioprio;
+
 	SET_LINKS(p);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -3448,15 +3448,7 @@ int task_nice(const task_t *p)
 {
 	return TASK_NICE(p);
 }
-
-/*
- * The only users of task_nice are binfmt_elf and binfmt_elf32.
- * binfmt_elf is no longer modular, but binfmt_elf32 still is.
- * Therefore, task_nice is needed if there is a compat_mode.
- */
-#ifdef CONFIG_COMPAT
 EXPORT_SYMBOL_GPL(task_nice);
-#endif

 /**
 * idle_cpu - is a given cpu idle currently?