diff options
author | Felix Domke <tmbinc@elitedvb.ne> | 2008-07-14 06:43:09 +0000 |
---|---|---|
committer | Felix Domke <tmbinc@elitedvb.ne> | 2008-07-14 06:43:09 +0000 |
commit | cd21ab24b9b40a4f5d45c9ee42f15b795e1fb862 (patch) | |
tree | e371dc413494245a947dacfdab3d886e3ba584ba /packages/linux | |
parent | a45b36e9ec6383550c2406e3215faf0a47abac65 (diff) | |
download | openembedded-cd21ab24b9b40a4f5d45c9ee42f15b795e1fb862.tar.gz |
linux-dm800: keep big patches outside metadata
Diffstat (limited to 'packages/linux')
-rw-r--r-- | packages/linux/linux-dm800.bb | 2 | ||||
-rw-r--r-- | packages/linux/linux-dm800/linux-2.6.12-add-ioprio.patch | 4148 | ||||
-rw-r--r-- | packages/linux/linux-dm800/linux-2.6.12-dm8000-nand.patch | 235 |
3 files changed, 1 insertions, 4384 deletions
diff --git a/packages/linux/linux-dm800.bb b/packages/linux/linux-dm800.bb index 2fcb7a6e89..1b07a88615 100644 --- a/packages/linux/linux-dm800.bb +++ b/packages/linux/linux-dm800.bb @@ -14,8 +14,8 @@ SRC_URI += "ftp://ftp.kernel.org/pub/linux/kernel/v2.6/linux-${KV}.tar.bz2 \ http://sources.dreamboxupdate.com/download/kernel-patches/linux-2.6.12-dvb-core-fix-several-locking-problems.patch.bz2;patch=1;pnum=1 \ http://sources.dreamboxupdate.com/download/kernel-patches/linux-2.6.12-dvbapi-pilot-rolloff-extension-r0.patch.bz2;patch=1;pnum=1\ http://sources.dreamboxupdate.com/download/kernel-patches/linux-2.6.12-update-wireless.patch.bz2;patch=1;pnum=1\ + http://sources.dreamboxupdate.com/download/kernel-patches/linux-2.6.12-add-ioprio.patch.bz2;patch=1;pnum=1 \ file://linux-2.6.12-dream-misc.patch;patch=1;pnum=1 \ - file://linux-2.6.12-add-ioprio.patch;patch=1;pnum=1 \ file://linux-2.6.12-fix-serial.patch;patch=1;pnum=1 \ file://linux-2.6.12-dm800-flash-layout.patch;patch=1;pnum=1 \ file://linux-2.6.12-dream-temp.patch;patch=1;pnum=1 \ diff --git a/packages/linux/linux-dm800/linux-2.6.12-add-ioprio.patch b/packages/linux/linux-dm800/linux-2.6.12-add-ioprio.patch deleted file mode 100644 index ef35dd3fe9..0000000000 --- a/packages/linux/linux-dm800/linux-2.6.12-add-ioprio.patch +++ /dev/null @@ -1,4148 +0,0 @@ -diff -Naur 2.6.12-5.0-org/Documentation/block/ioprio.txt 2.6.12-5.0-patched/Documentation/block/ioprio.txt ---- 2.6.12-5.0-org/Documentation/block/ioprio.txt 1970-01-01 01:00:00.000000000 +0100 -+++ 2.6.12-5.0-patched/Documentation/block/ioprio.txt 2007-12-11 12:34:52.000000000 +0100 -@@ -0,0 +1,179 @@ -+Block io priorities -+=================== -+ -+ -+Intro -+----- -+ -+With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io -+priorities is supported for reads on files. This enables users to io nice -+processes or process groups, similar to what has been possible to cpu -+scheduling for ages. This document mainly details the current possibilites -+with cfq, other io schedulers do not support io priorities so far. -+ -+Scheduling classes -+------------------ -+ -+CFQ implements three generic scheduling classes that determine how io is -+served for a process. -+ -+IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given -+higher priority than any other in the system, processes from this class are -+given first access to the disk every time. Thus it needs to be used with some -+care, one io RT process can starve the entire system. Within the RT class, -+there are 8 levels of class data that determine exactly how much time this -+process needs the disk for on each service. In the future this might change -+to be more directly mappable to performance, by passing in a wanted data -+rate instead. -+ -+IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default -+for any process that hasn't set a specific io priority. The class data -+determines how much io bandwidth the process will get, it's directly mappable -+to the cpu nice levels just more coarsely implemented. 0 is the highest -+BE prio level, 7 is the lowest. The mapping between cpu nice level and io -+nice level is determined as: io_nice = (cpu_nice + 20) / 5. -+ -+IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this -+level only get io time when no one else needs the disk. The idle class has no -+class data, since it doesn't really apply here. -+ -+Tools -+----- -+ -+See below for a sample ionice tool. Usage: -+ -+# ionice -c<class> -n<level> -p<pid> -+ -+If pid isn't given, the current process is assumed. IO priority settings -+are inherited on fork, so you can use ionice to start the process at a given -+level: -+ -+# ionice -c2 -n0 /bin/ls -+ -+will run ls at the best-effort scheduling class at the highest priority. -+For a running process, you can give the pid instead: -+ -+# ionice -c1 -n2 -p100 -+ -+will change pid 100 to run at the realtime scheduling class, at priority 2. -+ -+---> snip ionice.c tool <--- -+ -+#include <stdio.h> -+#include <stdlib.h> -+#include <errno.h> -+#include <getopt.h> -+#include <unistd.h> -+#include <sys/ptrace.h> -+#include <asm/unistd.h> -+ -+extern int sys_ioprio_set(int, int, int); -+extern int sys_ioprio_get(int, int); -+ -+#if defined(__i386__) -+#define __NR_ioprio_set 289 -+#define __NR_ioprio_get 290 -+#elif defined(__ppc__) -+#define __NR_ioprio_set 273 -+#define __NR_ioprio_get 274 -+#elif defined(__x86_64__) -+#define __NR_ioprio_set 251 -+#define __NR_ioprio_get 252 -+#elif defined(__ia64__) -+#define __NR_ioprio_set 1274 -+#define __NR_ioprio_get 1275 -+#elif defined(__mips__) -+#define __NR_ioprio_set 4284 -+#define __NR_ioprio_get 4285 -+#else -+#error "Unsupported arch" -+#endif -+ -+_syscall3(int, ioprio_set, int, which, int, who, int, ioprio); -+_syscall2(int, ioprio_get, int, which, int, who); -+ -+enum { -+ IOPRIO_CLASS_NONE, -+ IOPRIO_CLASS_RT, -+ IOPRIO_CLASS_BE, -+ IOPRIO_CLASS_IDLE, -+}; -+ -+enum { -+ IOPRIO_WHO_PROCESS = 1, -+ IOPRIO_WHO_PGRP, -+ IOPRIO_WHO_USER, -+}; -+ -+#define IOPRIO_CLASS_SHIFT 13 -+ -+const char *to_prio[] = { "none", "realtime", "best-effort", "idle", }; -+ -+int main(int argc, char *argv[]) -+{ -+ int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE; -+ int c, pid = 0; -+ -+ while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) { -+ switch (c) { -+ case 'n': -+ ioprio = strtol(optarg, NULL, 10); -+ set = 1; -+ break; -+ case 'c': -+ ioprio_class = strtol(optarg, NULL, 10); -+ set = 1; -+ break; -+ case 'p': -+ pid = strtol(optarg, NULL, 10); -+ break; -+ } -+ } -+ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_NONE: -+ ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_RT: -+ case IOPRIO_CLASS_BE: -+ break; -+ case IOPRIO_CLASS_IDLE: -+ ioprio = 7; -+ break; -+ default: -+ printf("bad prio class %d\n", ioprio_class); -+ return 1; -+ } -+ -+ if (!set) { -+ if (!pid && argv[optind]) -+ pid = strtol(argv[optind], NULL, 10); -+ -+ ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid); -+ -+ printf("pid=%d, %d\n", pid, ioprio); -+ -+ if (ioprio == -1) -+ perror("ioprio_get"); -+ else { -+ ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT; -+ ioprio = ioprio & 0xff; -+ printf("%s: prio %d\n", to_prio[ioprio_class], ioprio); -+ } -+ } else { -+ if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) { -+ perror("ioprio_set"); -+ return 1; -+ } -+ -+ if (argv[optind]) -+ execvp(argv[optind], &argv[optind]); -+ } -+ -+ return 0; -+} -+ -+---> snip ionice.c tool <--- -+ -+ -+March 11 2005, Jens Axboe <axboe@suse.de> -diff -Naur 2.6.12-5.0-org/drivers/block/as-iosched.c 2.6.12-5.0-patched/drivers/block/as-iosched.c ---- 2.6.12-5.0-org/drivers/block/as-iosched.c 2007-07-26 00:53:20.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/as-iosched.c 2007-12-11 12:34:52.000000000 +0100 -@@ -1806,7 +1806,8 @@ - rq->elevator_private = NULL; - } - --static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -+static int as_set_request(request_queue_t *q, struct request *rq, -+ struct bio *bio, int gfp_mask) - { - struct as_data *ad = q->elevator->elevator_data; - struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask); -@@ -1827,7 +1828,7 @@ - return 1; - } - --static int as_may_queue(request_queue_t *q, int rw) -+static int as_may_queue(request_queue_t *q, int rw, struct bio *bio) - { - int ret = ELV_MQUEUE_MAY; - struct as_data *ad = q->elevator->elevator_data; -diff -Naur 2.6.12-5.0-org/drivers/block/cfq-iosched.c 2.6.12-5.0-patched/drivers/block/cfq-iosched.c ---- 2.6.12-5.0-org/drivers/block/cfq-iosched.c 2007-07-26 00:53:20.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/cfq-iosched.c 2007-12-11 12:34:52.000000000 +0100 -@@ -21,22 +21,34 @@ - #include <linux/hash.h> - #include <linux/rbtree.h> - #include <linux/mempool.h> -- --static unsigned long max_elapsed_crq; --static unsigned long max_elapsed_dispatch; -+#include <linux/ioprio.h> -+#include <linux/writeback.h> - - /* - * tunables - */ - static int cfq_quantum = 4; /* max queue in one round of service */ - static int cfq_queued = 8; /* minimum rq allocate limit per-queue*/ --static int cfq_service = HZ; /* period over which service is avg */ --static int cfq_fifo_expire_r = HZ / 2; /* fifo timeout for sync requests */ --static int cfq_fifo_expire_w = 5 * HZ; /* fifo timeout for async requests */ --static int cfq_fifo_rate = HZ / 8; /* fifo expiry rate */ -+static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; - static int cfq_back_max = 16 * 1024; /* maximum backwards seek, in KiB */ - static int cfq_back_penalty = 2; /* penalty of a backwards seek */ - -+static int cfq_slice_sync = HZ / 10; -+static int cfq_slice_async = HZ / 25; -+static int cfq_slice_async_rq = 2; -+static int cfq_slice_idle = HZ / 100; -+ -+#define CFQ_IDLE_GRACE (HZ / 10) -+#define CFQ_SLICE_SCALE (5) -+ -+#define CFQ_KEY_ASYNC (0) -+#define CFQ_KEY_ANY (0xffff) -+ -+/* -+ * disable queueing at the driver/hardware level -+ */ -+static int cfq_max_depth = 2; -+ - /* - * for the hash of cfqq inside the cfqd - */ -@@ -55,6 +67,7 @@ - #define list_entry_hash(ptr) hlist_entry((ptr), struct cfq_rq, hash) - - #define list_entry_cfqq(ptr) list_entry((ptr), struct cfq_queue, cfq_list) -+#define list_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) - - #define RQ_DATA(rq) (rq)->elevator_private - -@@ -75,78 +88,110 @@ - #define rb_entry_crq(node) rb_entry((node), struct cfq_rq, rb_node) - #define rq_rb_key(rq) (rq)->sector - --/* -- * threshold for switching off non-tag accounting -- */ --#define CFQ_MAX_TAG (4) -- --/* -- * sort key types and names -- */ --enum { -- CFQ_KEY_PGID, -- CFQ_KEY_TGID, -- CFQ_KEY_UID, -- CFQ_KEY_GID, -- CFQ_KEY_LAST, --}; -- --static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL }; -- - static kmem_cache_t *crq_pool; - static kmem_cache_t *cfq_pool; - static kmem_cache_t *cfq_ioc_pool; - -+#define CFQ_PRIO_LISTS IOPRIO_BE_NR -+#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define cfq_class_be(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_BE) -+#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define ASYNC (0) -+#define SYNC (1) -+ -+#define cfq_cfqq_dispatched(cfqq) \ -+ ((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC]) -+ -+#define cfq_cfqq_class_sync(cfqq) ((cfqq)->key != CFQ_KEY_ASYNC) -+ -+#define cfq_cfqq_sync(cfqq) \ -+ (cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC]) -+ -+/* -+ * Per block device queue structure -+ */ - struct cfq_data { -- struct list_head rr_list; -+ atomic_t ref; -+ request_queue_t *queue; -+ -+ /* -+ * rr list of queues with requests and the count of them -+ */ -+ struct list_head rr_list[CFQ_PRIO_LISTS]; -+ struct list_head busy_rr; -+ struct list_head cur_rr; -+ struct list_head idle_rr; -+ unsigned int busy_queues; -+ -+ /* -+ * non-ordered list of empty cfqq's -+ */ - struct list_head empty_list; - -+ /* -+ * cfqq lookup hash -+ */ - struct hlist_head *cfq_hash; -- struct hlist_head *crq_hash; - -- /* queues on rr_list (ie they have pending requests */ -- unsigned int busy_queues; -+ /* -+ * global crq hash for all queues -+ */ -+ struct hlist_head *crq_hash; - - unsigned int max_queued; - -- atomic_t ref; -+ mempool_t *crq_pool; - -- int key_type; -+ int rq_in_driver; - -- mempool_t *crq_pool; -+ /* -+ * schedule slice state info -+ */ -+ /* -+ * idle window management -+ */ -+ struct timer_list idle_slice_timer; -+ struct work_struct unplug_work; - -- request_queue_t *queue; -+ struct cfq_queue *active_queue; -+ struct cfq_io_context *active_cic; -+ int cur_prio, cur_end_prio; -+ unsigned int dispatch_slice; -+ -+ struct timer_list idle_class_timer; - - sector_t last_sector; -+ unsigned long last_end_request; - -- int rq_in_driver; -+ unsigned int rq_starved; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_queued; -- unsigned int cfq_fifo_expire_r; -- unsigned int cfq_fifo_expire_w; -- unsigned int cfq_fifo_batch_expire; -+ unsigned int cfq_fifo_expire[2]; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; -- unsigned int find_best_crq; -- -- unsigned int cfq_tagged; -+ unsigned int cfq_slice[2]; -+ unsigned int cfq_slice_async_rq; -+ unsigned int cfq_slice_idle; -+ unsigned int cfq_max_depth; - }; - -+/* -+ * Per process-grouping structure -+ */ - struct cfq_queue { - /* reference count */ - atomic_t ref; - /* parent cfq_data */ - struct cfq_data *cfqd; -- /* hash of mergeable requests */ -+ /* cfqq lookup hash */ - struct hlist_node cfq_hash; - /* hash key */ -- unsigned long key; -- /* whether queue is on rr (or empty) list */ -- int on_rr; -+ unsigned int key; - /* on either rr or empty list of cfqd */ - struct list_head cfq_list; - /* sorted list of pending requests */ -@@ -158,21 +203,22 @@ - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ -- struct list_head fifo[2]; -- /* last time fifo expired */ -- unsigned long last_fifo_expire; -+ struct list_head fifo; - -- int key_type; -+ unsigned long slice_start; -+ unsigned long slice_end; -+ unsigned long slice_left; -+ unsigned long service_last; -+ -+ /* number of requests that are on the dispatch list */ -+ int on_dispatch[2]; -+ -+ /* io prio of this group */ -+ unsigned short ioprio, org_ioprio; -+ unsigned short ioprio_class, org_ioprio_class; - -- unsigned long service_start; -- unsigned long service_used; -- -- unsigned int max_rate; -- -- /* number of requests that have been handed to the driver */ -- int in_flight; -- /* number of currently allocated requests */ -- int alloc_limit[2]; -+ /* various state flags, see below */ -+ unsigned int flags; - }; - - struct cfq_rq { -@@ -184,42 +230,78 @@ - struct cfq_queue *cfq_queue; - struct cfq_io_context *io_context; - -- unsigned long service_start; -- unsigned long queue_start; -+ unsigned int crq_flags; -+}; -+ -+enum cfqq_state_flags { -+ CFQ_CFQQ_FLAG_on_rr = 0, -+ CFQ_CFQQ_FLAG_wait_request, -+ CFQ_CFQQ_FLAG_must_alloc, -+ CFQ_CFQQ_FLAG_must_alloc_slice, -+ CFQ_CFQQ_FLAG_must_dispatch, -+ CFQ_CFQQ_FLAG_fifo_expire, -+ CFQ_CFQQ_FLAG_idle_window, -+ CFQ_CFQQ_FLAG_prio_changed, -+ CFQ_CFQQ_FLAG_expired, -+}; -+ -+#define CFQ_CFQQ_FNS(name) \ -+static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ -+{ \ -+ cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ -+} \ -+static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ -+{ \ -+ cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ -+} \ -+static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ -+{ \ -+ return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ -+} - -- unsigned int in_flight : 1; -- unsigned int accounted : 1; -- unsigned int is_sync : 1; -- unsigned int is_write : 1; -+CFQ_CFQQ_FNS(on_rr); -+CFQ_CFQQ_FNS(wait_request); -+CFQ_CFQQ_FNS(must_alloc); -+CFQ_CFQQ_FNS(must_alloc_slice); -+CFQ_CFQQ_FNS(must_dispatch); -+CFQ_CFQQ_FNS(fifo_expire); -+CFQ_CFQQ_FNS(idle_window); -+CFQ_CFQQ_FNS(prio_changed); -+CFQ_CFQQ_FNS(expired); -+#undef CFQ_CFQQ_FNS -+ -+enum cfq_rq_state_flags { -+ CFQ_CRQ_FLAG_in_flight = 0, -+ CFQ_CRQ_FLAG_in_driver, -+ CFQ_CRQ_FLAG_is_sync, -+ CFQ_CRQ_FLAG_requeued, - }; - --static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long); -+#define CFQ_CRQ_FNS(name) \ -+static inline void cfq_mark_crq_##name(struct cfq_rq *crq) \ -+{ \ -+ crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name); \ -+} \ -+static inline void cfq_clear_crq_##name(struct cfq_rq *crq) \ -+{ \ -+ crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name); \ -+} \ -+static inline int cfq_crq_##name(const struct cfq_rq *crq) \ -+{ \ -+ return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0; \ -+} -+ -+CFQ_CRQ_FNS(in_flight); -+CFQ_CRQ_FNS(in_driver); -+CFQ_CRQ_FNS(is_sync); -+CFQ_CRQ_FNS(requeued); -+#undef CFQ_CRQ_FNS -+ -+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short); - static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *); --static void cfq_update_next_crq(struct cfq_rq *); - static void cfq_put_cfqd(struct cfq_data *cfqd); - --/* -- * what the fairness is based on (ie how processes are grouped and -- * differentiated) -- */ --static inline unsigned long --cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk) --{ -- /* -- * optimize this so that ->key_type is the offset into the struct -- */ -- switch (cfqd->key_type) { -- case CFQ_KEY_PGID: -- return process_group(tsk); -- default: -- case CFQ_KEY_TGID: -- return tsk->tgid; -- case CFQ_KEY_UID: -- return tsk->uid; -- case CFQ_KEY_GID: -- return tsk->gid; -- } --} -+#define process_sync(tsk) ((tsk)->flags & PF_SYNCWRITE) - - /* - * lots of deadline iosched dupes, can be abstracted later... -@@ -235,16 +317,12 @@ - - if (q->last_merge == crq->request) - q->last_merge = NULL; -- -- cfq_update_next_crq(crq); - } - - static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq) - { - const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request)); - -- BUG_ON(!hlist_unhashed(&crq->hash)); -- - hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]); - } - -@@ -257,8 +335,6 @@ - struct cfq_rq *crq = list_entry_hash(entry); - struct request *__rq = crq->request; - -- BUG_ON(hlist_unhashed(&crq->hash)); -- - if (!rq_mergeable(__rq)) { - cfq_del_crq_hash(crq); - continue; -@@ -271,6 +347,28 @@ - return NULL; - } - -+static inline int cfq_pending_requests(struct cfq_data *cfqd) -+{ -+ return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues; -+} -+ -+/* -+ * scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing -+ */ -+static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) -+{ -+ if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd)) -+ kblockd_schedule_work(&cfqd->unplug_work); -+} -+ -+static int cfq_queue_empty(request_queue_t *q) -+{ -+ struct cfq_data *cfqd = q->elevator->elevator_data; -+ -+ return !cfq_pending_requests(cfqd); -+} -+ - /* - * Lifted from AS - choose which of crq1 and crq2 that is best served now. - * We choose the request that is closest to the head right now. Distance -@@ -288,35 +386,21 @@ - if (crq2 == NULL) - return crq1; - -+ if (cfq_crq_requeued(crq1) && !cfq_crq_requeued(crq2)) -+ return crq1; -+ else if (cfq_crq_requeued(crq2) && !cfq_crq_requeued(crq1)) -+ return crq2; -+ -+ if (cfq_crq_is_sync(crq1) && !cfq_crq_is_sync(crq2)) -+ return crq1; -+ else if (cfq_crq_is_sync(crq2) && !cfq_crq_is_sync(crq1)) -+ return crq2; -+ - s1 = crq1->request->sector; - s2 = crq2->request->sector; - - last = cfqd->last_sector; - --#if 0 -- if (!list_empty(&cfqd->queue->queue_head)) { -- struct list_head *entry = &cfqd->queue->queue_head; -- unsigned long distance = ~0UL; -- struct request *rq; -- -- while ((entry = entry->prev) != &cfqd->queue->queue_head) { -- rq = list_entry_rq(entry); -- -- if (blk_barrier_rq(rq)) -- break; -- -- if (distance < abs(s1 - rq->sector + rq->nr_sectors)) { -- distance = abs(s1 - rq->sector +rq->nr_sectors); -- last = rq->sector + rq->nr_sectors; -- } -- if (distance < abs(s2 - rq->sector + rq->nr_sectors)) { -- distance = abs(s2 - rq->sector +rq->nr_sectors); -- last = rq->sector + rq->nr_sectors; -- } -- } -- } --#endif -- - /* - * by definition, 1KiB is 2 sectors - */ -@@ -377,11 +461,14 @@ - struct cfq_rq *crq_next = NULL, *crq_prev = NULL; - struct rb_node *rbnext, *rbprev; - -- if (!ON_RB(&last->rb_node)) -- return NULL; -- -- if ((rbnext = rb_next(&last->rb_node)) == NULL) -+ rbnext = NULL; -+ if (ON_RB(&last->rb_node)) -+ rbnext = rb_next(&last->rb_node); -+ if (!rbnext) { - rbnext = rb_first(&cfqq->sort_list); -+ if (rbnext == &last->rb_node) -+ rbnext = NULL; -+ } - - rbprev = rb_prev(&last->rb_node); - -@@ -401,67 +488,53 @@ - cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq); - } - --static int cfq_check_sort_rr_list(struct cfq_queue *cfqq) -+static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted) - { -- struct list_head *head = &cfqq->cfqd->rr_list; -- struct list_head *next, *prev; -- -- /* -- * list might still be ordered -- */ -- next = cfqq->cfq_list.next; -- if (next != head) { -- struct cfq_queue *cnext = list_entry_cfqq(next); -+ struct cfq_data *cfqd = cfqq->cfqd; -+ struct list_head *list, *entry; - -- if (cfqq->service_used > cnext->service_used) -- return 1; -- } -+ BUG_ON(!cfq_cfqq_on_rr(cfqq)); - -- prev = cfqq->cfq_list.prev; -- if (prev != head) { -- struct cfq_queue *cprev = list_entry_cfqq(prev); -+ list_del(&cfqq->cfq_list); - -- if (cfqq->service_used < cprev->service_used) -- return 1; -+ if (cfq_class_rt(cfqq)) -+ list = &cfqd->cur_rr; -+ else if (cfq_class_idle(cfqq)) -+ list = &cfqd->idle_rr; -+ else { -+ /* -+ * if cfqq has requests in flight, don't allow it to be -+ * found in cfq_set_active_queue before it has finished them. -+ * this is done to increase fairness between a process that -+ * has lots of io pending vs one that only generates one -+ * sporadically or synchronously -+ */ -+ if (cfq_cfqq_dispatched(cfqq)) -+ list = &cfqd->busy_rr; -+ else -+ list = &cfqd->rr_list[cfqq->ioprio]; - } - -- return 0; --} -- --static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue) --{ -- struct list_head *entry = &cfqq->cfqd->rr_list; -- -- if (!cfqq->on_rr) -- return; -- if (!new_queue && !cfq_check_sort_rr_list(cfqq)) -+ /* -+ * if queue was preempted, just add to front to be fair. busy_rr -+ * isn't sorted. -+ */ -+ if (preempted || list == &cfqd->busy_rr) { -+ list_add(&cfqq->cfq_list, list); - return; -- -- list_del(&cfqq->cfq_list); -+ } - - /* -- * sort by our mean service_used, sub-sort by in-flight requests -+ * sort by when queue was last serviced - */ -- while ((entry = entry->prev) != &cfqq->cfqd->rr_list) { -+ entry = list; -+ while ((entry = entry->prev) != list) { - struct cfq_queue *__cfqq = list_entry_cfqq(entry); - -- if (cfqq->service_used > __cfqq->service_used) -+ if (!__cfqq->service_last) -+ break; -+ if (time_before(__cfqq->service_last, cfqq->service_last)) - break; -- else if (cfqq->service_used == __cfqq->service_used) { -- struct list_head *prv; -- -- while ((prv = entry->prev) != &cfqq->cfqd->rr_list) { -- __cfqq = list_entry_cfqq(prv); -- -- WARN_ON(__cfqq->service_used > cfqq->service_used); -- if (cfqq->service_used != __cfqq->service_used) -- break; -- if (cfqq->in_flight > __cfqq->in_flight) -- break; -- -- entry = prv; -- } -- } - } - - list_add(&cfqq->cfq_list, entry); -@@ -469,28 +542,24 @@ - - /* - * add to busy list of queues for service, trying to be fair in ordering -- * the pending list according to requests serviced -+ * the pending list according to last request service - */ - static inline void --cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue) - { -- /* -- * it's currently on the empty list -- */ -- cfqq->on_rr = 1; -+ BUG_ON(cfq_cfqq_on_rr(cfqq)); -+ cfq_mark_cfqq_on_rr(cfqq); - cfqd->busy_queues++; - -- if (time_after(jiffies, cfqq->service_start + cfq_service)) -- cfqq->service_used >>= 3; -- -- cfq_sort_rr_list(cfqq, 1); -+ cfq_resort_rr_list(cfqq, requeue); - } - - static inline void - cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) - { -+ BUG_ON(!cfq_cfqq_on_rr(cfqq)); -+ cfq_clear_cfqq_on_rr(cfqq); - list_move(&cfqq->cfq_list, &cfqd->empty_list); -- cfqq->on_rr = 0; - - BUG_ON(!cfqd->busy_queues); - cfqd->busy_queues--; -@@ -505,16 +574,17 @@ - - if (ON_RB(&crq->rb_node)) { - struct cfq_data *cfqd = cfqq->cfqd; -+ const int sync = cfq_crq_is_sync(crq); - -- BUG_ON(!cfqq->queued[crq->is_sync]); -+ BUG_ON(!cfqq->queued[sync]); -+ cfqq->queued[sync]--; - - cfq_update_next_crq(crq); - -- cfqq->queued[crq->is_sync]--; - rb_erase(&crq->rb_node, &cfqq->sort_list); - RB_CLEAR_COLOR(&crq->rb_node); - -- if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr) -+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); - } - } -@@ -550,7 +620,7 @@ - struct cfq_rq *__alias; - - crq->rb_key = rq_rb_key(rq); -- cfqq->queued[crq->is_sync]++; -+ cfqq->queued[cfq_crq_is_sync(crq)]++; - - /* - * looks a little odd, but the first insert might return an alias. -@@ -561,8 +631,8 @@ - - rb_insert_color(&crq->rb_node, &cfqq->sort_list); - -- if (!cfqq->on_rr) -- cfq_add_cfqq_rr(cfqd, cfqq); -+ if (!cfq_cfqq_on_rr(cfqq)) -+ cfq_add_cfqq_rr(cfqd, cfqq, cfq_crq_requeued(crq)); - - /* - * check if this request is a better next-serve candidate -@@ -575,17 +645,16 @@ - { - if (ON_RB(&crq->rb_node)) { - rb_erase(&crq->rb_node, &cfqq->sort_list); -- cfqq->queued[crq->is_sync]--; -+ cfqq->queued[cfq_crq_is_sync(crq)]--; - } - - cfq_add_crq_rb(crq); - } - --static struct request * --cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) -+static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector) -+ - { -- const unsigned long key = cfq_hash_key(cfqd, current); -- struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key); -+ struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY); - struct rb_node *n; - - if (!cfqq) -@@ -609,20 +678,25 @@ - - static void cfq_deactivate_request(request_queue_t *q, struct request *rq) - { -+ struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; - -- if (cfqq->cfqd->cfq_tagged) { -- cfqq->service_used--; -- cfq_sort_rr_list(cfqq, 0); -+ if (cfq_crq_in_driver(crq)) { -+ cfq_clear_crq_in_driver(crq); -+ WARN_ON(!cfqd->rq_in_driver); -+ cfqd->rq_in_driver--; - } -+ if (cfq_crq_in_flight(crq)) { -+ const int sync = cfq_crq_is_sync(crq); - -- if (crq->accounted) { -- crq->accounted = 0; -- cfqq->cfqd->rq_in_driver--; -+ cfq_clear_crq_in_flight(crq); -+ WARN_ON(!cfqq->on_dispatch[sync]); -+ cfqq->on_dispatch[sync]--; - } -+ cfq_mark_crq_requeued(crq); - } - } - -@@ -640,11 +714,10 @@ - struct cfq_rq *crq = RQ_DATA(rq); - - if (crq) { -- cfq_remove_merge_hints(q, crq); - list_del_init(&rq->queuelist); -+ cfq_del_crq_rb(crq); -+ cfq_remove_merge_hints(q, crq); - -- if (crq->cfq_queue) -- cfq_del_crq_rb(crq); - } - } - -@@ -662,21 +735,15 @@ - } - - __rq = cfq_find_rq_hash(cfqd, bio->bi_sector); -- if (__rq) { -- BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector); -- -- if (elv_rq_merge_ok(__rq, bio)) { -- ret = ELEVATOR_BACK_MERGE; -- goto out; -- } -+ if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ ret = ELEVATOR_BACK_MERGE; -+ goto out; - } - - __rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio)); -- if (__rq) { -- if (elv_rq_merge_ok(__rq, bio)) { -- ret = ELEVATOR_FRONT_MERGE; -- goto out; -- } -+ if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ ret = ELEVATOR_FRONT_MERGE; -+ goto out; - } - - return ELEVATOR_NO_MERGE; -@@ -709,235 +776,496 @@ - cfq_merged_requests(request_queue_t *q, struct request *rq, - struct request *next) - { -- struct cfq_rq *crq = RQ_DATA(rq); -- struct cfq_rq *cnext = RQ_DATA(next); -- - cfq_merged_request(q, rq); - -- if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) { -- if (time_before(cnext->queue_start, crq->queue_start)) { -- list_move(&rq->queuelist, &next->queuelist); -- crq->queue_start = cnext->queue_start; -- } -- } -+ /* -+ * reposition in fifo if next is older than rq -+ */ -+ if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ time_before(next->start_time, rq->start_time)) -+ list_move(&rq->queuelist, &next->queuelist); - -- cfq_update_next_crq(cnext); - cfq_remove_request(q, next); - } - -+static inline void -+__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+{ -+ if (cfqq) { -+ /* -+ * stop potential idle class queues waiting service -+ */ -+ del_timer(&cfqd->idle_class_timer); -+ -+ cfqq->slice_start = jiffies; -+ cfqq->slice_end = 0; -+ cfqq->slice_left = 0; -+ cfq_clear_cfqq_must_alloc_slice(cfqq); -+ cfq_clear_cfqq_fifo_expire(cfqq); -+ cfq_clear_cfqq_expired(cfqq); -+ } -+ -+ cfqd->active_queue = cfqq; -+} -+ - /* -- * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues, -- * this function sector sorts the selected request to minimize seeks. we start -- * at cfqd->last_sector, not 0. -+ * 0 -+ * 0,1 -+ * 0,1,2 -+ * 0,1,2,3 -+ * 0,1,2,3,4 -+ * 0,1,2,3,4,5 -+ * 0,1,2,3,4,5,6 -+ * 0,1,2,3,4,5,6,7 - */ --static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) -+static int cfq_get_next_prio_level(struct cfq_data *cfqd) - { -- struct cfq_data *cfqd = q->elevator->elevator_data; -- struct cfq_queue *cfqq = crq->cfq_queue; -- struct list_head *head = &q->queue_head, *entry = head; -- struct request *__rq; -- sector_t last; -- -- cfq_del_crq_rb(crq); -- cfq_remove_merge_hints(q, crq); -- list_del(&crq->request->queuelist); -+ int prio, wrap; - -- last = cfqd->last_sector; -- while ((entry = entry->prev) != head) { -- __rq = list_entry_rq(entry); -+ prio = -1; -+ wrap = 0; -+ do { -+ int p; - -- if (blk_barrier_rq(crq->request)) -- break; -- if (!blk_fs_request(crq->request)) -- break; -+ for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) { -+ if (!list_empty(&cfqd->rr_list[p])) { -+ prio = p; -+ break; -+ } -+ } - -- if (crq->request->sector > __rq->sector) -- break; -- if (__rq->sector > last && crq->request->sector < last) { -- last = crq->request->sector; -+ if (prio != -1) - break; -+ cfqd->cur_prio = 0; -+ if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) { -+ cfqd->cur_end_prio = 0; -+ if (wrap) -+ break; -+ wrap = 1; - } -- } -+ } while (1); - -- cfqd->last_sector = last; -- crq->in_flight = 1; -- cfqq->in_flight++; -- list_add(&crq->request->queuelist, entry); --} -+ if (unlikely(prio == -1)) -+ return -1; - --/* -- * return expired entry, or NULL to just start from scratch in rbtree -- */ --static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) --{ -- struct cfq_data *cfqd = cfqq->cfqd; -- const int reads = !list_empty(&cfqq->fifo[0]); -- const int writes = !list_empty(&cfqq->fifo[1]); -- unsigned long now = jiffies; -- struct cfq_rq *crq; -+ BUG_ON(prio >= CFQ_PRIO_LISTS); - -- if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire)) -- return NULL; -+ list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr); - -- crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist)); -- if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) { -- cfqq->last_fifo_expire = now; -- return crq; -+ cfqd->cur_prio = prio + 1; -+ if (cfqd->cur_prio > cfqd->cur_end_prio) { -+ cfqd->cur_end_prio = cfqd->cur_prio; -+ cfqd->cur_prio = 0; - } -- -- crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist)); -- if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) { -- cfqq->last_fifo_expire = now; -- return crq; -+ if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) { -+ cfqd->cur_prio = 0; -+ cfqd->cur_end_prio = 0; - } - -- return NULL; -+ return prio; - } - --/* -- * dispatch a single request from given queue -- */ --static inline void --cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd, -- struct cfq_queue *cfqq) -+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd) - { -- struct cfq_rq *crq; -+ struct cfq_queue *cfqq; - - /* -- * follow expired path, else get first next available -+ * if current queue is expired but not done with its requests yet, -+ * wait for that to happen - */ -- if ((crq = cfq_check_fifo(cfqq)) == NULL) { -- if (cfqd->find_best_crq) -- crq = cfqq->next_crq; -- else -- crq = rb_entry_crq(rb_first(&cfqq->sort_list)); -+ if ((cfqq = cfqd->active_queue) != NULL) { -+ if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq)) -+ return NULL; - } - -- cfqd->last_sector = crq->request->sector + crq->request->nr_sectors; -+ /* -+ * if current list is non-empty, grab first entry. if it is empty, -+ * get next prio level and grab first entry then if any are spliced -+ */ -+ if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1) -+ cfqq = list_entry_cfqq(cfqd->cur_rr.next); - - /* -- * finally, insert request into driver list -+ * if we have idle queues and no rt or be queues had pending -+ * requests, either allow immediate service if the grace period -+ * has passed or arm the idle grace timer - */ -- cfq_dispatch_sort(q, crq); -+ if (!cfqq && !list_empty(&cfqd->idle_rr)) { -+ unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE; -+ -+ if (time_after_eq(jiffies, end)) -+ cfqq = list_entry_cfqq(cfqd->idle_rr.next); -+ else -+ mod_timer(&cfqd->idle_class_timer, end); -+ } -+ -+ __cfq_set_active_queue(cfqd, cfqq); -+ return cfqq; - } - --static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch) -+/* -+ * current cfqq expired its slice (or was too idle), select new one -+ */ -+static void -+__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, -+ int preempted) - { -- struct cfq_data *cfqd = q->elevator->elevator_data; -- struct cfq_queue *cfqq; -- struct list_head *entry, *tmp; -- int queued, busy_queues, first_round; -- -- if (list_empty(&cfqd->rr_list)) -- return 0; -+ unsigned long now = jiffies; - -- queued = 0; -- first_round = 1; --restart: -- busy_queues = 0; -- list_for_each_safe(entry, tmp, &cfqd->rr_list) { -- cfqq = list_entry_cfqq(entry); -+ if (cfq_cfqq_wait_request(cfqq)) -+ del_timer(&cfqd->idle_slice_timer); - -- BUG_ON(RB_EMPTY(&cfqq->sort_list)); -+ if (!preempted && !cfq_cfqq_dispatched(cfqq)) -+ cfqq->service_last = now; - -- /* -- * first round of queueing, only select from queues that -- * don't already have io in-flight -- */ -- if (first_round && cfqq->in_flight) -- continue; -+ cfq_clear_cfqq_must_dispatch(cfqq); -+ cfq_clear_cfqq_wait_request(cfqq); - -- cfq_dispatch_request(q, cfqd, cfqq); -+ /* -+ * store what was left of this slice, if the queue idled out -+ * or was preempted -+ */ -+ if (time_after(cfqq->slice_end, now)) -+ cfqq->slice_left = cfqq->slice_end - now; -+ else -+ cfqq->slice_left = 0; - -- if (!RB_EMPTY(&cfqq->sort_list)) -- busy_queues++; -+ if (cfq_cfqq_on_rr(cfqq)) -+ cfq_resort_rr_list(cfqq, preempted); - -- queued++; -- } -+ if (cfqq == cfqd->active_queue) -+ cfqd->active_queue = NULL; - -- if ((queued < max_dispatch) && (busy_queues || first_round)) { -- first_round = 0; -- goto restart; -+ if (cfqd->active_cic) { -+ put_io_context(cfqd->active_cic->ioc); -+ cfqd->active_cic = NULL; - } - -- return queued; -+ cfqd->dispatch_slice = 0; - } - --static inline void cfq_account_dispatch(struct cfq_rq *crq) -+static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted) - { -- struct cfq_queue *cfqq = crq->cfq_queue; -- struct cfq_data *cfqd = cfqq->cfqd; -- unsigned long now, elapsed; -+ struct cfq_queue *cfqq = cfqd->active_queue; - -- if (!blk_fs_request(crq->request)) -- return; -+ if (cfqq) { -+ /* -+ * use deferred expiry, if there are requests in progress as -+ * not to disturb the slice of the next queue -+ */ -+ if (cfq_cfqq_dispatched(cfqq)) -+ cfq_mark_cfqq_expired(cfqq); -+ else -+ __cfq_slice_expired(cfqd, cfqq, preempted); -+ } -+} - -- /* -- * accounted bit is necessary since some drivers will call -- * elv_next_request() many times for the same request (eg ide) -- */ -- if (crq->accounted) -- return; -+static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) - -- now = jiffies; -- if (cfqq->service_start == ~0UL) -- cfqq->service_start = now; -+{ -+ WARN_ON(!RB_EMPTY(&cfqq->sort_list)); -+ WARN_ON(cfqq != cfqd->active_queue); - - /* -- * on drives with tagged command queueing, command turn-around time -- * doesn't necessarily reflect the time spent processing this very -- * command inside the drive. so do the accounting differently there, -- * by just sorting on the number of requests -- */ -- if (cfqd->cfq_tagged) { -- if (time_after(now, cfqq->service_start + cfq_service)) { -- cfqq->service_start = now; -- cfqq->service_used /= 10; -- } -- -- cfqq->service_used++; -- cfq_sort_rr_list(cfqq, 0); -- } -+ * idle is disabled, either manually or by past process history -+ */ -+ if (!cfqd->cfq_slice_idle) -+ return 0; -+ if (!cfq_cfqq_idle_window(cfqq)) -+ return 0; -+ /* -+ * task has exited, don't wait -+ */ -+ if (cfqd->active_cic && !cfqd->active_cic->ioc->task) -+ return 0; -+ -+ cfq_mark_cfqq_must_dispatch(cfqq); -+ cfq_mark_cfqq_wait_request(cfqq); -+ -+ if (!timer_pending(&cfqd->idle_slice_timer)) { -+ unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle); -+ -+ cfqd->idle_slice_timer.expires = jiffies + slice_left; -+ add_timer(&cfqd->idle_slice_timer); -+ } -+ -+ return 1; -+} -+ -+/* -+ * we dispatch cfqd->cfq_quantum requests in total from the rr_list queues, -+ * this function sector sorts the selected request to minimize seeks. we start -+ * at cfqd->last_sector, not 0. -+ */ -+static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq) -+{ -+ struct cfq_data *cfqd = q->elevator->elevator_data; -+ struct cfq_queue *cfqq = crq->cfq_queue; -+ struct list_head *head = &q->queue_head, *entry = head; -+ struct request *__rq; -+ sector_t last; -+ -+ list_del(&crq->request->queuelist); -+ -+ last = cfqd->last_sector; -+ list_for_each_entry_reverse(__rq, head, queuelist) { -+ struct cfq_rq *__crq = RQ_DATA(__rq); -+ -+ if (blk_barrier_rq(__rq)) -+ break; -+ if (!blk_fs_request(__rq)) -+ break; -+ if (cfq_crq_requeued(__crq)) -+ break; -+ -+ if (__rq->sector <= crq->request->sector) -+ break; -+ if (__rq->sector > last && crq->request->sector < last) { -+ last = crq->request->sector + crq->request->nr_sectors; -+ break; -+ } -+ entry = &__rq->queuelist; -+ } -+ -+ cfqd->last_sector = last; -+ -+ cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq); - -- elapsed = now - crq->queue_start; -- if (elapsed > max_elapsed_dispatch) -- max_elapsed_dispatch = elapsed; -+ cfq_del_crq_rb(crq); -+ cfq_remove_merge_hints(q, crq); -+ -+ cfq_mark_crq_in_flight(crq); -+ cfq_clear_crq_requeued(crq); -+ -+ cfqq->on_dispatch[cfq_crq_is_sync(crq)]++; -+ list_add_tail(&crq->request->queuelist, entry); -+} -+ -+/* -+ * return expired entry, or NULL to just start from scratch in rbtree -+ */ -+static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq) -+{ -+ struct cfq_data *cfqd = cfqq->cfqd; -+ struct request *rq; -+ struct cfq_rq *crq; - -- crq->accounted = 1; -- crq->service_start = now; -+ if (cfq_cfqq_fifo_expire(cfqq)) -+ return NULL; -+ -+ if (!list_empty(&cfqq->fifo)) { -+ int fifo = cfq_cfqq_class_sync(cfqq); - -- if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) { -- cfqq->cfqd->cfq_tagged = 1; -- printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG); -+ crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next)); -+ rq = crq->request; -+ if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) { -+ cfq_mark_cfqq_fifo_expire(cfqq); -+ return crq; -+ } - } -+ -+ return NULL; -+} -+ -+/* -+ * Scale schedule slice based on io priority. Use the sync time slice only -+ * if a queue is marked sync and has sync io queued. A sync queue with async -+ * io only, should not get full sync slice length. -+ */ -+static inline int -+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+{ -+ const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)]; -+ -+ WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); -+ -+ return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio)); -+} -+ -+static inline void -+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+{ -+ cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; -+} -+ -+static inline int -+cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+{ -+ const int base_rq = cfqd->cfq_slice_async_rq; -+ -+ WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); -+ -+ return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); -+} -+ -+/* -+ * get next queue for service -+ */ -+static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force) -+{ -+ unsigned long now = jiffies; -+ struct cfq_queue *cfqq; -+ -+ cfqq = cfqd->active_queue; -+ if (!cfqq) -+ goto new_queue; -+ -+ if (cfq_cfqq_expired(cfqq)) -+ goto new_queue; -+ -+ /* -+ * slice has expired -+ */ -+ if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end)) -+ goto expire; -+ -+ /* -+ * if queue has requests, dispatch one. if not, check if -+ * enough slice is left to wait for one -+ */ -+ if (!RB_EMPTY(&cfqq->sort_list)) -+ goto keep_queue; -+ else if (!force && cfq_cfqq_class_sync(cfqq) && -+ time_before(now, cfqq->slice_end)) { -+ if (cfq_arm_slice_timer(cfqd, cfqq)) -+ return NULL; -+ } -+ -+expire: -+ cfq_slice_expired(cfqd, 0); -+new_queue: -+ cfqq = cfq_set_active_queue(cfqd); -+keep_queue: -+ return cfqq; -+} -+ -+static int -+__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, -+ int max_dispatch) -+{ -+ int dispatched = 0; -+ -+ BUG_ON(RB_EMPTY(&cfqq->sort_list)); -+ -+ do { -+ struct cfq_rq *crq; -+ -+ /* -+ * follow expired path, else get first next available -+ */ -+ if ((crq = cfq_check_fifo(cfqq)) == NULL) -+ crq = cfqq->next_crq; -+ -+ /* -+ * finally, insert request into driver dispatch list -+ */ -+ cfq_dispatch_sort(cfqd->queue, crq); -+ -+ cfqd->dispatch_slice++; -+ dispatched++; -+ -+ if (!cfqd->active_cic) { -+ atomic_inc(&crq->io_context->ioc->refcount); -+ cfqd->active_cic = crq->io_context; -+ } -+ -+ if (RB_EMPTY(&cfqq->sort_list)) -+ break; -+ -+ } while (dispatched < max_dispatch); -+ -+ /* -+ * if slice end isn't set yet, set it. if at least one request was -+ * sync, use the sync time slice value -+ */ -+ if (!cfqq->slice_end) -+ cfq_set_prio_slice(cfqd, cfqq); -+ -+ /* -+ * expire an async queue immediately if it has used up its slice. idle -+ * queue always expire after 1 dispatch round. -+ */ -+ if ((!cfq_cfqq_sync(cfqq) && -+ cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) || -+ cfq_class_idle(cfqq)) -+ cfq_slice_expired(cfqd, 0); -+ -+ return dispatched; -+} -+ -+static int -+cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force) -+{ -+ struct cfq_data *cfqd = q->elevator->elevator_data; -+ struct cfq_queue *cfqq; -+ -+ if (!cfqd->busy_queues) -+ return 0; -+ -+ cfqq = cfq_select_queue(cfqd, force); -+ if (cfqq) { -+ cfq_clear_cfqq_must_dispatch(cfqq); -+ cfq_clear_cfqq_wait_request(cfqq); -+ del_timer(&cfqd->idle_slice_timer); -+ -+ if (cfq_class_idle(cfqq)) -+ max_dispatch = 1; -+ -+ return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch); -+ } -+ -+ return 0; -+} -+ -+static inline void cfq_account_dispatch(struct cfq_rq *crq) -+{ -+ struct cfq_queue *cfqq = crq->cfq_queue; -+ struct cfq_data *cfqd = cfqq->cfqd; -+ -+ if (unlikely(!blk_fs_request(crq->request))) -+ return; -+ -+ /* -+ * accounted bit is necessary since some drivers will call -+ * elv_next_request() many times for the same request (eg ide) -+ */ -+ if (cfq_crq_in_driver(crq)) -+ return; -+ -+ cfq_mark_crq_in_driver(crq); -+ cfqd->rq_in_driver++; - } - - static inline void - cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq) - { - struct cfq_data *cfqd = cfqq->cfqd; -+ unsigned long now; - -- if (!crq->accounted) -+ if (!cfq_crq_in_driver(crq)) - return; - -+ now = jiffies; -+ - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; - -- if (!cfqd->cfq_tagged) { -- unsigned long now = jiffies; -- unsigned long duration = now - crq->service_start; -+ if (!cfq_class_idle(cfqq)) -+ cfqd->last_end_request = now; - -- if (time_after(now, cfqq->service_start + cfq_service)) { -- cfqq->service_start = now; -- cfqq->service_used >>= 3; -+ if (!cfq_cfqq_dispatched(cfqq)) { -+ if (cfq_cfqq_on_rr(cfqq)) { -+ cfqq->service_last = now; -+ cfq_resort_rr_list(cfqq, 0); -+ } -+ if (cfq_cfqq_expired(cfqq)) { -+ __cfq_slice_expired(cfqd, cfqq, 0); -+ cfq_schedule_dispatch(cfqd); - } -- -- cfqq->service_used += duration; -- cfq_sort_rr_list(cfqq, 0); -- -- if (duration > max_elapsed_crq) -- max_elapsed_crq = duration; - } -+ -+ if (cfq_crq_is_sync(crq)) -+ crq->io_context->last_end_request = now; - } - - static struct request *cfq_next_request(request_queue_t *q) -@@ -950,7 +1278,19 @@ - dispatch: - rq = list_entry_rq(q->queue_head.next); - -- if ((crq = RQ_DATA(rq)) != NULL) { -+ crq = RQ_DATA(rq); -+ if (crq) { -+ struct cfq_queue *cfqq = crq->cfq_queue; -+ -+ /* -+ * if idle window is disabled, allow queue buildup -+ */ -+ if (!cfq_crq_in_driver(crq) && -+ !cfq_cfqq_idle_window(cfqq) && -+ !blk_barrier_rq(rq) && -+ cfqd->rq_in_driver >= cfqd->cfq_max_depth) -+ return NULL; -+ - cfq_remove_merge_hints(q, crq); - cfq_account_dispatch(crq); - } -@@ -958,7 +1298,7 @@ - return rq; - } - -- if (cfq_dispatch_requests(q, cfqd->cfq_quantum)) -+ if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0)) - goto dispatch; - - return NULL; -@@ -972,13 +1312,21 @@ - */ - static void cfq_put_queue(struct cfq_queue *cfqq) - { -- BUG_ON(!atomic_read(&cfqq->ref)); -+ struct cfq_data *cfqd = cfqq->cfqd; -+ -+ BUG_ON(atomic_read(&cfqq->ref) <= 0); - - if (!atomic_dec_and_test(&cfqq->ref)) - return; - - BUG_ON(rb_first(&cfqq->sort_list)); -- BUG_ON(cfqq->on_rr); -+ BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); -+ BUG_ON(cfq_cfqq_on_rr(cfqq)); -+ -+ if (unlikely(cfqd->active_queue == cfqq)) { -+ __cfq_slice_expired(cfqd, cfqq, 0); -+ cfq_schedule_dispatch(cfqd); -+ } - - cfq_put_cfqd(cfqq->cfqd); - -@@ -991,15 +1339,17 @@ - } - - static inline struct cfq_queue * --__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval) -+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio, -+ const int hashval) - { - struct hlist_head *hash_list = &cfqd->cfq_hash[hashval]; - struct hlist_node *entry, *next; - - hlist_for_each_safe(entry, next, hash_list) { - struct cfq_queue *__cfqq = list_entry_qhash(entry); -+ const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio); - -- if (__cfqq->key == key) -+ if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY)) - return __cfqq; - } - -@@ -1007,94 +1357,220 @@ - } - - static struct cfq_queue * --cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key) -+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio) - { -- return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT)); -+ return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT)); - } - --static inline void --cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq, -- struct cfq_io_context *cic) -+static void cfq_free_io_context(struct cfq_io_context *cic) - { -- unsigned long hashkey = cfq_hash_key(cfqd, current); -- unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT); -- struct cfq_queue *__cfqq; -- unsigned long flags; -- -- spin_lock_irqsave(cfqd->queue->queue_lock, flags); -+ struct cfq_io_context *__cic; -+ struct list_head *entry, *next; - -- hlist_del(&(*cfqq)->cfq_hash); -- -- __cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval); -- if (!__cfqq || __cfqq == *cfqq) { -- __cfqq = *cfqq; -- hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); -- __cfqq->key_type = cfqd->key_type; -- } else { -- atomic_inc(&__cfqq->ref); -- cic->cfqq = __cfqq; -- cfq_put_queue(*cfqq); -- *cfqq = __cfqq; -+ list_for_each_safe(entry, next, &cic->list) { -+ __cic = list_entry(entry, struct cfq_io_context, list); -+ kmem_cache_free(cfq_ioc_pool, __cic); - } - -- cic->cfqq = __cfqq; -- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -+ kmem_cache_free(cfq_ioc_pool, cic); - } - --static void cfq_free_io_context(struct cfq_io_context *cic) -+/* -+ * Called with interrupts disabled -+ */ -+static void cfq_exit_single_io_context(struct cfq_io_context *cic) - { -- kmem_cache_free(cfq_ioc_pool, cic); -+ struct cfq_data *cfqd = cic->cfqq->cfqd; -+ request_queue_t *q = cfqd->queue; -+ -+ WARN_ON(!irqs_disabled()); -+ -+ spin_lock(q->queue_lock); -+ -+ if (unlikely(cic->cfqq == cfqd->active_queue)) { -+ __cfq_slice_expired(cfqd, cic->cfqq, 0); -+ cfq_schedule_dispatch(cfqd); -+ } -+ -+ cfq_put_queue(cic->cfqq); -+ cic->cfqq = NULL; -+ spin_unlock(q->queue_lock); - } - - /* -- * locking hierarchy is: io_context lock -> queue locks -+ * Another task may update the task cic list, if it is doing a queue lookup -+ * on its behalf. cfq_cic_lock excludes such concurrent updates - */ - static void cfq_exit_io_context(struct cfq_io_context *cic) - { -- struct cfq_queue *cfqq = cic->cfqq; -- struct list_head *entry = &cic->list; -- request_queue_t *q; -+ struct cfq_io_context *__cic; -+ struct list_head *entry; - unsigned long flags; - -+ local_irq_save(flags); -+ - /* - * put the reference this task is holding to the various queues - */ -- spin_lock_irqsave(&cic->ioc->lock, flags); -- while ((entry = cic->list.next) != &cic->list) { -- struct cfq_io_context *__cic; -- -+ list_for_each(entry, &cic->list) { - __cic = list_entry(entry, struct cfq_io_context, list); -- list_del(entry); -- -- q = __cic->cfqq->cfqd->queue; -- spin_lock(q->queue_lock); -- cfq_put_queue(__cic->cfqq); -- spin_unlock(q->queue_lock); -+ cfq_exit_single_io_context(__cic); - } - -- q = cfqq->cfqd->queue; -- spin_lock(q->queue_lock); -- cfq_put_queue(cfqq); -- spin_unlock(q->queue_lock); -- -- cic->cfqq = NULL; -- spin_unlock_irqrestore(&cic->ioc->lock, flags); -+ cfq_exit_single_io_context(cic); -+ local_irq_restore(flags); - } - --static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags) -+static struct cfq_io_context * -+cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask) - { -- struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags); -+ struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask); - - if (cic) { -- cic->dtor = cfq_free_io_context; -- cic->exit = cfq_exit_io_context; - INIT_LIST_HEAD(&cic->list); - cic->cfqq = NULL; -+ cic->key = NULL; -+ cic->last_end_request = jiffies; -+ cic->ttime_total = 0; -+ cic->ttime_samples = 0; -+ cic->ttime_mean = 0; -+ cic->dtor = cfq_free_io_context; -+ cic->exit = cfq_exit_io_context; - } - - return cic; - } - -+static void cfq_init_prio_data(struct cfq_queue *cfqq) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ if (!cfq_cfqq_prio_changed(cfqq)) -+ return; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio); -+ switch (ioprio_class) { -+ default: -+ printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * no prio set, place us in the middle of the BE classes -+ */ -+ cfqq->ioprio = task_nice_ioprio(tsk); -+ cfqq->ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_RT: -+ cfqq->ioprio = task_ioprio(tsk); -+ cfqq->ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ cfqq->ioprio = task_ioprio(tsk); -+ cfqq->ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ cfqq->ioprio_class = IOPRIO_CLASS_IDLE; -+ cfqq->ioprio = 7; -+ cfq_clear_cfqq_idle_window(cfqq); -+ break; -+ } -+ -+ /* -+ * keep track of original prio settings in case we have to temporarily -+ * elevate the priority of this queue -+ */ -+ cfqq->org_ioprio = cfqq->ioprio; -+ cfqq->org_ioprio_class = cfqq->ioprio_class; -+ -+ if (cfq_cfqq_on_rr(cfqq)) -+ cfq_resort_rr_list(cfqq, 0); -+ -+ cfq_clear_cfqq_prio_changed(cfqq); -+} -+ -+static inline void changed_ioprio(struct cfq_queue *cfqq) -+{ -+ if (cfqq) { -+ struct cfq_data *cfqd = cfqq->cfqd; -+ -+ spin_lock(cfqd->queue->queue_lock); -+ cfq_mark_cfqq_prio_changed(cfqq); -+ cfq_init_prio_data(cfqq); -+ spin_unlock(cfqd->queue->queue_lock); -+ } -+} -+ -+/* -+ * callback from sys_ioprio_set, irqs are disabled -+ */ -+static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio) -+{ -+ struct cfq_io_context *cic = ioc->cic; -+ -+ changed_ioprio(cic->cfqq); -+ -+ list_for_each_entry(cic, &cic->list, list) -+ changed_ioprio(cic->cfqq); -+ -+ return 0; -+} -+ -+static struct cfq_queue * -+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio, -+ int gfp_mask) -+{ -+ const int hashval = hash_long(key, CFQ_QHASH_SHIFT); -+ struct cfq_queue *cfqq, *new_cfqq = NULL; -+ -+retry: -+ cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval); -+ -+ if (!cfqq) { -+ if (new_cfqq) { -+ cfqq = new_cfqq; -+ new_cfqq = NULL; -+ } else { -+ spin_unlock_irq(cfqd->queue->queue_lock); -+ new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); -+ spin_lock_irq(cfqd->queue->queue_lock); -+ -+ if (!new_cfqq && !(gfp_mask & __GFP_WAIT)) -+ goto out; -+ -+ goto retry; -+ } -+ -+ memset(cfqq, 0, sizeof(*cfqq)); -+ -+ INIT_HLIST_NODE(&cfqq->cfq_hash); -+ INIT_LIST_HEAD(&cfqq->cfq_list); -+ RB_CLEAR_ROOT(&cfqq->sort_list); -+ INIT_LIST_HEAD(&cfqq->fifo); -+ -+ cfqq->key = key; -+ hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); -+ atomic_set(&cfqq->ref, 0); -+ cfqq->cfqd = cfqd; -+ atomic_inc(&cfqd->ref); -+ cfqq->service_last = 0; -+ /* -+ * set ->slice_left to allow preemption for a new process -+ */ -+ cfqq->slice_left = 2 * cfqd->cfq_slice_idle; -+ cfq_mark_cfqq_idle_window(cfqq); -+ cfq_mark_cfqq_prio_changed(cfqq); -+ cfq_init_prio_data(cfqq); -+ } -+ -+ if (new_cfqq) -+ kmem_cache_free(cfq_pool, new_cfqq); -+ -+ atomic_inc(&cfqq->ref); -+out: -+ WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); -+ return cfqq; -+} -+ - /* - * Setup general io context and cfq io context. There can be several cfq - * io contexts per general io context, if this process is doing io to more -@@ -1102,39 +1578,39 @@ - * cfqq, so we don't need to worry about it disappearing - */ - static struct cfq_io_context * --cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags) -+cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask) - { -- struct cfq_data *cfqd = (*cfqq)->cfqd; -- struct cfq_queue *__cfqq = *cfqq; -+ struct io_context *ioc = NULL; - struct cfq_io_context *cic; -- struct io_context *ioc; - -- might_sleep_if(gfp_flags & __GFP_WAIT); -+ might_sleep_if(gfp_mask & __GFP_WAIT); - -- ioc = get_io_context(gfp_flags); -+ ioc = get_io_context(gfp_mask); - if (!ioc) - return NULL; - - if ((cic = ioc->cic) == NULL) { -- cic = cfq_alloc_io_context(gfp_flags); -+ cic = cfq_alloc_io_context(cfqd, gfp_mask); - - if (cic == NULL) - goto err; - -+ /* -+ * manually increment generic io_context usage count, it -+ * cannot go away since we are already holding one ref to it -+ */ - ioc->cic = cic; -+ ioc->set_ioprio = cfq_ioc_set_ioprio; - cic->ioc = ioc; -- cic->cfqq = __cfqq; -- atomic_inc(&__cfqq->ref); -+ cic->key = cfqd; -+ atomic_inc(&cfqd->ref); - } else { - struct cfq_io_context *__cic; -- unsigned long flags; - - /* -- * since the first cic on the list is actually the head -- * itself, need to check this here or we'll duplicate an -- * cic per ioc for no reason -+ * the first cic on the list is actually the head itself - */ -- if (cic->cfqq == __cfqq) -+ if (cic->key == cfqd) - goto out; - - /* -@@ -1142,152 +1618,255 @@ - * should be ok here, the list will usually not be more than - * 1 or a few entries long - */ -- spin_lock_irqsave(&ioc->lock, flags); - list_for_each_entry(__cic, &cic->list, list) { - /* - * this process is already holding a reference to - * this queue, so no need to get one more - */ -- if (__cic->cfqq == __cfqq) { -+ if (__cic->key == cfqd) { - cic = __cic; -- spin_unlock_irqrestore(&ioc->lock, flags); - goto out; - } - } -- spin_unlock_irqrestore(&ioc->lock, flags); - - /* - * nope, process doesn't have a cic assoicated with this - * cfqq yet. get a new one and add to list - */ -- __cic = cfq_alloc_io_context(gfp_flags); -+ __cic = cfq_alloc_io_context(cfqd, gfp_mask); - if (__cic == NULL) - goto err; - - __cic->ioc = ioc; -- __cic->cfqq = __cfqq; -- atomic_inc(&__cfqq->ref); -- spin_lock_irqsave(&ioc->lock, flags); -+ __cic->key = cfqd; -+ atomic_inc(&cfqd->ref); - list_add(&__cic->list, &cic->list); -- spin_unlock_irqrestore(&ioc->lock, flags); -- - cic = __cic; -- *cfqq = __cfqq; - } - - out: -+ return cic; -+err: -+ put_io_context(ioc); -+ return NULL; -+} -+ -+static void -+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) -+{ -+ unsigned long elapsed, ttime; -+ -+ /* -+ * if this context already has stuff queued, thinktime is from -+ * last queue not last end -+ */ -+#if 0 -+ if (time_after(cic->last_end_request, cic->last_queue)) -+ elapsed = jiffies - cic->last_end_request; -+ else -+ elapsed = jiffies - cic->last_queue; -+#else -+ elapsed = jiffies - cic->last_end_request; -+#endif -+ -+ ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle); -+ -+ cic->ttime_samples = (7*cic->ttime_samples + 256) / 8; -+ cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8; -+ cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples; -+} -+ -+#define sample_valid(samples) ((samples) > 80) -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter -+ */ -+static void -+cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, -+ struct cfq_io_context *cic) -+{ -+ int enable_idle = cfq_cfqq_idle_window(cfqq); -+ -+ if (!cic->ioc->task || !cfqd->cfq_slice_idle) -+ enable_idle = 0; -+ else if (sample_valid(cic->ttime_samples)) { -+ if (cic->ttime_mean > cfqd->cfq_slice_idle) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ -+ if (enable_idle) -+ cfq_mark_cfqq_idle_window(cfqq); -+ else -+ cfq_clear_cfqq_idle_window(cfqq); -+} -+ -+ -+/* -+ * Check if new_cfqq should preempt the currently active queue. Return 0 for -+ * no or if we aren't sure, a 1 will cause a preempt. -+ */ -+static int -+cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, -+ struct cfq_rq *crq) -+{ -+ struct cfq_queue *cfqq = cfqd->active_queue; -+ -+ if (cfq_class_idle(new_cfqq)) -+ return 0; -+ -+ if (!cfqq) -+ return 1; -+ -+ if (cfq_class_idle(cfqq)) -+ return 1; -+ if (!cfq_cfqq_wait_request(new_cfqq)) -+ return 0; - /* -- * if key_type has been changed on the fly, we lazily rehash -- * each queue at lookup time -+ * if it doesn't have slice left, forget it - */ -- if ((*cfqq)->key_type != cfqd->key_type) -- cfq_rehash_cfqq(cfqd, cfqq, cic); -+ if (new_cfqq->slice_left < cfqd->cfq_slice_idle) -+ return 0; -+ if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq)) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * cfqq preempts the active queue. if we allowed preempt with no slice left, -+ * let it have half of its nominal slice. -+ */ -+static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -+{ -+ struct cfq_queue *__cfqq, *next; -+ -+ list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list) -+ cfq_resort_rr_list(__cfqq, 1); - -- return cic; --err: -- put_io_context(ioc); -- return NULL; -+ if (!cfqq->slice_left) -+ cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2; -+ -+ cfqq->slice_end = cfqq->slice_left + jiffies; -+ __cfq_slice_expired(cfqd, cfqq, 1); -+ __cfq_set_active_queue(cfqd, cfqq); - } - --static struct cfq_queue * --__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask) -+/* -+ * should really be a ll_rw_blk.c helper -+ */ -+static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq) - { -- const int hashval = hash_long(key, CFQ_QHASH_SHIFT); -- struct cfq_queue *cfqq, *new_cfqq = NULL; -- --retry: -- cfqq = __cfq_find_cfq_hash(cfqd, key, hashval); -+ request_queue_t *q = cfqd->queue; - -- if (!cfqq) { -- if (new_cfqq) { -- cfqq = new_cfqq; -- new_cfqq = NULL; -- } else { -- spin_unlock_irq(cfqd->queue->queue_lock); -- new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask); -- spin_lock_irq(cfqd->queue->queue_lock); -+ if (!blk_queue_plugged(q)) -+ q->request_fn(q); -+ else -+ __generic_unplug_device(q); -+} - -- if (!new_cfqq && !(gfp_mask & __GFP_WAIT)) -- goto out; -+/* -+ * Called when a new fs request (crq) is added (to cfqq). Check if there's -+ * something we should do about it -+ */ -+static void -+cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, -+ struct cfq_rq *crq) -+{ -+ struct cfq_io_context *cic; - -- goto retry; -- } -+ cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq); - -- memset(cfqq, 0, sizeof(*cfqq)); -+ /* -+ * we never wait for an async request and we don't allow preemption -+ * of an async request. so just return early -+ */ -+ if (!cfq_crq_is_sync(crq)) -+ return; - -- INIT_HLIST_NODE(&cfqq->cfq_hash); -- INIT_LIST_HEAD(&cfqq->cfq_list); -- RB_CLEAR_ROOT(&cfqq->sort_list); -- INIT_LIST_HEAD(&cfqq->fifo[0]); -- INIT_LIST_HEAD(&cfqq->fifo[1]); -+ cic = crq->io_context; - -- cfqq->key = key; -- hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]); -- atomic_set(&cfqq->ref, 0); -- cfqq->cfqd = cfqd; -- atomic_inc(&cfqd->ref); -- cfqq->key_type = cfqd->key_type; -- cfqq->service_start = ~0UL; -- } -+ cfq_update_io_thinktime(cfqd, cic); -+ cfq_update_idle_window(cfqd, cfqq, cic); - -- if (new_cfqq) -- kmem_cache_free(cfq_pool, new_cfqq); -+ cic->last_queue = jiffies; - -- atomic_inc(&cfqq->ref); --out: -- WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq); -- return cfqq; -+ if (cfqq == cfqd->active_queue) { -+ /* -+ * if we are waiting for a request for this queue, let it rip -+ * immediately and flag that we must not expire this queue -+ * just now -+ */ -+ if (cfq_cfqq_wait_request(cfqq)) { -+ cfq_mark_cfqq_must_dispatch(cfqq); -+ del_timer(&cfqd->idle_slice_timer); -+ cfq_start_queueing(cfqd, cfqq); -+ } -+ } else if (cfq_should_preempt(cfqd, cfqq, crq)) { -+ /* -+ * not the active queue - expire current slice if it is -+ * idle and has expired it's mean thinktime or this new queue -+ * has some old slice time left and is of higher priority -+ */ -+ cfq_preempt_queue(cfqd, cfqq); -+ cfq_mark_cfqq_must_dispatch(cfqq); -+ cfq_start_queueing(cfqd, cfqq); -+ } - } - --static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq) -+static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq) - { -- crq->is_sync = 0; -- if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE) -- crq->is_sync = 1; -+ struct cfq_rq *crq = RQ_DATA(rq); -+ struct cfq_queue *cfqq = crq->cfq_queue; -+ -+ cfq_init_prio_data(cfqq); - - cfq_add_crq_rb(crq); -- crq->queue_start = jiffies; - -- list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]); -+ list_add_tail(&rq->queuelist, &cfqq->fifo); -+ -+ if (rq_mergeable(rq)) { -+ cfq_add_crq_hash(cfqd, crq); -+ -+ if (!cfqd->queue->last_merge) -+ cfqd->queue->last_merge = rq; -+ } -+ -+ cfq_crq_enqueued(cfqd, cfqq, crq); - } - - static void - cfq_insert_request(request_queue_t *q, struct request *rq, int where) - { - struct cfq_data *cfqd = q->elevator->elevator_data; -- struct cfq_rq *crq = RQ_DATA(rq); - - switch (where) { - case ELEVATOR_INSERT_BACK: -- while (cfq_dispatch_requests(q, cfqd->cfq_quantum)) -+ while (cfq_dispatch_requests(q, INT_MAX, 1)) - ; - list_add_tail(&rq->queuelist, &q->queue_head); -+ /* -+ * If we were idling with pending requests on -+ * inactive cfqqs, force dispatching will -+ * remove the idle timer and the queue won't -+ * be kicked by __make_request() afterward. -+ * Kick it here. -+ */ -+ cfq_schedule_dispatch(cfqd); - break; - case ELEVATOR_INSERT_FRONT: - list_add(&rq->queuelist, &q->queue_head); - break; - case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq)); -- cfq_enqueue(cfqd, crq); -+ cfq_enqueue(cfqd, rq); - break; - default: - printk("%s: bad insert point %d\n", __FUNCTION__,where); - return; - } -- -- if (rq_mergeable(rq)) { -- cfq_add_crq_hash(cfqd, crq); -- -- if (!q->last_merge) -- q->last_merge = rq; -- } --} -- --static int cfq_queue_empty(request_queue_t *q) --{ -- struct cfq_data *cfqd = q->elevator->elevator_data; -- -- return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list); - } - - static void cfq_completed_request(request_queue_t *q, struct request *rq) -@@ -1300,9 +1879,11 @@ - - cfqq = crq->cfq_queue; - -- if (crq->in_flight) { -- WARN_ON(!cfqq->in_flight); -- cfqq->in_flight--; -+ if (cfq_crq_in_flight(crq)) { -+ const int sync = cfq_crq_is_sync(crq); -+ -+ WARN_ON(!cfqq->on_dispatch[sync]); -+ cfqq->on_dispatch[sync]--; - } - - cfq_account_completion(cfqq, crq); -@@ -1332,51 +1913,136 @@ - return NULL; - } - --static int cfq_may_queue(request_queue_t *q, int rw) -+/* -+ * we temporarily boost lower priority queues if they are holding fs exclusive -+ * resources. they are boosted to normal prio (CLASS_BE/4) -+ */ -+static void cfq_prio_boost(struct cfq_queue *cfqq) - { -- struct cfq_data *cfqd = q->elevator->elevator_data; -- struct cfq_queue *cfqq; -- int ret = ELV_MQUEUE_MAY; -+ const int ioprio_class = cfqq->ioprio_class; -+ const int ioprio = cfqq->ioprio; - -- if (current->flags & PF_MEMALLOC) -- return ELV_MQUEUE_MAY; -+ if (has_fs_excl()) { -+ /* -+ * boost idle prio on transactions that would lock out other -+ * users of the filesystem -+ */ -+ if (cfq_class_idle(cfqq)) -+ cfqq->ioprio_class = IOPRIO_CLASS_BE; -+ if (cfqq->ioprio > IOPRIO_NORM) -+ cfqq->ioprio = IOPRIO_NORM; -+ } else { -+ /* -+ * check if we need to unboost the queue -+ */ -+ if (cfqq->ioprio_class != cfqq->org_ioprio_class) -+ cfqq->ioprio_class = cfqq->org_ioprio_class; -+ if (cfqq->ioprio != cfqq->org_ioprio) -+ cfqq->ioprio = cfqq->org_ioprio; -+ } - -- cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current)); -- if (cfqq) { -- int limit = cfqd->max_queued; -+ /* -+ * refile between round-robin lists if we moved the priority class -+ */ -+ if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) && -+ cfq_cfqq_on_rr(cfqq)) -+ cfq_resort_rr_list(cfqq, 0); -+} - -- if (cfqq->allocated[rw] < cfqd->cfq_queued) -- return ELV_MQUEUE_MUST; -+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw) -+{ -+ if (rw == READ || process_sync(task)) -+ return task->pid; -+ -+ return CFQ_KEY_ASYNC; -+} - -- if (cfqd->busy_queues) -- limit = q->nr_requests / cfqd->busy_queues; -+static inline int -+__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq, -+ struct task_struct *task, int rw) -+{ -+#if 1 -+ if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && -+ !cfq_cfqq_must_alloc_slice(cfqq)) { -+ cfq_mark_cfqq_must_alloc_slice(cfqq); -+ return ELV_MQUEUE_MUST; -+ } - -- if (limit < cfqd->cfq_queued) -- limit = cfqd->cfq_queued; -- else if (limit > cfqd->max_queued) -- limit = cfqd->max_queued; -- -- if (cfqq->allocated[rw] >= limit) { -- if (limit > cfqq->alloc_limit[rw]) -- cfqq->alloc_limit[rw] = limit; -+ return ELV_MQUEUE_MAY; -+#else -+ if (!cfqq || task->flags & PF_MEMALLOC) -+ return ELV_MQUEUE_MAY; -+ if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) { -+ if (cfq_cfqq_wait_request(cfqq)) -+ return ELV_MQUEUE_MUST; - -- ret = ELV_MQUEUE_NO; -+ /* -+ * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we -+ * can quickly flood the queue with writes from a single task -+ */ -+ if (rw == READ || !cfq_cfqq_must_alloc_slice(cfqq)) { -+ cfq_mark_cfqq_must_alloc_slice(cfqq); -+ return ELV_MQUEUE_MUST; - } -+ -+ return ELV_MQUEUE_MAY; - } -+ if (cfq_class_idle(cfqq)) -+ return ELV_MQUEUE_NO; -+ if (cfqq->allocated[rw] >= cfqd->max_queued) { -+ struct io_context *ioc = get_io_context(GFP_ATOMIC); -+ int ret = ELV_MQUEUE_NO; - -- return ret; -+ if (ioc && ioc->nr_batch_requests) -+ ret = ELV_MQUEUE_MAY; -+ -+ put_io_context(ioc); -+ return ret; -+ } -+ -+ return ELV_MQUEUE_MAY; -+#endif -+} -+ -+static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio) -+{ -+ struct cfq_data *cfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct cfq_queue *cfqq; -+ -+ /* -+ * don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be queued. -+ * so just lookup a possibly existing queue, or return 'may queue' -+ * if that fails -+ */ -+ cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio); -+ if (cfqq) { -+ cfq_init_prio_data(cfqq); -+ cfq_prio_boost(cfqq); -+ -+ return __cfq_may_queue(cfqd, cfqq, tsk, rw); -+ } -+ -+ return ELV_MQUEUE_MAY; - } - - static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq) - { -+ struct cfq_data *cfqd = q->elevator->elevator_data; - struct request_list *rl = &q->rq; -- const int write = waitqueue_active(&rl->wait[WRITE]); -- const int read = waitqueue_active(&rl->wait[READ]); - -- if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ]) -- wake_up(&rl->wait[READ]); -- if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE]) -- wake_up(&rl->wait[WRITE]); -+ if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) { -+ smp_mb(); -+ if (waitqueue_active(&rl->wait[READ])) -+ wake_up(&rl->wait[READ]); -+ } -+ -+ if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) { -+ smp_mb(); -+ if (waitqueue_active(&rl->wait[WRITE])) -+ wake_up(&rl->wait[WRITE]); -+ } - } - - /* -@@ -1389,69 +2055,61 @@ - - if (crq) { - struct cfq_queue *cfqq = crq->cfq_queue; -+ const int rw = rq_data_dir(rq); - -- BUG_ON(q->last_merge == rq); -- BUG_ON(!hlist_unhashed(&crq->hash)); -- -- if (crq->io_context) -- put_io_context(crq->io_context->ioc); -+ BUG_ON(!cfqq->allocated[rw]); -+ cfqq->allocated[rw]--; - -- BUG_ON(!cfqq->allocated[crq->is_write]); -- cfqq->allocated[crq->is_write]--; -+ put_io_context(crq->io_context->ioc); - - mempool_free(crq, cfqd->crq_pool); - rq->elevator_private = NULL; - -- smp_mb(); - cfq_check_waiters(q, cfqq); - cfq_put_queue(cfqq); - } - } - - /* -- * Allocate cfq data structures associated with this request. A queue and -+ * Allocate cfq data structures associated with this request. - */ --static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -+static int -+cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio, -+ int gfp_mask) - { - struct cfq_data *cfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; - struct cfq_io_context *cic; - const int rw = rq_data_dir(rq); -- struct cfq_queue *cfqq, *saved_cfqq; -+ pid_t key = cfq_queue_pid(tsk, rw); -+ struct cfq_queue *cfqq; - struct cfq_rq *crq; - unsigned long flags; - - might_sleep_if(gfp_mask & __GFP_WAIT); - -+ cic = cfq_get_io_context(cfqd, key, gfp_mask); -+ - spin_lock_irqsave(q->queue_lock, flags); - -- cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask); -- if (!cfqq) -- goto out_lock; -+ if (!cic) -+ goto queue_fail; - --repeat: -- if (cfqq->allocated[rw] >= cfqd->max_queued) -- goto out_lock; -+ if (!cic->cfqq) { -+ cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask); -+ if (!cfqq) -+ goto queue_fail; -+ -+ cic->cfqq = cfqq; -+ } else -+ cfqq = cic->cfqq; - - cfqq->allocated[rw]++; -+ cfq_clear_cfqq_must_alloc(cfqq); -+ cfqd->rq_starved = 0; -+ atomic_inc(&cfqq->ref); - spin_unlock_irqrestore(q->queue_lock, flags); - -- /* -- * if hashing type has changed, the cfq_queue might change here. -- */ -- saved_cfqq = cfqq; -- cic = cfq_get_io_context(&cfqq, gfp_mask); -- if (!cic) -- goto err; -- -- /* -- * repeat allocation checks on queue change -- */ -- if (unlikely(saved_cfqq != cfqq)) { -- spin_lock_irqsave(q->queue_lock, flags); -- saved_cfqq->allocated[rw]--; -- goto repeat; -- } -- - crq = mempool_alloc(cfqd->crq_pool, gfp_mask); - if (crq) { - RB_CLEAR(&crq->rb_node); -@@ -1460,24 +2118,141 @@ - INIT_HLIST_NODE(&crq->hash); - crq->cfq_queue = cfqq; - crq->io_context = cic; -- crq->service_start = crq->queue_start = 0; -- crq->in_flight = crq->accounted = crq->is_sync = 0; -- crq->is_write = rw; -+ cfq_clear_crq_in_flight(crq); -+ cfq_clear_crq_in_driver(crq); -+ cfq_clear_crq_requeued(crq); -+ -+ if (rw == READ || process_sync(tsk)) -+ cfq_mark_crq_is_sync(crq); -+ else -+ cfq_clear_crq_is_sync(crq); -+ - rq->elevator_private = crq; -- cfqq->alloc_limit[rw] = 0; - return 0; - } - -- put_io_context(cic->ioc); --err: - spin_lock_irqsave(q->queue_lock, flags); - cfqq->allocated[rw]--; -+ if (!(cfqq->allocated[0] + cfqq->allocated[1])) -+ cfq_mark_cfqq_must_alloc(cfqq); - cfq_put_queue(cfqq); --out_lock: -+queue_fail: -+ if (cic) -+ put_io_context(cic->ioc); -+ /* -+ * mark us rq allocation starved. we need to kickstart the process -+ * ourselves if there are no pending requests that can do it for us. -+ * that would be an extremely rare OOM situation -+ */ -+ cfqd->rq_starved = 1; -+ cfq_schedule_dispatch(cfqd); - spin_unlock_irqrestore(q->queue_lock, flags); - return 1; - } - -+static void cfq_kick_queue(void *data) -+{ -+ request_queue_t *q = data; -+ struct cfq_data *cfqd = q->elevator->elevator_data; -+ unsigned long flags; -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (cfqd->rq_starved) { -+ struct request_list *rl = &q->rq; -+ -+ /* -+ * we aren't guaranteed to get a request after this, but we -+ * have to be opportunistic -+ */ -+ smp_mb(); -+ if (waitqueue_active(&rl->wait[READ])) -+ wake_up(&rl->wait[READ]); -+ if (waitqueue_active(&rl->wait[WRITE])) -+ wake_up(&rl->wait[WRITE]); -+ } -+ -+ blk_remove_plug(q); -+ q->request_fn(q); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+} -+ -+/* -+ * Timer running if the active_queue is currently idling inside its time slice -+ */ -+static void cfq_idle_slice_timer(unsigned long data) -+{ -+ struct cfq_data *cfqd = (struct cfq_data *) data; -+ struct cfq_queue *cfqq; -+ unsigned long flags; -+ -+ spin_lock_irqsave(cfqd->queue->queue_lock, flags); -+ -+ if ((cfqq = cfqd->active_queue) != NULL) { -+ unsigned long now = jiffies; -+ -+ /* -+ * expired -+ */ -+ if (time_after(now, cfqq->slice_end)) -+ goto expire; -+ -+ /* -+ * only expire and reinvoke request handler, if there are -+ * other queues with pending requests -+ */ -+ if (!cfq_pending_requests(cfqd)) { -+ cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end); -+ add_timer(&cfqd->idle_slice_timer); -+ goto out_cont; -+ } -+ -+ /* -+ * not expired and it has a request pending, let it dispatch -+ */ -+ if (!RB_EMPTY(&cfqq->sort_list)) { -+ cfq_mark_cfqq_must_dispatch(cfqq); -+ goto out_kick; -+ } -+ } -+expire: -+ cfq_slice_expired(cfqd, 0); -+out_kick: -+ cfq_schedule_dispatch(cfqd); -+out_cont: -+ spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -+} -+ -+/* -+ * Timer running if an idle class queue is waiting for service -+ */ -+static void cfq_idle_class_timer(unsigned long data) -+{ -+ struct cfq_data *cfqd = (struct cfq_data *) data; -+ unsigned long flags, end; -+ -+ spin_lock_irqsave(cfqd->queue->queue_lock, flags); -+ -+ /* -+ * race with a non-idle queue, reset timer -+ */ -+ end = cfqd->last_end_request + CFQ_IDLE_GRACE; -+ if (!time_after_eq(jiffies, end)) { -+ cfqd->idle_class_timer.expires = end; -+ add_timer(&cfqd->idle_class_timer); -+ } else -+ cfq_schedule_dispatch(cfqd); -+ -+ spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); -+} -+ -+static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) -+{ -+ del_timer_sync(&cfqd->idle_slice_timer); -+ del_timer_sync(&cfqd->idle_class_timer); -+ blk_sync_queue(cfqd->queue); -+} -+ - static void cfq_put_cfqd(struct cfq_data *cfqd) - { - request_queue_t *q = cfqd->queue; -@@ -1485,6 +2260,7 @@ - if (!atomic_dec_and_test(&cfqd->ref)) - return; - -+ cfq_shutdown_timer_wq(cfqd); - blk_put_queue(q); - - mempool_destroy(cfqd->crq_pool); -@@ -1495,7 +2271,10 @@ - - static void cfq_exit_queue(elevator_t *e) - { -- cfq_put_cfqd(e->elevator_data); -+ struct cfq_data *cfqd = e->elevator_data; -+ -+ cfq_shutdown_timer_wq(cfqd); -+ cfq_put_cfqd(cfqd); - } - - static int cfq_init_queue(request_queue_t *q, elevator_t *e) -@@ -1508,7 +2287,13 @@ - return -ENOMEM; - - memset(cfqd, 0, sizeof(*cfqd)); -- INIT_LIST_HEAD(&cfqd->rr_list); -+ -+ for (i = 0; i < CFQ_PRIO_LISTS; i++) -+ INIT_LIST_HEAD(&cfqd->rr_list[i]); -+ -+ INIT_LIST_HEAD(&cfqd->busy_rr); -+ INIT_LIST_HEAD(&cfqd->cur_rr); -+ INIT_LIST_HEAD(&cfqd->idle_rr); - INIT_LIST_HEAD(&cfqd->empty_list); - - cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL); -@@ -1533,24 +2318,32 @@ - cfqd->queue = q; - atomic_inc(&q->refcnt); - -- /* -- * just set it to some high value, we want anyone to be able to queue -- * some requests. fairness is handled differently -- */ -- q->nr_requests = 1024; -- cfqd->max_queued = q->nr_requests / 16; -+ cfqd->max_queued = q->nr_requests / 4; - q->nr_batching = cfq_queued; -- cfqd->key_type = CFQ_KEY_TGID; -- cfqd->find_best_crq = 1; -+ -+ init_timer(&cfqd->idle_slice_timer); -+ cfqd->idle_slice_timer.function = cfq_idle_slice_timer; -+ cfqd->idle_slice_timer.data = (unsigned long) cfqd; -+ -+ init_timer(&cfqd->idle_class_timer); -+ cfqd->idle_class_timer.function = cfq_idle_class_timer; -+ cfqd->idle_class_timer.data = (unsigned long) cfqd; -+ -+ INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q); -+ - atomic_set(&cfqd->ref, 1); - - cfqd->cfq_queued = cfq_queued; - cfqd->cfq_quantum = cfq_quantum; -- cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r; -- cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w; -- cfqd->cfq_fifo_batch_expire = cfq_fifo_rate; -+ cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; -+ cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; - cfqd->cfq_back_max = cfq_back_max; - cfqd->cfq_back_penalty = cfq_back_penalty; -+ cfqd->cfq_slice[0] = cfq_slice_async; -+ cfqd->cfq_slice[1] = cfq_slice_sync; -+ cfqd->cfq_slice_async_rq = cfq_slice_async_rq; -+ cfqd->cfq_slice_idle = cfq_slice_idle; -+ cfqd->cfq_max_depth = cfq_max_depth; - - return 0; - out_crqpool: -@@ -1595,7 +2388,6 @@ - return -ENOMEM; - } - -- - /* - * sysfs parts below --> - */ -@@ -1620,45 +2412,6 @@ - return count; - } - --static ssize_t --cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count) --{ -- max_elapsed_dispatch = max_elapsed_crq = 0; -- return count; --} -- --static ssize_t --cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count) --{ -- spin_lock_irq(cfqd->queue->queue_lock); -- if (!strncmp(page, "pgid", 4)) -- cfqd->key_type = CFQ_KEY_PGID; -- else if (!strncmp(page, "tgid", 4)) -- cfqd->key_type = CFQ_KEY_TGID; -- else if (!strncmp(page, "uid", 3)) -- cfqd->key_type = CFQ_KEY_UID; -- else if (!strncmp(page, "gid", 3)) -- cfqd->key_type = CFQ_KEY_GID; -- spin_unlock_irq(cfqd->queue->queue_lock); -- return count; --} -- --static ssize_t --cfq_read_key_type(struct cfq_data *cfqd, char *page) --{ -- ssize_t len = 0; -- int i; -- -- for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) { -- if (cfqd->key_type == i) -- len += sprintf(page+len, "[%s] ", cfq_key_types[i]); -- else -- len += sprintf(page+len, "%s ", cfq_key_types[i]); -- } -- len += sprintf(page+len, "\n"); -- return len; --} -- - #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ - static ssize_t __FUNC(struct cfq_data *cfqd, char *page) \ - { \ -@@ -1669,12 +2422,15 @@ - } - SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); - SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0); --SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1); --SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1); --SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1); --SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0); -+SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); -+SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); - SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0); - SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0); -+SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); -+SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); -+SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); -+SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); -+SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0); - #undef SHOW_FUNCTION - - #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -@@ -1694,12 +2450,15 @@ - } - STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); - STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0); --STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1); --STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1); --STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1); --STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0); -+STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); -+STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1); - STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); - STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); -+STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); -+STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); -+STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); -+STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); -+STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0); - #undef STORE_FUNCTION - - static struct cfq_fs_entry cfq_quantum_entry = { -@@ -1712,25 +2471,15 @@ - .show = cfq_queued_show, - .store = cfq_queued_store, - }; --static struct cfq_fs_entry cfq_fifo_expire_r_entry = { -+static struct cfq_fs_entry cfq_fifo_expire_sync_entry = { - .attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR }, -- .show = cfq_fifo_expire_r_show, -- .store = cfq_fifo_expire_r_store, -+ .show = cfq_fifo_expire_sync_show, -+ .store = cfq_fifo_expire_sync_store, - }; --static struct cfq_fs_entry cfq_fifo_expire_w_entry = { -+static struct cfq_fs_entry cfq_fifo_expire_async_entry = { - .attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR }, -- .show = cfq_fifo_expire_w_show, -- .store = cfq_fifo_expire_w_store, --}; --static struct cfq_fs_entry cfq_fifo_batch_expire_entry = { -- .attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR }, -- .show = cfq_fifo_batch_expire_show, -- .store = cfq_fifo_batch_expire_store, --}; --static struct cfq_fs_entry cfq_find_best_entry = { -- .attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR }, -- .show = cfq_find_best_show, -- .store = cfq_find_best_store, -+ .show = cfq_fifo_expire_async_show, -+ .store = cfq_fifo_expire_async_store, - }; - static struct cfq_fs_entry cfq_back_max_entry = { - .attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR }, -@@ -1742,27 +2491,44 @@ - .show = cfq_back_penalty_show, - .store = cfq_back_penalty_store, - }; --static struct cfq_fs_entry cfq_clear_elapsed_entry = { -- .attr = {.name = "clear_elapsed", .mode = S_IWUSR }, -- .store = cfq_clear_elapsed, --}; --static struct cfq_fs_entry cfq_key_type_entry = { -- .attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR }, -- .show = cfq_read_key_type, -- .store = cfq_set_key_type, -+static struct cfq_fs_entry cfq_slice_sync_entry = { -+ .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR }, -+ .show = cfq_slice_sync_show, -+ .store = cfq_slice_sync_store, -+}; -+static struct cfq_fs_entry cfq_slice_async_entry = { -+ .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR }, -+ .show = cfq_slice_async_show, -+ .store = cfq_slice_async_store, -+}; -+static struct cfq_fs_entry cfq_slice_async_rq_entry = { -+ .attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR }, -+ .show = cfq_slice_async_rq_show, -+ .store = cfq_slice_async_rq_store, -+}; -+static struct cfq_fs_entry cfq_slice_idle_entry = { -+ .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR }, -+ .show = cfq_slice_idle_show, -+ .store = cfq_slice_idle_store, -+}; -+static struct cfq_fs_entry cfq_max_depth_entry = { -+ .attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR }, -+ .show = cfq_max_depth_show, -+ .store = cfq_max_depth_store, - }; - - static struct attribute *default_attrs[] = { - &cfq_quantum_entry.attr, - &cfq_queued_entry.attr, -- &cfq_fifo_expire_r_entry.attr, -- &cfq_fifo_expire_w_entry.attr, -- &cfq_fifo_batch_expire_entry.attr, -- &cfq_key_type_entry.attr, -- &cfq_find_best_entry.attr, -+ &cfq_fifo_expire_sync_entry.attr, -+ &cfq_fifo_expire_async_entry.attr, - &cfq_back_max_entry.attr, - &cfq_back_penalty_entry.attr, -- &cfq_clear_elapsed_entry.attr, -+ &cfq_slice_sync_entry.attr, -+ &cfq_slice_async_entry.attr, -+ &cfq_slice_async_rq_entry.attr, -+ &cfq_slice_idle_entry.attr, -+ &cfq_max_depth_entry.attr, - NULL, - }; - -@@ -1832,21 +2598,46 @@ - { - int ret; - -+ /* -+ * could be 0 on HZ < 1000 setups -+ */ -+ if (!cfq_slice_async) -+ cfq_slice_async = 1; -+ if (!cfq_slice_idle) -+ cfq_slice_idle = 1; -+ - if (cfq_slab_setup()) - return -ENOMEM; - - ret = elv_register(&iosched_cfq); -- if (!ret) { -- __module_get(THIS_MODULE); -- return 0; -- } -+ if (ret) -+ cfq_slab_kill(); - -- cfq_slab_kill(); - return ret; - } - - static void __exit cfq_exit(void) - { -+ struct task_struct *g, *p; -+ unsigned long flags; -+ -+ read_lock_irqsave(&tasklist_lock, flags); -+ -+ /* -+ * iterate each process in the system, removing our io_context -+ */ -+ do_each_thread(g, p) { -+ struct io_context *ioc = p->io_context; -+ -+ if (ioc && ioc->cic) { -+ ioc->cic->exit(ioc->cic); -+ cfq_free_io_context(ioc->cic); -+ ioc->cic = NULL; -+ } -+ } while_each_thread(g, p); -+ -+ read_unlock_irqrestore(&tasklist_lock, flags); -+ - cfq_slab_kill(); - elv_unregister(&iosched_cfq); - } -diff -Naur 2.6.12-5.0-org/drivers/block/deadline-iosched.c 2.6.12-5.0-patched/drivers/block/deadline-iosched.c ---- 2.6.12-5.0-org/drivers/block/deadline-iosched.c 2007-07-26 00:53:20.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/deadline-iosched.c 2007-12-11 12:34:52.000000000 +0100 -@@ -758,7 +758,8 @@ - } - - static int --deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -+deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio, -+ int gfp_mask) - { - struct deadline_data *dd = q->elevator->elevator_data; - struct deadline_rq *drq; -diff -Naur 2.6.12-5.0-org/drivers/block/elevator.c 2.6.12-5.0-patched/drivers/block/elevator.c ---- 2.6.12-5.0-org/drivers/block/elevator.c 2007-07-26 00:53:20.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/elevator.c 2007-12-11 12:34:52.000000000 +0100 -@@ -158,27 +158,6 @@ - - static char chosen_elevator[16]; - --static void elevator_setup_default(void) --{ -- /* -- * check if default is set and exists -- */ -- if (chosen_elevator[0] && elevator_find(chosen_elevator)) -- return; -- --#if defined(CONFIG_IOSCHED_AS) -- strcpy(chosen_elevator, "anticipatory"); --#elif defined(CONFIG_IOSCHED_DEADLINE) -- strcpy(chosen_elevator, "deadline"); --#elif defined(CONFIG_IOSCHED_CFQ) -- strcpy(chosen_elevator, "cfq"); --#elif defined(CONFIG_IOSCHED_NOOP) -- strcpy(chosen_elevator, "noop"); --#else --#error "You must build at least 1 IO scheduler into the kernel" --#endif --} -- - static int __init elevator_setup(char *str) - { - strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); -@@ -193,15 +172,17 @@ - struct elevator_queue *eq; - int ret = 0; - -- elevator_setup_default(); -- -- if (!name) -- name = chosen_elevator; -- -- e = elevator_get(name); -- if (!e) -+ if (name && !(e = elevator_get(name))) - return -EINVAL; -- -+ -+ if (!e && *chosen_elevator && !(e = elevator_get(chosen_elevator))) -+ printk("I/O scheduler %s not found\n", chosen_elevator); -+ -+ if (!e && !(e = elevator_get(CONFIG_DEFAULT_IOSCHED))) { -+ printk("Default I/O scheduler not found, using no+op\n"); -+ e = elevator_get("noop"); -+ } -+ - eq = kmalloc(sizeof(struct elevator_queue), GFP_KERNEL); - if (!eq) { - elevator_put(e->elevator_type); -@@ -480,12 +461,13 @@ - return NULL; - } - --int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask) -+int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio, -+ int gfp_mask) - { - elevator_t *e = q->elevator; - - if (e->ops->elevator_set_req_fn) -- return e->ops->elevator_set_req_fn(q, rq, gfp_mask); -+ return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask); - - rq->elevator_private = NULL; - return 0; -@@ -499,12 +481,12 @@ - e->ops->elevator_put_req_fn(q, rq); - } - --int elv_may_queue(request_queue_t *q, int rw) -+int elv_may_queue(request_queue_t *q, int rw, struct bio *bio) - { - elevator_t *e = q->elevator; - - if (e->ops->elevator_may_queue_fn) -- return e->ops->elevator_may_queue_fn(q, rw); -+ return e->ops->elevator_may_queue_fn(q, rw, bio); - - return ELV_MQUEUE_MAY; - } -diff -Naur 2.6.12-5.0-org/drivers/block/Kconfig.iosched 2.6.12-5.0-patched/drivers/block/Kconfig.iosched ---- 2.6.12-5.0-org/drivers/block/Kconfig.iosched 2007-07-26 00:53:20.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/Kconfig.iosched 2007-12-11 12:34:52.000000000 +0100 -@@ -38,4 +38,32 @@ - among all processes in the system. It should provide a fair - working environment, suitable for desktop systems. - -+choice -+ prompt "Default I/O scheduler" -+ default DEFAULT_AS -+ help -+ Select the I/O scheduler which will be used by default for all -+ block devices. -+ -+ config DEFAULT_AS -+ bool "Anticipatory" if IOSCHED_AS=y -+ -+ config DEFAULT_DEADLINE -+ bool "Deadline" if IOSCHED_DEADLINE=y -+ -+ config DEFAULT_CFQ -+ bool "CFQ" if IOSCHED_CFQ=y -+ -+ config DEFAULT_NOOP -+ bool "No-op" -+ -+endchoice -+ -+config DEFAULT_IOSCHED -+ string -+ default "anticipatory" if DEFAULT_AS -+ default "deadline" if DEFAULT_DEADLINE -+ default "cfq" if DEFAULT_CFQ -+ default "noop" if DEFAULT_NOOP -+ - endmenu -diff -Naur 2.6.12-5.0-org/drivers/block/ll_rw_blk.c 2.6.12-5.0-patched/drivers/block/ll_rw_blk.c ---- 2.6.12-5.0-org/drivers/block/ll_rw_blk.c 2007-07-26 00:53:21.000000000 +0200 -+++ 2.6.12-5.0-patched/drivers/block/ll_rw_blk.c 2007-12-11 12:37:54.000000000 +0100 -@@ -287,6 +287,7 @@ - rq->errors = 0; - rq->rq_status = RQ_ACTIVE; - rq->bio = rq->biotail = NULL; -+ rq->ioprio = 0; - rq->buffer = NULL; - rq->ref_count = 1; - rq->q = q; -@@ -1522,11 +1523,7 @@ - if (!blk_remove_plug(q)) - return; - -- /* -- * was plugged, fire request_fn if queue has stuff to do -- */ -- if (elv_next_request(q)) -- q->request_fn(q); -+ q->request_fn(q); - } - EXPORT_SYMBOL(__generic_unplug_device); - -@@ -1841,8 +1838,8 @@ - mempool_free(rq, q->rq.rq_pool); - } - --static inline struct request *blk_alloc_request(request_queue_t *q, int rw, -- int gfp_mask) -+static inline struct request * -+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask) - { - struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); - -@@ -1855,7 +1852,7 @@ - */ - rq->flags = rw; - -- if (!elv_set_request(q, rq, gfp_mask)) -+ if (!elv_set_request(q, rq, bio, gfp_mask)) - return rq; - - mempool_free(rq, q->rq.rq_pool); -@@ -1938,7 +1935,8 @@ - /* - * Get a free request, queue_lock must not be held - */ --static struct request *get_request(request_queue_t *q, int rw, int gfp_mask) -+static struct request *get_request(request_queue_t *q, int rw, struct bio *bio, -+ int gfp_mask) - { - struct request *rq = NULL; - struct request_list *rl = &q->rq; -@@ -1961,7 +1959,7 @@ - } - } - -- switch (elv_may_queue(q, rw)) { -+ switch (elv_may_queue(q, rw, bio)) { - case ELV_MQUEUE_NO: - goto rq_starved; - case ELV_MQUEUE_MAY: -@@ -1986,7 +1984,7 @@ - set_queue_congested(q, rw); - spin_unlock_irq(q->queue_lock); - -- rq = blk_alloc_request(q, rw, gfp_mask); -+ rq = blk_alloc_request(q, rw, bio, gfp_mask); - if (!rq) { - /* - * Allocation failed presumably due to memory. Undo anything -@@ -2027,7 +2025,8 @@ - * No available requests for this queue, unplug the device and wait for some - * requests to become available. - */ --static struct request *get_request_wait(request_queue_t *q, int rw) -+static struct request *get_request_wait(request_queue_t *q, int rw, -+ struct bio *bio) - { - DEFINE_WAIT(wait); - struct request *rq; -@@ -2039,7 +2038,7 @@ - prepare_to_wait_exclusive(&rl->wait[rw], &wait, - TASK_UNINTERRUPTIBLE); - -- rq = get_request(q, rw, GFP_NOIO); -+ rq = get_request(q, rw, bio, GFP_NOIO); - - if (!rq) { - struct io_context *ioc; -@@ -2069,9 +2068,9 @@ - BUG_ON(rw != READ && rw != WRITE); - - if (gfp_mask & __GFP_WAIT) -- rq = get_request_wait(q, rw); -+ rq = get_request_wait(q, rw, NULL); - else -- rq = get_request(q, rw, gfp_mask); -+ rq = get_request(q, rw, NULL, gfp_mask); - - return rq; - } -@@ -2445,7 +2444,6 @@ - return; - - req->rq_status = RQ_INACTIVE; -- req->q = NULL; - req->rl = NULL; - - /* -@@ -2583,6 +2581,8 @@ - req->rq_disk->in_flight--; - } - -+ req->ioprio = ioprio_best(req->ioprio, next->ioprio); -+ - __blk_put_request(q, next); - return 1; - } -@@ -2645,11 +2645,13 @@ - { - struct request *req, *freereq = NULL; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync; -+ unsigned short prio; - sector_t sector; - - sector = bio->bi_sector; - nr_sectors = bio_sectors(bio); - cur_nr_sectors = bio_cur_sectors(bio); -+ prio = bio_prio(bio); - - rw = bio_data_dir(bio); - sync = bio_sync(bio); -@@ -2696,6 +2698,7 @@ - set_bit(__REQ_DIRECTIO, &req->flags); - } - #endif -+ req->ioprio = ioprio_best(req->ioprio, prio); - drive_stat_acct(req, nr_sectors, 0); - if (!attempt_back_merge(q, req)) - elv_merged_request(q, req); -@@ -2726,6 +2729,7 @@ - set_bit(__REQ_DIRECTIO, &req->flags); - } - #endif -+ req->ioprio = ioprio_best(req->ioprio, prio); - drive_stat_acct(req, nr_sectors, 0); - if (!attempt_front_merge(q, req)) - elv_merged_request(q, req); -@@ -2753,7 +2757,7 @@ - freereq = NULL; - } else { - spin_unlock_irq(q->queue_lock); -- if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) { -+ if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) { - /* - * READA bit set - */ -@@ -2761,7 +2765,7 @@ - if (bio_rw_ahead(bio)) - goto end_io; - -- freereq = get_request_wait(q, rw); -+ freereq = get_request_wait(q, rw, bio); - } - goto again; - } -@@ -2789,6 +2793,7 @@ - req->buffer = bio_data(bio); /* see ->buffer comment above */ - req->waiting = NULL; - req->bio = req->biotail = bio; -+ req->ioprio = prio; - req->rq_disk = bio->bi_bdev->bd_disk; - req->start_time = jiffies; - #if defined (CONFIG_MIPS_BCM7440) -@@ -2821,7 +2826,7 @@ - if (bdev != bdev->bd_contains) { - struct hd_struct *p = bdev->bd_part; - -- switch (bio->bi_rw) { -+ switch (bio_data_dir(bio)) { - case READ: - p->read_sectors += bio_sectors(bio); - p->reads++; -@@ -2840,6 +2845,7 @@ - { - struct request_list *rl = &q->rq; - struct request *rq; -+ int requeued = 0; - - spin_lock_irq(q->queue_lock); - clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags); -@@ -2848,9 +2854,13 @@ - rq = list_entry_rq(q->drain_list.next); - - list_del_init(&rq->queuelist); -- __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1); -+ elv_requeue_request(q, rq); -+ requeued++; - } - -+ if (requeued) -+ q->request_fn(q); -+ - spin_unlock_irq(q->queue_lock); - - wake_up(&rl->wait[0]); -@@ -3056,7 +3066,7 @@ - - BIO_BUG_ON(!bio->bi_size); - BIO_BUG_ON(!bio->bi_io_vec); -- bio->bi_rw = rw; -+ bio->bi_rw |= rw; - if (rw & WRITE) - mod_page_state(pgpgout, count); - else -@@ -3418,8 +3428,11 @@ - struct io_context *ioc; - - local_irq_save(flags); -+ task_lock(current); - ioc = current->io_context; - current->io_context = NULL; -+ ioc->task = NULL; -+ task_unlock(current); - local_irq_restore(flags); - - if (ioc->aic && ioc->aic->exit) -@@ -3454,12 +3467,12 @@ - ret = kmem_cache_alloc(iocontext_cachep, gfp_flags); - if (ret) { - atomic_set(&ret->refcount, 1); -- ret->pid = tsk->pid; -+ ret->task = current; -+ ret->set_ioprio = NULL; - ret->last_waited = jiffies; /* doesn't matter... */ - ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; - ret->cic = NULL; -- spin_lock_init(&ret->lock); - - local_irq_save(flags); - -diff -Naur 2.6.12-5.0-org/fs/ioprio.c 2.6.12-5.0-patched/fs/ioprio.c ---- 2.6.12-5.0-org/fs/ioprio.c 1970-01-01 01:00:00.000000000 +0100 -+++ 2.6.12-5.0-patched/fs/ioprio.c 2007-12-11 12:34:52.000000000 +0100 -@@ -0,0 +1,172 @@ -+/* -+ * fs/ioprio.c -+ * -+ * Copyright (C) 2004 Jens Axboe <axboe@suse.de> -+ * -+ * Helper functions for setting/querying io priorities of processes. The -+ * system calls closely mimmick getpriority/setpriority, see the man page for -+ * those. The prio argument is a composite of prio class and prio data, where -+ * the data argument has meaning within that class. The standard scheduling -+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7 -+ * being the lowest. -+ * -+ * IOW, setting BE scheduling class with prio 2 is done ala: -+ * -+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; -+ * -+ * ioprio_set(PRIO_PROCESS, pid, prio); -+ * -+ * See also Documentation/block/ioprio.txt -+ * -+ */ -+#include <linux/kernel.h> -+#include <linux/ioprio.h> -+#include <linux/blkdev.h> -+ -+static int set_task_ioprio(struct task_struct *task, int ioprio) -+{ -+ struct io_context *ioc; -+ -+ if (task->uid != current->euid && -+ task->uid != current->uid && !capable(CAP_SYS_NICE)) -+ return -EPERM; -+ -+ task_lock(task); -+ -+ task->ioprio = ioprio; -+ -+ ioc = task->io_context; -+ if (ioc && ioc->set_ioprio) -+ ioc->set_ioprio(ioc, ioprio); -+ -+ task_unlock(task); -+ return 0; -+} -+ -+asmlinkage int sys_ioprio_set(int which, int who, int ioprio) -+{ -+ int class = IOPRIO_PRIO_CLASS(ioprio); -+ int data = IOPRIO_PRIO_DATA(ioprio); -+ struct task_struct *p, *g; -+ struct user_struct *user; -+ int ret; -+ -+ switch (class) { -+ case IOPRIO_CLASS_RT: -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ /* fall through, rt has prio field too */ -+ case IOPRIO_CLASS_BE: -+ if (data >= IOPRIO_BE_NR || data < 0) -+ return -EINVAL; -+ -+ break; -+ case IOPRIO_CLASS_IDLE: -+ break; -+ default: -+ return -EINVAL; -+ } -+ -+ ret = -ESRCH; -+ read_lock_irq(&tasklist_lock); -+ switch (which) { -+ case IOPRIO_WHO_PROCESS: -+ if (!who) -+ p = current; -+ else -+ p = find_task_by_pid(who); -+ if (p) -+ ret = set_task_ioprio(p, ioprio); -+ break; -+ case IOPRIO_WHO_PGRP: -+ if (!who) -+ who = process_group(current); -+ do_each_task_pid(who, PIDTYPE_PGID, p) { -+ ret = set_task_ioprio(p, ioprio); -+ if (ret) -+ break; -+ } while_each_task_pid(who, PIDTYPE_PGID, p); -+ break; -+ case IOPRIO_WHO_USER: -+ if (!who) -+ user = current->user; -+ else -+ user = find_user(who); -+ -+ if (!user) -+ break; -+ -+ do_each_thread(g, p) { -+ if (p->uid != who) -+ continue; -+ ret = set_task_ioprio(p, ioprio); -+ if (ret) -+ break; -+ } while_each_thread(g, p); -+ -+ if (who) -+ free_uid(user); -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ read_unlock_irq(&tasklist_lock); -+ return ret; -+} -+ -+asmlinkage int sys_ioprio_get(int which, int who) -+{ -+ struct task_struct *g, *p; -+ struct user_struct *user; -+ int ret = -ESRCH; -+ -+ read_lock_irq(&tasklist_lock); -+ switch (which) { -+ case IOPRIO_WHO_PROCESS: -+ if (!who) -+ p = current; -+ else -+ p = find_task_by_pid(who); -+ if (p) -+ ret = p->ioprio; -+ break; -+ case IOPRIO_WHO_PGRP: -+ if (!who) -+ who = process_group(current); -+ do_each_task_pid(who, PIDTYPE_PGID, p) { -+ if (ret == -ESRCH) -+ ret = p->ioprio; -+ else -+ ret = ioprio_best(ret, p->ioprio); -+ } while_each_task_pid(who, PIDTYPE_PGID, p); -+ break; -+ case IOPRIO_WHO_USER: -+ if (!who) -+ user = current->user; -+ else -+ user = find_user(who); -+ -+ if (!user) -+ break; -+ -+ do_each_thread(g, p) { -+ if (p->uid != user->uid) -+ continue; -+ if (ret == -ESRCH) -+ ret = p->ioprio; -+ else -+ ret = ioprio_best(ret, p->ioprio); -+ } while_each_thread(g, p); -+ -+ if (who) -+ free_uid(user); -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ read_unlock_irq(&tasklist_lock); -+ return ret; -+} -+ -diff -Naur 2.6.12-5.0-org/fs/Makefile 2.6.12-5.0-patched/fs/Makefile ---- 2.6.12-5.0-org/fs/Makefile 2007-07-26 00:55:01.000000000 +0200 -+++ 2.6.12-5.0-patched/fs/Makefile 2007-12-11 12:34:52.000000000 +0100 -@@ -10,6 +10,7 @@ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ - attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ - seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ -+ ioprio.o - - obj-$(CONFIG_EPOLL) += eventpoll.o - obj-$(CONFIG_COMPAT) += compat.o -diff -Naur 2.6.12-5.0-org/fs/reiserfs/journal.c 2.6.12-5.0-patched/fs/reiserfs/journal.c ---- 2.6.12-5.0-org/fs/reiserfs/journal.c 2007-07-26 00:55:15.000000000 +0200 -+++ 2.6.12-5.0-patched/fs/reiserfs/journal.c 2007-12-11 12:34:52.000000000 +0100 -@@ -645,18 +645,22 @@ - - static void write_chunk(struct buffer_chunk *chunk) { - int i; -+ get_fs_excl(); - for (i = 0; i < chunk->nr ; i++) { - submit_logged_buffer(chunk->bh[i]) ; - } - chunk->nr = 0; -+ put_fs_excl(); - } - - static void write_ordered_chunk(struct buffer_chunk *chunk) { - int i; -+ get_fs_excl(); - for (i = 0; i < chunk->nr ; i++) { - submit_ordered_buffer(chunk->bh[i]) ; - } - chunk->nr = 0; -+ put_fs_excl(); - } - - static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, -@@ -918,6 +922,8 @@ - return 0 ; - } - -+ get_fs_excl(); -+ - /* before we can put our commit blocks on disk, we have to make sure everyone older than - ** us is on disk too - */ -@@ -1055,6 +1061,7 @@ - - if (retval) - reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__); -+ put_fs_excl(); - return retval; - } - -@@ -1251,6 +1258,8 @@ - return 0 ; - } - -+ get_fs_excl(); -+ - /* if all the work is already done, get out of here */ - if (atomic_read(&(jl->j_nonzerolen)) <= 0 && - atomic_read(&(jl->j_commit_left)) <= 0) { -@@ -1450,6 +1459,7 @@ - put_journal_list(s, jl); - if (flushall) - up(&journal->j_flush_sem); -+ put_fs_excl(); - return err ; - } - -@@ -2717,6 +2727,7 @@ - th->t_trans_id = journal->j_trans_id ; - unlock_journal(p_s_sb) ; - INIT_LIST_HEAD (&th->t_list); -+ get_fs_excl(); - return 0 ; - - out_fail: -@@ -3524,6 +3535,7 @@ - BUG_ON (th->t_refcount > 1); - BUG_ON (!th->t_trans_id); - -+ put_fs_excl(); - current->journal_info = th->t_handle_save; - reiserfs_check_lock_depth(p_s_sb, "journal end"); - if (journal->j_len == 0) { -diff -Naur 2.6.12-5.0-org/include/asm-mips/unistd.h 2.6.12-5.0-patched/include/asm-mips/unistd.h ---- 2.6.12-5.0-org/include/asm-mips/unistd.h 2007-07-26 00:56:08.000000000 +0200 -+++ 2.6.12-5.0-patched/include/asm-mips/unistd.h 2007-12-11 12:34:52.000000000 +0100 -@@ -304,16 +304,18 @@ - #define __NR_request_key (__NR_Linux + 281) - #define __NR_keyctl (__NR_Linux + 282) - #define __NR_set_thread_area (__NR_Linux + 283) -+#define __NR_sys_ioprio_set (__NR_Linux + 284) -+#define __NR_sys_ioprio_get (__NR_Linux + 285) - - /* - * Offset of the last Linux o32 flavoured syscall - */ --#define __NR_Linux_syscalls 283 -+#define __NR_Linux_syscalls 285 - - #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ - - #define __NR_O32_Linux 4000 --#define __NR_O32_Linux_syscalls 283 -+#define __NR_O32_Linux_syscalls 285 - - #if _MIPS_SIM == _MIPS_SIM_ABI64 - -diff -Naur 2.6.12-5.0-org/include/linux/bio.h 2.6.12-5.0-patched/include/linux/bio.h ---- 2.6.12-5.0-org/include/linux/bio.h 2007-07-26 00:57:02.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/bio.h 2007-12-11 12:34:52.000000000 +0100 -@@ -22,6 +22,7 @@ - - #include <linux/highmem.h> - #include <linux/mempool.h> -+#include <linux/ioprio.h> - - /* Platforms may set this to teach the BIO layer about IOMMU hardware. */ - #include <asm/io.h> -@@ -153,6 +154,19 @@ - #define BIO_RW_SYNC 4 - - /* -+ * upper 16 bits of bi_rw define the io priority of this bio -+ */ -+#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) -+#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) -+#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) -+ -+#define bio_set_prio(bio, prio) do { \ -+ WARN_ON(prio >= (1 << IOPRIO_BITS)); \ -+ (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ -+ (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ -+} while (0) -+ -+/* - * various member access, note that bio_data should of course not be used - * on highmem page vectors - */ -diff -Naur 2.6.12-5.0-org/include/linux/blkdev.h 2.6.12-5.0-patched/include/linux/blkdev.h ---- 2.6.12-5.0-org/include/linux/blkdev.h 2007-07-26 00:57:02.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/blkdev.h 2007-12-11 12:34:52.000000000 +0100 -@@ -54,16 +54,23 @@ - - struct cfq_queue; - struct cfq_io_context { -- void (*dtor)(struct cfq_io_context *); -- void (*exit)(struct cfq_io_context *); -- -- struct io_context *ioc; -- - /* - * circular list of cfq_io_contexts belonging to a process io context - */ - struct list_head list; - struct cfq_queue *cfqq; -+ void *key; -+ -+ struct io_context *ioc; -+ -+ unsigned long last_end_request; -+ unsigned long last_queue; -+ unsigned long ttime_total; -+ unsigned long ttime_samples; -+ unsigned long ttime_mean; -+ -+ void (*dtor)(struct cfq_io_context *); -+ void (*exit)(struct cfq_io_context *); - }; - - /* -@@ -73,7 +80,9 @@ - */ - struct io_context { - atomic_t refcount; -- pid_t pid; -+ struct task_struct *task; -+ -+ int (*set_ioprio)(struct io_context *, unsigned int); - - /* - * For request batching -@@ -81,8 +90,6 @@ - unsigned long last_waited; /* Time last woken after wait for request */ - int nr_batch_requests; /* Number of requests left in the batch */ - -- spinlock_t lock; -- - struct as_io_context *aic; - struct cfq_io_context *cic; - }; -@@ -134,6 +141,8 @@ - - void *elevator_private; - -+ unsigned short ioprio; -+ - int rq_status; /* should split this into a few status bits */ - struct gendisk *rq_disk; - int errors; -diff -Naur 2.6.12-5.0-org/include/linux/elevator.h 2.6.12-5.0-patched/include/linux/elevator.h ---- 2.6.12-5.0-org/include/linux/elevator.h 2007-07-26 00:56:58.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/elevator.h 2007-12-11 12:34:52.000000000 +0100 -@@ -16,9 +16,9 @@ - typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *); - typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *); - typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *); --typedef int (elevator_may_queue_fn) (request_queue_t *, int); -+typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *); - --typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int); -+typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int); - typedef void (elevator_put_req_fn) (request_queue_t *, struct request *); - typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *); - -@@ -96,9 +96,9 @@ - extern struct request *elv_latter_request(request_queue_t *, struct request *); - extern int elv_register_queue(request_queue_t *q); - extern void elv_unregister_queue(request_queue_t *q); --extern int elv_may_queue(request_queue_t *, int); -+extern int elv_may_queue(request_queue_t *, int, struct bio *); - extern void elv_completed_request(request_queue_t *, struct request *); --extern int elv_set_request(request_queue_t *, struct request *, int); -+extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int); - extern void elv_put_request(request_queue_t *, struct request *); - - /* -diff -Naur 2.6.12-5.0-org/include/linux/fs.h 2.6.12-5.0-patched/include/linux/fs.h ---- 2.6.12-5.0-org/include/linux/fs.h 2007-07-26 00:57:01.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/fs.h 2007-12-11 12:34:52.000000000 +0100 -@@ -213,6 +213,7 @@ - #include <linux/radix-tree.h> - #include <linux/prio_tree.h> - #include <linux/init.h> -+#include <linux/sched.h> - - #include <asm/atomic.h> - #include <asm/semaphore.h> -@@ -820,16 +821,34 @@ - #define vfs_check_frozen(sb, level) \ - wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) - -+static inline void get_fs_excl(void) -+{ -+ atomic_inc(¤t->fs_excl); -+} -+ -+static inline void put_fs_excl(void) -+{ -+ atomic_dec(¤t->fs_excl); -+} -+ -+static inline int has_fs_excl(void) -+{ -+ return atomic_read(¤t->fs_excl); -+} -+ -+ - /* - * Superblock locking. - */ - static inline void lock_super(struct super_block * sb) - { -+ get_fs_excl(); - down(&sb->s_lock); - } - - static inline void unlock_super(struct super_block * sb) - { -+ put_fs_excl(); - up(&sb->s_lock); - } - -diff -Naur 2.6.12-5.0-org/include/linux/init_task.h 2.6.12-5.0-patched/include/linux/init_task.h ---- 2.6.12-5.0-org/include/linux/init_task.h 2007-07-26 00:56:58.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/init_task.h 2007-12-11 12:34:52.000000000 +0100 -@@ -81,6 +81,7 @@ - .mm = NULL, \ - .active_mm = &init_mm, \ - .run_list = LIST_HEAD_INIT(tsk.run_list), \ -+ .ioprio = 0, \ - .time_slice = HZ, \ - .tasks = LIST_HEAD_INIT(tsk.tasks), \ - .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ -@@ -111,6 +112,7 @@ - .switch_lock = SPIN_LOCK_UNLOCKED, \ - .journal_info = NULL, \ - .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ -+ .fs_excl = ATOMIC_INIT(0), \ - } - - -diff -Naur 2.6.12-5.0-org/include/linux/ioprio.h 2.6.12-5.0-patched/include/linux/ioprio.h ---- 2.6.12-5.0-org/include/linux/ioprio.h 1970-01-01 01:00:00.000000000 +0100 -+++ 2.6.12-5.0-patched/include/linux/ioprio.h 2007-12-11 12:34:52.000000000 +0100 -@@ -0,0 +1,88 @@ -+#ifndef IOPRIO_H -+#define IOPRIO_H -+ -+#include <linux/sched.h> -+ -+/* -+ * Gives us 8 prio classes with 13-bits of data for each class -+ */ -+#define IOPRIO_BITS (16) -+#define IOPRIO_CLASS_SHIFT (13) -+#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) -+ -+#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) -+#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) -+#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) -+ -+#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) -+ -+/* -+ * These are the io priority groups as implemented by CFQ. RT is the realtime -+ * class, it always gets premium service. BE is the best-effort scheduling -+ * class, the default for any process. IDLE is the idle scheduling class, it -+ * is only served when no one else is using the disk. -+ */ -+enum { -+ IOPRIO_CLASS_NONE, -+ IOPRIO_CLASS_RT, -+ IOPRIO_CLASS_BE, -+ IOPRIO_CLASS_IDLE, -+}; -+ -+/* -+ * 8 best effort priority levels are supported -+ */ -+#define IOPRIO_BE_NR (8) -+ -+asmlinkage int sys_ioprio_set(int, int, int); -+asmlinkage int sys_ioprio_get(int, int); -+ -+enum { -+ IOPRIO_WHO_PROCESS = 1, -+ IOPRIO_WHO_PGRP, -+ IOPRIO_WHO_USER, -+}; -+ -+/* -+ * if process has set io priority explicitly, use that. if not, convert -+ * the cpu scheduler nice value to an io priority -+ */ -+#define IOPRIO_NORM (4) -+static inline int task_ioprio(struct task_struct *task) -+{ -+ WARN_ON(!ioprio_valid(task->ioprio)); -+ return IOPRIO_PRIO_DATA(task->ioprio); -+} -+ -+static inline int task_nice_ioprio(struct task_struct *task) -+{ -+ return (task_nice(task) + 20) / 5; -+} -+ -+/* -+ * For inheritance, return the highest of the two given priorities -+ */ -+static inline int ioprio_best(unsigned short aprio, unsigned short bprio) -+{ -+ unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); -+ unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); -+ -+ if (!ioprio_valid(aprio)) -+ return bprio; -+ if (!ioprio_valid(bprio)) -+ return aprio; -+ -+ if (aclass == IOPRIO_CLASS_NONE) -+ aclass = IOPRIO_CLASS_BE; -+ if (bclass == IOPRIO_CLASS_NONE) -+ bclass = IOPRIO_CLASS_BE; -+ -+ if (aclass == bclass) -+ return min(aprio, bprio); -+ if (aclass > bclass) -+ return bprio; -+ else -+ return aprio; -+} -+ -+#endif -diff -Naur 2.6.12-5.0-org/include/linux/sched.h 2.6.12-5.0-patched/include/linux/sched.h ---- 2.6.12-5.0-org/include/linux/sched.h 2007-07-26 00:57:07.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/sched.h 2007-12-11 12:34:52.000000000 +0100 -@@ -584,6 +584,8 @@ - struct list_head run_list; - prio_array_t *array; - -+ unsigned short ioprio; -+ - unsigned long sleep_avg; - unsigned long long timestamp, last_ran; - unsigned long long sched_time; /* sched_clock time spent running */ -@@ -740,6 +742,7 @@ - nodemask_t mems_allowed; - int cpuset_mems_generation; - #endif -+ atomic_t fs_excl; /* holding fs exclusive resources */ - }; - - static inline pid_t process_group(struct task_struct *tsk) -@@ -1089,7 +1092,8 @@ - - /* - * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring -- * subscriptions and synchronises with wait4(). Also used in procfs. -+ * subscriptions and synchronises with wait4(). Also used in procfs. Also -+ * pins the final release of task.io_context. - * - * Nests both inside and outside of read_lock(&tasklist_lock). - * It must not be nested with write_lock_irq(&tasklist_lock), -diff -Naur 2.6.12-5.0-org/include/linux/time.h 2.6.12-5.0-patched/include/linux/time.h ---- 2.6.12-5.0-org/include/linux/time.h 2007-07-26 00:57:01.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/time.h 2007-12-11 12:34:52.000000000 +0100 -@@ -84,6 +84,12 @@ - )*60 + sec; /* finally seconds */ - } - -+/* -+ * Returns true if the timespec is nor, false is denorm: -+ */ -+#define timespec_valid(ts) \ -+ (((ts)->tv_sec >= 0) && (((unsigned long) (ts)->tv_nsec) < NSEC_PER_SEC)) -+ - extern struct timespec xtime; - extern struct timespec wall_to_monotonic; - extern seqlock_t xtime_lock; -diff -Naur 2.6.12-5.0-org/include/linux/writeback.h 2.6.12-5.0-patched/include/linux/writeback.h ---- 2.6.12-5.0-org/include/linux/writeback.h 2007-07-26 00:57:08.000000000 +0200 -+++ 2.6.12-5.0-patched/include/linux/writeback.h 2007-12-11 12:34:52.000000000 +0100 -@@ -14,11 +14,13 @@ - * Yes, writeback.h requires sched.h - * No, sched.h is not included from here. - */ --static inline int current_is_pdflush(void) -+static inline int task_is_pdflush(struct task_struct *task) - { -- return current->flags & PF_FLUSHER; -+ return task->flags & PF_FLUSHER; - } - -+#define current_is_pdflush() task_is_pdflush(current) -+ - /* - * fs/fs-writeback.c - */ -diff -Naur 2.6.12-5.0-org/kernel/exit.c 2.6.12-5.0-patched/kernel/exit.c ---- 2.6.12-5.0-org/kernel/exit.c 2007-07-26 00:57:20.000000000 +0200 -+++ 2.6.12-5.0-patched/kernel/exit.c 2007-12-11 12:34:52.000000000 +0100 -@@ -779,6 +779,8 @@ - - profile_task_exit(tsk); - -+ WARN_ON(atomic_read(&tsk->fs_excl)); -+ - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) -diff -Naur 2.6.12-5.0-org/kernel/fork.c 2.6.12-5.0-patched/kernel/fork.c ---- 2.6.12-5.0-org/kernel/fork.c 2007-07-26 00:57:20.000000000 +0200 -+++ 2.6.12-5.0-patched/kernel/fork.c 2007-12-11 12:34:52.000000000 +0100 -@@ -1084,6 +1084,11 @@ - spin_unlock(¤t->sighand->siglock); - } - -+ /* -+ * inherit ioprio -+ */ -+ p->ioprio = current->ioprio; -+ - SET_LINKS(p); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); -diff -Naur 2.6.12-5.0-org/kernel/sched.c 2.6.12-5.0-patched/kernel/sched.c ---- 2.6.12-5.0-org/kernel/sched.c 2007-07-26 00:57:20.000000000 +0200 -+++ 2.6.12-5.0-patched/kernel/sched.c 2007-12-11 12:34:52.000000000 +0100 -@@ -3302,15 +3302,7 @@ - { - return TASK_NICE(p); - } -- --/* -- * The only users of task_nice are binfmt_elf and binfmt_elf32. -- * binfmt_elf is no longer modular, but binfmt_elf32 still is. -- * Therefore, task_nice is needed if there is a compat_mode. -- */ --#ifdef CONFIG_COMPAT - EXPORT_SYMBOL_GPL(task_nice); --#endif - - /** - * idle_cpu - is a given cpu idle currently? -diff -Naur 2.6.12-5.0-org/arch/mips/kernel/scall32-o32.S 2.6.12-5.0-patched/arch/mips/kernel/scall32-o32.S ---- 2.6.12-5.0-org/arch/mips/kernel/scall32-o32.S 2007-07-26 00:51:08.000000000 +0200 -+++ 2.6.12-5.0-patched/arch/mips/kernel/scall32-o32.S 2007-12-11 12:34:52.000000000 +0100 -@@ -624,6 +624,8 @@ - sys sys_request_key 4 - sys sys_keyctl 5 - sys sys_set_thread_area 1 -+ sys sys_ioprio_set 3 -+ sys sys_ioprio_get 2 - - .endm - diff --git a/packages/linux/linux-dm800/linux-2.6.12-dm8000-nand.patch b/packages/linux/linux-dm800/linux-2.6.12-dm8000-nand.patch deleted file mode 100644 index 94dcf669b5..0000000000 --- a/packages/linux/linux-dm800/linux-2.6.12-dm8000-nand.patch +++ /dev/null @@ -1,235 +0,0 @@ -Index: stblinux-2.6.12/drivers/mtd/nand/Kconfig -=================================================================== ---- stblinux-2.6.12.orig/drivers/mtd/nand/Kconfig 2006-10-10 20:27:00.000000000 +0200 -+++ stblinux-2.6.12/drivers/mtd/nand/Kconfig 2007-04-25 02:36:25.000000000 +0200 -@@ -199,6 +199,10 @@ - help - Enables access to the Smart Media card interface on the AT91RM9200. - -+config MTD_NAND_DM8000 -+ bool "DM8000 NAND support" -+ depends on MTD_NAND -+ - config MTD_NAND_NANDSIM - tristate "Support for NAND Flash Simulator" - depends on m && MTD_NAND && MTD_PARTITIONS -Index: stblinux-2.6.12/drivers/mtd/nand/Makefile -=================================================================== ---- stblinux-2.6.12.orig/drivers/mtd/nand/Makefile 2006-10-10 20:27:00.000000000 +0200 -+++ stblinux-2.6.12/drivers/mtd/nand/Makefile 2007-04-25 02:36:25.000000000 +0200 -@@ -16,6 +16,7 @@ - obj-$(CONFIG_MTD_NAND_AU1550) += au1550nd.o - obj-$(CONFIG_MTD_NAND_PPCHAMELEONEVB) += ppchameleonevb.o - obj-$(CONFIG_MTD_NAND_S3C2410) += s3c2410.o -+obj-$(CONFIG_MTD_NAND_DM8000) += dm8000.o - obj-$(CONFIG_MTD_NAND_DISKONCHIP) += diskonchip.o - obj-$(CONFIG_MTD_NAND_H1900) += h1910.o - obj-$(CONFIG_MTD_NAND_RTC_FROM4) += rtc_from4.o -Index: stblinux-2.6.12/drivers/mtd/nand/dm8000.c -=================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ stblinux-2.6.12/drivers/mtd/nand/dm8000.c 2007-04-25 16:09:12.000000000 +0200 -@@ -0,0 +1,203 @@ -+/* -+ * drivers/mtd/nand/dm8000.c -+ * -+ * Copyright (C) 2000 Steven J. Hill (sjhill@realitydiluted.com) -+ * -+ * Modified for Dreambox DM8000 by Felix Domke <tmbinc@elitedvb.net> -+ * -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 as -+ * published by the Free Software Foundation. -+ * -+ * Overview: -+ * This is a device driver for the NAND flash device found on the -+ * DM8000 board. -+ * -+ */ -+ -+#include <linux/slab.h> -+#include <linux/module.h> -+#include <linux/mtd/mtd.h> -+#include <linux/mtd/nand.h> -+#include <linux/mtd/partitions.h> -+#include <asm/io.h> -+ -+static struct mtd_info *dm8000_mtd = NULL; -+/* -+ * Define partitions for flash device -+ */ -+const static struct mtd_partition partition_info[] = { -+#warning THIS STILL USES THE OLD PART LAYOUT -+ { -+ .name = "complete", -+ .offset = 0, -+ .size = 32*1024*1024 -+ }, -+ { -+ .name = "loader", -+ .offset = 0, -+ .size = 256*1024 -+ }, -+ { -+ .name = "boot partition", -+ .offset = 256*1024, -+ .size = (4*1024-256)*1024 -+ }, -+ { -+ .name = "root partition", -+ .offset = 4*1024*1024, -+ .size = 28*1024*1024 -+ }, -+}; -+#define NUM_PARTITIONS 4 -+ -+/* -+ * hardware specific access to control-lines -+ * our hardware logic handles the line according to the addresses. -+ */ -+static void dm8000_hwcontrol(struct mtd_info *mtd, int cmd) -+{ -+ struct nand_chip *this = mtd->priv; -+ -+ switch(cmd){ -+ -+ case NAND_CTL_SETCLE: this->IO_ADDR_W = (void*)0xBF030000; break; // COMMAND -+ case NAND_CTL_CLRCLE: this->IO_ADDR_W = (void*)0xBF030004; break; // DATA -+ -+ case NAND_CTL_SETALE: this->IO_ADDR_W = (void*)0xBF030002; break; // ADDRESS -+ case NAND_CTL_CLRALE: this->IO_ADDR_W = (void*)0xBF030004; break; // DATA -+ -+ case NAND_CTL_SETNCE: break; // CE will automatically set on command -+ case NAND_CTL_CLRNCE: *(volatile unsigned char*)0xBF030003 = 0; break; // TERM -+ default: -+ BUG(); -+ } -+} -+ -+static void dm8000_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len) -+{ -+// struct nand_chip *this = mtd->priv; -+ -+ *(volatile unsigned char*)0xBF030002; -+ -+ while (len > 16) -+ { -+ *(long*)buf = *(volatile long*)(0xBF030004); -+ *(long*)(buf+4) = *(volatile long*)(0xBF030004); -+ *(long*)(buf+8) = *(volatile long*)(0xBF030004); -+ *(long*)(buf+12) = *(volatile long*)(0xBF030004); -+ buf += 16; -+ len -= 16; -+ } -+ -+ while (len--) -+ *buf++ = *(volatile unsigned char*)(0xBF030004); -+} -+ -+static int dm8000_dev_ready(struct mtd_info *mtd) -+{ -+ *(volatile unsigned char*)0xBF030002; -+ return 1; -+} -+ -+/* -+ * Main initialization routine -+ */ -+int __init dm8000_init (void) -+{ -+ struct nand_chip *this; -+ unsigned char probe[4]; -+ int i, j; -+ -+ /* Allocate memory for MTD device structure and private data */ -+ dm8000_mtd = kmalloc (sizeof(struct mtd_info) + sizeof (struct nand_chip), -+ GFP_KERNEL); -+ if (!dm8000_mtd) { -+ printk ("Unable to allocate DM8000 NAND MTD device structure.\n"); -+ return -ENOMEM; -+ } -+ -+ /* Get pointer to private data */ -+ this = (struct nand_chip *) (&dm8000_mtd[1]); -+ -+ *(volatile unsigned char*)(0xBF030000) = 0x90; -+ *(volatile unsigned char*)(0xBF030002) = 0; -+ probe[0] = *(volatile unsigned char*)(0xBF030004); -+ probe[1] = *(volatile unsigned char*)(0xBF030004); -+ probe[2] = *(volatile unsigned char*)(0xBF030004); -+ probe[3] = *(volatile unsigned char*)(0xBF030004); -+ *(volatile unsigned char*)(0xBF030003) = 0; // term -+ -+ j = jiffies; -+ for (i=0; i<1000*1000/4; ++i) -+ *(volatile unsigned long*)(0xBF030004); -+ printk("%ld kb/s\n", 1000 * HZ / (jiffies-j)); -+ -+ printk(" - NAND PROBE: %02x %02x %02x %02x\n", -+ probe[0], probe[1], probe[2], probe[3]); -+ -+ /* Initialize structures */ -+ memset((char *) dm8000_mtd, 0, sizeof(struct mtd_info)); -+ memset((char *) this, 0, sizeof(struct nand_chip)); -+ -+ /* Link the private data with the MTD structure */ -+ dm8000_mtd->priv = this; -+ -+ /* Set address of NAND IO lines */ -+ this->IO_ADDR_R = (void*)0xBF030004; -+ this->IO_ADDR_W = (void*)0xBF030004; -+ -+ /* Set address of hardware control function */ -+ this->hwcontrol = dm8000_hwcontrol; -+ this->read_buf = dm8000_nand_read_buf; -+ this->dev_ready = 0; // don't use dm8000_dev_ready -+ /* 15 us command delay time */ -+ this->chip_delay = 15; -+ this->eccmode = NAND_ECC_SOFT; -+ -+ /* Scan to find existence of the device */ -+ if (nand_scan (dm8000_mtd, 1)) { -+ kfree (dm8000_mtd); -+ return -ENXIO; -+ } -+ -+ /* Allocate memory for internal data buffer */ -+ this->data_buf = kmalloc (sizeof(u_char) * (dm8000_mtd->oobblock + dm8000_mtd->oobsize), GFP_KERNEL); -+ if (!this->data_buf) { -+ printk ("Unable to allocate NAND data buffer for Dreambox.\n"); -+ kfree (dm8000_mtd); -+ return -ENOMEM; -+ } -+ -+ /* Register the partitions */ -+ add_mtd_partitions(dm8000_mtd, partition_info, NUM_PARTITIONS); -+ -+ /* Return happy */ -+ return 0; -+} -+module_init(dm8000_init); -+ -+/* -+ * Clean up routine -+ */ -+#ifdef MODULE -+static void __exit dm8000_cleanup (void) -+{ -+ struct nand_chip *this = (struct nand_chip *) &dm8000_mtd[1]; -+ -+ /* Unregister the device */ -+ del_mtd_device (dm8000_mtd); -+ -+ /* Free internal data buffer */ -+ kfree (this->data_buf); -+ -+ /* Free the MTD device structure */ -+ kfree (dm8000_mtd); -+} -+module_exit(dm8000_cleanup); -+#endif -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Felix Domke <tmbinc@elitedvb.net>"); -+MODULE_DESCRIPTION("Dream-Multimedia DM8000 NAND flash board glue"); |