1 diff -urNp --exclude CVS --exclude BitKeeper x-ref/drivers/block/ll_rw_blk.c x/drivers/block/ll_rw_blk.c
2 --- x-ref/drivers/block/ll_rw_blk.c 2003-06-12 04:47:41.000000000 +0200
3 +++ x/drivers/block/ll_rw_blk.c 2003-06-12 04:47:55.000000000 +0200
4 @@ -596,12 +596,20 @@ static struct request *__get_request_wai
5 register struct request *rq;
6 DECLARE_WAITQUEUE(wait, current);
8 - add_wait_queue(&q->wait_for_requests[rw], &wait);
9 + add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
11 set_current_state(TASK_UNINTERRUPTIBLE);
12 - generic_unplug_device(q);
13 - if (q->rq[rw].count == 0)
14 + if (q->rq[rw].count == 0) {
16 + * All we care about is not to stall if any request
17 + * is been released after we set TASK_UNINTERRUPTIBLE.
18 + * This is the most efficient place to unplug the queue
19 + * in case we hit the race and we can get the request
22 + generic_unplug_device(q);
25 spin_lock_irq(&io_request_lock);
26 rq = get_request(q, rw);
27 spin_unlock_irq(&io_request_lock);
28 @@ -611,6 +619,17 @@ static struct request *__get_request_wai
32 +static void get_request_wait_wakeup(request_queue_t *q, int rw)
35 + * avoid losing an unplug if a second __get_request_wait did the
36 + * generic_unplug_device while our __get_request_wait was running
37 + * w/o the queue_lock held and w/ our request out of the queue.
39 + if (q->rq[rw].count == 0 && waitqueue_active(&q->wait_for_requests[rw]))
40 + __generic_unplug_device(q);
43 /* RO fail safe mechanism */
45 static long ro_bits[MAX_BLKDEV][8];
46 @@ -835,8 +854,11 @@ void blkdev_release_request(struct reque
49 list_add(&req->queue, &q->rq[rw].free);
50 - if (++q->rq[rw].count >= q->batch_requests)
51 - wake_up(&q->wait_for_requests[rw]);
52 + if (++q->rq[rw].count >= q->batch_requests) {
54 + if (waitqueue_active(&q->wait_for_requests[rw]))
55 + wake_up(&q->wait_for_requests[rw]);
60 @@ -907,6 +929,7 @@ static inline void attempt_front_merge(r
61 static int __make_request(request_queue_t * q, int rw,
62 struct buffer_head * bh)
64 + int need_unplug = 0;
65 unsigned int sector, count;
66 int max_segments = MAX_SEGMENTS;
67 struct request * req, *freereq = NULL;
68 @@ -954,7 +977,6 @@ static int __make_request(request_queue_
70 max_sectors = get_max_sectors(bh->b_rdev);
74 head = &q->queue_head;
76 @@ -963,6 +985,7 @@ again:
78 spin_lock_irq(&io_request_lock);
81 insert_here = head->prev;
82 if (list_empty(head)) {
83 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
84 @@ -1048,6 +1071,9 @@ get_rq:
86 spin_unlock_irq(&io_request_lock);
87 freereq = __get_request_wait(q, rw);
88 + head = &q->queue_head;
90 + spin_lock_irq(&q->queue_lock);
94 @@ -1074,6 +1100,8 @@ get_rq:
97 blkdev_release_request(freereq);
99 + get_request_wait_wakeup(q, rw);
100 spin_unlock_irq(&io_request_lock);
103 @@ -1202,8 +1230,21 @@ void __submit_bh(int rw, struct buffer_h
104 bh->b_rdev = bh->b_dev;
105 bh->b_rsector = bh->b_blocknr * count;
108 + * Really we could read random memory in the waitqueue
109 + * check and as worse we would trigger a false positive
110 + * queue unplug, however getting the reference
111 + * on the bh and reading allocated memory is cleaner.
114 generic_make_request(rw, bh);
116 + /* fix race condition with wait_on_buffer() */
117 + smp_mb(); /* spin_unlock may have inclusive semantics */
118 + if (waitqueue_active(&bh->b_wait))
119 + run_task_queue(&tq_disk);
124 kstat.pgpgout += count;
125 diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/buffer.c x/fs/buffer.c
126 --- x-ref/fs/buffer.c 2003-06-12 04:47:41.000000000 +0200
127 +++ x/fs/buffer.c 2003-06-12 04:47:44.000000000 +0200
128 @@ -158,10 +158,23 @@ void __wait_on_buffer(struct buffer_head
130 add_wait_queue(&bh->b_wait, &wait);
132 - run_task_queue(&tq_disk);
133 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
134 if (!buffer_locked(bh))
137 + * We must read tq_disk in TQ_ACTIVE after the
138 + * add_wait_queue effect is visible to other cpus.
139 + * We could unplug some line above it wouldn't matter
140 + * but we can't do that right after add_wait_queue
141 + * without an smp_mb() in between because spin_unlock
142 + * has inclusive semantics.
143 + * Doing it here is the most efficient place so we
144 + * don't do a suprious unplug if we get a racy
145 + * wakeup that make buffer_locked to return 0, and
146 + * doing it here avoids an explicit smp_mb() we
147 + * rely on the implicit one in set_task_state.
149 + run_task_queue(&tq_disk);
151 } while (buffer_locked(bh));
152 tsk->state = TASK_RUNNING;
153 @@ -1471,6 +1484,7 @@ static int __block_write_full_page(struc
156 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
157 + BUG_ON(page_count(page) < 3);
158 head = page->buffers;
160 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
161 @@ -1517,6 +1531,9 @@ static int __block_write_full_page(struc
163 /* Done - end_buffer_io_async will unlock */
164 SetPageUptodate(page);
166 + wakeup_page_waiters(page);
171 @@ -1548,6 +1565,7 @@ out:
172 } while (bh != head);
175 + wakeup_page_waiters(page);
179 @@ -1721,6 +1739,7 @@ int block_read_full_page(struct page *pa
180 blocksize = 1 << inode->i_blkbits;
182 create_empty_buffers(page, inode->i_dev, blocksize);
183 + BUG_ON(page_count(page) < 3);
184 head = page->buffers;
186 blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
187 @@ -1781,6 +1800,8 @@ int block_read_full_page(struct page *pa
192 + wakeup_page_waiters(page);
196 @@ -2400,6 +2421,7 @@ int brw_page(int rw, struct page *page,
199 create_empty_buffers(page, dev, size);
200 + BUG_ON(page_count(page) < 3);
201 head = bh = page->buffers;
203 /* Stage 1: lock all the buffers */
204 @@ -2417,6 +2439,7 @@ int brw_page(int rw, struct page *page,
207 } while (bh != head);
208 + wakeup_page_waiters(page);
212 diff -urNp --exclude CVS --exclude BitKeeper x-ref/fs/reiserfs/inode.c x/fs/reiserfs/inode.c
213 --- x-ref/fs/reiserfs/inode.c 2003-06-12 04:47:35.000000000 +0200
214 +++ x/fs/reiserfs/inode.c 2003-06-12 04:47:44.000000000 +0200
215 @@ -2048,6 +2048,7 @@ static int reiserfs_write_full_page(stru
218 submit_bh_for_writepage(page, arr, nr) ;
219 + wakeup_page_waiters(page);
223 diff -urNp --exclude CVS --exclude BitKeeper x-ref/include/linux/pagemap.h x/include/linux/pagemap.h
224 --- x-ref/include/linux/pagemap.h 2003-06-12 04:47:41.000000000 +0200
225 +++ x/include/linux/pagemap.h 2003-06-12 04:47:44.000000000 +0200
226 @@ -98,6 +98,8 @@ static inline void wait_on_page(struct p
227 ___wait_on_page(page);
230 +extern void FASTCALL(wakeup_page_waiters(struct page * page));
233 * Returns locked page at given index in given cache, creating it if needed.
235 diff -urNp --exclude CVS --exclude BitKeeper x-ref/kernel/ksyms.c x/kernel/ksyms.c
236 --- x-ref/kernel/ksyms.c 2003-06-12 04:47:41.000000000 +0200
237 +++ x/kernel/ksyms.c 2003-06-12 04:47:44.000000000 +0200
238 @@ -319,6 +319,7 @@ EXPORT_SYMBOL(filemap_fdatasync);
239 EXPORT_SYMBOL(filemap_fdatawait);
240 EXPORT_SYMBOL(lock_page);
241 EXPORT_SYMBOL(unlock_page);
242 +EXPORT_SYMBOL(wakeup_page_waiters);
244 /* device registration */
245 EXPORT_SYMBOL(register_chrdev);
246 diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/filemap.c x/mm/filemap.c
247 --- x-ref/mm/filemap.c 2003-06-12 04:47:41.000000000 +0200
248 +++ x/mm/filemap.c 2003-06-12 04:47:44.000000000 +0200
249 @@ -779,6 +779,20 @@ inline wait_queue_head_t * page_waitqueu
250 return wait_table_hashfn(page, &pgdat->wait_table);
254 + * This must be called after every submit_bh with end_io
255 + * callbacks that would result into the blkdev layer waking
256 + * up the page after a queue unplug.
258 +void wakeup_page_waiters(struct page * page)
260 + wait_queue_head_t * head;
262 + head = page_waitqueue(page);
263 + if (waitqueue_active(head))
268 * Wait for a page to get unlocked.
270 diff -urNp --exclude CVS --exclude BitKeeper x-ref/mm/swapfile.c x/mm/swapfile.c
271 --- x-ref/mm/swapfile.c 2003-06-12 04:47:41.000000000 +0200
272 +++ x/mm/swapfile.c 2003-06-12 04:47:44.000000000 +0200
273 @@ -984,8 +984,10 @@ asmlinkage long sys_swapon(const char *
277 + get_page(virt_to_page(swap_header));
278 lock_page(virt_to_page(swap_header));
279 rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header);
280 + put_page(virt_to_page(swap_header));
282 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
283 swap_header_version = 1;