1
2
3
4
5
6
7
8
9
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/errno.h>
14#include <linux/time.h>
15#include <linux/aio_abi.h>
16#include <linux/module.h>
17
18#define DEBUG 0
19
20#include <linux/sched.h>
21#include <linux/fs.h>
22#include <linux/file.h>
23#include <linux/mm.h>
24#include <linux/mman.h>
25#include <linux/slab.h>
26#include <linux/timer.h>
27#include <linux/aio.h>
28#include <linux/highmem.h>
29#include <linux/workqueue.h>
30#include <linux/security.h>
31
32#include <asm/kmap_types.h>
33#include <asm/uaccess.h>
34#include <asm/mmu_context.h>
35
36#if DEBUG > 1
37#define dprintk printk
38#else
39#define dprintk(x...) do { ; } while (0)
40#endif
41
42long aio_run = 0;
43long aio_wakeups = 0;
44
45
46atomic_t aio_nr = ATOMIC_INIT(0);
47unsigned aio_max_nr = 0x10000;
48
49
50static kmem_cache_t *kiocb_cachep;
51static kmem_cache_t *kioctx_cachep;
52
53static struct workqueue_struct *aio_wq;
54
55
56static void aio_fput_routine(void *);
57static DECLARE_WORK(fput_work, aio_fput_routine, NULL);
58
59static spinlock_t fput_lock = SPIN_LOCK_UNLOCKED;
60LIST_HEAD(fput_head);
61
62static void aio_kick_handler(void *);
63static void aio_queue_work(struct kioctx *);
64
65
66
67
68
69static int __init aio_setup(void)
70{
71 kiocb_cachep = kmem_cache_create("kiocb", sizeof(struct kiocb),
72 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
73 kioctx_cachep = kmem_cache_create("kioctx", sizeof(struct kioctx),
74 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
75
76 aio_wq = create_workqueue("aio");
77
78 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
79
80 return 0;
81}
82
83static void aio_free_ring(struct kioctx *ctx)
84{
85 struct aio_ring_info *info = &ctx->ring_info;
86 long i;
87
88 for (i=0; i<info->nr_pages; i++)
89 put_page(info->ring_pages[i]);
90
91 if (info->mmap_size) {
92 down_write(&ctx->mm->mmap_sem);
93 do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
94 up_write(&ctx->mm->mmap_sem);
95 }
96
97 if (info->ring_pages && info->ring_pages != info->internal_pages)
98 kfree(info->ring_pages);
99 info->ring_pages = NULL;
100 info->nr = 0;
101}
102
103static int aio_setup_ring(struct kioctx *ctx)
104{
105 struct aio_ring *ring;
106 struct aio_ring_info *info = &ctx->ring_info;
107 unsigned nr_events = ctx->max_reqs;
108 unsigned long size;
109 int nr_pages;
110
111
112 nr_events += 2;
113
114 size = sizeof(struct aio_ring);
115 size += sizeof(struct io_event) * nr_events;
116 nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
117
118 if (nr_pages < 0)
119 return -EINVAL;
120
121 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
122
123 info->nr = 0;
124 info->ring_pages = info->internal_pages;
125 if (nr_pages > AIO_RING_PAGES) {
126 info->ring_pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_KERNEL);
127 if (!info->ring_pages)
128 return -ENOMEM;
129 memset(info->ring_pages, 0, sizeof(struct page *) * nr_pages);
130 }
131
132 info->mmap_size = nr_pages * PAGE_SIZE;
133 dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
134 down_write(&ctx->mm->mmap_sem);
135 info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
136 PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE,
137 0);
138 if (IS_ERR((void *)info->mmap_base)) {
139 up_write(&ctx->mm->mmap_sem);
140 printk("mmap err: %ld\n", -info->mmap_base);
141 info->mmap_size = 0;
142 aio_free_ring(ctx);
143 return -EAGAIN;
144 }
145
146 dprintk("mmap address: 0x%08lx\n", info->mmap_base);
147 info->nr_pages = get_user_pages(current, ctx->mm,
148 info->mmap_base, nr_pages,
149 1, 0, info->ring_pages, NULL);
150 up_write(&ctx->mm->mmap_sem);
151
152 if (unlikely(info->nr_pages != nr_pages)) {
153 aio_free_ring(ctx);
154 return -EAGAIN;
155 }
156
157 ctx->user_id = info->mmap_base;
158
159 info->nr = nr_events;
160
161 ring = kmap_atomic(info->ring_pages[0], KM_USER0);
162 ring->nr = nr_events;
163 ring->id = ctx->user_id;
164 ring->head = ring->tail = 0;
165 ring->magic = AIO_RING_MAGIC;
166 ring->compat_features = AIO_RING_COMPAT_FEATURES;
167 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
168 ring->header_length = sizeof(struct aio_ring);
169 kunmap_atomic(ring, KM_USER0);
170
171 return 0;
172}
173
174
175
176
177
178#define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
179#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
180#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
181
182#define aio_ring_event(info, nr, km) ({ \
183 unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
184 struct io_event *__event; \
185 __event = kmap_atomic( \
186 (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
187 __event += pos % AIO_EVENTS_PER_PAGE; \
188 __event; \
189})
190
191#define put_aio_ring_event(event, km) do { \
192 struct io_event *__event = (event); \
193 (void)__event; \
194 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
195} while(0)
196
197
198
199
200static struct kioctx *ioctx_alloc(unsigned nr_events)
201{
202 struct mm_struct *mm;
203 struct kioctx *ctx;
204
205
206 if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
207 (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
208 pr_debug("ENOMEM: nr_events too high\n");
209 return ERR_PTR(-EINVAL);
210 }
211
212 if (nr_events > aio_max_nr)
213 return ERR_PTR(-EAGAIN);
214
215 ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
216 if (!ctx)
217 return ERR_PTR(-ENOMEM);
218
219 memset(ctx, 0, sizeof(*ctx));
220 ctx->max_reqs = nr_events;
221 mm = ctx->mm = current->mm;
222 atomic_inc(&mm->mm_count);
223
224 atomic_set(&ctx->users, 1);
225 spin_lock_init(&ctx->ctx_lock);
226 spin_lock_init(&ctx->ring_info.ring_lock);
227 init_waitqueue_head(&ctx->wait);
228
229 INIT_LIST_HEAD(&ctx->active_reqs);
230 INIT_LIST_HEAD(&ctx->run_list);
231 INIT_WORK(&ctx->wq, aio_kick_handler, ctx);
232
233 if (aio_setup_ring(ctx) < 0)
234 goto out_freectx;
235
236
237 atomic_add(ctx->max_reqs, &aio_nr);
238 if (unlikely(atomic_read(&aio_nr) > aio_max_nr))
239 goto out_cleanup;
240
241
242 write_lock(&mm->ioctx_list_lock);
243 ctx->next = mm->ioctx_list;
244 mm->ioctx_list = ctx;
245 write_unlock(&mm->ioctx_list_lock);
246
247 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
248 ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
249 return ctx;
250
251out_cleanup:
252 atomic_sub(ctx->max_reqs, &aio_nr);
253 ctx->max_reqs = 0;
254 __put_ioctx(ctx);
255 return ERR_PTR(-EAGAIN);
256
257out_freectx:
258 mmdrop(mm);
259 kmem_cache_free(kioctx_cachep, ctx);
260 ctx = ERR_PTR(-ENOMEM);
261
262 dprintk("aio: error allocating ioctx %p\n", ctx);
263 return ctx;
264}
265
266
267
268
269
270
271static void aio_cancel_all(struct kioctx *ctx)
272{
273 int (*cancel)(struct kiocb *, struct io_event *);
274 struct io_event res;
275 spin_lock_irq(&ctx->ctx_lock);
276 ctx->dead = 1;
277 while (!list_empty(&ctx->active_reqs)) {
278 struct list_head *pos = ctx->active_reqs.next;
279 struct kiocb *iocb = list_kiocb(pos);
280 list_del_init(&iocb->ki_list);
281 cancel = iocb->ki_cancel;
282 kiocbSetCancelled(iocb);
283 if (cancel) {
284 iocb->ki_users++;
285 spin_unlock_irq(&ctx->ctx_lock);
286 cancel(iocb, &res);
287 spin_lock_irq(&ctx->ctx_lock);
288 }
289 }
290 spin_unlock_irq(&ctx->ctx_lock);
291}
292
293void wait_for_all_aios(struct kioctx *ctx)
294{
295 struct task_struct *tsk = current;
296 DECLARE_WAITQUEUE(wait, tsk);
297
298 spin_lock_irq(&ctx->ctx_lock);
299 if (!ctx->reqs_active)
300 goto out;
301
302 add_wait_queue(&ctx->wait, &wait);
303 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
304 while (ctx->reqs_active) {
305 spin_unlock_irq(&ctx->ctx_lock);
306 io_schedule();
307 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
308 spin_lock_irq(&ctx->ctx_lock);
309 }
310 __set_task_state(tsk, TASK_RUNNING);
311 remove_wait_queue(&ctx->wait, &wait);
312
313out:
314 spin_unlock_irq(&ctx->ctx_lock);
315}
316
317
318
319
320ssize_t fastcall wait_on_sync_kiocb(struct kiocb *iocb)
321{
322 while (iocb->ki_users) {
323 set_current_state(TASK_UNINTERRUPTIBLE);
324 if (!iocb->ki_users)
325 break;
326 io_schedule();
327 }
328 __set_current_state(TASK_RUNNING);
329 return iocb->ki_user_data;
330}
331
332
333
334
335
336
337
338
339void fastcall exit_aio(struct mm_struct *mm)
340{
341 struct kioctx *ctx = mm->ioctx_list;
342 mm->ioctx_list = NULL;
343 while (ctx) {
344 struct kioctx *next = ctx->next;
345 ctx->next = NULL;
346 aio_cancel_all(ctx);
347
348 wait_for_all_aios(ctx);
349
350
351
352
353 flush_workqueue(aio_wq);
354
355 if (1 != atomic_read(&ctx->users))
356 printk(KERN_DEBUG
357 "exit_aio:ioctx still alive: %d %d %d\n",
358 atomic_read(&ctx->users), ctx->dead,
359 ctx->reqs_active);
360 put_ioctx(ctx);
361 ctx = next;
362 }
363}
364
365
366
367
368
369void fastcall __put_ioctx(struct kioctx *ctx)
370{
371 unsigned nr_events = ctx->max_reqs;
372
373 if (unlikely(ctx->reqs_active))
374 BUG();
375
376 cancel_delayed_work(&ctx->wq);
377 flush_workqueue(aio_wq);
378 aio_free_ring(ctx);
379 mmdrop(ctx->mm);
380 ctx->mm = NULL;
381 pr_debug("__put_ioctx: freeing %p\n", ctx);
382 kmem_cache_free(kioctx_cachep, ctx);
383
384 atomic_sub(nr_events, &aio_nr);
385}
386
387
388
389
390
391
392
393
394
395
396
397static struct kiocb *FASTCALL(__aio_get_req(struct kioctx *ctx));
398static struct kiocb fastcall *__aio_get_req(struct kioctx *ctx)
399{
400 struct kiocb *req = NULL;
401 struct aio_ring *ring;
402 int okay = 0;
403
404 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
405 if (unlikely(!req))
406 return NULL;
407
408 req->ki_flags = 1 << KIF_LOCKED;
409 req->ki_users = 2;
410 req->ki_key = 0;
411 req->ki_ctx = ctx;
412 req->ki_cancel = NULL;
413 req->ki_retry = NULL;
414 req->ki_obj.user = NULL;
415 req->ki_dtor = NULL;
416 req->private = NULL;
417 INIT_LIST_HEAD(&req->ki_run_list);
418
419
420
421
422 spin_lock_irq(&ctx->ctx_lock);
423 ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
424 if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
425 list_add(&req->ki_list, &ctx->active_reqs);
426 ctx->reqs_active++;
427 okay = 1;
428 }
429 kunmap_atomic(ring, KM_USER0);
430 spin_unlock_irq(&ctx->ctx_lock);
431
432 if (!okay) {
433 kmem_cache_free(kiocb_cachep, req);
434 req = NULL;
435 }
436
437 return req;
438}
439
440static inline struct kiocb *aio_get_req(struct kioctx *ctx)
441{
442 struct kiocb *req;
443
444
445
446
447 req = __aio_get_req(ctx);
448 if (unlikely(NULL == req)) {
449 aio_fput_routine(NULL);
450 req = __aio_get_req(ctx);
451 }
452 return req;
453}
454
455static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
456{
457 if (req->ki_dtor)
458 req->ki_dtor(req);
459 req->ki_ctx = NULL;
460 req->ki_filp = NULL;
461 req->ki_obj.user = NULL;
462 req->ki_dtor = NULL;
463 req->private = NULL;
464 kmem_cache_free(kiocb_cachep, req);
465 ctx->reqs_active--;
466
467 if (unlikely(!ctx->reqs_active && ctx->dead))
468 wake_up(&ctx->wait);
469}
470
471static void aio_fput_routine(void *data)
472{
473 spin_lock_irq(&fput_lock);
474 while (likely(!list_empty(&fput_head))) {
475 struct kiocb *req = list_kiocb(fput_head.next);
476 struct kioctx *ctx = req->ki_ctx;
477
478 list_del(&req->ki_list);
479 spin_unlock_irq(&fput_lock);
480
481
482 __fput(req->ki_filp);
483
484
485 spin_lock_irq(&ctx->ctx_lock);
486 really_put_req(ctx, req);
487 spin_unlock_irq(&ctx->ctx_lock);
488
489 put_ioctx(ctx);
490 spin_lock_irq(&fput_lock);
491 }
492 spin_unlock_irq(&fput_lock);
493}
494
495
496
497
498static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
499{
500 dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
501 req, atomic_read(&req->ki_filp->f_count));
502
503 req->ki_users --;
504 if (unlikely(req->ki_users < 0))
505 BUG();
506 if (likely(req->ki_users))
507 return 0;
508 list_del(&req->ki_list);
509 req->ki_cancel = NULL;
510 req->ki_retry = NULL;
511
512
513
514
515 if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
516 get_ioctx(ctx);
517 spin_lock(&fput_lock);
518 list_add(&req->ki_list, &fput_head);
519 spin_unlock(&fput_lock);
520 queue_work(aio_wq, &fput_work);
521 } else
522 really_put_req(ctx, req);
523 return 1;
524}
525
526
527
528
529
530int fastcall aio_put_req(struct kiocb *req)
531{
532 struct kioctx *ctx = req->ki_ctx;
533 int ret;
534 spin_lock_irq(&ctx->ctx_lock);
535 ret = __aio_put_req(ctx, req);
536 spin_unlock_irq(&ctx->ctx_lock);
537 return ret;
538}
539
540
541
542
543struct kioctx *lookup_ioctx(unsigned long ctx_id)
544{
545 struct kioctx *ioctx;
546 struct mm_struct *mm;
547
548 mm = current->mm;
549 read_lock(&mm->ioctx_list_lock);
550 for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
551 if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
552 get_ioctx(ioctx);
553 break;
554 }
555 read_unlock(&mm->ioctx_list_lock);
556
557 return ioctx;
558}
559
560
561
562
563
564
565
566
567
568
569
570static void use_mm(struct mm_struct *mm)
571{
572 struct mm_struct *active_mm;
573 struct task_struct *tsk = current;
574
575 task_lock(tsk);
576 active_mm = tsk->active_mm;
577 atomic_inc(&mm->mm_count);
578 tsk->mm = mm;
579 tsk->active_mm = mm;
580 activate_mm(active_mm, mm);
581 task_unlock(tsk);
582
583 mmdrop(active_mm);
584}
585
586
587
588
589
590
591
592
593
594
595
596
597void unuse_mm(struct mm_struct *mm)
598{
599 struct task_struct *tsk = current;
600
601 task_lock(tsk);
602 tsk->mm = NULL;
603
604 enter_lazy_tlb(mm, tsk);
605 task_unlock(tsk);
606}
607
608
609
610
611
612
613
614
615
616
617
618
619
620static inline int __queue_kicked_iocb(struct kiocb *iocb)
621{
622 struct kioctx *ctx = iocb->ki_ctx;
623
624 if (list_empty(&iocb->ki_run_list)) {
625 list_add_tail(&iocb->ki_run_list,
626 &ctx->run_list);
627 iocb->ki_queued++;
628 return 1;
629 }
630 return 0;
631}
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655static ssize_t aio_run_iocb(struct kiocb *iocb)
656{
657 struct kioctx *ctx = iocb->ki_ctx;
658 ssize_t (*retry)(struct kiocb *);
659 ssize_t ret;
660
661 if (iocb->ki_retried++ > 1024*1024) {
662 printk("Maximal retry count. Bytes done %Zd\n",
663 iocb->ki_nbytes - iocb->ki_left);
664 return -EAGAIN;
665 }
666
667 if (!(iocb->ki_retried & 0xff)) {
668 pr_debug("%ld retry: %d of %d (kick %ld, Q %ld run %ld, wake %ld)\n",
669 iocb->ki_retried,
670 iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes,
671 iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups);
672 }
673
674 if (!(retry = iocb->ki_retry)) {
675 printk("aio_run_iocb: iocb->ki_retry = NULL\n");
676 return 0;
677 }
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698 kiocbClearKicked(iocb);
699
700
701
702
703
704
705
706 iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
707 spin_unlock_irq(&ctx->ctx_lock);
708
709
710 if (kiocbIsCancelled(iocb)) {
711 ret = -EINTR;
712 aio_complete(iocb, ret, 0);
713
714 goto out;
715 }
716
717
718
719
720
721
722
723
724
725
726
727 BUG_ON(current->io_wait != NULL);
728 current->io_wait = &iocb->ki_wait;
729 ret = retry(iocb);
730 current->io_wait = NULL;
731
732 if (-EIOCBRETRY != ret) {
733 if (-EIOCBQUEUED != ret) {
734 BUG_ON(!list_empty(&iocb->ki_wait.task_list));
735 aio_complete(iocb, ret, 0);
736
737 }
738 } else {
739
740
741
742
743 if (list_empty(&iocb->ki_wait.task_list))
744 kiocbSetKicked(iocb);
745 }
746out:
747 spin_lock_irq(&ctx->ctx_lock);
748
749 if (-EIOCBRETRY == ret) {
750
751
752
753
754
755
756
757
758 INIT_LIST_HEAD(&iocb->ki_run_list);
759
760
761 if (kiocbIsKicked(iocb)) {
762 __queue_kicked_iocb(iocb);
763
764
765
766
767
768
769
770 aio_queue_work(ctx);
771 }
772 }
773 return ret;
774}
775
776
777
778
779
780
781
782
783static int __aio_run_iocbs(struct kioctx *ctx)
784{
785 struct kiocb *iocb;
786 int count = 0;
787 LIST_HEAD(run_list);
788
789 list_splice_init(&ctx->run_list, &run_list);
790 while (!list_empty(&run_list)) {
791 iocb = list_entry(run_list.next, struct kiocb,
792 ki_run_list);
793 list_del(&iocb->ki_run_list);
794
795
796
797 iocb->ki_users++;
798 aio_run_iocb(iocb);
799 __aio_put_req(ctx, iocb);
800 count++;
801 }
802 aio_run++;
803 if (!list_empty(&ctx->run_list))
804 return 1;
805 return 0;
806}
807
808static void aio_queue_work(struct kioctx * ctx)
809{
810 unsigned long timeout;
811
812
813
814
815 smp_mb();
816 if (waitqueue_active(&ctx->wait))
817 timeout = 1;
818 else
819 timeout = HZ/10;
820 queue_delayed_work(aio_wq, &ctx->wq, timeout);
821}
822
823
824
825
826
827
828
829
830
831static inline void aio_run_iocbs(struct kioctx *ctx)
832{
833 int requeue;
834
835 spin_lock_irq(&ctx->ctx_lock);
836
837 requeue = __aio_run_iocbs(ctx);
838 spin_unlock_irq(&ctx->ctx_lock);
839 if (requeue)
840 aio_queue_work(ctx);
841}
842
843
844
845
846
847static inline void aio_run_all_iocbs(struct kioctx *ctx)
848{
849 spin_lock_irq(&ctx->ctx_lock);
850 while (__aio_run_iocbs(ctx))
851 ;
852 spin_unlock_irq(&ctx->ctx_lock);
853}
854
855
856
857
858
859
860
861
862
863
864static void aio_kick_handler(void *data)
865{
866 struct kioctx *ctx = data;
867 mm_segment_t oldfs = get_fs();
868 int requeue;
869
870 set_fs(USER_DS);
871 use_mm(ctx->mm);
872 spin_lock_irq(&ctx->ctx_lock);
873 requeue =__aio_run_iocbs(ctx);
874 unuse_mm(ctx->mm);
875 spin_unlock_irq(&ctx->ctx_lock);
876 set_fs(oldfs);
877
878
879
880 if (requeue)
881 queue_work(aio_wq, &ctx->wq);
882}
883
884
885
886
887
888
889
890void queue_kicked_iocb(struct kiocb *iocb)
891{
892 struct kioctx *ctx = iocb->ki_ctx;
893 unsigned long flags;
894 int run = 0;
895
896 WARN_ON((!list_empty(&iocb->ki_wait.task_list)));
897
898 spin_lock_irqsave(&ctx->ctx_lock, flags);
899 run = __queue_kicked_iocb(iocb);
900 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
901 if (run) {
902 aio_queue_work(ctx);
903 aio_wakeups++;
904 }
905}
906
907
908
909
910
911
912
913
914void fastcall kick_iocb(struct kiocb *iocb)
915{
916
917
918 if (is_sync_kiocb(iocb)) {
919 kiocbSetKicked(iocb);
920 wake_up_process(iocb->ki_obj.tsk);
921 return;
922 }
923
924 iocb->ki_kicked++;
925
926 if (!kiocbTryKick(iocb)) {
927 queue_kicked_iocb(iocb);
928 }
929}
930EXPORT_SYMBOL(kick_iocb);
931
932
933
934
935
936
937int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
938{
939 struct kioctx *ctx = iocb->ki_ctx;
940 struct aio_ring_info *info;
941 struct aio_ring *ring;
942 struct io_event *event;
943 unsigned long flags;
944 unsigned long tail;
945 int ret;
946
947
948
949
950
951
952
953 if (is_sync_kiocb(iocb)) {
954 int ret;
955
956 iocb->ki_user_data = res;
957 if (iocb->ki_users == 1) {
958 iocb->ki_users = 0;
959 ret = 1;
960 } else {
961 spin_lock_irq(&ctx->ctx_lock);
962 iocb->ki_users--;
963 ret = (0 == iocb->ki_users);
964 spin_unlock_irq(&ctx->ctx_lock);
965 }
966
967 wake_up_process(iocb->ki_obj.tsk);
968 return ret;
969 }
970
971 info = &ctx->ring_info;
972
973
974
975
976
977
978
979 spin_lock_irqsave(&ctx->ctx_lock, flags);
980
981 if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
982 list_del_init(&iocb->ki_run_list);
983
984
985
986
987
988 if (kiocbIsCancelled(iocb))
989 goto put_rq;
990
991 ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
992
993 tail = info->tail;
994 event = aio_ring_event(info, tail, KM_IRQ0);
995 tail = (tail + 1) % info->nr;
996
997 event->obj = (u64)(unsigned long)iocb->ki_obj.user;
998 event->data = iocb->ki_user_data;
999 event->res = res;
1000 event->res2 = res2;
1001
1002 dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
1003 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
1004 res, res2);
1005
1006
1007
1008
1009 smp_wmb();
1010
1011 info->tail = tail;
1012 ring->tail = tail;
1013
1014 put_aio_ring_event(event, KM_IRQ0);
1015 kunmap_atomic(ring, KM_IRQ1);
1016
1017 pr_debug("added to ring %p at [%lu]\n", iocb, tail);
1018
1019 pr_debug("%ld retries: %d of %d (kicked %ld, Q %ld run %ld wake %ld)\n",
1020 iocb->ki_retried,
1021 iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes,
1022 iocb->ki_kicked, iocb->ki_queued, aio_run, aio_wakeups);
1023put_rq:
1024
1025 ret = __aio_put_req(ctx, iocb);
1026
1027
1028
1029
1030
1031
1032
1033 smp_mb();
1034
1035 if (waitqueue_active(&ctx->wait))
1036 wake_up(&ctx->wait);
1037
1038 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1039 return ret;
1040}
1041
1042
1043
1044
1045
1046
1047
1048static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
1049{
1050 struct aio_ring_info *info = &ioctx->ring_info;
1051 struct aio_ring *ring;
1052 unsigned long head;
1053 int ret = 0;
1054
1055 ring = kmap_atomic(info->ring_pages[0], KM_USER0);
1056 dprintk("in aio_read_evt h%lu t%lu m%lu\n",
1057 (unsigned long)ring->head, (unsigned long)ring->tail,
1058 (unsigned long)ring->nr);
1059
1060 if (ring->head == ring->tail)
1061 goto out;
1062
1063 spin_lock(&info->ring_lock);
1064
1065 head = ring->head % info->nr;
1066 if (head != ring->tail) {
1067 struct io_event *evp = aio_ring_event(info, head, KM_USER1);
1068 *ent = *evp;
1069 head = (head + 1) % info->nr;
1070 smp_mb();
1071 ring->head = head;
1072 ret = 1;
1073 put_aio_ring_event(evp, KM_USER1);
1074 }
1075 spin_unlock(&info->ring_lock);
1076
1077out:
1078 kunmap_atomic(ring, KM_USER0);
1079 dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
1080 (unsigned long)ring->head, (unsigned long)ring->tail);
1081 return ret;
1082}
1083
1084struct aio_timeout {
1085 struct timer_list timer;
1086 int timed_out;
1087 struct task_struct *p;
1088};
1089
1090static void timeout_func(unsigned long data)
1091{
1092 struct aio_timeout *to = (struct aio_timeout *)data;
1093
1094 to->timed_out = 1;
1095 wake_up_process(to->p);
1096}
1097
1098static inline void init_timeout(struct aio_timeout *to)
1099{
1100 init_timer(&to->timer);
1101 to->timer.data = (unsigned long)to;
1102 to->timer.function = timeout_func;
1103 to->timed_out = 0;
1104 to->p = current;
1105}
1106
1107static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
1108 const struct timespec *ts)
1109{
1110 to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
1111 if (time_after(to->timer.expires, jiffies))
1112 add_timer(&to->timer);
1113 else
1114 to->timed_out = 1;
1115}
1116
1117static inline void clear_timeout(struct aio_timeout *to)
1118{
1119 del_singleshot_timer_sync(&to->timer);
1120}
1121
1122static int read_events(struct kioctx *ctx,
1123 long min_nr, long nr,
1124 struct io_event __user *event,
1125 struct timespec __user *timeout)
1126{
1127 long start_jiffies = jiffies;
1128 struct task_struct *tsk = current;
1129 DECLARE_WAITQUEUE(wait, tsk);
1130 int ret;
1131 int i = 0;
1132 struct io_event ent;
1133 struct aio_timeout to;
1134 int event_loop = 0;
1135 int retry = 0;
1136
1137
1138
1139
1140 memset(&ent, 0, sizeof(ent));
1141retry:
1142 ret = 0;
1143 while (likely(i < nr)) {
1144 ret = aio_read_evt(ctx, &ent);
1145 if (unlikely(ret <= 0))
1146 break;
1147
1148 dprintk("read event: %Lx %Lx %Lx %Lx\n",
1149 ent.data, ent.obj, ent.res, ent.res2);
1150
1151
1152 ret = -EFAULT;
1153 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1154 dprintk("aio: lost an event due to EFAULT.\n");
1155 break;
1156 }
1157 ret = 0;
1158
1159
1160 event ++;
1161 i ++;
1162 }
1163
1164 if (min_nr <= i)
1165 return i;
1166 if (ret)
1167 return ret;
1168
1169
1170
1171
1172 if (!retry && unlikely(!list_empty(&ctx->run_list))) {
1173 retry = 1;
1174 aio_run_all_iocbs(ctx);
1175 goto retry;
1176 }
1177
1178 init_timeout(&to);
1179 if (timeout) {
1180 struct timespec ts;
1181 ret = -EFAULT;
1182 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
1183 goto out;
1184
1185 set_timeout(start_jiffies, &to, &ts);
1186 }
1187
1188 while (likely(i < nr)) {
1189 add_wait_queue_exclusive(&ctx->wait, &wait);
1190 do {
1191 set_task_state(tsk, TASK_INTERRUPTIBLE);
1192 ret = aio_read_evt(ctx, &ent);
1193 if (ret)
1194 break;
1195 if (min_nr <= i)
1196 break;
1197 ret = 0;
1198 if (to.timed_out)
1199 break;
1200
1201
1202 if (ctx->reqs_active)
1203 io_schedule();
1204 else
1205 schedule();
1206 event_loop++;
1207 if (signal_pending(tsk)) {
1208 ret = -EINTR;
1209 break;
1210 }
1211
1212 } while (1) ;
1213
1214 set_task_state(tsk, TASK_RUNNING);
1215 remove_wait_queue(&ctx->wait, &wait);
1216
1217 if (unlikely(ret <= 0))
1218 break;
1219
1220 ret = -EFAULT;
1221 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1222 dprintk("aio: lost an event due to EFAULT.\n");
1223 break;
1224 }
1225
1226
1227 event ++;
1228 i ++;
1229 }
1230
1231 if (timeout)
1232 clear_timeout(&to);
1233out:
1234 pr_debug("event loop executed %d times\n", event_loop);
1235 pr_debug("aio_run %ld\n", aio_run);
1236 pr_debug("aio_wakeups %ld\n", aio_wakeups);
1237 return i ? i : ret;
1238}
1239
1240
1241
1242
1243static void io_destroy(struct kioctx *ioctx)
1244{
1245 struct mm_struct *mm = current->mm;
1246 struct kioctx **tmp;
1247 int was_dead;
1248
1249
1250 write_lock(&mm->ioctx_list_lock);
1251 was_dead = ioctx->dead;
1252 ioctx->dead = 1;
1253 for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
1254 tmp = &(*tmp)->next)
1255 ;
1256 if (*tmp)
1257 *tmp = ioctx->next;
1258 write_unlock(&mm->ioctx_list_lock);
1259
1260 dprintk("aio_release(%p)\n", ioctx);
1261 if (likely(!was_dead))
1262 put_ioctx(ioctx);
1263
1264 aio_cancel_all(ioctx);
1265 wait_for_all_aios(ioctx);
1266 put_ioctx(ioctx);
1267}
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
1283{
1284 struct kioctx *ioctx = NULL;
1285 unsigned long ctx;
1286 long ret;
1287
1288 ret = get_user(ctx, ctxp);
1289 if (unlikely(ret))
1290 goto out;
1291
1292 ret = -EINVAL;
1293 if (unlikely(ctx || (int)nr_events <= 0)) {
1294 pr_debug("EINVAL: io_setup: ctx or nr_events > max\n");
1295 goto out;
1296 }
1297
1298 ioctx = ioctx_alloc(nr_events);
1299 ret = PTR_ERR(ioctx);
1300 if (!IS_ERR(ioctx)) {
1301 ret = put_user(ioctx->user_id, ctxp);
1302 if (!ret)
1303 return 0;
1304
1305 get_ioctx(ioctx);
1306 io_destroy(ioctx);
1307 }
1308
1309out:
1310 return ret;
1311}
1312
1313
1314
1315
1316
1317
1318
1319asmlinkage long sys_io_destroy(aio_context_t ctx)
1320{
1321 struct kioctx *ioctx = lookup_ioctx(ctx);
1322 if (likely(NULL != ioctx)) {
1323 io_destroy(ioctx);
1324 return 0;
1325 }
1326 pr_debug("EINVAL: io_destroy: invalid context id\n");
1327 return -EINVAL;
1328}
1329
1330
1331
1332
1333
1334static ssize_t aio_pread(struct kiocb *iocb)
1335{
1336 struct file *file = iocb->ki_filp;
1337 struct address_space *mapping = file->f_mapping;
1338 struct inode *inode = mapping->host;
1339 ssize_t ret = 0;
1340
1341 ret = file->f_op->aio_read(iocb, iocb->ki_buf,
1342 iocb->ki_left, iocb->ki_pos);
1343
1344
1345
1346
1347
1348 if (ret > 0) {
1349 iocb->ki_buf += ret;
1350 iocb->ki_left -= ret;
1351
1352
1353
1354
1355
1356
1357 if (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))
1358 ret = -EIOCBRETRY;
1359 }
1360
1361
1362
1363 if ((ret == 0) || (iocb->ki_left == 0))
1364 ret = iocb->ki_nbytes - iocb->ki_left;
1365
1366 return ret;
1367}
1368
1369
1370
1371
1372
1373static ssize_t aio_pwrite(struct kiocb *iocb)
1374{
1375 struct file *file = iocb->ki_filp;
1376 ssize_t ret = 0;
1377
1378 ret = file->f_op->aio_write(iocb, iocb->ki_buf,
1379 iocb->ki_left, iocb->ki_pos);
1380
1381 if (ret > 0) {
1382 iocb->ki_buf += ret;
1383 iocb->ki_left -= ret;
1384
1385 ret = -EIOCBRETRY;
1386 }
1387
1388
1389
1390 if ((ret == 0) || (iocb->ki_left == 0))
1391 ret = iocb->ki_nbytes - iocb->ki_left;
1392
1393 return ret;
1394}
1395
1396static ssize_t aio_fdsync(struct kiocb *iocb)
1397{
1398 struct file *file = iocb->ki_filp;
1399 ssize_t ret = -EINVAL;
1400
1401 if (file->f_op->aio_fsync)
1402 ret = file->f_op->aio_fsync(iocb, 1);
1403 return ret;
1404}
1405
1406static ssize_t aio_fsync(struct kiocb *iocb)
1407{
1408 struct file *file = iocb->ki_filp;
1409 ssize_t ret = -EINVAL;
1410
1411 if (file->f_op->aio_fsync)
1412 ret = file->f_op->aio_fsync(iocb, 0);
1413 return ret;
1414}
1415
1416
1417
1418
1419
1420
1421ssize_t aio_setup_iocb(struct kiocb *kiocb)
1422{
1423 struct file *file = kiocb->ki_filp;
1424 ssize_t ret = 0;
1425
1426 switch (kiocb->ki_opcode) {
1427 case IOCB_CMD_PREAD:
1428 ret = -EBADF;
1429 if (unlikely(!(file->f_mode & FMODE_READ)))
1430 break;
1431 ret = -EFAULT;
1432 if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
1433 kiocb->ki_left)))
1434 break;
1435 ret = -EINVAL;
1436 if (file->f_op->aio_read)
1437 kiocb->ki_retry = aio_pread;
1438 break;
1439 case IOCB_CMD_PWRITE:
1440 ret = -EBADF;
1441 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1442 break;
1443 ret = -EFAULT;
1444 if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
1445 kiocb->ki_left)))
1446 break;
1447 ret = -EINVAL;
1448 if (file->f_op->aio_write)
1449 kiocb->ki_retry = aio_pwrite;
1450 break;
1451 case IOCB_CMD_FDSYNC:
1452 ret = -EINVAL;
1453 if (file->f_op->aio_fsync)
1454 kiocb->ki_retry = aio_fdsync;
1455 break;
1456 case IOCB_CMD_FSYNC:
1457 ret = -EINVAL;
1458 if (file->f_op->aio_fsync)
1459 kiocb->ki_retry = aio_fsync;
1460 break;
1461 default:
1462 dprintk("EINVAL: io_submit: no operation provided\n");
1463 ret = -EINVAL;
1464 }
1465
1466 if (!kiocb->ki_retry)
1467 return ret;
1468
1469 return 0;
1470}
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
1491{
1492 struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
1493
1494 list_del_init(&wait->task_list);
1495 kick_iocb(iocb);
1496 return 1;
1497}
1498
1499int fastcall io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1500 struct iocb *iocb)
1501{
1502 struct kiocb *req;
1503 struct file *file;
1504 ssize_t ret;
1505
1506
1507 if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
1508 iocb->aio_reserved3)) {
1509 pr_debug("EINVAL: io_submit: reserve field set\n");
1510 return -EINVAL;
1511 }
1512
1513
1514 if (unlikely(
1515 (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
1516 (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
1517 ((ssize_t)iocb->aio_nbytes < 0)
1518 )) {
1519 pr_debug("EINVAL: io_submit: overflow check\n");
1520 return -EINVAL;
1521 }
1522
1523 file = fget(iocb->aio_fildes);
1524 if (unlikely(!file))
1525 return -EBADF;
1526
1527 req = aio_get_req(ctx);
1528 if (unlikely(!req)) {
1529 fput(file);
1530 return -EAGAIN;
1531 }
1532
1533 req->ki_filp = file;
1534 iocb->aio_key = req->ki_key;
1535 ret = put_user(iocb->aio_key, &user_iocb->aio_key);
1536 if (unlikely(ret)) {
1537 dprintk("EFAULT: aio_key\n");
1538 goto out_put_req;
1539 }
1540
1541 req->ki_obj.user = user_iocb;
1542 req->ki_user_data = iocb->aio_data;
1543 req->ki_pos = iocb->aio_offset;
1544
1545 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
1546 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1547 req->ki_opcode = iocb->aio_lio_opcode;
1548 init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
1549 INIT_LIST_HEAD(&req->ki_wait.task_list);
1550 req->ki_run_list.next = req->ki_run_list.prev = NULL;
1551 req->ki_retry = NULL;
1552 req->ki_retried = 0;
1553 req->ki_kicked = 0;
1554 req->ki_queued = 0;
1555 aio_run = 0;
1556 aio_wakeups = 0;
1557
1558 ret = aio_setup_iocb(req);
1559
1560 if (ret)
1561 goto out_put_req;
1562
1563 spin_lock_irq(&ctx->ctx_lock);
1564 if (likely(list_empty(&ctx->run_list))) {
1565 aio_run_iocb(req);
1566 } else {
1567 list_add_tail(&req->ki_run_list, &ctx->run_list);
1568
1569 while (__aio_run_iocbs(ctx))
1570 ;
1571 }
1572 spin_unlock_irq(&ctx->ctx_lock);
1573 aio_put_req(req);
1574 return 0;
1575
1576out_put_req:
1577 aio_put_req(req);
1578 aio_put_req(req);
1579 return ret;
1580}
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr,
1595 struct iocb __user * __user *iocbpp)
1596{
1597 struct kioctx *ctx;
1598 long ret = 0;
1599 int i;
1600
1601 if (unlikely(nr < 0))
1602 return -EINVAL;
1603
1604 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
1605 return -EFAULT;
1606
1607 ctx = lookup_ioctx(ctx_id);
1608 if (unlikely(!ctx)) {
1609 pr_debug("EINVAL: io_submit: invalid context id\n");
1610 return -EINVAL;
1611 }
1612
1613
1614
1615
1616
1617 for (i=0; i<nr; i++) {
1618 struct iocb __user *user_iocb;
1619 struct iocb tmp;
1620
1621 if (unlikely(__get_user(user_iocb, iocbpp + i))) {
1622 ret = -EFAULT;
1623 break;
1624 }
1625
1626 if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
1627 ret = -EFAULT;
1628 break;
1629 }
1630
1631 ret = io_submit_one(ctx, user_iocb, &tmp);
1632 if (ret)
1633 break;
1634 }
1635
1636 put_ioctx(ctx);
1637 return i ? i : ret;
1638}
1639
1640
1641
1642
1643
1644struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
1645{
1646 struct list_head *pos;
1647
1648 list_for_each(pos, &ctx->active_reqs) {
1649 struct kiocb *kiocb = list_kiocb(pos);
1650 if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
1651 return kiocb;
1652 }
1653 return NULL;
1654}
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
1667 struct io_event __user *result)
1668{
1669 int (*cancel)(struct kiocb *iocb, struct io_event *res);
1670 struct kioctx *ctx;
1671 struct kiocb *kiocb;
1672 u32 key;
1673 int ret;
1674
1675 ret = get_user(key, &iocb->aio_key);
1676 if (unlikely(ret))
1677 return -EFAULT;
1678
1679 ctx = lookup_ioctx(ctx_id);
1680 if (unlikely(!ctx))
1681 return -EINVAL;
1682
1683 spin_lock_irq(&ctx->ctx_lock);
1684 ret = -EAGAIN;
1685 kiocb = lookup_kiocb(ctx, iocb, key);
1686 if (kiocb && kiocb->ki_cancel) {
1687 cancel = kiocb->ki_cancel;
1688 kiocb->ki_users ++;
1689 kiocbSetCancelled(kiocb);
1690 } else
1691 cancel = NULL;
1692 spin_unlock_irq(&ctx->ctx_lock);
1693
1694 if (NULL != cancel) {
1695 struct io_event tmp;
1696 pr_debug("calling cancel\n");
1697 memset(&tmp, 0, sizeof(tmp));
1698 tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
1699 tmp.data = kiocb->ki_user_data;
1700 ret = cancel(kiocb, &tmp);
1701 if (!ret) {
1702
1703
1704
1705 if (copy_to_user(result, &tmp, sizeof(tmp)))
1706 ret = -EFAULT;
1707 }
1708 } else
1709 ret = -EINVAL;
1710
1711 put_ioctx(ctx);
1712
1713 return ret;
1714}
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728asmlinkage long sys_io_getevents(aio_context_t ctx_id,
1729 long min_nr,
1730 long nr,
1731 struct io_event __user *events,
1732 struct timespec __user *timeout)
1733{
1734 struct kioctx *ioctx = lookup_ioctx(ctx_id);
1735 long ret = -EINVAL;
1736
1737 if (likely(ioctx)) {
1738 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
1739 ret = read_events(ioctx, min_nr, nr, events, timeout);
1740 put_ioctx(ioctx);
1741 }
1742
1743 return ret;
1744}
1745
1746__initcall(aio_setup);
1747
1748EXPORT_SYMBOL(aio_complete);
1749EXPORT_SYMBOL(aio_put_req);
1750EXPORT_SYMBOL(wait_on_sync_kiocb);
1751