2.6 kernel raw device performance issue patch
글쓴이: nanosec / 작성시간: 금, 2006/01/06 - 10:41오전
현재 2.6 kernel ( RHEL4 UP1) 을 64bit system에서 구동하고 있습니다.
RAW DEVICE를 사용하는데, 성능이 너무 안좋아 (어느정도의 KNOWN ISSUE로 보입니다..) 그 내용으로 검색중 아래의 패치를 찾았습니다.
적용전 어떠한 내용이 수정된 것인지, 이것으로 발생될 다른 문제는 없을지 확인하고자 질문 드립니다.
아래는 찾은 파일입니다.
This patch adds block device direct I/O for AIO path. 30% performance gain!! AIO (io_submit) 2.6.9 206,917 2.6.9+patches 268,484 - Ken Signed-off-by: Ken Chen <kenneth.w.c...@intel.com> --- linux-2.6.9/drivers/char/raw.c 2005-03-08 17:22:07.000000000 -0800 +++ linux-2.6.9.ken/drivers/char/raw.c 2005-03-08 17:25:38.000000000 -0800 @@ -385,21 +385,148 @@ static ssize_t raw_file_write(struct fil return raw_file_rw(file, (char __user *) buf, count, ppos, WRITE); } -static ssize_t raw_file_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +int raw_end_aio(struct bio *bio, unsigned int bytes_done, int error) { - struct iovec local_iov = { - .iov_base = (char __user *)buf, - .iov_len = count - }; + struct kiocb* iocb = bio->bi_private; + atomic_t* bio_count = (atomic_t*) &iocb->private; + + if ((bio->bi_rw & 0x1) == READ) + bio_check_pages_dirty(bio); + else { + int i; + struct bio_vec *bvec = bio->bi_io_vec; + struct page *page; + for (i = 0; i < bio->bi_vcnt; i++) { + page = bvec[i].bv_page; + if (page) + put_page(page); + } + bio_put(bio); + } + if (atomic_dec_and_test(bio_count)) + aio_complete(iocb, iocb->ki_nbytes, 0); - return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); + return 0; } +static ssize_t raw_file_aio_rw(struct kiocb *iocb, char __user *buf, + size_t count, loff_t pos, int rw) +{ + struct inode * inode = iocb->ki_filp->f_mapping->host; + unsigned long blkbits = inode->i_blkbits; + unsigned long blocksize_mask = (1<< blkbits) - 1; + struct page * quick_list[PAGE_QUICK_LIST]; + int nr_pages, cur_offset, cur_len; + struct bio * bio; + unsigned long ret; + unsigned long addr = (unsigned long) buf; + loff_t size; + int pg_idx; + atomic_t *bio_count = (atomic_t *) &iocb->private; + + if (count == 0) + return 0; + + /* first check the alignment */ + if (addr & blocksize_mask || count & blocksize_mask || + count < 0 || pos & blocksize_mask) + return -EINVAL; + + size = i_size_read(inode); + if (pos >= size) + return -ENXIO; + if (pos + count > size) + count = size - pos; + + nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE - + addr / PAGE_SIZE; + + pg_idx = PAGE_QUICK_LIST; + atomic_set(bio_count, 1); + +start: + bio = bio_alloc(GFP_KERNEL, nr_pages); + if (unlikely(bio == NULL)) { + if (atomic_read(bio_count) == 1) + return -ENOMEM; + else { + iocb->ki_nbytes = addr - (unsigned long) buf; + goto out; + } + } + + /* initialize bio */ + bio->bi_bdev = I_BDEV(inode); + bio->bi_end_io = raw_end_aio; + bio->bi_private = iocb; + bio->bi_sector = pos >> blkbits; + + while (count > 0) { + cur_offset = addr & ~PAGE_MASK; + cur_len = PAGE_SIZE - cur_offset; + if (cur_len > count) + cur_len = count; + + if (pg_idx >= PAGE_QUICK_LIST) { + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, addr, + min(nr_pages, PAGE_QUICK_LIST), + rw==READ, 0, quick_list, NULL); + up_read(¤t->mm->mmap_sem); + if (unlikely(ret < 0)) { + bio_put(bio); + if (atomic_read(bio_count) == 1) + return ret; + else { + iocb->ki_nbytes = addr - (unsigned long) buf; + goto out; + } + } + pg_idx = 0; + } + + if (unlikely(!bio_add_page(bio, quick_list[pg_idx], cur_len, cur_offset))) { + atomic_inc(bio_count); + if (rw == READ) + bio_set_pages_dirty(bio); + submit_bio(rw, bio); + pos += addr - (unsigned long) buf; + goto start; + } + + addr += cur_len; + count -= cur_len; + pg_idx++; + nr_pages--; + } + + atomic_inc(bio_count); + if (rw == READ) + bio_set_pages_dirty(bio); + submit_bio(rw, bio); +out: + blk_run_address_space(inode->i_mapping); + if (atomic_dec_and_test(bio_count)) { + aio_complete(iocb, iocb->ki_nbytes, 0); + } + + return -EIOCBQUEUED; +} + +ssize_t raw_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +{ + return raw_file_aio_rw(iocb, buf, count, pos, READ); +} + +static ssize_t raw_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + return raw_file_aio_rw(iocb, (char __user *) buf, count, pos, WRITE); +} static struct file_operations raw_fops = { .read = raw_file_read, - .aio_read = generic_file_aio_read, + .aio_read = raw_file_aio_read, .write = raw_file_write, .aio_write = raw_file_aio_write, .open = raw_open,
이것과
OK, last one in the series: user level test programs that stress the kernel I/O stack. Pretty dull stuff. - Ken diff -Nur zero/aio_null.c blknull_test/aio_null.c --- zero/aio_null.c 1969-12-31 16:00:00.000000000 -0800 +++ blknull_test/aio_null.c 2005-03-08 00:46:17.000000000 -0800 @@ -0,0 +1,76 @@ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sched.h> +#include <signal.h> +#include <sys/types.h> +#include <linux/ioctl.h> +#include <libaio.h> + +#define MAXAIO 1024 + +char buf[4096] __attribute__((aligned(4096))); + +io_context_t io_ctx; +struct iocb iocbpool[MAXAIO]; +struct io_event ioevent[MAXAIO]; + +void aio_setup(int n) +{ + int res = io_queue_init(n, &io_ctx); + if (res != 0) { + printf("io_queue_setup(%d) returned %d (%s)\n", + n, res, strerror(-res)); + exit(0); + } +} + +main(int argc, char* argv[]) +{ + int fd, i, status, batch; + struct iocb* iocbbatch[MAXAIO]; + int devidx; + off_t offset; + unsigned long start, end; + + batch = argc < 2 ? 100: atoi(argv[1]); + if (batch >= MAXAIO) + batch = MAXAIO; + + aio_setup(MAXAIO); + fd = open("/dev/raw/raw1", O_RDONLY); + + if (fd == -1) { + perror("error opening\n"); + exit (0); + } + for (i=0; i<batch; i++) { + iocbbatch[i] = iocbpool+i; + io_prep_pread(iocbbatch[i], fd, buf, 4096, 0); + } + + while (1) { + struct timespec ts={30,0}; + int bufidx; + int reap; + + status = io_submit(io_ctx, i, iocbbatch); + if (status != i) { + printf("bad io_submit: %d [%s]\n", status, + strerror(-status)); + } + + // reap at least batch count back + reap = io_getevents(io_ctx, batch, MAXAIO, ioevent, &ts); + if (reap < batch) { + printf("io_getevents returned=%d [%s]\n", reap, + strerror(-reap)); + } + + // check the return result of each event + for (i=0; i<reap; i++) + if (ioevent[i].res != 4096) + printf("error in read: %lx\n", ioevent[i].res); + } /* while (1) */ +} diff -Nur zero/pread_null.c blknull_test/pread_null.c --- zero/pread_null.c 1969-12-31 16:00:00.000000000 -0800 +++ blknull_test/pread_null.c 2005-03-08 00:44:20.000000000 -0800 @@ -0,0 +1,27 @@ +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <malloc.h> + +main(int argc, char* argv[]) +{ + int fd; + char *addr; + + fd = open("/dev/raw/raw1", O_RDONLY); + if (fd == -1) { + perror("error opening\n"); + exit(0); + } + + addr = memalign(4096, 4096); + if (addr == 0) { + printf("no memory\n"); + exit(0); + } + + while (1) { + pread(fd, addr, 4096, 0); + } + +} diff -Nur zero/makefile blknull_test/makefile --- zero/makefile 1969-12-31 16:00:00.000000000 -0800 +++ blknull_test/makefile 2005-03-08 17:10:39.000000000 -0800 @@ -0,0 +1,10 @@ +all: pread_null aio_null + +pread_null: pread_null.c + gcc -O3 -o $@ pread_null.c + +aio_null: aio_null.c + gcc -O3 -o $@ aio_null.c -laio + +clean: + rm -f pread_null aio_null
이렇게 세가지 입니다.
도움 주심에 미리 감사말씀 드립니다.
새해 복 많이 받으세요.
The pseudo disk driver that I used to stress the kernel I/O stack (anything above block layer, AIO/DIO/BIO). - Ken diff -Nur zero/blknull.c blknull/blknull.c --- zero/blknull.c 1969-12-31 16:00:00.000000000 -0800 +++ blknull/blknull.c 2005-03-03 19:04:07.000000000 -0800 @@ -0,0 +1,97 @@ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/blkpg.h> +#include <linux/spinlock.h> + +#include <linux/blkdev.h> +#include <linux/genhd.h> + +#define BLK_NULL_MAJOR 60 +#define BLK_NULL_NAME "blknull" + + +MODULE_AUTHOR("Ken Chen"); +MODULE_DESCRIPTION("null block driver"); +MODULE_LICENSE("GPL"); + + +spinlock_t driver_lock; +struct request_queue *q; +struct gendisk *disk; + + +static int null_open(struct inode *inode, struct file *filp) +{ + return 0; +} + +static int null_release(struct inode *inode, struct file *filp) +{ + return 0; +} + +static struct block_device_operations null_fops = { + .owner = THIS_MODULE, + .open = null_open, + .release = null_release, +}; + +static void do_null_request(request_queue_t *q) +{ + struct request *req; + + while (!blk_queue_plugged(q)) { + req = elv_next_request(q); + if (!req) + break; + + blkdev_dequeue_request(req); + + end_that_request_first(req, 1, req->nr_sectors); + end_that_request_last(req); + } +} + +static int __init init_blk_null_module(void) +{ + + if (register_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME)) { + printk(KERN_ERR "Unable to register null blk device\n"); + return 0; + } + + spin_lock_init(&driver_lock); + q = blk_init_queue(do_null_request, &driver_lock); + if (q) { + disk = alloc_disk(1); + + if (disk) { + disk->major = BLK_NULL_MAJOR; + disk->first_minor = 0; + disk->fops = &null_fops; + disk->capacity = 1<<30; + disk->queue = q; + memcpy(disk->disk_name, BLK_NULL_NAME, sizeof(BLK_NULL_NAME)); + add_disk(disk); + return 1; + } + + blk_cleanup_queue(q); + } + unregister_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME); + return 0; +} + +static void __exit exit_blk_null_module(void) +{ + del_gendisk(disk); + blk_cleanup_queue(q); + unregister_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME); +} + +module_init(init_blk_null_module); +module_exit(exit_blk_null_module); diff -Nur zero/Makefile blknull/Makefile --- zero/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ blknull/Makefile 2005-03-03 18:42:55.000000000 -0800 @@ -0,0 +1 @@ +obj-m := blknull.o
Forums:
다른것은 모르겠고,앞에 Ken이라는 이름을 보니..그냥 믿어도 될
다른것은 모르겠고,
앞에 Ken이라는 이름을 보니..
그냥 믿어도 될 것 같은데요?
ps) 이사람 cat으로 바이너리를 짠다던 그 사람 아니가요?
아니면 말구...
댓글 달기