2.6 kernel raw device performance issue patch
글쓴이: nanosec / 작성시간: 금, 2006/01/06 - 10:41오전
현재 2.6 kernel ( RHEL4 UP1) 을 64bit system에서 구동하고 있습니다.
RAW DEVICE를 사용하는데, 성능이 너무 안좋아 (어느정도의 KNOWN ISSUE로 보입니다..) 그 내용으로 검색중 아래의 패치를 찾았습니다.
적용전 어떠한 내용이 수정된 것인지, 이것으로 발생될 다른 문제는 없을지 확인하고자 질문 드립니다.
아래는 찾은 파일입니다.
This patch adds block device direct I/O for AIO path.
30% performance gain!!
AIO (io_submit)
2.6.9 206,917
2.6.9+patches 268,484
- Ken
Signed-off-by: Ken Chen <kenneth.w.c...@intel.com>
--- linux-2.6.9/drivers/char/raw.c 2005-03-08 17:22:07.000000000 -0800
+++ linux-2.6.9.ken/drivers/char/raw.c 2005-03-08 17:25:38.000000000 -0800
@@ -385,21 +385,148 @@ static ssize_t raw_file_write(struct fil
return raw_file_rw(file, (char __user *) buf, count, ppos, WRITE);
}
-static ssize_t raw_file_aio_write(struct kiocb *iocb, const char __user *buf,
- size_t count, loff_t pos)
+int raw_end_aio(struct bio *bio, unsigned int bytes_done, int error)
{
- struct iovec local_iov = {
- .iov_base = (char __user *)buf,
- .iov_len = count
- };
+ struct kiocb* iocb = bio->bi_private;
+ atomic_t* bio_count = (atomic_t*) &iocb->private;
+
+ if ((bio->bi_rw & 0x1) == READ)
+ bio_check_pages_dirty(bio);
+ else {
+ int i;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ struct page *page;
+ for (i = 0; i < bio->bi_vcnt; i++) {
+ page = bvec[i].bv_page;
+ if (page)
+ put_page(page);
+ }
+ bio_put(bio);
+ }
+ if (atomic_dec_and_test(bio_count))
+ aio_complete(iocb, iocb->ki_nbytes, 0);
- return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+ return 0;
}
+static ssize_t raw_file_aio_rw(struct kiocb *iocb, char __user *buf,
+ size_t count, loff_t pos, int rw)
+{
+ struct inode * inode = iocb->ki_filp->f_mapping->host;
+ unsigned long blkbits = inode->i_blkbits;
+ unsigned long blocksize_mask = (1<< blkbits) - 1;
+ struct page * quick_list[PAGE_QUICK_LIST];
+ int nr_pages, cur_offset, cur_len;
+ struct bio * bio;
+ unsigned long ret;
+ unsigned long addr = (unsigned long) buf;
+ loff_t size;
+ int pg_idx;
+ atomic_t *bio_count = (atomic_t *) &iocb->private;
+
+ if (count == 0)
+ return 0;
+
+ /* first check the alignment */
+ if (addr & blocksize_mask || count & blocksize_mask ||
+ count < 0 || pos & blocksize_mask)
+ return -EINVAL;
+
+ size = i_size_read(inode);
+ if (pos >= size)
+ return -ENXIO;
+ if (pos + count > size)
+ count = size - pos;
+
+ nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
+ addr / PAGE_SIZE;
+
+ pg_idx = PAGE_QUICK_LIST;
+ atomic_set(bio_count, 1);
+
+start:
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
+ if (unlikely(bio == NULL)) {
+ if (atomic_read(bio_count) == 1)
+ return -ENOMEM;
+ else {
+ iocb->ki_nbytes = addr - (unsigned long) buf;
+ goto out;
+ }
+ }
+
+ /* initialize bio */
+ bio->bi_bdev = I_BDEV(inode);
+ bio->bi_end_io = raw_end_aio;
+ bio->bi_private = iocb;
+ bio->bi_sector = pos >> blkbits;
+
+ while (count > 0) {
+ cur_offset = addr & ~PAGE_MASK;
+ cur_len = PAGE_SIZE - cur_offset;
+ if (cur_len > count)
+ cur_len = count;
+
+ if (pg_idx >= PAGE_QUICK_LIST) {
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, addr,
+ min(nr_pages, PAGE_QUICK_LIST),
+ rw==READ, 0, quick_list, NULL);
+ up_read(¤t->mm->mmap_sem);
+ if (unlikely(ret < 0)) {
+ bio_put(bio);
+ if (atomic_read(bio_count) == 1)
+ return ret;
+ else {
+ iocb->ki_nbytes = addr - (unsigned long) buf;
+ goto out;
+ }
+ }
+ pg_idx = 0;
+ }
+
+ if (unlikely(!bio_add_page(bio, quick_list[pg_idx], cur_len, cur_offset))) {
+ atomic_inc(bio_count);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ submit_bio(rw, bio);
+ pos += addr - (unsigned long) buf;
+ goto start;
+ }
+
+ addr += cur_len;
+ count -= cur_len;
+ pg_idx++;
+ nr_pages--;
+ }
+
+ atomic_inc(bio_count);
+ if (rw == READ)
+ bio_set_pages_dirty(bio);
+ submit_bio(rw, bio);
+out:
+ blk_run_address_space(inode->i_mapping);
+ if (atomic_dec_and_test(bio_count)) {
+ aio_complete(iocb, iocb->ki_nbytes, 0);
+ }
+
+ return -EIOCBQUEUED;
+}
+
+ssize_t raw_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
+{
+ return raw_file_aio_rw(iocb, buf, count, pos, READ);
+}
+
+static ssize_t raw_file_aio_write(struct kiocb *iocb, const char __user *buf,
+ size_t count, loff_t pos)
+{
+ return raw_file_aio_rw(iocb, (char __user *) buf, count, pos, WRITE);
+}
static struct file_operations raw_fops = {
.read = raw_file_read,
- .aio_read = generic_file_aio_read,
+ .aio_read = raw_file_aio_read,
.write = raw_file_write,
.aio_write = raw_file_aio_write,
.open = raw_open,
이것과
OK, last one in the series: user level test programs that stress
the kernel I/O stack. Pretty dull stuff.
- Ken
diff -Nur zero/aio_null.c blknull_test/aio_null.c
--- zero/aio_null.c 1969-12-31 16:00:00.000000000 -0800
+++ blknull_test/aio_null.c 2005-03-08 00:46:17.000000000 -0800
@@ -0,0 +1,76 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <linux/ioctl.h>
+#include <libaio.h>
+
+#define MAXAIO 1024
+
+char buf[4096] __attribute__((aligned(4096)));
+
+io_context_t io_ctx;
+struct iocb iocbpool[MAXAIO];
+struct io_event ioevent[MAXAIO];
+
+void aio_setup(int n)
+{
+ int res = io_queue_init(n, &io_ctx);
+ if (res != 0) {
+ printf("io_queue_setup(%d) returned %d (%s)\n",
+ n, res, strerror(-res));
+ exit(0);
+ }
+}
+
+main(int argc, char* argv[])
+{
+ int fd, i, status, batch;
+ struct iocb* iocbbatch[MAXAIO];
+ int devidx;
+ off_t offset;
+ unsigned long start, end;
+
+ batch = argc < 2 ? 100: atoi(argv[1]);
+ if (batch >= MAXAIO)
+ batch = MAXAIO;
+
+ aio_setup(MAXAIO);
+ fd = open("/dev/raw/raw1", O_RDONLY);
+
+ if (fd == -1) {
+ perror("error opening\n");
+ exit (0);
+ }
+ for (i=0; i<batch; i++) {
+ iocbbatch[i] = iocbpool+i;
+ io_prep_pread(iocbbatch[i], fd, buf, 4096, 0);
+ }
+
+ while (1) {
+ struct timespec ts={30,0};
+ int bufidx;
+ int reap;
+
+ status = io_submit(io_ctx, i, iocbbatch);
+ if (status != i) {
+ printf("bad io_submit: %d [%s]\n", status,
+ strerror(-status));
+ }
+
+ // reap at least batch count back
+ reap = io_getevents(io_ctx, batch, MAXAIO, ioevent, &ts);
+ if (reap < batch) {
+ printf("io_getevents returned=%d [%s]\n", reap,
+ strerror(-reap));
+ }
+
+ // check the return result of each event
+ for (i=0; i<reap; i++)
+ if (ioevent[i].res != 4096)
+ printf("error in read: %lx\n", ioevent[i].res);
+ } /* while (1) */
+}
diff -Nur zero/pread_null.c blknull_test/pread_null.c
--- zero/pread_null.c 1969-12-31 16:00:00.000000000 -0800
+++ blknull_test/pread_null.c 2005-03-08 00:44:20.000000000 -0800
@@ -0,0 +1,27 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <malloc.h>
+
+main(int argc, char* argv[])
+{
+ int fd;
+ char *addr;
+
+ fd = open("/dev/raw/raw1", O_RDONLY);
+ if (fd == -1) {
+ perror("error opening\n");
+ exit(0);
+ }
+
+ addr = memalign(4096, 4096);
+ if (addr == 0) {
+ printf("no memory\n");
+ exit(0);
+ }
+
+ while (1) {
+ pread(fd, addr, 4096, 0);
+ }
+
+}
diff -Nur zero/makefile blknull_test/makefile
--- zero/makefile 1969-12-31 16:00:00.000000000 -0800
+++ blknull_test/makefile 2005-03-08 17:10:39.000000000 -0800
@@ -0,0 +1,10 @@
+all: pread_null aio_null
+
+pread_null: pread_null.c
+ gcc -O3 -o $@ pread_null.c
+
+aio_null: aio_null.c
+ gcc -O3 -o $@ aio_null.c -laio
+
+clean:
+ rm -f pread_null aio_null
이렇게 세가지 입니다.
도움 주심에 미리 감사말씀 드립니다.
새해 복 많이 받으세요.
The pseudo disk driver that I used to stress the kernel I/O stack
(anything above block layer, AIO/DIO/BIO).
- Ken
diff -Nur zero/blknull.c blknull/blknull.c
--- zero/blknull.c 1969-12-31 16:00:00.000000000 -0800
+++ blknull/blknull.c 2005-03-03 19:04:07.000000000 -0800
@@ -0,0 +1,97 @@
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkpg.h>
+#include <linux/spinlock.h>
+
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+
+#define BLK_NULL_MAJOR 60
+#define BLK_NULL_NAME "blknull"
+
+
+MODULE_AUTHOR("Ken Chen");
+MODULE_DESCRIPTION("null block driver");
+MODULE_LICENSE("GPL");
+
+
+spinlock_t driver_lock;
+struct request_queue *q;
+struct gendisk *disk;
+
+
+static int null_open(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int null_release(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static struct block_device_operations null_fops = {
+ .owner = THIS_MODULE,
+ .open = null_open,
+ .release = null_release,
+};
+
+static void do_null_request(request_queue_t *q)
+{
+ struct request *req;
+
+ while (!blk_queue_plugged(q)) {
+ req = elv_next_request(q);
+ if (!req)
+ break;
+
+ blkdev_dequeue_request(req);
+
+ end_that_request_first(req, 1, req->nr_sectors);
+ end_that_request_last(req);
+ }
+}
+
+static int __init init_blk_null_module(void)
+{
+
+ if (register_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME)) {
+ printk(KERN_ERR "Unable to register null blk device\n");
+ return 0;
+ }
+
+ spin_lock_init(&driver_lock);
+ q = blk_init_queue(do_null_request, &driver_lock);
+ if (q) {
+ disk = alloc_disk(1);
+
+ if (disk) {
+ disk->major = BLK_NULL_MAJOR;
+ disk->first_minor = 0;
+ disk->fops = &null_fops;
+ disk->capacity = 1<<30;
+ disk->queue = q;
+ memcpy(disk->disk_name, BLK_NULL_NAME, sizeof(BLK_NULL_NAME));
+ add_disk(disk);
+ return 1;
+ }
+
+ blk_cleanup_queue(q);
+ }
+ unregister_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME);
+ return 0;
+}
+
+static void __exit exit_blk_null_module(void)
+{
+ del_gendisk(disk);
+ blk_cleanup_queue(q);
+ unregister_blkdev(BLK_NULL_MAJOR, BLK_NULL_NAME);
+}
+
+module_init(init_blk_null_module);
+module_exit(exit_blk_null_module);
diff -Nur zero/Makefile blknull/Makefile
--- zero/Makefile 1969-12-31 16:00:00.000000000 -0800
+++ blknull/Makefile 2005-03-03 18:42:55.000000000 -0800
@@ -0,0 +1 @@
+obj-m := blknull.o
Forums:


다른것은 모르겠고,앞에 Ken이라는 이름을 보니..그냥 믿어도 될
다른것은 모르겠고,
앞에 Ken이라는 이름을 보니..
그냥 믿어도 될 것 같은데요?
ps) 이사람 cat으로 바이너리를 짠다던 그 사람 아니가요?
아니면 말구...
댓글 달기