Btrfs: do aio_write instead of write
In order for AIO to work, we need to implement aio_write. This patch converts our btrfs_file_write to btrfs_aio_write. I've tested this with xfstests and nothing broke, and the AIO stuff magically started working. Thanks, Signed-off-by: Josef Bacik <josef@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
This commit is contained in:
@@ -2017,6 +2017,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
|
|||||||
sector_t sector;
|
sector_t sector;
|
||||||
struct extent_map *em;
|
struct extent_map *em;
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
|
struct btrfs_ordered_extent *ordered;
|
||||||
int ret;
|
int ret;
|
||||||
int nr = 0;
|
int nr = 0;
|
||||||
size_t page_offset = 0;
|
size_t page_offset = 0;
|
||||||
@@ -2028,7 +2029,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
|
|||||||
set_page_extent_mapped(page);
|
set_page_extent_mapped(page);
|
||||||
|
|
||||||
end = page_end;
|
end = page_end;
|
||||||
lock_extent(tree, start, end, GFP_NOFS);
|
while (1) {
|
||||||
|
lock_extent(tree, start, end, GFP_NOFS);
|
||||||
|
ordered = btrfs_lookup_ordered_extent(inode, start);
|
||||||
|
if (!ordered)
|
||||||
|
break;
|
||||||
|
unlock_extent(tree, start, end, GFP_NOFS);
|
||||||
|
btrfs_start_ordered_extent(inode, ordered, 1);
|
||||||
|
btrfs_put_ordered_extent(ordered);
|
||||||
|
}
|
||||||
|
|
||||||
if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
|
if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
|
||||||
char *userpage;
|
char *userpage;
|
||||||
|
178
fs/btrfs/file.c
178
fs/btrfs/file.c
@@ -46,32 +46,42 @@
|
|||||||
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
|
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
|
||||||
int write_bytes,
|
int write_bytes,
|
||||||
struct page **prepared_pages,
|
struct page **prepared_pages,
|
||||||
const char __user *buf)
|
struct iov_iter *i)
|
||||||
{
|
{
|
||||||
long page_fault = 0;
|
size_t copied;
|
||||||
int i;
|
int pg = 0;
|
||||||
int offset = pos & (PAGE_CACHE_SIZE - 1);
|
int offset = pos & (PAGE_CACHE_SIZE - 1);
|
||||||
|
|
||||||
for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
|
while (write_bytes > 0) {
|
||||||
size_t count = min_t(size_t,
|
size_t count = min_t(size_t,
|
||||||
PAGE_CACHE_SIZE - offset, write_bytes);
|
PAGE_CACHE_SIZE - offset, write_bytes);
|
||||||
struct page *page = prepared_pages[i];
|
struct page *page = prepared_pages[pg];
|
||||||
fault_in_pages_readable(buf, count);
|
again:
|
||||||
|
if (unlikely(iov_iter_fault_in_readable(i, count)))
|
||||||
|
return -EFAULT;
|
||||||
|
|
||||||
/* Copy data from userspace to the current page */
|
/* Copy data from userspace to the current page */
|
||||||
kmap(page);
|
copied = iov_iter_copy_from_user(page, i, offset, count);
|
||||||
page_fault = __copy_from_user(page_address(page) + offset,
|
|
||||||
buf, count);
|
|
||||||
/* Flush processor's dcache for this page */
|
/* Flush processor's dcache for this page */
|
||||||
flush_dcache_page(page);
|
flush_dcache_page(page);
|
||||||
kunmap(page);
|
iov_iter_advance(i, copied);
|
||||||
buf += count;
|
write_bytes -= copied;
|
||||||
write_bytes -= count;
|
|
||||||
|
|
||||||
if (page_fault)
|
if (unlikely(copied == 0)) {
|
||||||
break;
|
count = min_t(size_t, PAGE_CACHE_SIZE - offset,
|
||||||
|
iov_iter_single_seg_count(i));
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
|
||||||
|
offset += copied;
|
||||||
|
} else {
|
||||||
|
pg++;
|
||||||
|
offset = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return page_fault ? -EFAULT : 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -822,60 +832,24 @@ again:
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Copied from read-write.c */
|
static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
|
||||||
static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
|
const struct iovec *iov,
|
||||||
|
unsigned long nr_segs, loff_t pos)
|
||||||
{
|
{
|
||||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
struct file *file = iocb->ki_filp;
|
||||||
if (!kiocbIsKicked(iocb))
|
struct inode *inode = fdentry(file)->d_inode;
|
||||||
schedule();
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
||||||
else
|
struct page *pinned[2];
|
||||||
kiocbClearKicked(iocb);
|
struct page **pages = NULL;
|
||||||
__set_current_state(TASK_RUNNING);
|
struct iov_iter i;
|
||||||
}
|
loff_t *ppos = &iocb->ki_pos;
|
||||||
|
|
||||||
/*
|
|
||||||
* Just a copy of what do_sync_write does.
|
|
||||||
*/
|
|
||||||
static ssize_t __btrfs_direct_write(struct file *file, const char __user *buf,
|
|
||||||
size_t count, loff_t pos, loff_t *ppos)
|
|
||||||
{
|
|
||||||
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
|
|
||||||
unsigned long nr_segs = 1;
|
|
||||||
struct kiocb kiocb;
|
|
||||||
ssize_t ret;
|
|
||||||
|
|
||||||
init_sync_kiocb(&kiocb, file);
|
|
||||||
kiocb.ki_pos = pos;
|
|
||||||
kiocb.ki_left = count;
|
|
||||||
kiocb.ki_nbytes = count;
|
|
||||||
|
|
||||||
while (1) {
|
|
||||||
ret = generic_file_direct_write(&kiocb, &iov, &nr_segs, pos,
|
|
||||||
ppos, count, count);
|
|
||||||
if (ret != -EIOCBRETRY)
|
|
||||||
break;
|
|
||||||
wait_on_retry_sync_kiocb(&kiocb);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ret == -EIOCBQUEUED)
|
|
||||||
ret = wait_on_sync_kiocb(&kiocb);
|
|
||||||
*ppos = kiocb.ki_pos;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|
||||||
size_t count, loff_t *ppos)
|
|
||||||
{
|
|
||||||
loff_t pos;
|
|
||||||
loff_t start_pos;
|
loff_t start_pos;
|
||||||
ssize_t num_written = 0;
|
ssize_t num_written = 0;
|
||||||
ssize_t err = 0;
|
ssize_t err = 0;
|
||||||
|
size_t count;
|
||||||
|
size_t ocount;
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
struct inode *inode = fdentry(file)->d_inode;
|
|
||||||
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
||||||
struct page **pages = NULL;
|
|
||||||
int nrptrs;
|
int nrptrs;
|
||||||
struct page *pinned[2];
|
|
||||||
unsigned long first_index;
|
unsigned long first_index;
|
||||||
unsigned long last_index;
|
unsigned long last_index;
|
||||||
int will_write;
|
int will_write;
|
||||||
@@ -887,13 +861,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
pinned[0] = NULL;
|
pinned[0] = NULL;
|
||||||
pinned[1] = NULL;
|
pinned[1] = NULL;
|
||||||
|
|
||||||
pos = *ppos;
|
|
||||||
start_pos = pos;
|
start_pos = pos;
|
||||||
|
|
||||||
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
|
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
|
||||||
|
|
||||||
mutex_lock(&inode->i_mutex);
|
mutex_lock(&inode->i_mutex);
|
||||||
|
|
||||||
|
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
|
||||||
|
if (err)
|
||||||
|
goto out;
|
||||||
|
count = ocount;
|
||||||
|
|
||||||
current->backing_dev_info = inode->i_mapping->backing_dev_info;
|
current->backing_dev_info = inode->i_mapping->backing_dev_info;
|
||||||
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
|
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
|
||||||
if (err)
|
if (err)
|
||||||
@@ -910,33 +888,69 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
BTRFS_I(inode)->sequence++;
|
BTRFS_I(inode)->sequence++;
|
||||||
|
|
||||||
if (unlikely(file->f_flags & O_DIRECT)) {
|
if (unlikely(file->f_flags & O_DIRECT)) {
|
||||||
num_written = __btrfs_direct_write(file, buf, count, pos,
|
ret = btrfs_delalloc_reserve_space(inode, count);
|
||||||
ppos);
|
if (ret)
|
||||||
pos += num_written;
|
|
||||||
count -= num_written;
|
|
||||||
|
|
||||||
/* We've written everything we wanted to, exit */
|
|
||||||
if (num_written < 0 || !count)
|
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
|
num_written = generic_file_direct_write(iocb, iov, &nr_segs,
|
||||||
|
pos, ppos, count,
|
||||||
|
ocount);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* the generic O_DIRECT will update in-memory i_size after the
|
||||||
|
* DIOs are done. But our endio handlers that update the on
|
||||||
|
* disk i_size never update past the in memory i_size. So we
|
||||||
|
* need one more update here to catch any additions to the
|
||||||
|
* file
|
||||||
|
*/
|
||||||
|
if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
|
||||||
|
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
|
||||||
|
mark_inode_dirty(inode);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num_written < 0) {
|
||||||
|
if (num_written != -EIOCBQUEUED) {
|
||||||
|
/*
|
||||||
|
* aio land will take care of releasing the
|
||||||
|
* delalloc
|
||||||
|
*/
|
||||||
|
btrfs_delalloc_release_space(inode, count);
|
||||||
|
}
|
||||||
|
ret = num_written;
|
||||||
|
num_written = 0;
|
||||||
|
goto out;
|
||||||
|
} else if (num_written == count) {
|
||||||
|
/* pick up pos changes done by the generic code */
|
||||||
|
pos = *ppos;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* the buffered IO will reserve bytes for the rest of the
|
||||||
|
* range, don't double count them here
|
||||||
|
*/
|
||||||
|
btrfs_delalloc_release_space(inode, count - num_written);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We are going to do buffered for the rest of the range, so we
|
* We are going to do buffered for the rest of the range, so we
|
||||||
* need to make sure to invalidate the buffered pages when we're
|
* need to make sure to invalidate the buffered pages when we're
|
||||||
* done.
|
* done.
|
||||||
*/
|
*/
|
||||||
buffered = 1;
|
buffered = 1;
|
||||||
buf += num_written;
|
pos += num_written;
|
||||||
}
|
}
|
||||||
|
|
||||||
nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
|
iov_iter_init(&i, iov, nr_segs, count, num_written);
|
||||||
PAGE_CACHE_SIZE / (sizeof(struct page *)));
|
nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
|
||||||
|
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
|
||||||
|
(sizeof(struct page *)));
|
||||||
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
|
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
|
||||||
|
|
||||||
/* generic_write_checks can change our pos */
|
/* generic_write_checks can change our pos */
|
||||||
start_pos = pos;
|
start_pos = pos;
|
||||||
|
|
||||||
first_index = pos >> PAGE_CACHE_SHIFT;
|
first_index = pos >> PAGE_CACHE_SHIFT;
|
||||||
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
|
last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* there are lots of better ways to do this, but this code
|
* there are lots of better ways to do this, but this code
|
||||||
@@ -953,7 +967,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
unlock_page(pinned[0]);
|
unlock_page(pinned[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
|
if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
|
||||||
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
|
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
|
||||||
if (!PageUptodate(pinned[1])) {
|
if (!PageUptodate(pinned[1])) {
|
||||||
ret = btrfs_readpage(NULL, pinned[1]);
|
ret = btrfs_readpage(NULL, pinned[1]);
|
||||||
@@ -964,10 +978,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
while (count > 0) {
|
while (iov_iter_count(&i) > 0) {
|
||||||
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
|
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
|
||||||
size_t write_bytes = min(count, nrptrs *
|
size_t write_bytes = min(iov_iter_count(&i),
|
||||||
(size_t)PAGE_CACHE_SIZE -
|
nrptrs * (size_t)PAGE_CACHE_SIZE -
|
||||||
offset);
|
offset);
|
||||||
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
|
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
|
||||||
PAGE_CACHE_SHIFT;
|
PAGE_CACHE_SHIFT;
|
||||||
@@ -988,7 +1002,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ret = btrfs_copy_from_user(pos, num_pages,
|
ret = btrfs_copy_from_user(pos, num_pages,
|
||||||
write_bytes, pages, buf);
|
write_bytes, pages, &i);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
dirty_and_release_pages(NULL, root, file, pages,
|
dirty_and_release_pages(NULL, root, file, pages,
|
||||||
num_pages, pos, write_bytes);
|
num_pages, pos, write_bytes);
|
||||||
@@ -1012,8 +1026,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
|
|||||||
btrfs_throttle(root);
|
btrfs_throttle(root);
|
||||||
}
|
}
|
||||||
|
|
||||||
buf += write_bytes;
|
|
||||||
count -= write_bytes;
|
|
||||||
pos += write_bytes;
|
pos += write_bytes;
|
||||||
num_written += write_bytes;
|
num_written += write_bytes;
|
||||||
|
|
||||||
@@ -1206,7 +1218,7 @@ const struct file_operations btrfs_file_operations = {
|
|||||||
.read = do_sync_read,
|
.read = do_sync_read,
|
||||||
.aio_read = generic_file_aio_read,
|
.aio_read = generic_file_aio_read,
|
||||||
.splice_read = generic_file_splice_read,
|
.splice_read = generic_file_splice_read,
|
||||||
.write = btrfs_file_write,
|
.aio_write = btrfs_file_aio_write,
|
||||||
.mmap = btrfs_file_mmap,
|
.mmap = btrfs_file_mmap,
|
||||||
.open = generic_file_open,
|
.open = generic_file_open,
|
||||||
.release = btrfs_release_file,
|
.release = btrfs_release_file,
|
||||||
|
Reference in New Issue
Block a user