Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs

Pull btrfs fixes from Chris Mason:
 "I've split out the big send/receive update from my last pull request
  and now have just the fixes in my for-linus branch.  The send/recv
  branch will wander over to linux-next shortly though.

  The largest patches in this pull are Josef's patches to fix DIO
  locking problems and his patch to fix a crash during balance.  They
  are both well tested.

  The rest are smaller fixes that we've had queued.  The last rc came
  out while I was hacking new and exciting ways to recover from a
  misplaced rm -rf on my dev box, so these missed rc3."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (25 commits)
  Btrfs: fix that repair code is spuriously executed for transid failures
  Btrfs: fix ordered extent leak when failing to start a transaction
  Btrfs: fix a dio write regression
  Btrfs: fix deadlock with freeze and sync V2
  Btrfs: revert checksum error statistic which can cause a BUG()
  Btrfs: remove superblock writing after fatal error
  Btrfs: allow delayed refs to be merged
  Btrfs: fix enospc problems when deleting a subvol
  Btrfs: fix wrong mtime and ctime when creating snapshots
  Btrfs: fix race in run_clustered_refs
  Btrfs: don't run __tree_mod_log_free_eb on leaves
  Btrfs: increase the size of the free space cache
  Btrfs: barrier before waitqueue_active
  Btrfs: fix deadlock in wait_for_more_refs
  btrfs: fix second lock in btrfs_delete_delayed_items()
  Btrfs: don't allocate a seperate csums array for direct reads
  Btrfs: do not strdup non existent strings
  Btrfs: do not use missing devices when showing devname
  Btrfs: fix that error value is changed by mistake
  Btrfs: lock extents as we map them in DIO
  ...
This commit is contained in:
Linus Torvalds
2012-08-29 11:36:22 -07:00
21 changed files with 418 additions and 376 deletions

View File

@@ -1008,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
if (atomic_read(&root->fs_info->async_delalloc_pages) <
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
5 * 1024 * 1024 &&
waitqueue_active(&root->fs_info->async_submit_wait))
wake_up(&root->fs_info->async_submit_wait);
@@ -1885,8 +1883,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = btrfs_join_transaction_nolock(root);
else
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
@@ -3174,7 +3175,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode_inc_iversion(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, root, ret);
out:
@@ -5774,18 +5775,112 @@ out:
return ret;
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
0, cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
* extents in this range.
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
lockend - lockstart + 1);
/*
* We need to make sure there are no buffered pages in this
* range either, we could have raced between the invalidate in
* generic_file_direct_write and locking the extent. The
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
if (!ordered && (!writing ||
!test_range_bit(&BTRFS_I(inode)->io_tree,
lockstart, lockend, EXTENT_UPTODATE, 0,
*cached_state)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state, GFP_NOFS);
if (ordered) {
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
ret = filemap_write_and_wait_range(inode->i_mapping,
lockstart,
lockend);
if (ret)
break;
/*
* If we found a page that couldn't be invalidated just
* fall back to buffered.
*/
ret = invalidate_inode_pages2_range(inode->i_mapping,
lockstart >> PAGE_CACHE_SHIFT,
lockend >> PAGE_CACHE_SHIFT);
if (ret)
break;
}
cond_resched();
}
return ret;
}
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
struct btrfs_trans_handle *trans;
int unlock_bits = EXTENT_LOCKED;
int ret;
if (create) {
ret = btrfs_delalloc_reserve_space(inode, len);
if (ret)
return ret;
unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
} else {
len = min_t(u64, len, root->sectorsize);
}
lockstart = start;
lockend = start + len - 1;
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
return -ENOTBLK;
if (create) {
ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_DELALLOC, NULL,
&cached_state, GFP_NOFS);
if (ret)
goto unlock_err;
}
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em))
return PTR_ERR(em);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
}
/*
* Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5804,17 +5899,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
return -ENOTBLK;
ret = -ENOTBLK;
goto unlock_err;
}
/* Just a good old fashioned hole, return */
if (!create && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
/* DIO will do one hole at a time, so just unlock a sector */
unlock_extent(&BTRFS_I(inode)->io_tree, start,
start + root->sectorsize - 1);
return 0;
ret = 0;
goto unlock_err;
}
/*
@@ -5827,8 +5921,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
*
*/
if (!create) {
len = em->len - (start - em->start);
goto map;
len = min(len, em->len - (start - em->start));
lockstart = start + len;
goto unlock;
}
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5860,7 +5955,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
btrfs_end_transaction(trans, root);
if (ret) {
free_extent_map(em);
return ret;
goto unlock_err;
}
goto unlock;
}
@@ -5873,14 +5968,12 @@ must_cow:
*/
len = bh_result->b_size;
em = btrfs_new_extent_direct(inode, em, start, len);
if (IS_ERR(em))
return PTR_ERR(em);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
goto unlock_err;
}
len = min(len, em->len - (start - em->start));
unlock:
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
0, NULL, GFP_NOFS);
map:
bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
inode->i_blkbits;
bh_result->b_size = len;
@@ -5898,9 +5991,44 @@ map:
i_size_write(inode, start + len);
}
/*
* In the case of write we need to clear and unlock the entire range,
* in the case of read we need to unlock only the end area that we
* aren't using if there is any left over space.
*/
if (lockstart < lockend) {
if (create && len < lockend - lockstart) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockstart + len - 1, unlock_bits, 1, 0,
&cached_state, GFP_NOFS);
/*
* Beside unlock, we also need to cleanup reserved space
* for the left range by attaching EXTENT_DO_ACCOUNTING.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree,
lockstart + len, lockend,
unlock_bits | EXTENT_DO_ACCOUNTING,
1, 0, NULL, GFP_NOFS);
} else {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, unlock_bits, 1, 0,
&cached_state, GFP_NOFS);
}
} else {
free_extent_state(cached_state);
}
free_extent_map(em);
return 0;
unlock_err:
if (create)
unlock_bits |= EXTENT_DO_ACCOUNTING;
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
return ret;
}
struct btrfs_dio_private {
@@ -5908,7 +6036,6 @@ struct btrfs_dio_private {
u64 logical_offset;
u64 disk_bytenr;
u64 bytes;
u32 *csums;
void *private;
/* number of bios pending for this dio */
@@ -5928,7 +6055,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 start;
u32 *private = dip->csums;
start = dip->logical_offset;
do {
@@ -5936,8 +6062,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
struct page *page = bvec->bv_page;
char *kaddr;
u32 csum = ~(u32)0;
u64 private = ~(u32)0;
unsigned long flags;
if (get_state_private(&BTRFS_I(inode)->io_tree,
start, &private))
goto failed;
local_irq_save(flags);
kaddr = kmap_atomic(page);
csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5947,18 +6077,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
if (csum != *private) {
if (csum != private) {
failed:
printk(KERN_ERR "btrfs csum failed ino %llu off"
" %llu csum %u private %u\n",
(unsigned long long)btrfs_ino(inode),
(unsigned long long)start,
csum, *private);
csum, (unsigned)private);
err = -EIO;
}
}
start += bvec->bv_len;
private++;
bvec++;
} while (bvec <= bvec_end);
@@ -5966,7 +6096,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
dip->logical_offset + dip->bytes - 1);
bio->bi_private = dip->private;
kfree(dip->csums);
kfree(dip);
/* If we had a csum failure make sure to clear the uptodate flag */
@@ -6072,7 +6201,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
u32 *csums, int async_submit)
int async_submit)
{
int write = rw & REQ_WRITE;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6105,8 +6234,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (ret)
goto err;
} else if (!skip_sum) {
ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
file_offset, csums);
ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
if (ret)
goto err;
}
@@ -6132,10 +6260,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
u64 submit_len = 0;
u64 map_length;
int nr_pages = 0;
u32 *csums = dip->csums;
int ret = 0;
int async_submit = 0;
int write = rw & REQ_WRITE;
map_length = orig_bio->bi_size;
ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6171,16 +6297,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
atomic_inc(&dip->pending_bios);
ret = __btrfs_submit_dio_bio(bio, inode, rw,
file_offset, skip_sum,
csums, async_submit);
async_submit);
if (ret) {
bio_put(bio);
atomic_dec(&dip->pending_bios);
goto out_err;
}
/* Write's use the ordered csums */
if (!write && !skip_sum)
csums = csums + nr_pages;
start_sector += submit_len >> 9;
file_offset += submit_len;
@@ -6210,7 +6333,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
submit:
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
csums, async_submit);
async_submit);
if (!ret)
return 0;
@@ -6246,17 +6369,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
ret = -ENOMEM;
goto free_ordered;
}
dip->csums = NULL;
/* Write's use the ordered csum stuff, so we don't need dip->csums */
if (!write && !skip_sum) {
dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
if (!dip->csums) {
kfree(dip);
ret = -ENOMEM;
goto free_ordered;
}
}
dip->private = bio->bi_private;
dip->inode = inode;
@@ -6341,132 +6453,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
out:
return retval;
}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
ssize_t ret;
int writing = rw & WRITE;
int write_bits = 0;
size_t count = iov_length(iov, nr_segs);
if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
offset, nr_segs)) {
offset, nr_segs))
return 0;
}
lockstart = offset;
lockend = offset + count - 1;
if (writing) {
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
}
while (1) {
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
0, &cached_state);
/*
* We're concerned with the entire range that we're going to be
* doing DIO to, so we need to make sure theres no ordered
* extents in this range.
*/
ordered = btrfs_lookup_ordered_range(inode, lockstart,
lockend - lockstart + 1);
/*
* We need to make sure there are no buffered pages in this
* range either, we could have raced between the invalidate in
* generic_file_direct_write and locking the extent. The
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
if (!ordered && (!writing ||
!test_range_bit(&BTRFS_I(inode)->io_tree,
lockstart, lockend, EXTENT_UPTODATE, 0,
cached_state)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
if (ordered) {
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
ret = filemap_write_and_wait_range(file->f_mapping,
lockstart,
lockend);
if (ret)
goto out;
/*
* If we found a page that couldn't be invalidated just
* fall back to buffered.
*/
ret = invalidate_inode_pages2_range(file->f_mapping,
lockstart >> PAGE_CACHE_SHIFT,
lockend >> PAGE_CACHE_SHIFT);
if (ret) {
if (ret == -EBUSY)
ret = 0;
goto out;
}
}
cond_resched();
}
/*
* we don't use btrfs_set_extent_delalloc because we don't want
* the dirty or uptodate bits
*/
if (writing) {
write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
EXTENT_DELALLOC, NULL, &cached_state,
GFP_NOFS);
if (ret) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_LOCKED | write_bits,
1, 0, &cached_state, GFP_NOFS);
goto out;
}
}
free_extent_state(cached_state);
cached_state = NULL;
ret = __blockdev_direct_IO(rw, iocb, inode,
return __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, 0);
if (ret < 0 && ret != -EIOCBQUEUED) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
offset + iov_length(iov, nr_segs) - 1,
EXTENT_LOCKED | write_bits, 1, 0,
&cached_state, GFP_NOFS);
} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
/*
* We're falling back to buffered, unlock the section we didn't
* do IO on.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
offset + iov_length(iov, nr_segs) - 1,
EXTENT_LOCKED | write_bits, 1, 0,
&cached_state, GFP_NOFS);
}
out:
free_extent_state(cached_state);
return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,