diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e25564bfcb46..54a201dac7f9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (ret > 0) goto next; - ret = ulist_add_merge(parents, eb->start, - (uintptr_t)eie, - (u64 *)&old, GFP_NOFS); + ret = ulist_add_merge_ptr(parents, eb->start, + eie, (void **)&old, GFP_NOFS); if (ret < 0) break; if (!ret && extent_item_pos) { @@ -1001,16 +1000,19 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, ret = -EIO; goto out; } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; ref->inode_list = eie; } - ret = ulist_add_merge(refs, ref->parent, - (uintptr_t)ref->inode_list, - (u64 *)&eie, GFP_NOFS); + ret = ulist_add_merge_ptr(refs, ref->parent, + ref->inode_list, + (void **)&eie, GFP_NOFS); if (ret < 0) goto out; if (!ret && extent_item_pos) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4794923c410c..43527fd78825 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -84,12 +84,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index aeab453b8e24..44ee5d2e52a4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); + ret = btrfs_inc_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); + ret = btrfs_dec_ref(trans, root, buf, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { @@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); + ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index be91397f4e92..8e29b614fe93 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..d0ed9e664f7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - - list_splice_init(&t->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - list_del_init(&btrfs_inode->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - btrfs_invalidate_inodes(btrfs_inode->root); - - spin_lock(&root->fs_info->ordered_root_lock); - } - - spin_unlock(&root->fs_info->ordered_root_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4093,8 +4063,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { - btrfs_destroy_ordered_operations(cur_trans, root); - btrfs_destroy_delayed_refs(cur_trans, root); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f362f9..102ed3143976 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3057,7 +3057,7 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int no_quota) + int full_backref, int inc) { u64 bytenr; u64 num_bytes; @@ -3111,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, no_quota); + key.offset, 1); if (ret) goto fail; } else { @@ -3119,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - no_quota); + 1); if (ret) goto fail; } @@ -3130,15 +3130,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -7478,6 +7478,220 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, wc->reada_slot = slot; } +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + bytenr, num_bytes, + BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int child_bsize = root->nodesize; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_bsize, + child_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + child_bytenr, + child_bsize, + BTRFS_QGROUP_OPER_SUB_SUBTREE, + 0); + if (ret) + goto out; + + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + /* * helper to process tree block while walking down the tree. * @@ -7532,9 +7746,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, @@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, int level = wc->level; int reada = 0; int ret = 0; + bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + need_account = true; if (level == 1 && (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) goto skip; @@ -7689,6 +7905,16 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, parent = 0; } + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -7769,12 +7995,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 1); else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -7900,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -8025,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } + /* + * Qgroup update accounting is run from + * delayed ref handling. This usually works + * out because delayed refs are normally the + * only way qgroup updates are added. However, + * we may have added updates during our tree + * walk so run qgroups here to make sure we + * don't lose any updates. + */ + ret = btrfs_delayed_qgroup_accounting(trans, + root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8078,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: + ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f46cfe45d686..54c84daec9b5 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -756,7 +756,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, found_next = 1; if (ret != 0) goto insert; - slot = 0; + slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1f2b99cb55ea..d3afac292d67 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1838,33 +1838,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, int btrfs_release_file(struct inode *inode, struct file *filp) { - /* - * ordered_data_close is set by settattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. - */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(inode->i_mapping); - } if (filp->private_data) btrfs_ioctl_trans_end(filp); + filemap_flush(inode->i_mapping); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3183742d6f0d..03708ef3deef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -709,6 +709,18 @@ static noinline int submit_compressed_extents(struct inode *inode, unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); + + /* + * we need to redirty the pages if we decide to + * fallback to uncompressed IO, otherwise we + * will not submit these pages down to lower + * layers. + */ + extent_range_redirty_for_io(inode, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + goto retry; } goto out_free; @@ -7938,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_ordered_operation(trans, root, inode); - /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -8106,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); - INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -8146,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", @@ -8338,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = 0; /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction */ - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ @@ -8391,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ btrfs_pin_log_trans(root); } - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) - btrfs_add_ordered_operation(trans, root, old_inode); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7187b14faa6c..963895c1f801 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, trace_btrfs_ordered_extent_remove(inode, entry); - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (!root->nr_ordered_extents) { spin_lock(&root->fs_info->ordered_root_lock); BUG_ON(list_empty(&root->ordered_root)); @@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) mutex_unlock(&fs_info->ordered_operations_mutex); } -/* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct btrfs_transaction *cur_trans = trans->transaction; - struct list_head splice; - struct list_head works; - struct btrfs_delalloc_work *work, *next; - int ret = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); - - mutex_lock(&root->fs_info->ordered_extent_flush_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - list_splice_init(&cur_trans->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - if (!inode) - continue; - - if (!wait) - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - work = btrfs_alloc_delalloc_work(inode, wait, 1); - if (!work) { - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) - list_add_tail(&btrfs_inode->ordered_operations, - &splice); - list_splice_tail(&splice, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &work->work); - - cond_resched(); - spin_lock(&root->fs_info->ordered_root_lock); - } - spin_unlock(&root->fs_info->ordered_root_lock); -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); - return ret; -} - /* * Used to start IO or wait for a given ordered extent to finish. * @@ -1120,42 +1033,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, return index; } - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod <= root->fs_info->last_trans_committed) - return; - - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_root_lock); -} - int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 246897058efb..d81a274d621e 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait); -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98cb6b2630f9..b497498484be 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1201,6 +1201,50 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } + +static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + /* + * Ignore seq and type here, we're looking for any operation + * at all related to this extent on that root. + */ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + return 0; +} + +static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node *n; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + n = fs_info->qgroup_op_tree.rb_node; + while (n) { + cur = rb_entry(n, struct btrfs_qgroup_operation, n); + cmp = comp_oper_exist(cur, oper); + if (cmp < 0) { + n = n->rb_right; + } else if (cmp) { + n = n->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + static int comp_oper(struct btrfs_qgroup_operation *oper1, struct btrfs_qgroup_operation *oper2) { @@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); INIT_LIST_HEAD(&oper->elem.list); oper->elem.seq = 0; + + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { + /* + * If any operation for this bytenr/ref_root combo + * exists, then we know it's not exclusively owned and + * shouldn't be queued up. + * + * This also catches the case where we have a cloned + * extent that gets queued up multiple times during + * drop snapshot. + */ + if (qgroup_oper_exists(fs_info, oper)) { + kfree(oper); + return 0; + } + } + ret = insert_qgroup_oper(fs_info, oper); if (ret) { /* Shouldn't happen so have an assert for developers */ @@ -1883,6 +1944,111 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, return ret; } +/* + * Process a reference to a shared subtree. This type of operation is + * queued during snapshot removal when we encounter extents which are + * shared between more than one root. + */ +static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup_list *glist; + struct ulist *parents; + int ret = 0; + int err; + struct btrfs_qgroup *qg; + u64 root_obj = 0; + struct seq_list elem = {}; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + elem.seq, &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) + return ret; + + if (roots->nnodes != 1) + goto out; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ + /* + * If we find our ref root then that means all refs + * this extent has to the root have not yet been + * deleted. In that case, we do nothing and let the + * last ref for this bytenr drive our update. + * + * This can happen for example if an extent is + * referenced multiple times in a snapshot (clone, + * etc). If we are in the middle of snapshot removal, + * queued updates for such an extent will find the + * root if we have not yet finished removing the + * snapshot. + */ + if (unode->val == oper->ref_root) + goto out; + + root_obj = unode->val; + BUG_ON(!root_obj); + + spin_lock(&fs_info->qgroup_lock); + qg = find_qgroup_rb(fs_info, root_obj); + if (!qg) + goto out_unlock; + + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* + * Adjust counts for parent groups. First we find all + * parents, then in the 2nd loop we do the adjustment + * while adding parents of the parents to our ulist. + */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(parents, &uiter))) { + qg = u64_to_ptr(unode->aux); + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qg->groups, next_group) { + err = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (err < 0) { + ret = err; + goto out_unlock; + } + } + } + +out_unlock: + spin_unlock(&fs_info->qgroup_lock); + +out: + ulist_free(roots); + ulist_free(parents); + return ret; +} + /* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and @@ -1920,6 +2086,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, case BTRFS_QGROUP_OPER_SUB_SHARED: ret = qgroup_shared_accounting(trans, fs_info, oper); break; + case BTRFS_QGROUP_OPER_SUB_SUBTREE: + ret = qgroup_subtree_accounting(trans, fs_info, oper); + break; default: ASSERT(0); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 5952ff1fbd7a..18cc68ca3090 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type { BTRFS_QGROUP_OPER_ADD_SHARED, BTRFS_QGROUP_OPER_SUB_EXCL, BTRFS_QGROUP_OPER_SUB_SHARED, + BTRFS_QGROUP_OPER_SUB_SUBTREE, }; struct btrfs_qgroup_operation { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 67b48b9a03e0..c4124de4435b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1665,6 +1665,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) return 0; } +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes, order of allocations and the + * 'alloc_start' value, this is a close approximation of the actual use but + * there are other factors that may change the result (like a new metadata + * chunk). + * + * FIXME: not accurate for mixed block groups, total and free/used are ok, + * available appears slightly larger. + */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); @@ -1675,6 +1690,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; /* holding chunk_muext to avoid allocating new chunks */ @@ -1682,30 +1699,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + total_free_data += found->disk_total - found->disk_used; total_free_data -= btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) { + switch (i) { + case BTRFS_RAID_DUP: + case BTRFS_RAID_RAID1: + case BTRFS_RAID_RAID10: + factor = 2; + } + } + } } total_used += found->disk_used; } + rcu_read_unlock(); - buf->f_namelen = BTRFS_NAME_LEN; - buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + buf->f_bfree -= block_rsv->size >> bits; + spin_unlock(&block_rsv->lock); + buf->f_bavail = total_free_data; ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { mutex_unlock(&fs_info->chunk_mutex); return ret; } - buf->f_bavail += total_free_data; + buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; mutex_unlock(&fs_info->chunk_mutex); + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = BTRFS_NAME_LEN; + /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted on a big-endian or little-endian host */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5f379affdf23..d89c6d3542ca 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -218,7 +218,6 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type) spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } -static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = btrfs_run_delayed_items(trans, root); - if (ret) - return ret; - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(trans, root, 1); - - return ret; -} - static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) @@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *prev_trans = NULL; int ret; - ret = btrfs_run_ordered_operations(trans, root, 0); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - return ret; - } - /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; @@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7dd558ed0716..579be51b27e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -55,7 +55,6 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head ordered_operations; struct list_head pending_chunks; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 7f78cbf5cf41..4c29db604bbe 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); + +/* just like ulist_add_merge() but take a pointer for the aux data */ +static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, + void **old_aux, gfp_t gfp_mask) +{ +#if BITS_PER_LONG == 32 + u64 old64 = (uintptr_t)*old_aux; + int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask); + *old_aux = (void *)((uintptr_t)old64); + return ret; +#else + return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask); +#endif +} + struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter);