Btrfs: don't read leaf blocks containing only checksums during truncate
Checksum items take up a significant portion of the metadata for large files. It is possible to avoid reading them during truncates by checking the keys in the higher level nodes. If a given leaf is followed by another leaf where the lowest key is a checksum item from the same file, we know we can safely delete the leaf without reading it. For a 32GB file on a 6 drive raid0 array, Btrfs needs 8s to delete the file with a cold cache. It is read bound during the run. With this change, Btrfs is able to delete the file in 0.5s Signed-off-by: Chris Mason <chris.mason@oracle.com>
This commit is contained in:
@@ -1388,7 +1388,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
|
|||||||
struct btrfs_key prealloc_block;
|
struct btrfs_key prealloc_block;
|
||||||
|
|
||||||
lowest_level = p->lowest_level;
|
lowest_level = p->lowest_level;
|
||||||
WARN_ON(lowest_level && ins_len);
|
WARN_ON(lowest_level && ins_len > 0);
|
||||||
WARN_ON(p->nodes[0] != NULL);
|
WARN_ON(p->nodes[0] != NULL);
|
||||||
WARN_ON(cow && root == root->fs_info->extent_root &&
|
WARN_ON(cow && root == root->fs_info->extent_root &&
|
||||||
!mutex_is_locked(&root->fs_info->alloc_mutex));
|
!mutex_is_locked(&root->fs_info->alloc_mutex));
|
||||||
@@ -3186,6 +3186,36 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* a helper function to delete the leaf pointed to by path->slots[1] and
|
||||||
|
* path->nodes[1]. bytenr is the node block pointer, but since the callers
|
||||||
|
* already know it, it is faster to have them pass it down than to
|
||||||
|
* read it out of the node again.
|
||||||
|
*
|
||||||
|
* This deletes the pointer in path->nodes[1] and frees the leaf
|
||||||
|
* block extent. zero is returned if it all worked out, < 0 otherwise.
|
||||||
|
*
|
||||||
|
* The path must have already been setup for deleting the leaf, including
|
||||||
|
* all the proper balancing. path->nodes[1] must be locked.
|
||||||
|
*/
|
||||||
|
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
|
||||||
|
struct btrfs_root *root,
|
||||||
|
struct btrfs_path *path, u64 bytenr)
|
||||||
|
{
|
||||||
|
int ret;
|
||||||
|
u64 root_gen = btrfs_header_generation(path->nodes[1]);
|
||||||
|
|
||||||
|
ret = del_ptr(trans, root, path, 1, path->slots[1]);
|
||||||
|
if (ret)
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
ret = btrfs_free_extent(trans, root, bytenr,
|
||||||
|
btrfs_level_size(root, 0),
|
||||||
|
path->nodes[1]->start,
|
||||||
|
btrfs_header_owner(path->nodes[1]),
|
||||||
|
root_gen, 0, 0, 1);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
* delete the item at the leaf level in path. If that empties
|
* delete the item at the leaf level in path. If that empties
|
||||||
* the leaf, remove it from the tree
|
* the leaf, remove it from the tree
|
||||||
@@ -3251,17 +3281,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
|||||||
if (leaf == root->node) {
|
if (leaf == root->node) {
|
||||||
btrfs_set_header_level(leaf, 0);
|
btrfs_set_header_level(leaf, 0);
|
||||||
} else {
|
} else {
|
||||||
u64 root_gen = btrfs_header_generation(path->nodes[1]);
|
ret = btrfs_del_leaf(trans, root, path, leaf->start);
|
||||||
wret = del_ptr(trans, root, path, 1, path->slots[1]);
|
BUG_ON(ret);
|
||||||
if (wret)
|
|
||||||
ret = wret;
|
|
||||||
wret = btrfs_free_extent(trans, root,
|
|
||||||
leaf->start, leaf->len,
|
|
||||||
path->nodes[1]->start,
|
|
||||||
btrfs_header_owner(path->nodes[1]),
|
|
||||||
root_gen, 0, 0, 1);
|
|
||||||
if (wret)
|
|
||||||
ret = wret;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
int used = leaf_space_used(leaf, 0, nritems);
|
int used = leaf_space_used(leaf, 0, nritems);
|
||||||
@@ -3296,24 +3317,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (btrfs_header_nritems(leaf) == 0) {
|
if (btrfs_header_nritems(leaf) == 0) {
|
||||||
u64 root_gen;
|
path->slots[1] = slot;
|
||||||
u64 bytenr = leaf->start;
|
ret = btrfs_del_leaf(trans, root, path, leaf->start);
|
||||||
u32 blocksize = leaf->len;
|
BUG_ON(ret);
|
||||||
|
|
||||||
root_gen = btrfs_header_generation(
|
|
||||||
path->nodes[1]);
|
|
||||||
|
|
||||||
wret = del_ptr(trans, root, path, 1, slot);
|
|
||||||
if (wret)
|
|
||||||
ret = wret;
|
|
||||||
|
|
||||||
free_extent_buffer(leaf);
|
free_extent_buffer(leaf);
|
||||||
wret = btrfs_free_extent(trans, root, bytenr,
|
|
||||||
blocksize, path->nodes[1]->start,
|
|
||||||
btrfs_header_owner(path->nodes[1]),
|
|
||||||
root_gen, 0, 0, 1);
|
|
||||||
if (wret)
|
|
||||||
ret = wret;
|
|
||||||
} else {
|
} else {
|
||||||
/* if we're still in the path, make sure
|
/* if we're still in the path, make sure
|
||||||
* we're dirty. Otherwise, one of the
|
* we're dirty. Otherwise, one of the
|
||||||
@@ -3418,8 +3425,8 @@ again:
|
|||||||
level = btrfs_header_level(cur);
|
level = btrfs_header_level(cur);
|
||||||
sret = bin_search(cur, min_key, level, &slot);
|
sret = bin_search(cur, min_key, level, &slot);
|
||||||
|
|
||||||
/* at level = 0, we're done, setup the path and exit */
|
/* at the lowest level, we're done, setup the path and exit */
|
||||||
if (level == 0) {
|
if (level == path->lowest_level) {
|
||||||
if (slot >= nritems)
|
if (slot >= nritems)
|
||||||
goto find_next_key;
|
goto find_next_key;
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
@@ -1649,7 +1649,9 @@ void btrfs_free_path(struct btrfs_path *p);
|
|||||||
void btrfs_init_path(struct btrfs_path *p);
|
void btrfs_init_path(struct btrfs_path *p);
|
||||||
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
|
||||||
struct btrfs_path *path, int slot, int nr);
|
struct btrfs_path *path, int slot, int nr);
|
||||||
|
int btrfs_del_leaf(struct btrfs_trans_handle *trans,
|
||||||
|
struct btrfs_root *root,
|
||||||
|
struct btrfs_path *path, u64 bytenr);
|
||||||
static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
|
static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
|
||||||
struct btrfs_root *root,
|
struct btrfs_root *root,
|
||||||
struct btrfs_path *path)
|
struct btrfs_path *path)
|
||||||
|
152
fs/btrfs/inode.c
152
fs/btrfs/inode.c
@@ -1389,6 +1389,154 @@ fail:
|
|||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* when truncating bytes in a file, it is possible to avoid reading
|
||||||
|
* the leaves that contain only checksum items. This can be the
|
||||||
|
* majority of the IO required to delete a large file, but it must
|
||||||
|
* be done carefully.
|
||||||
|
*
|
||||||
|
* The keys in the level just above the leaves are checked to make sure
|
||||||
|
* the lowest key in a given leaf is a csum key, and starts at an offset
|
||||||
|
* after the new size.
|
||||||
|
*
|
||||||
|
* Then the key for the next leaf is checked to make sure it also has
|
||||||
|
* a checksum item for the same file. If it does, we know our target leaf
|
||||||
|
* contains only checksum items, and it can be safely freed without reading
|
||||||
|
* it.
|
||||||
|
*
|
||||||
|
* This is just an optimization targeted at large files. It may do
|
||||||
|
* nothing. It will return 0 unless things went badly.
|
||||||
|
*/
|
||||||
|
static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
|
||||||
|
struct btrfs_root *root,
|
||||||
|
struct btrfs_path *path,
|
||||||
|
struct inode *inode, u64 new_size)
|
||||||
|
{
|
||||||
|
struct btrfs_key key;
|
||||||
|
int ret;
|
||||||
|
int nritems;
|
||||||
|
struct btrfs_key found_key;
|
||||||
|
struct btrfs_key other_key;
|
||||||
|
|
||||||
|
path->lowest_level = 1;
|
||||||
|
key.objectid = inode->i_ino;
|
||||||
|
key.type = BTRFS_CSUM_ITEM_KEY;
|
||||||
|
key.offset = new_size;
|
||||||
|
again:
|
||||||
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
||||||
|
if (ret < 0)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
if (path->nodes[1] == NULL) {
|
||||||
|
ret = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
|
||||||
|
nritems = btrfs_header_nritems(path->nodes[1]);
|
||||||
|
|
||||||
|
if (!nritems)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
if (path->slots[1] >= nritems)
|
||||||
|
goto next_node;
|
||||||
|
|
||||||
|
/* did we find a key greater than anything we want to delete? */
|
||||||
|
if (found_key.objectid > inode->i_ino ||
|
||||||
|
(found_key.objectid == inode->i_ino && found_key.type > key.type))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* we check the next key in the node to make sure the leave contains
|
||||||
|
* only checksum items. This comparison doesn't work if our
|
||||||
|
* leaf is the last one in the node
|
||||||
|
*/
|
||||||
|
if (path->slots[1] + 1 >= nritems) {
|
||||||
|
next_node:
|
||||||
|
/* search forward from the last key in the node, this
|
||||||
|
* will bring us into the next node in the tree
|
||||||
|
*/
|
||||||
|
btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
|
||||||
|
|
||||||
|
/* unlikely, but we inc below, so check to be safe */
|
||||||
|
if (found_key.offset == (u64)-1)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/* search_forward needs a path with locks held, do the
|
||||||
|
* search again for the original key. It is possible
|
||||||
|
* this will race with a balance and return a path that
|
||||||
|
* we could modify, but this drop is just an optimization
|
||||||
|
* and is allowed to miss some leaves.
|
||||||
|
*/
|
||||||
|
btrfs_release_path(root, path);
|
||||||
|
found_key.offset++;
|
||||||
|
|
||||||
|
/* setup a max key for search_forward */
|
||||||
|
other_key.offset = (u64)-1;
|
||||||
|
other_key.type = key.type;
|
||||||
|
other_key.objectid = key.objectid;
|
||||||
|
|
||||||
|
path->keep_locks = 1;
|
||||||
|
ret = btrfs_search_forward(root, &found_key, &other_key,
|
||||||
|
path, 0, 0);
|
||||||
|
path->keep_locks = 0;
|
||||||
|
if (ret || found_key.objectid != key.objectid ||
|
||||||
|
found_key.type != key.type) {
|
||||||
|
ret = 0;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
key.offset = found_key.offset;
|
||||||
|
btrfs_release_path(root, path);
|
||||||
|
cond_resched();
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* we know there's one more slot after us in the tree,
|
||||||
|
* read that key so we can verify it is also a checksum item
|
||||||
|
*/
|
||||||
|
btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
|
||||||
|
|
||||||
|
if (found_key.objectid < inode->i_ino)
|
||||||
|
goto next_key;
|
||||||
|
|
||||||
|
if (found_key.type != key.type || found_key.offset < new_size)
|
||||||
|
goto next_key;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* if the key for the next leaf isn't a csum key from this objectid,
|
||||||
|
* we can't be sure there aren't good items inside this leaf.
|
||||||
|
* Bail out
|
||||||
|
*/
|
||||||
|
if (other_key.objectid != inode->i_ino || other_key.type != key.type)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* it is safe to delete this leaf, it contains only
|
||||||
|
* csum items from this inode at an offset >= new_size
|
||||||
|
*/
|
||||||
|
ret = btrfs_del_leaf(trans, root, path,
|
||||||
|
btrfs_node_blockptr(path->nodes[1],
|
||||||
|
path->slots[1]));
|
||||||
|
BUG_ON(ret);
|
||||||
|
|
||||||
|
next_key:
|
||||||
|
btrfs_release_path(root, path);
|
||||||
|
|
||||||
|
if (other_key.objectid == inode->i_ino &&
|
||||||
|
other_key.type == key.type && other_key.offset > key.offset) {
|
||||||
|
key.offset = other_key.offset;
|
||||||
|
cond_resched();
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
ret = 0;
|
||||||
|
out:
|
||||||
|
/* fixup any changes we've made to the path */
|
||||||
|
path->lowest_level = 0;
|
||||||
|
path->keep_locks = 0;
|
||||||
|
btrfs_release_path(root, path);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* this can truncate away extent items, csum items and directory items.
|
* this can truncate away extent items, csum items and directory items.
|
||||||
* It starts at a high offset and removes keys until it can't find
|
* It starts at a high offset and removes keys until it can't find
|
||||||
@@ -1436,6 +1584,10 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
|
|||||||
key.type = (u8)-1;
|
key.type = (u8)-1;
|
||||||
|
|
||||||
btrfs_init_path(path);
|
btrfs_init_path(path);
|
||||||
|
|
||||||
|
ret = drop_csum_leaves(trans, root, path, inode, new_size);
|
||||||
|
BUG_ON(ret);
|
||||||
|
|
||||||
search_again:
|
search_again:
|
||||||
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
|
Reference in New Issue
Block a user