Btrfs: avoid races between super writeout and device list updates
On multi-device filesystems, btrfs writes supers to all of the devices before considering a sync complete. There wasn't any additional locking between super writeout and the device list management code because device management was done inside a transaction and super writeout only happened with no transation writers running. With the btrfs fsync log and other async transaction updates, this has been racey for some time. This adds a mutex to protect the device list. The existing volume mutex could not be reused due to transaction lock ordering requirements. Signed-off-by: Chris Mason <chris.mason@oracle.com>
This commit is contained in:
@@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device,
|
|||||||
|
|
||||||
int write_all_supers(struct btrfs_root *root, int max_mirrors)
|
int write_all_supers(struct btrfs_root *root, int max_mirrors)
|
||||||
{
|
{
|
||||||
struct list_head *head = &root->fs_info->fs_devices->devices;
|
struct list_head *head;
|
||||||
struct btrfs_device *dev;
|
struct btrfs_device *dev;
|
||||||
struct btrfs_super_block *sb;
|
struct btrfs_super_block *sb;
|
||||||
struct btrfs_dev_item *dev_item;
|
struct btrfs_dev_item *dev_item;
|
||||||
@@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
|
|||||||
|
|
||||||
sb = &root->fs_info->super_for_commit;
|
sb = &root->fs_info->super_for_commit;
|
||||||
dev_item = &sb->dev_item;
|
dev_item = &sb->dev_item;
|
||||||
|
|
||||||
|
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
|
head = &root->fs_info->fs_devices->devices;
|
||||||
list_for_each_entry(dev, head, dev_list) {
|
list_for_each_entry(dev, head, dev_list) {
|
||||||
if (!dev->bdev) {
|
if (!dev->bdev) {
|
||||||
total_errors++;
|
total_errors++;
|
||||||
@@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
|
|||||||
if (ret)
|
if (ret)
|
||||||
total_errors++;
|
total_errors++;
|
||||||
}
|
}
|
||||||
|
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
if (total_errors > max_errors) {
|
if (total_errors > max_errors) {
|
||||||
printk(KERN_ERR "btrfs: %d errors while writing supers\n",
|
printk(KERN_ERR "btrfs: %d errors while writing supers\n",
|
||||||
total_errors);
|
total_errors);
|
||||||
|
@@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path,
|
|||||||
memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
|
memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
|
||||||
fs_devices->latest_devid = devid;
|
fs_devices->latest_devid = devid;
|
||||||
fs_devices->latest_trans = found_transid;
|
fs_devices->latest_trans = found_transid;
|
||||||
|
mutex_init(&fs_devices->device_list_mutex);
|
||||||
device = NULL;
|
device = NULL;
|
||||||
} else {
|
} else {
|
||||||
device = __find_device(&fs_devices->devices, devid,
|
device = __find_device(&fs_devices->devices, devid,
|
||||||
@@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path,
|
|||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
INIT_LIST_HEAD(&device->dev_alloc_list);
|
INIT_LIST_HEAD(&device->dev_alloc_list);
|
||||||
|
|
||||||
|
mutex_lock(&fs_devices->device_list_mutex);
|
||||||
list_add(&device->dev_list, &fs_devices->devices);
|
list_add(&device->dev_list, &fs_devices->devices);
|
||||||
|
mutex_unlock(&fs_devices->device_list_mutex);
|
||||||
|
|
||||||
device->fs_devices = fs_devices;
|
device->fs_devices = fs_devices;
|
||||||
fs_devices->num_devices++;
|
fs_devices->num_devices++;
|
||||||
}
|
}
|
||||||
@@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
|
|||||||
INIT_LIST_HEAD(&fs_devices->devices);
|
INIT_LIST_HEAD(&fs_devices->devices);
|
||||||
INIT_LIST_HEAD(&fs_devices->alloc_list);
|
INIT_LIST_HEAD(&fs_devices->alloc_list);
|
||||||
INIT_LIST_HEAD(&fs_devices->list);
|
INIT_LIST_HEAD(&fs_devices->list);
|
||||||
|
mutex_init(&fs_devices->device_list_mutex);
|
||||||
fs_devices->latest_devid = orig->latest_devid;
|
fs_devices->latest_devid = orig->latest_devid;
|
||||||
fs_devices->latest_trans = orig->latest_trans;
|
fs_devices->latest_trans = orig->latest_trans;
|
||||||
memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
|
memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
|
||||||
|
|
||||||
|
mutex_lock(&orig->device_list_mutex);
|
||||||
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
|
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
|
||||||
device = kzalloc(sizeof(*device), GFP_NOFS);
|
device = kzalloc(sizeof(*device), GFP_NOFS);
|
||||||
if (!device)
|
if (!device)
|
||||||
@@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
|
|||||||
device->fs_devices = fs_devices;
|
device->fs_devices = fs_devices;
|
||||||
fs_devices->num_devices++;
|
fs_devices->num_devices++;
|
||||||
}
|
}
|
||||||
|
mutex_unlock(&orig->device_list_mutex);
|
||||||
return fs_devices;
|
return fs_devices;
|
||||||
error:
|
error:
|
||||||
|
mutex_unlock(&orig->device_list_mutex);
|
||||||
free_fs_devices(fs_devices);
|
free_fs_devices(fs_devices);
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
}
|
}
|
||||||
@@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
|
|||||||
|
|
||||||
mutex_lock(&uuid_mutex);
|
mutex_lock(&uuid_mutex);
|
||||||
again:
|
again:
|
||||||
|
mutex_lock(&fs_devices->device_list_mutex);
|
||||||
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
|
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
|
||||||
if (device->in_fs_metadata)
|
if (device->in_fs_metadata)
|
||||||
continue;
|
continue;
|
||||||
@@ -485,6 +495,7 @@ again:
|
|||||||
kfree(device->name);
|
kfree(device->name);
|
||||||
kfree(device);
|
kfree(device);
|
||||||
}
|
}
|
||||||
|
mutex_unlock(&fs_devices->device_list_mutex);
|
||||||
|
|
||||||
if (fs_devices->seed) {
|
if (fs_devices->seed) {
|
||||||
fs_devices = fs_devices->seed;
|
fs_devices = fs_devices->seed;
|
||||||
@@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
|
|||||||
|
|
||||||
device = NULL;
|
device = NULL;
|
||||||
devices = &root->fs_info->fs_devices->devices;
|
devices = &root->fs_info->fs_devices->devices;
|
||||||
|
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
list_for_each_entry(tmp, devices, dev_list) {
|
list_for_each_entry(tmp, devices, dev_list) {
|
||||||
if (tmp->in_fs_metadata && !tmp->bdev) {
|
if (tmp->in_fs_metadata && !tmp->bdev) {
|
||||||
device = tmp;
|
device = tmp;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
bdev = NULL;
|
bdev = NULL;
|
||||||
bh = NULL;
|
bh = NULL;
|
||||||
disk_super = NULL;
|
disk_super = NULL;
|
||||||
@@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
|
|||||||
goto error_brelse;
|
goto error_brelse;
|
||||||
|
|
||||||
device->in_fs_metadata = 0;
|
device->in_fs_metadata = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* the device list mutex makes sure that we don't change
|
||||||
|
* the device list while someone else is writing out all
|
||||||
|
* the device supers.
|
||||||
|
*/
|
||||||
|
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
list_del_init(&device->dev_list);
|
list_del_init(&device->dev_list);
|
||||||
|
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
|
|
||||||
device->fs_devices->num_devices--;
|
device->fs_devices->num_devices--;
|
||||||
|
|
||||||
next_device = list_entry(root->fs_info->fs_devices->devices.next,
|
next_device = list_entry(root->fs_info->fs_devices->devices.next,
|
||||||
@@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
|
|||||||
seed_devices->opened = 1;
|
seed_devices->opened = 1;
|
||||||
INIT_LIST_HEAD(&seed_devices->devices);
|
INIT_LIST_HEAD(&seed_devices->devices);
|
||||||
INIT_LIST_HEAD(&seed_devices->alloc_list);
|
INIT_LIST_HEAD(&seed_devices->alloc_list);
|
||||||
|
mutex_init(&seed_devices->device_list_mutex);
|
||||||
list_splice_init(&fs_devices->devices, &seed_devices->devices);
|
list_splice_init(&fs_devices->devices, &seed_devices->devices);
|
||||||
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
|
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
|
||||||
list_for_each_entry(device, &seed_devices->devices, dev_list) {
|
list_for_each_entry(device, &seed_devices->devices, dev_list) {
|
||||||
@@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
|
|||||||
mutex_lock(&root->fs_info->volume_mutex);
|
mutex_lock(&root->fs_info->volume_mutex);
|
||||||
|
|
||||||
devices = &root->fs_info->fs_devices->devices;
|
devices = &root->fs_info->fs_devices->devices;
|
||||||
|
/*
|
||||||
|
* we have the volume lock, so we don't need the extra
|
||||||
|
* device list mutex while reading the list here.
|
||||||
|
*/
|
||||||
list_for_each_entry(device, devices, dev_list) {
|
list_for_each_entry(device, devices, dev_list) {
|
||||||
if (device->bdev == bdev) {
|
if (device->bdev == bdev) {
|
||||||
ret = -EEXIST;
|
ret = -EEXIST;
|
||||||
@@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
|
|||||||
}
|
}
|
||||||
|
|
||||||
device->fs_devices = root->fs_info->fs_devices;
|
device->fs_devices = root->fs_info->fs_devices;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* we don't want write_supers to jump in here with our device
|
||||||
|
* half setup
|
||||||
|
*/
|
||||||
|
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
|
list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
|
||||||
list_add(&device->dev_alloc_list,
|
list_add(&device->dev_alloc_list,
|
||||||
&root->fs_info->fs_devices->alloc_list);
|
&root->fs_info->fs_devices->alloc_list);
|
||||||
@@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
|
|||||||
total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
|
total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
|
||||||
btrfs_set_super_num_devices(&root->fs_info->super_copy,
|
btrfs_set_super_num_devices(&root->fs_info->super_copy,
|
||||||
total_bytes + 1);
|
total_bytes + 1);
|
||||||
|
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
|
||||||
|
|
||||||
if (seeding_dev) {
|
if (seeding_dev) {
|
||||||
ret = init_first_rw_device(trans, root, device);
|
ret = init_first_rw_device(trans, root, device);
|
||||||
|
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
|
|||||||
u64 rw_devices;
|
u64 rw_devices;
|
||||||
u64 total_rw_bytes;
|
u64 total_rw_bytes;
|
||||||
struct block_device *latest_bdev;
|
struct block_device *latest_bdev;
|
||||||
/* all of the devices in the FS */
|
|
||||||
|
/* all of the devices in the FS, protected by a mutex
|
||||||
|
* so we can safely walk it to write out the supers without
|
||||||
|
* worrying about add/remove by the multi-device code
|
||||||
|
*/
|
||||||
|
struct mutex device_list_mutex;
|
||||||
struct list_head devices;
|
struct list_head devices;
|
||||||
|
|
||||||
/* devices not currently being allocated */
|
/* devices not currently being allocated */
|
||||||
|
Reference in New Issue
Block a user