drbd: introduce WRITE_SAME support

We will support WRITE_SAME, if
 * all peers support WRITE_SAME (both in kernel and DRBD version),
 * all peer devices support WRITE_SAME
 * logical_block_size is identical on all peers.

We may at some point introduce a fallback on the receiving side
for devices/kernels that do not support WRITE_SAME,
by open-coding a submit loop. But not yet.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
Lars Ellenberg 2016-06-14 00:26:31 +02:00 committed by Jens Axboe
parent 60bac04012
commit 9104d31a75
10 changed files with 360 additions and 80 deletions

View File

@ -840,6 +840,13 @@ static int update_sync_bits(struct drbd_device *device,
return count;
}
static bool plausible_request_size(int size)
{
return size > 0
&& size <= DRBD_MAX_BATCH_BIO_SIZE
&& IS_ALIGNED(size, 512);
}
/* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
@ -859,7 +866,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
if ((mode == SET_OUT_OF_SYNC) && size == 0)
return 0;
if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
if (!plausible_request_size(size)) {
drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
drbd_change_sync_fname[mode],
(unsigned long long)sector, size);

View File

@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
if (f & EE_IS_TRIM) {
seq_putc(m, sep);
sep = '|';
if (f & EE_IS_TRIM_USE_ZEROOUT)
seq_puts(m, "zero-out");
else
seq_puts(m, "trim");
}
if (f & EE_IS_TRIM)
__seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
seq_putc(m, '\n');
}

View File

@ -468,6 +468,9 @@ enum {
/* this is/was a write request */
__EE_WRITE,
/* this is/was a write same request */
__EE_WRITE_SAME,
/* this originates from application on peer
* (not some resync or verify or other DRBD internal request) */
__EE_APPLICATION,
@ -487,6 +490,7 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
@ -1350,8 +1354,8 @@ struct bm_extent {
/* For now, don't allow more than half of what we can "activate" in one
* activity log transaction to be discarded in one go. We may need to rework
* drbd_al_begin_io() to allow for even larger discard ranges */
#define DRBD_MAX_DISCARD_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
extern int drbd_bm_init(struct drbd_device *device);
extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@ -1488,7 +1492,8 @@ enum determine_dev_size {
extern enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_device *);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
struct drbd_backing_dev *bdev, struct o_qlim *o);
extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
enum drbd_role new_role,
int force);
@ -1569,7 +1574,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
bool,
unsigned int,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int);

View File

@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
}
}
/* communicated if (agreed_features & DRBD_FF_WSAME) */
void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
{
if (q) {
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = blk_queue_discard(q);
p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
} else {
q = device->rq_queue;
p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
p->qlim->alignment_offset = 0;
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = 0;
p->qlim->discard_zeroes_data = 0;
p->qlim->write_same_capable = 0;
}
}
int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{
struct drbd_device *device = peer_device->device;
@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
sector_t d_size, u_size;
int q_order_type;
unsigned int max_bio_size;
unsigned int packet_size;
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
packet_size = sizeof(*p);
if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
packet_size += sizeof(p->qlim[0]);
memset(p, 0, packet_size);
if (get_ldev_if_state(device, D_NEGOTIATING)) {
D_ASSERT(device, device->ldev->backing_bdev);
struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
d_size = drbd_get_max_capacity(device->ldev);
rcu_read_lock();
u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
rcu_read_unlock();
q_order_type = drbd_queue_order_type(device);
max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
max_bio_size = queue_max_hw_sectors(q) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
assign_p_sizes_qlim(device, p, q);
put_ldev(device);
} else {
d_size = 0;
u_size = 0;
q_order_type = QUEUE_ORDERED_NONE;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
assign_p_sizes_qlim(device, p, NULL);
}
sock = &peer_device->connection->data;
p = drbd_prepare_command(peer_device, sock);
if (!p)
return -EIO;
if (peer_device->connection->agreed_pro_version <= 94)
max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
else if (peer_device->connection->agreed_pro_version < 100)
@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
p->max_bio_size = cpu_to_be32(max_bio_size);
p->queue_order_type = cpu_to_be16(q_order_type);
p->dds_flags = cpu_to_be16(flags);
return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
}
/**
@ -1577,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
? 0 : MSG_MORE);
if (err)
return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
return 0;
}
@ -1595,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
if (err)
return err;
/* REQ_OP_WRITE_SAME has only one segment */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
return 0;
}
@ -1626,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
else
return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
@ -1639,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
struct drbd_device *device = peer_device->device;
struct drbd_socket *sock;
struct p_data *p;
struct p_wsame *wsame = NULL;
void *digest_out;
unsigned int dp_flags = 0;
int digest_size;
int err;
@ -1674,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
goto out;
}
if (dp_flags & DP_WSAME) {
/* this will only work if DRBD_FF_WSAME is set AND the
* handshake agreed that all nodes and backend devices are
* WRITE_SAME capable and agree on logical_block_size */
wsame = (struct p_wsame*)p;
digest_out = wsame + 1;
wsame->size = cpu_to_be32(req->i.size);
} else
digest_out = p + 1;
/* our digest is still only over the payload.
* TRIM does not carry any payload. */
if (digest_size)
drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
if (wsame) {
err =
__send_command(peer_device->connection, device->vnr, sock, P_WSAME,
sizeof(*wsame) + digest_size, NULL,
bio_iovec(req->master_bio).bv_len);
} else
err =
__send_command(peer_device->connection, device->vnr, sock, P_DATA,
sizeof(*p) + digest_size, NULL, req->i.size);
if (!err) {
/* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away
@ -3660,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
* one PRO_VERSION */
static const char *cmdnames[] = {
[P_DATA] = "Data",
[P_WSAME] = "WriteSame",
[P_TRIM] = "Trim",
[P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier",

View File

@ -1174,6 +1174,17 @@ static void blk_queue_discard_granularity(struct request_queue *q, unsigned int
{
q->limits.discard_granularity = granularity;
}
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
{
/* when we introduced REQ_WRITE_SAME support, we also bumped
* our maximum supported batch bio size used for discards. */
if (connection->agreed_features & DRBD_FF_WSAME)
return DRBD_MAX_BBIO_SECTORS;
/* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
return AL_EXTENT_SIZE >> 9;
}
static void decide_on_discard_support(struct drbd_device *device,
struct request_queue *q,
struct request_queue *b,
@ -1190,7 +1201,7 @@ static void decide_on_discard_support(struct drbd_device *device,
can_do = false;
drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
}
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & FF_TRIM)) {
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
can_do = false;
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
}
@ -1202,7 +1213,7 @@ static void decide_on_discard_support(struct drbd_device *device,
* you care, you need to use devices with similar
* topology on all peers. */
blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
} else {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
@ -1223,8 +1234,67 @@ static void fixup_discard_if_not_supported(struct request_queue *q)
}
}
static void decide_on_write_same_support(struct drbd_device *device,
struct request_queue *q,
struct request_queue *b, struct o_qlim *o)
{
struct drbd_peer_device *peer_device = first_peer_device(device);
struct drbd_connection *connection = peer_device->connection;
bool can_do = b ? b->limits.max_write_same_sectors : true;
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
can_do = false;
drbd_info(peer_device, "peer does not support WRITE_SAME\n");
}
if (o) {
/* logical block size; queue_logical_block_size(NULL) is 512 */
unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
unsigned int me_lbs_b = queue_logical_block_size(b);
unsigned int me_lbs = queue_logical_block_size(q);
if (me_lbs_b != me_lbs) {
drbd_warn(device,
"logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
me_lbs, me_lbs_b);
/* rather disable write same than trigger some BUG_ON later in the scsi layer. */
can_do = false;
}
if (me_lbs_b != peer_lbs) {
drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
me_lbs, peer_lbs);
if (can_do) {
drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
can_do = false;
}
me_lbs = max(me_lbs, me_lbs_b);
/* We cannot change the logical block size of an in-use queue.
* We can only hope that access happens to be properly aligned.
* If not, the peer will likely produce an IO error, and detach. */
if (peer_lbs > me_lbs) {
if (device->state.role != R_PRIMARY) {
blk_queue_logical_block_size(q, peer_lbs);
drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
} else {
drbd_warn(peer_device,
"current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
me_lbs, peer_lbs);
}
}
}
if (can_do && !o->write_same_capable) {
/* If we introduce an open-coded write-same loop on the receiving side,
* the peer would present itself as "capable". */
drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
can_do = false;
}
}
blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size)
unsigned int max_bio_size, struct o_qlim *o)
{
struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9;
@ -1244,15 +1314,15 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
rcu_read_unlock();
blk_set_stacking_limits(&q->limits);
blk_queue_max_write_same_sectors(q, 0);
}
blk_queue_logical_block_size(q, 512);
blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
decide_on_write_same_support(device, q, b, o);
if (b) {
blk_queue_stack_limits(q, b);
@ -1266,7 +1336,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
fixup_discard_if_not_supported(q);
}
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev)
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
unsigned int now, new, local, peer;
@ -1309,7 +1379,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_ba
if (new != now)
drbd_info(device, "max BIO size = %u\n", new);
drbd_setup_queue_param(device, bdev, new);
drbd_setup_queue_param(device, bdev, new, o);
}
/* Starts the worker thread */
@ -1542,7 +1612,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
drbd_reconsider_queue_parameters(device, device->ldev);
drbd_reconsider_queue_parameters(device, device->ldev, NULL);
drbd_md_sync(device);
@ -1922,7 +1992,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
device->read_cnt = 0;
device->writ_cnt = 0;
drbd_reconsider_queue_parameters(device, device->ldev);
drbd_reconsider_queue_parameters(device, device->ldev, NULL);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,

View File

@ -64,6 +64,11 @@ enum drbd_packet {
P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */
/* REQ_WRITE_SAME.
* On a receiving side without REQ_WRITE_SAME,
* we may fall back to an opencoded loop instead. */
P_WSAME = 0x34,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
@ -110,8 +115,11 @@ struct p_header100 {
u32 pad;
} __packed;
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
/* These defines must not be changed without changing the protocol version.
* New defines may only be introduced together with protocol version bump or
* new protocol feature flags.
*/
#define DP_HARDBARRIER 1 /* no longer used */
#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
#define DP_UNPLUG 8 /* not used anymore */
@ -120,6 +128,7 @@ struct p_header100 {
#define DP_DISCARD 64 /* equals REQ_DISCARD */
#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */
struct p_data {
u64 sector; /* 64 bits sector number */
@ -133,6 +142,11 @@ struct p_trim {
u32 size; /* == bio->bi_size */
} __packed;
struct p_wsame {
struct p_data p_data;
u32 size; /* == bio->bi_size */
} __packed;
/*
* commands which share a struct:
* p_block_ack:
@ -164,8 +178,23 @@ struct p_block_req {
* ReportParams
*/
#define FF_TRIM 1
#define FF_THIN_RESYNC 2
/* supports TRIM/DISCARD on the "wire" protocol */
#define DRBD_FF_TRIM 1
/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
* instead of fully allocate a supposedly thin volume on initial resync */
#define DRBD_FF_THIN_RESYNC 2
/* supports REQ_WRITE_SAME on the "wire" protocol.
* Note: this flag is overloaded,
* its presence also
* - indicates support for 128 MiB "batch bios",
* max discard size of 128 MiB
* instead of 4M before that.
* - indicates that we exchange additional settings in p_sizes
* drbd_send_sizes()/receive_sizes()
*/
#define DRBD_FF_WSAME 4
struct p_connection_features {
u32 protocol_min;
@ -240,6 +269,40 @@ struct p_rs_uuid {
u64 uuid;
} __packed;
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
* see also struct queue_limits, as of late 2015 */
struct o_qlim {
/* we don't need it yet, but we may as well communicate it now */
u32 physical_block_size;
/* so the original in struct queue_limits is unsigned short,
* but I'd have to put in padding anyways. */
u32 logical_block_size;
/* One incoming bio becomes one DRBD request,
* which may be translated to several bio on the receiving side.
* We don't need to communicate chunk/boundary/segment ... limits.
*/
/* various IO hints may be useful with "diskless client" setups */
u32 alignment_offset;
u32 io_min;
u32 io_opt;
/* We may need to communicate integrity stuff at some point,
* but let's not get ahead of ourselves. */
/* Backend discard capabilities.
* Receiving side uses "blkdev_issue_discard()", no need to communicate
* more specifics. If the backend cannot do discards, the DRBD peer
* may fall back to blkdev_issue_zeroout().
*/
u8 discard_enabled;
u8 discard_zeroes_data;
u8 write_same_capable;
u8 _pad;
} __packed;
struct p_sizes {
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
@ -247,6 +310,9 @@ struct p_sizes {
u32 max_bio_size; /* Maximal size of a BIO */
u16 queue_order_type; /* not yet implemented in DRBD*/
u16 dds_flags; /* use enum dds_flags here. */
/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
struct o_qlim qlim[0];
} __packed;
struct p_state {

View File

@ -48,7 +48,7 @@
#include "drbd_req.h"
#include "drbd_vli.h"
#define PRO_FEATURES (FF_TRIM | FF_THIN_RESYNC)
#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME)
struct packet_info {
enum drbd_packet cmd;
@ -361,14 +361,17 @@ You must not have the req_lock:
drbd_wait_ee_list_empty()
*/
/* normal: payload_size == request size (bi_size)
* w_same: payload_size == logical_block_size
* trim: payload_size == 0 */
struct drbd_peer_request *
drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
{
struct drbd_device *device = peer_device->device;
struct drbd_peer_request *peer_req;
struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
return NULL;
@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
if (has_payload && data_size) {
if (nr_pages) {
page = drbd_alloc_pages(peer_device, nr_pages,
gfpflags_allow_blocking(gfp_mask));
if (!page)
@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
memset(peer_req, 0, sizeof(*peer_req));
INIT_LIST_HEAD(&peer_req->w.list);
drbd_clear_interval(&peer_req->i);
peer_req->i.size = data_size;
peer_req->i.size = request_size;
peer_req->i.sector = sector;
peer_req->submit_jif = jiffies;
peer_req->peer_device = peer_device;
@ -1530,7 +1533,7 @@ static bool can_do_reliable_discards(struct drbd_device *device)
return can_do;
}
void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
/* If the backend cannot discard, or does not guarantee
* read-back zeroes in discarded ranges, we fall back to
@ -1545,6 +1548,18 @@ void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_reques
drbd_endio_write_sec_final(peer_req);
}
static void drbd_issue_peer_wsame(struct drbd_device *device,
struct drbd_peer_request *peer_req)
{
struct block_device *bdev = device->ldev->backing_bdev;
sector_t s = peer_req->i.sector;
sector_t nr = peer_req->i.size >> 9;
if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
peer_req->flags |= EE_WAS_ERROR;
drbd_endio_write_sec_final(peer_req);
}
/**
* drbd_submit_peer_request()
* @device: DRBD device.
@ -1582,7 +1597,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
* Correctness first, performance later. Next step is to code an
* asynchronous variant of the same.
*/
if (peer_req->flags & EE_IS_TRIM) {
if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) {
/* wait for all pending IO completions, before we start
* zeroing things out. */
conn_wait_active_ee_empty(peer_req->peer_device->connection);
@ -1599,7 +1614,10 @@ int drbd_submit_peer_request(struct drbd_device *device,
spin_unlock_irq(&device->resource->req_lock);
}
drbd_issue_peer_discard(device, peer_req);
if (peer_req->flags & EE_IS_TRIM)
drbd_issue_peer_discard(device, peer_req);
else /* EE_WRITE_SAME */
drbd_issue_peer_wsame(device, peer_req);
return 0;
}
@ -1772,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
return 0;
}
/* quick wrapper in case payload size != request_size (write same) */
static void drbd_csum_ee_size(struct crypto_ahash *h,
struct drbd_peer_request *r, void *d,
unsigned int payload_size)
{
unsigned int tmp = r->i.size;
r->i.size = payload_size;
drbd_csum_ee(h, r, d);
r->i.size = tmp;
}
/* used from receive_RSDataReply (recv_resync_read)
* and from receive_Data */
* and from receive_Data.
* data_size: actual payload ("data in")
* for normal writes that is bi_size.
* for discards, that is zero.
* for write same, it is logical_block_size.
* both trim and write same have the bi_size ("data len to be affected")
* as extra argument in the packet header.
*/
static struct drbd_peer_request *
read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
struct packet_info *pi) __must_hold(local)
@ -1788,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
void *dig_vv = peer_device->connection->int_dig_vv;
unsigned long *data;
struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
digest_size = 0;
if (!trim && peer_device->connection->peer_integrity_tfm) {
@ -1802,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
data_size -= digest_size;
}
/* assume request_size == data_size, but special case trim and wsame. */
ds = data_size;
if (trim) {
D_ASSERT(peer_device, data_size == 0);
data_size = be32_to_cpu(trim->size);
if (!expect(data_size == 0))
return NULL;
ds = be32_to_cpu(trim->size);
} else if (wsame) {
if (data_size != queue_logical_block_size(device->rq_queue)) {
drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
data_size, queue_logical_block_size(device->rq_queue));
return NULL;
}
if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
data_size, bdev_logical_block_size(device->ldev->backing_bdev));
return NULL;
}
ds = be32_to_cpu(wsame->size);
}
if (!expect(IS_ALIGNED(data_size, 512)))
if (!expect(IS_ALIGNED(ds, 512)))
return NULL;
/* prepare for larger trim requests. */
if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
if (trim || wsame) {
if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
return NULL;
} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
if (sector + (data_size>>9) > capacity) {
if (sector + (ds>>9) > capacity) {
drbd_err(device, "request from peer beyond end of local disk: "
"capacity: %llus < sector: %llus + size: %u\n",
(unsigned long long)capacity,
(unsigned long long)sector, data_size);
(unsigned long long)sector, ds);
return NULL;
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
if (!peer_req)
return NULL;
peer_req->flags |= EE_WRITE;
if (trim)
if (trim) {
peer_req->flags |= EE_IS_TRIM;
return peer_req;
}
if (wsame)
peer_req->flags |= EE_WRITE_SAME;
/* receive payload size bytes into page chain */
ds = data_size;
page = peer_req->pages;
page_chain_for_each(page) {
@ -1853,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
}
if (digest_size) {
drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
if (memcmp(dig_in, dig_vv, digest_size)) {
drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
(unsigned long long)sector, data_size);
@ -2517,7 +2576,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
op = wire_flags_to_bio_op(dp_flags);
op_flags = wire_flags_to_bio_flags(dp_flags);
if (pi->cmd == P_TRIM) {
peer_req->flags |= EE_IS_TRIM;
D_ASSERT(peer_device, peer_req->i.size > 0);
D_ASSERT(peer_device, op == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req->pages == NULL);
@ -2584,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
update_peer_seq(peer_device, peer_seq);
spin_lock_irq(&device->resource->req_lock);
}
/* if we use the zeroout fallback code, we process synchronously
* and we wait for all pending requests, respectively wait for
/* TRIM and WRITE_SAME are processed synchronously,
* we wait for all pending requests, respectively wait for
* active_ee to become empty in drbd_submit_peer_request();
* better not add ourselves here. */
if ((peer_req->flags & EE_IS_TRIM) == 0)
if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0)
list_add_tail(&peer_req->w.list, &device->active_ee);
spin_unlock_irq(&device->resource->req_lock);
@ -2771,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
true /* has real payload */, GFP_NOIO);
size, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
@ -3933,6 +3991,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
struct drbd_peer_device *peer_device;
struct drbd_device *device;
struct p_sizes *p = pi->data;
struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
enum determine_dev_size dd = DS_UNCHANGED;
sector_t p_size, p_usize, p_csize, my_usize;
int ldsc = 0; /* local disk size changed */
@ -4016,7 +4075,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(device)) {
drbd_reconsider_queue_parameters(device, device->ldev);
drbd_reconsider_queue_parameters(device, device->ldev, o);
dd = drbd_determine_dev_size(device, ddsf, NULL);
put_ldev(device);
if (dd == DS_ERROR)
@ -4036,7 +4095,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
* However, if he sends a zero current size,
* take his (user-capped or) backing disk size anyways.
*/
drbd_reconsider_queue_parameters(device, NULL);
drbd_reconsider_queue_parameters(device, NULL, o);
drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
}
@ -4792,7 +4851,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
const int op = REQ_OP_DISCARD;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, false, GFP_NOIO);
size, 0, GFP_NOIO);
if (!peer_req) {
put_ldev(device);
return -ENOMEM;
@ -4837,7 +4896,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
struct data_cmd {
int expect_payload;
size_t pkt_size;
unsigned int pkt_size;
int (*fn)(struct drbd_connection *, struct packet_info *);
};
@ -4869,7 +4928,7 @@ static struct data_cmd drbd_cmd_handler[] = {
[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
[P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
[P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data },
};
static void drbdd(struct drbd_connection *connection)
@ -4879,7 +4938,7 @@ static void drbdd(struct drbd_connection *connection)
int err;
while (get_t_state(&connection->receiver) == RUNNING) {
struct data_cmd *cmd;
struct data_cmd const *cmd;
drbd_thread_current_set_cpu(&connection->receiver);
update_receiver_timing_details(connection, drbd_recv_header);
@ -4894,11 +4953,18 @@ static void drbdd(struct drbd_connection *connection)
}
shs = cmd->pkt_size;
if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
shs += sizeof(struct o_qlim);
if (pi.size > shs && !cmd->expect_payload) {
drbd_err(connection, "No payload expected %s l:%d\n",
cmdname(pi.cmd), pi.size);
goto err_out;
}
if (pi.size < shs) {
drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
cmdname(pi.cmd), (int)shs, pi.size);
goto err_out;
}
if (shs) {
update_receiver_timing_details(connection, drbd_recv_all_warn);
@ -5145,11 +5211,12 @@ static int drbd_do_features(struct drbd_connection *connection)
drbd_info(connection, "Handshake successful: "
"Agreed network protocol version %d\n", connection->agreed_pro_version);
drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
connection->agreed_features & FF_TRIM ? " " : " not ");
drbd_info(connection, "Agreed to%ssupport THIN_RESYNC on protocol level\n",
connection->agreed_features & FF_THIN_RESYNC ? " " : " not ");
drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n",
connection->agreed_features,
connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" :
connection->agreed_features ? "" : " none");
return 1;

View File

@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r
&device->vdisk->part0, req->start_jif);
}
static struct drbd_request *drbd_req_new(struct drbd_device *device,
struct bio *bio_src)
static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
{
struct drbd_request *req;
@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device,
memset(req, 0, sizeof(*req));
drbd_req_make_private_bio(req, bio_src);
req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->device = device;
req->master_bio = bio_src;
req->epoch = 0;
req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
| (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
req->device = device;
req->master_bio = bio_src;
req->epoch = 0;
drbd_clear_interval(&req->i);
req->i.sector = bio_src->bi_iter.bi_sector;

View File

@ -206,6 +206,8 @@ enum drbd_req_state_bits {
/* Set when this is a write, clear for a read */
__RQ_WRITE,
__RQ_WSAME,
__RQ_UNMAP,
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
@ -241,10 +243,11 @@ enum drbd_req_state_bits {
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_WSAME (1UL << __RQ_WSAME)
#define RQ_UNMAP (1UL << __RQ_UNMAP)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)

View File

@ -320,6 +320,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest)
sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
ahash_request_set_crypt(req, &sg, NULL, sg.length);
crypto_ahash_update(req);
/* REQ_OP_WRITE_SAME has only one segment,
* checksum the payload only once. */
if (bio_op(bio) == REQ_OP_WRITE_SAME)
break;
}
ahash_request_set_crypt(req, NULL, digest, 0);
crypto_ahash_final(req);
@ -387,7 +391,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
size, true /* has real payload */, GFP_TRY);
size, size, GFP_TRY);
if (!peer_req)
goto defer;
@ -603,7 +607,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel)
return 0;
}
if (connection->agreed_features & FF_THIN_RESYNC) {
if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
rcu_read_lock();
discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
rcu_read_unlock();