NFS client updates for Linux 4.2

Highlights include:
 
 Stable patches:
 - Fix a crash in the NFSv4 file locking code.
 - Fix an fsync() regression, where we were failing to retry I/O in some
   circumstances.
 - Fix an infinite loop in NFSv4.0 OPEN stateid recovery
 - Fix a memory leak when an attempted pnfs fails.
 - Fix a memory leak in the backchannel code
 - Large hostnames were not supported correctly in NFSv4.1
 - Fix a pNFS/flexfiles bug that was impeding error reporting on I/O.
 - Fix a couple of credential issues in pNFS/flexfiles
 
 Bugfixes + cleanups:
 - Open flag sanity checks in the NFSv4 atomic open codepath
 - More NFSv4 delegation related bugfixes
 - Various NFSv4.1 backchannel bugfixes and cleanups
 - Fix the NFS swap socket code
 - Various cleanups of the NFSv4 SETCLIENTID and EXCHANGE_ID code
 - Fix a UDP transport deadlock issue
 
 Features:
 - More RDMA client transport improvements
 - NFSv4.2 LAYOUTSTATS functionality for pnfs flexfiles.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABAgAGBQJVlWQgAAoJEGcL54qWCgDyXtcP/2Y3HJ9xu5qU3Bo/jzCAw4E1
 jPPMSFAz4kqy/LGoslyc1cNDEiKGzJYWU8TtCGI3KAyNxb6n3pT1mEE1tvIsSdis
 D8bpV13M452PPpZYrBawIf4+OuohXmuYHpFiVNSpLbH3Uo7dthvFFnbqCGaGlnqY
 rXYZHAnx637OGBcJsT4AXCUz12ILvxMYRnqwW6Xn+j9JmwR1coQX3v8W8e7SMf6i
 J+zOny7Uetjrg1U9C9uQB6ZvIoxUMo9QOVmtGCwsBl8lM3fLmzaQfcUf9fm76pMT
 yTrKJs4jBLvVf00bRHFDv9EHWCy97oqCkeQEw1EY2lnxp/lmM5SiI4zQqjbf0QTW
 5VQScT1MK6xwHoUbuI/sYdXXR8KGDVT1xCFFHUNcg69CvgqdgWslPQY7xLJMvUJZ
 vBWfWDd8ppdCw2ZVX4ae/bnhfc+/mVh4wRPF7tgVAjT0pobBV9xMOeMkF4mo76Wa
 pvo/nTRMt68hpESVSvq9dYEMVhy5haqFhPrSbyAGOpT4SE2V3RCCZQfhu15TMKdW
 BdvItG+mdAVPbIHqhx7vRdAudcOEZKyxbFA+l3E5FyCAXLV7XS3M8CEl3P1w7gmm
 Ccr8DW9abKFJf1RAKdX3stexIoJLGTwciSMR5smsbup/xNcx/fRgx2f1w31JMPxb
 kG3Izfk25w9uGSsbR39D
 =AREr
 -----END PGP SIGNATURE-----

Merge tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs

Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable patches:
   - Fix a crash in the NFSv4 file locking code.
   - Fix an fsync() regression, where we were failing to retry I/O in
     some circumstances.
   - Fix an infinite loop in NFSv4.0 OPEN stateid recovery
   - Fix a memory leak when an attempted pnfs fails.
   - Fix a memory leak in the backchannel code
   - Large hostnames were not supported correctly in NFSv4.1
   - Fix a pNFS/flexfiles bug that was impeding error reporting on I/O.
   - Fix a couple of credential issues in pNFS/flexfiles

  Bugfixes + cleanups:
   - Open flag sanity checks in the NFSv4 atomic open codepath
   - More NFSv4 delegation related bugfixes
   - Various NFSv4.1 backchannel bugfixes and cleanups
   - Fix the NFS swap socket code
   - Various cleanups of the NFSv4 SETCLIENTID and EXCHANGE_ID code
   - Fix a UDP transport deadlock issue

  Features:
   - More RDMA client transport improvements
   - NFSv4.2 LAYOUTSTATS functionality for pnfs flexfiles"

* tag 'nfs-for-4.2-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (87 commits)
  nfs: Remove invalid tk_pid from debug message
  nfs: Remove invalid NFS_ATTR_FATTR_V4_REFERRAL checking in nfs4_get_rootfh
  nfs: Drop bad comment in nfs41_walk_client_list()
  nfs: Remove unneeded micro checking of CONFIG_PROC_FS
  nfs: Don't setting FILE_CREATED flags always
  nfs: Use remove_proc_subtree() instead remove_proc_entry()
  nfs: Remove unused argument in nfs_server_set_fsinfo()
  nfs: Fix a memory leak when meeting an unsupported state protect
  nfs: take extra reference to fl->fl_file when running a LOCKU operation
  NFSv4: When returning a delegation, don't reclaim an incompatible open mode.
  NFSv4.2: LAYOUTSTATS is optional to implement
  NFSv4.2: Fix up a decoding error in layoutstats
  pNFS/flexfiles: Fix the reset of struct pgio_header when resending
  pNFS/flexfiles: Turn off layoutcommit for servers that don't need it
  pnfs/flexfiles: protect ktime manipulation with mirror lock
  nfs: provide pnfs_report_layoutstat when NFS42 is disabled
  nfs: verify open flags before allowing open
  nfs: always update creds in mirror, even when we have an already connected ds
  nfs: fix potential credential leak in ff_layout_update_mirror_cred
  pnfs/flexfiles: report layoutstat regularly
  ...
This commit is contained in:
Linus Torvalds 2015-07-02 11:32:23 -07:00
commit 8688d9540c
51 changed files with 1859 additions and 740 deletions

View File

@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
* pg_authenticate method for nfsv4 callback threads.
*
* The authflavor has been negotiated, so an incorrect flavor is a server
* bug. Drop packets with incorrect authflavor.
* bug. Deny packets with incorrect authflavor.
*
* All other checking done after NFS decoding where the nfs_client can be
* found in nfs4_callback_compound
@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
switch (rqstp->rq_authop->flavour) {
case RPC_AUTH_NULL:
if (rqstp->rq_proc != CB_NULL)
return SVC_DROP;
return SVC_DENIED;
break;
case RPC_AUTH_GSS:
/* No RPC_AUTH_GSS support yet in NFSv4.1 */
if (svc_is_backchannel(rqstp))
return SVC_DROP;
return SVC_DENIED;
}
return SVC_OK;
}

View File

@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr);
/* Normal */
if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
slot->seq_nr++;
if (likely(args->csa_sequenceid == slot->seq_nr + 1))
goto out_ok;
}
/* Replay */
if (args->csa_sequenceid == slot->seq_nr) {
@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_process_state *cps)
{
struct nfs4_slot_table *tbl;
struct nfs4_slot *slot;
struct nfs_client *clp;
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
goto out;
tbl = &clp->cl_session->bc_slot_table;
slot = tbl->slots + args->csa_slotid;
spin_lock(&tbl->slot_tbl_lock);
/* state manager is resetting the session */
if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) {
spin_unlock(&tbl->slot_tbl_lock);
status = htonl(NFS4ERR_DELAY);
/* Return NFS4ERR_BADSESSION if we're draining the session
* in order to reset it.
*/
if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
status = htonl(NFS4ERR_BADSESSION);
goto out;
goto out_unlock;
}
status = validate_seqid(&clp->cl_session->bc_slot_table, args);
spin_unlock(&tbl->slot_tbl_lock);
memcpy(&res->csr_sessionid, &args->csa_sessionid,
sizeof(res->csr_sessionid));
res->csr_sequenceid = args->csa_sequenceid;
res->csr_slotid = args->csa_slotid;
res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
status = validate_seqid(tbl, args);
if (status)
goto out;
goto out_unlock;
cps->slotid = args->csa_slotid;
@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
*/
if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) {
status = htonl(NFS4ERR_DELAY);
goto out;
goto out_unlock;
}
memcpy(&res->csr_sessionid, &args->csa_sessionid,
sizeof(res->csr_sessionid));
res->csr_sequenceid = args->csa_sequenceid;
res->csr_slotid = args->csa_slotid;
res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
/*
* RFC5661 20.9.3
* If CB_SEQUENCE returns an error, then the state of the slot
* (sequence ID, cached reply) MUST NOT change.
*/
slot->seq_nr++;
out_unlock:
spin_unlock(&tbl->slot_tbl_lock);
out:
cps->clp = clp; /* put in nfs4_callback_compound */

View File

@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
if (status == __constant_htonl(NFS4ERR_RESOURCE))
if (status == htonl(NFS4ERR_RESOURCE))
return rpc_garbage_args;
if (hdr_arg.minorversion == 0) {

View File

@ -825,7 +825,6 @@ static int nfs_init_server(struct nfs_server *server,
* Load up the server record from information gained in an fsinfo record
*/
static void nfs_server_set_fsinfo(struct nfs_server *server,
struct nfs_fh *mntfh,
struct nfs_fsinfo *fsinfo)
{
unsigned long max_rpc_payload;
@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
if (error < 0)
goto out_error;
nfs_server_set_fsinfo(server, mntfh, &fsinfo);
nfs_server_set_fsinfo(server, &fsinfo);
/* Get some general file system info */
if (server->namelen == 0) {
@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net)
}
#ifdef CONFIG_PROC_FS
static struct proc_dir_entry *proc_fs_nfs;
static int nfs_server_list_open(struct inode *inode, struct file *file);
static void *nfs_server_list_start(struct seq_file *p, loff_t *pos);
static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos);
@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
{
struct nfs_server *server;
struct nfs_client *clp;
char dev[8], fsid[17];
char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0'
char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0'
struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
/* display header on line 1 */
if (v == &nn->nfs_volume_list) {
seq_puts(m, "NV SERVER PORT DEV FSID FSC\n");
seq_puts(m, "NV SERVER PORT DEV FSID"
" FSC\n");
return 0;
}
/* display one transport per line on subsequent lines */
server = list_entry(v, struct nfs_server, master_link);
clp = server->nfs_client;
snprintf(dev, 8, "%u:%u",
snprintf(dev, sizeof(dev), "%u:%u",
MAJOR(server->s_dev), MINOR(server->s_dev));
snprintf(fsid, 17, "%llx:%llx",
snprintf(fsid, sizeof(fsid), "%llx:%llx",
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
rcu_read_lock();
seq_printf(m, "v%u %s %s %-7s %-17s %s\n",
seq_printf(m, "v%u %s %s %-12s %-33s %s\n",
clp->rpc_ops->version,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT),
@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net)
*/
int __init nfs_fs_proc_init(void)
{
struct proc_dir_entry *p;
proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL);
if (!proc_fs_nfs)
if (!proc_mkdir("fs/nfsfs", NULL))
goto error_0;
/* a file of servers with which we're dealing */
p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
if (!p)
if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers"))
goto error_1;
/* a file of volumes that we have mounted */
p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
if (!p)
goto error_2;
return 0;
if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes"))
goto error_1;
error_2:
remove_proc_entry("servers", proc_fs_nfs);
return 0;
error_1:
remove_proc_entry("fs/nfsfs", NULL);
remove_proc_subtree("fs/nfsfs", NULL);
error_0:
return -ENOMEM;
}
@ -1464,9 +1456,7 @@ int __init nfs_fs_proc_init(void)
*/
void nfs_fs_proc_exit(void)
{
remove_proc_entry("volumes", proc_fs_nfs);
remove_proc_entry("servers", proc_fs_nfs);
remove_proc_entry("fs/nfsfs", NULL);
remove_proc_subtree("fs/nfsfs", NULL);
}
#endif /* CONFIG_PROC_FS */

View File

@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
{
int err;
if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
*opened |= FILE_CREATED;
err = finish_open(file, dentry, do_open, opened);
if (err)
goto out;

View File

@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
return nfs_wb_page(inode, page);
}
#ifdef CONFIG_NFS_SWAP
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
sector_t *span)
{
int ret;
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
*span = sis->pages;
rcu_read_lock();
ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
rcu_read_unlock();
return ret;
return rpc_clnt_swap_activate(clnt);
}
static void nfs_swap_deactivate(struct file *file)
{
struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
rcu_read_lock();
xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
rcu_read_unlock();
rpc_clnt_swap_deactivate(clnt);
}
#endif
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
.launder_page = nfs_launder_page,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
#ifdef CONFIG_NFS_SWAP
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
#endif
};
/*

View File

@ -20,6 +20,7 @@
#include "../nfs4trace.h"
#include "../iostat.h"
#include "../nfs.h"
#include "../nfs42.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
{
struct nfs4_ff_layout_mirror *tmp;
int i, j;
for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
for (j = i + 1; j < fls->mirror_array_cnt; j++)
if (fls->mirror_array[i]->efficiency <
fls->mirror_array[j]->efficiency) {
tmp = fls->mirror_array[i];
fls->mirror_array[i] = fls->mirror_array[j];
fls->mirror_array[j] = tmp;
}
fls->mirror_array[j]->efficiency)
swap(fls->mirror_array[i],
fls->mirror_array[j]);
}
}
@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
spin_lock_init(&fls->mirror_array[i]->lock);
fls->mirror_array[i]->ds_count = ds_count;
fls->mirror_array[i]->lseg = &fls->generic_hdr;
/* deviceid */
rc = decode_deviceid(&stream, &devid);
@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
fls->mirror_array[i]->gid);
}
p = xdr_inline_decode(&stream, 4);
if (p)
fls->flags = be32_to_cpup(p);
ff_layout_sort_mirrors(fls);
rc = ff_layout_check_layout(lgr);
if (rc)
@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
return 1;
}
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer)
{
/* first IO request? */
if (atomic_inc_return(&timer->n_ops) == 1) {
timer->start_time = ktime_get();
}
}
static ktime_t
nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer)
{
ktime_t start, now;
if (atomic_dec_return(&timer->n_ops) < 0)
WARN_ON_ONCE(1);
now = ktime_get();
start = timer->start_time;
timer->start_time = now;
return ktime_sub(now, start);
}
static ktime_t
nfs4_ff_layout_calc_completion_time(struct rpc_task *task)
{
return ktime_sub(ktime_get(), task->tk_start);
}
static bool
nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
struct nfs4_ff_layoutstat *layoutstat)
{
static const ktime_t notime = {0};
ktime_t now = ktime_get();
nfs4_ff_start_busy_timer(&layoutstat->busy_timer);
if (ktime_equal(mirror->start_time, notime))
mirror->start_time = now;
if (ktime_equal(mirror->last_report_time, notime))
mirror->last_report_time = now;
if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
FF_LAYOUTSTATS_REPORT_INTERVAL) {
mirror->last_report_time = now;
return true;
}
return false;
}
static void
nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat,
__u64 requested)
{
struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
iostat->ops_requested++;
iostat->bytes_requested += requested;
}
static void
nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat,
__u64 requested,
__u64 completed,
ktime_t time_completed)
{
struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat;
ktime_t timer;
iostat->ops_completed++;
iostat->bytes_completed += completed;
iostat->bytes_not_delivered += requested - completed;
timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer);
iostat->total_busy_time =
ktime_add(iostat->total_busy_time, timer);
iostat->aggregate_completion_time =
ktime_add(iostat->aggregate_completion_time, time_completed);
}
static void
nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror,
__u64 requested)
{
bool report;
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat);
nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
spin_unlock(&mirror->lock);
if (report)
pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
}
static void
nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
__u64 requested,
__u64 completed)
{
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
requested, completed,
nfs4_ff_layout_calc_completion_time(task));
spin_unlock(&mirror->lock);
}
static void
nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror,
__u64 requested)
{
bool report;
spin_lock(&mirror->lock);
report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat);
nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
spin_unlock(&mirror->lock);
if (report)
pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode);
}
static void
nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
struct nfs4_ff_layout_mirror *mirror,
__u64 requested,
__u64 completed,
enum nfs3_stable_how committed)
{
if (committed == NFS_UNSTABLE)
requested = completed = 0;
spin_lock(&mirror->lock);
nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
requested, completed,
nfs4_ff_layout_calc_completion_time(task));
spin_unlock(&mirror->lock);
}
static int
ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo,
@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
nfs_direct_set_resched_writes(hdr->dreq);
/* fake unstable write to let common nfs resend pages */
hdr->verf.committed = NFS_UNSTABLE;
hdr->good_bytes = 0;
hdr->good_bytes = hdr->args.count;
}
return;
}
@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return 0;
}
static bool
ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
{
return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT);
}
/*
* We reference the rpc_cred of the first WRITE that triggers the need for
* a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
static void
ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
if (!ff_layout_need_layoutcommit(hdr->lseg))
return;
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
static int ff_layout_read_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
nfs4_ff_layout_stat_io_start_read(
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
if (ff_layout_read_prepare_common(task, hdr))
return;
if (ff_layout_setup_sequence(hdr->ds_clp,
&hdr->args.seq_args,
&hdr->res.seq_res,
task))
return;
if (ff_layout_read_prepare_common(task, hdr))
return;
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context, FMODE_READ) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
nfs4_ff_layout_stat_io_end_read(task,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count, hdr->res.count);
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
if (data->verf.committed == NFS_UNSTABLE)
if (data->verf.committed == NFS_UNSTABLE
&& ff_layout_need_layoutcommit(data->lseg))
pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
static int ff_layout_write_prepare_common(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
nfs4_ff_layout_stat_io_start_write(
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count);
if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
rpc_exit(task, -EIO);
return -EIO;
@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
if (ff_layout_write_prepare_common(task, hdr))
return;
if (ff_layout_setup_sequence(hdr->ds_clp,
&hdr->args.seq_args,
&hdr->res.seq_res,
task))
return;
if (ff_layout_write_prepare_common(task, hdr))
return;
if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
hdr->args.lock_context, FMODE_WRITE) == -EIO)
rpc_exit(task, -EIO); /* lost lock, terminate I/O */
@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
{
struct nfs_pgio_header *hdr = data;
nfs4_ff_layout_stat_io_end_write(task,
FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
hdr->args.count, hdr->res.count,
hdr->res.verf->committed);
if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
task->tk_status == 0) {
nfs4_sequence_done(task, &hdr->res.seq_res);
@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
&NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
}
static void ff_layout_commit_prepare_common(struct rpc_task *task,
struct nfs_commit_data *cdata)
{
nfs4_ff_layout_stat_io_start_write(
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
0);
}
static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
{
ff_layout_commit_prepare_common(task, data);
rpc_call_start(task);
}
@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
{
struct nfs_commit_data *wdata = data;
ff_layout_setup_sequence(wdata->ds_clp,
if (ff_layout_setup_sequence(wdata->ds_clp,
&wdata->args.seq_args,
&wdata->res.seq_res,
task);
task))
return;
ff_layout_commit_prepare_common(task, data);
}
static void ff_layout_commit_done(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
struct nfs_page *req;
__u64 count = 0;
if (task->tk_status == 0) {
list_for_each_entry(req, &cdata->pages, wb_list)
count += req->wb_bytes;
}
nfs4_ff_layout_stat_io_end_write(task,
FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
count, count, NFS_FILE_SYNC);
pnfs_generic_write_commit_done(task, data);
}
static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
.rpc_call_prepare = ff_layout_commit_prepare_v3,
.rpc_call_done = pnfs_generic_write_commit_done,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
.rpc_release = pnfs_generic_commit_release,
};
static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
.rpc_call_prepare = ff_layout_commit_prepare_v4,
.rpc_call_done = pnfs_generic_write_commit_done,
.rpc_call_done = ff_layout_commit_done,
.rpc_count_stats = ff_layout_commit_count_stats,
.rpc_release = pnfs_generic_commit_release,
};
@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
if (fh)
hdr->args.fh = fh;
/*
* Note that if we ever decide to split across DSes,
* then we may need to handle dense-like offsets.
@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
data->args.fh = fh;
return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
vers == 3 ? &ff_layout_commit_call_ops_v3 :
&ff_layout_commit_call_ops_v4,
@ -1488,6 +1687,247 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
dprintk("%s: Return\n", __func__);
}
static int
ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
{
const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
return snprintf(buf, buflen, "%pI4", &sin->sin_addr);
}
static size_t
ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf,
const int buflen)
{
const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
const struct in6_addr *addr = &sin6->sin6_addr;
/*
* RFC 4291, Section 2.2.2
*
* Shorthanded ANY address
*/
if (ipv6_addr_any(addr))
return snprintf(buf, buflen, "::");
/*
* RFC 4291, Section 2.2.2
*
* Shorthanded loopback address
*/
if (ipv6_addr_loopback(addr))
return snprintf(buf, buflen, "::1");
/*
* RFC 4291, Section 2.2.3
*
* Special presentation address format for mapped v4
* addresses.
*/
if (ipv6_addr_v4mapped(addr))
return snprintf(buf, buflen, "::ffff:%pI4",
&addr->s6_addr32[3]);
/*
* RFC 4291, Section 2.2.1
*/
return snprintf(buf, buflen, "%pI6c", addr);
}
/* Derived from rpc_sockaddr2uaddr */
static void
ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da)
{
struct sockaddr *sap = (struct sockaddr *)&da->da_addr;
char portbuf[RPCBIND_MAXUADDRPLEN];
char addrbuf[RPCBIND_MAXUADDRLEN];
char *netid;
unsigned short port;
int len, netid_len;
__be32 *p;
switch (sap->sa_family) {
case AF_INET:
if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0)
return;
port = ntohs(((struct sockaddr_in *)sap)->sin_port);
netid = "tcp";
netid_len = 3;
break;
case AF_INET6:
if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0)
return;
port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port);
netid = "tcp6";
netid_len = 4;
break;
default:
/* we only support tcp and tcp6 */
WARN_ON_ONCE(1);
return;
}
snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff);
len = strlcat(addrbuf, portbuf, sizeof(addrbuf));
p = xdr_reserve_space(xdr, 4 + netid_len);
xdr_encode_opaque(p, netid, netid_len);
p = xdr_reserve_space(xdr, 4 + len);
xdr_encode_opaque(p, addrbuf, len);
}
static void
ff_layout_encode_nfstime(struct xdr_stream *xdr,
ktime_t t)
{
struct timespec64 ts;
__be32 *p;
p = xdr_reserve_space(xdr, 12);
ts = ktime_to_timespec64(t);
p = xdr_encode_hyper(p, ts.tv_sec);
*p++ = cpu_to_be32(ts.tv_nsec);
}
static void
ff_layout_encode_io_latency(struct xdr_stream *xdr,
struct nfs4_ff_io_stat *stat)
{
__be32 *p;
p = xdr_reserve_space(xdr, 5 * 8);
p = xdr_encode_hyper(p, stat->ops_requested);
p = xdr_encode_hyper(p, stat->bytes_requested);
p = xdr_encode_hyper(p, stat->ops_completed);
p = xdr_encode_hyper(p, stat->bytes_completed);
p = xdr_encode_hyper(p, stat->bytes_not_delivered);
ff_layout_encode_nfstime(xdr, stat->total_busy_time);
ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time);
}
static void
ff_layout_encode_layoutstats(struct xdr_stream *xdr,
struct nfs42_layoutstat_args *args,
struct nfs42_layoutstat_devinfo *devinfo)
{
struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
struct nfs4_pnfs_ds_addr *da;
struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
struct nfs_fh *fh = &mirror->fh_versions[0];
__be32 *p, *start;
da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
dprintk("%s: DS %s: encoding address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
/* layoutupdate length */
start = xdr_reserve_space(xdr, 4);
/* netaddr4 */
ff_layout_encode_netaddr(xdr, da);
/* nfs_fh4 */
p = xdr_reserve_space(xdr, 4 + fh->size);
xdr_encode_opaque(p, fh->data, fh->size);
/* ff_io_latency4 read */
spin_lock(&mirror->lock);
ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat);
/* ff_io_latency4 write */
ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat);
spin_unlock(&mirror->lock);
/* nfstime4 */
ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time));
/* bool */
p = xdr_reserve_space(xdr, 4);
*p = cpu_to_be32(false);
*start = cpu_to_be32((xdr->p - start - 1) * 4);
}
static bool
ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
struct pnfs_layout_segment *pls,
int *dev_count, int dev_limit)
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_deviceid_node *dev;
struct nfs42_layoutstat_devinfo *devinfo;
int i;
for (i = 0; i <= FF_LAYOUT_MIRROR_COUNT(pls); i++) {
if (*dev_count >= dev_limit)
break;
mirror = FF_LAYOUT_COMP(pls, i);
if (!mirror || !mirror->mirror_ds)
continue;
dev = FF_LAYOUT_DEVID_NODE(pls, i);
devinfo = &args->devinfo[*dev_count];
memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
devinfo->offset = pls->pls_range.offset;
devinfo->length = pls->pls_range.length;
/* well, we don't really know if IO is continuous or not! */
devinfo->read_count = mirror->read_stat.io_stat.bytes_completed;
devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
devinfo->write_count = mirror->write_stat.io_stat.bytes_completed;
devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
devinfo->layout_type = LAYOUT_FLEX_FILES;
devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
devinfo->layout_private = mirror;
/* lseg refcount put in cleanup_layoutstats */
pnfs_get_lseg(pls);
++(*dev_count);
}
return *dev_count < dev_limit;
}
static int
ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
struct pnfs_layout_segment *pls;
int dev_count = 0;
spin_lock(&args->inode->i_lock);
list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
dev_count += FF_LAYOUT_MIRROR_COUNT(pls);
}
spin_unlock(&args->inode->i_lock);
/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
dprintk("%s: truncating devinfo to limit (%d:%d)\n",
__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
dev_count = PNFS_LAYOUTSTATS_MAXDEV;
}
args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL);
if (!args->devinfo)
return -ENOMEM;
dev_count = 0;
spin_lock(&args->inode->i_lock);
list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) {
if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count,
PNFS_LAYOUTSTATS_MAXDEV)) {
break;
}
}
spin_unlock(&args->inode->i_lock);
args->num_dev = dev_count;
return 0;
}
static void
ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
{
struct nfs4_ff_layout_mirror *mirror;
int i;
for (i = 0; i < data->args.num_dev; i++) {
mirror = data->args.devinfo[i].layout_private;
data->args.devinfo[i].layout_private = NULL;
pnfs_put_lseg(mirror->lseg);
}
}
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
.encode_layoutreturn = ff_layout_encode_layoutreturn,
.sync = pnfs_nfs_generic_sync,
.prepare_layoutstats = ff_layout_prepare_layoutstats,
.cleanup_layoutstats = ff_layout_cleanup_layoutstats,
};
static int __init nfs4flexfilelayout_init(void)

View File

@ -9,12 +9,17 @@
#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
#include "../pnfs.h"
/* XXX: Let's filter out insanely large mirror count for now to avoid oom
* due to network error etc. */
#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
/* LAYOUTSTATS report interval in ms */
#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
struct nfs4_ff_ds_version {
u32 version;
u32 minor_version;
@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err {
struct nfs4_deviceid deviceid;
};
struct nfs4_ff_io_stat {
__u64 ops_requested;
__u64 bytes_requested;
__u64 ops_completed;
__u64 bytes_completed;
__u64 bytes_not_delivered;
ktime_t total_busy_time;
ktime_t aggregate_completion_time;
};
struct nfs4_ff_busy_timer {
ktime_t start_time;
atomic_t n_ops;
};
struct nfs4_ff_layoutstat {
struct nfs4_ff_io_stat io_stat;
struct nfs4_ff_busy_timer busy_timer;
};
struct nfs4_ff_layout_mirror {
struct pnfs_layout_segment *lseg; /* back pointer */
u32 ds_count;
u32 efficiency;
struct nfs4_ff_layout_ds *mirror_ds;
u32 fh_versions_cnt;
struct nfs_fh *fh_versions;
nfs4_stateid stateid;
struct nfs4_string user_name;
struct nfs4_string group_name;
u32 uid;
u32 gid;
struct rpc_cred *cred;
spinlock_t lock;
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
ktime_t last_report_time;
};
struct nfs4_ff_layout_segment {
struct pnfs_layout_segment generic_hdr;
u64 stripe_unit;
u32 flags;
u32 mirror_array_cnt;
struct nfs4_ff_layout_mirror **mirror_array;
};

View File

@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
__func__, PTR_ERR(cred));
return PTR_ERR(cred);
} else {
mirror->cred = cred;
if (cmpxchg(&mirror->cred, NULL, cred))
put_rpccred(cred);
}
}
return 0;
@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
smp_rmb();
if (ds->ds_clp)
goto out;
goto out_update_creds;
flavor = nfs4_ff_layout_choose_authflavor(mirror);
@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
}
}
}
out_update_creds:
if (ff_layout_update_mirror_cred(mirror, ds))
ds = NULL;
out:

View File

@ -678,6 +678,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
}
out:
trace_nfs_getattr_exit(inode, err);
@ -2008,17 +2010,15 @@ static int __init init_nfs_fs(void)
if (err)
goto out1;
#ifdef CONFIG_PROC_FS
rpc_proc_register(&init_net, &nfs_rpcstat);
#endif
if ((err = register_nfs_fs()) != 0)
err = register_nfs_fs();
if (err)
goto out0;
return 0;
out0:
#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
#endif
nfs_destroy_directcache();
out1:
nfs_destroy_writepagecache();
@ -2049,9 +2049,7 @@ static void __exit exit_nfs_fs(void)
nfs_destroy_nfspagecache();
nfs_fscache_unregister();
unregister_pernet_subsys(&nfs_net_ops);
#ifdef CONFIG_PROC_FS
rpc_proc_unregister(&init_net, "nfs");
#endif
unregister_nfs_fs();
nfs_fs_proc_exit();
nfsiod_stop();

View File

@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req,
if (args->npages != 0)
xdr_write_pages(xdr, args->pages, 0, args->len);
else
xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE);
xdr_reserve_space(xdr, args->len);
error = nfsacl_encode(xdr->buf, base, args->inode,
(args->mask & NFS_ACL) ?

View File

@ -5,11 +5,18 @@
#ifndef __LINUX_FS_NFS_NFS4_2_H
#define __LINUX_FS_NFS_NFS4_2_H
/*
* FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support
* more? Need to consider not to pre-alloc too much for a compound.
*/
#define PNFS_LAYOUTSTATS_MAXDEV (4)
/* nfs4.2proc.c */
int nfs42_proc_allocate(struct file *, loff_t, loff_t);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
/* nfs4.2xdr.h */
extern struct rpc_procinfo nfs4_2_procedures[];

View File

@ -10,6 +10,11 @@
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
#include "nfs42.h"
#include "iostat.h"
#include "pnfs.h"
#include "internal.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
fmode_t fmode)
@ -165,3 +170,85 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
}
static void
nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
struct nfs_server *server = NFS_SERVER(data->args.inode);
nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
&data->res.seq_res, task);
}
static void
nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
if (!nfs4_sequence_done(task, &data->res.seq_res))
return;
switch (task->tk_status) {
case 0:
break;
case -ENOTSUPP:
case -EOPNOTSUPP:
NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
default:
dprintk("%s server returns %d\n", __func__, task->tk_status);
}
}
static void
nfs42_layoutstat_release(void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
struct nfs_server *nfss = NFS_SERVER(data->args.inode);
if (nfss->pnfs_curr_ld->cleanup_layoutstats)
nfss->pnfs_curr_ld->cleanup_layoutstats(data);
pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
smp_mb__before_atomic();
clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
smp_mb__after_atomic();
nfs_iput_and_deactive(data->inode);
kfree(data->args.devinfo);
kfree(data);
}
static const struct rpc_call_ops nfs42_layoutstat_ops = {
.rpc_call_prepare = nfs42_layoutstat_prepare,
.rpc_call_done = nfs42_layoutstat_done,
.rpc_release = nfs42_layoutstat_release,
};
int nfs42_proc_layoutstats_generic(struct nfs_server *server,
struct nfs42_layoutstat_data *data)
{
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
.rpc_argp = &data->args,
.rpc_resp = &data->res,
};
struct rpc_task_setup task_setup = {
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs42_layoutstat_ops,
.callback_data = data,
.flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
data->inode = nfs_igrab_and_active(data->args.inode);
if (!data->inode) {
nfs42_layoutstat_release(data);
return -EAGAIN;
}
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
task = rpc_run_task(&task_setup);
if (IS_ERR(task))
return PTR_ERR(task);
return 0;
}

View File

@ -4,6 +4,8 @@
#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
#define __LINUX_FS_NFS_NFS4_2XDR_H
#include "nfs42.h"
#define encode_fallocate_maxsz (encode_stateid_maxsz + \
2 /* offset */ + \
2 /* length */)
@ -22,6 +24,16 @@
1 /* whence */ + \
2 /* offset */ + \
2 /* length */)
#define encode_io_info_maxsz 4
#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \
2 /* offset */ + \
2 /* length */ + \
encode_stateid_maxsz + \
encode_io_info_maxsz + \
encode_io_info_maxsz + \
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@ -45,6 +57,14 @@
#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
decode_putfh_maxsz + \
decode_seek_maxsz)
#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
encode_uint32(xdr, args->sa_what);
}
static void encode_layoutstats(struct xdr_stream *xdr,
struct nfs42_layoutstat_args *args,
struct nfs42_layoutstat_devinfo *devinfo,
struct compound_hdr *hdr)
{
__be32 *p;
encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
p = reserve_space(xdr, 8 + 8);
p = xdr_encode_hyper(p, devinfo->offset);
p = xdr_encode_hyper(p, devinfo->length);
encode_nfs4_stateid(xdr, &args->stateid);
p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
p = xdr_encode_hyper(p, devinfo->read_count);
p = xdr_encode_hyper(p, devinfo->read_bytes);
p = xdr_encode_hyper(p, devinfo->write_count);
p = xdr_encode_hyper(p, devinfo->write_bytes);
p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
NFS4_DEVICEID4_SIZE);
/* Encode layoutupdate4 */
*p++ = cpu_to_be32(devinfo->layout_type);
if (devinfo->layoutstats_encode != NULL)
devinfo->layoutstats_encode(xdr, args, devinfo);
else
encode_uint32(xdr, 0);
}
/*
* Encode ALLOCATE request
*/
@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
encode_nops(&hdr);
}
/*
* Encode LAYOUTSTATS request
*/
static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
struct xdr_stream *xdr,
struct nfs42_layoutstat_args *args)
{
int i;
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
};
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
encode_putfh(xdr, args->fh, &hdr);
WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < args->num_dev; i++)
encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
encode_nops(&hdr);
}
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@ -169,6 +238,12 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
return -EIO;
}
static int decode_layoutstats(struct xdr_stream *xdr,
struct nfs42_layoutstat_res *res)
{
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
/*
* Decode ALLOCATE request
*/
@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
out:
return status;
}
/*
* Decode LAYOUTSTATS request
*/
static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
struct xdr_stream *xdr,
struct nfs42_layoutstat_res *res)
{
struct compound_hdr hdr;
int status, i;
status = decode_compound_hdr(xdr, &hdr);
if (status)
goto out;
status = decode_sequence(xdr, &res->seq_res, rqstp);
if (status)
goto out;
status = decode_putfh(xdr);
if (status)
goto out;
WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
for (i = 0; i < res->num_dev; i++) {
status = decode_layoutstats(xdr, res);
if (status)
goto out;
}
out:
res->rpc_status = status;
return status;
}
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */

View File

@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
struct rpc_message *, struct nfs4_sequence_args *,
struct nfs4_sequence_res *, int);
extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);

View File

@ -676,7 +676,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
break;
}
/* No matching nfs_client found. */
spin_unlock(&nn->nfs_client_lock);
dprintk("NFS: <-- %s status = %d\n", __func__, status);
nfs_put_client(prev);

View File

@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp)
dprintk("NFS: open file(%pd2)\n", dentry);
err = nfs_check_flags(openflags);
if (err)
return err;
if ((openflags & O_ACCMODE) == 3)
openflags--;

View File

@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p
goto out;
}
if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
printk(KERN_ERR "nfs4_get_rootfh:"
" getroot obtained referral\n");
ret = -EREMOTE;
goto out;
}
memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
out:
nfs_free_fattr(fsinfo.fattr);

View File

@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp)
int nfs_idmap_init(void)
{
int ret;
ret = nfs_idmap_init_keyring();
if (ret != 0)
goto out;
out:
return ret;
return nfs_idmap_init_keyring();
}
void nfs_idmap_quit(void)

View File

@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case 0:
return 0;
case -NFS4ERR_OPENMODE:
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
nfs4_inode_return_delegation(inode);
exception->retry = 1;
@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
if (ret < 0)
break;
goto wait_on_recovery;
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
if (ret < 0)
break;
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
if (state != NULL) {
ret = nfs4_schedule_stateid_recovery(server, state);
@ -482,8 +476,8 @@ struct nfs4_call_sync_data {
struct nfs4_sequence_res *seq_res;
};
static void nfs4_init_sequence(struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res, int cache_reply)
void nfs4_init_sequence(struct nfs4_sequence_args *args,
struct nfs4_sequence_res *res, int cache_reply)
{
args->sa_slot = NULL;
args->sa_cache_this = cache_reply;
@ -1553,6 +1547,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
struct nfs4_state *newstate;
int ret;
if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
(opendata->o_arg.u.delegation_type & fmode) != fmode)
/* This mode can't have been delegated, so we must have
* a valid open_stateid to cover it - not need to reclaim.
*/
return 0;
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
opendata->o_arg.share_access = nfs4_map_atomic_open_share(
@ -1684,6 +1685,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
"%d.\n", __func__, err);
case 0:
case -ENOENT:
case -EAGAIN:
case -ESTALE:
break;
case -NFS4ERR_BADSESSION:
@ -3355,6 +3357,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
goto out;
case -NFS4ERR_MOVED:
err = nfs4_get_referral(client, dir, name, fattr, fhandle);
if (err == -NFS4ERR_MOVED)
err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception);
goto out;
case -NFS4ERR_WRONGSEC:
err = -EPERM;
@ -4955,49 +4959,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
memcpy(bootverf->data, verf, sizeof(bootverf->data));
}
static unsigned int
nfs4_init_nonuniform_client_string(struct nfs_client *clp,
char *buf, size_t len)
static int
nfs4_init_nonuniform_client_string(struct nfs_client *clp)
{
unsigned int result;
int result;
size_t len;
char *str;
bool retried = false;
if (clp->cl_owner_id != NULL)
return strlcpy(buf, clp->cl_owner_id, len);
return 0;
retry:
rcu_read_lock();
len = 10 + strlen(clp->cl_ipaddr) + 1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) +
1 +
strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) +
1;
rcu_read_unlock();
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
/*
* Since this string is allocated at mount time, and held until the
* nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
* about a memory-reclaim deadlock.
*/
str = kmalloc(len, GFP_KERNEL);
if (!str)
return -ENOMEM;
rcu_read_lock();
result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_PROTO));
result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
rcu_read_unlock();
clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
return result;
/* Did something change? */
if (result >= len) {
kfree(str);
if (retried)
return -EINVAL;
retried = true;
goto retry;
}
clp->cl_owner_id = str;
return 0;
}
static unsigned int
nfs4_init_uniform_client_string(struct nfs_client *clp,
char *buf, size_t len)
static int
nfs4_init_uniquifier_client_string(struct nfs_client *clp)
{
const char *nodename = clp->cl_rpcclient->cl_nodename;
unsigned int result;
int result;
size_t len;
char *str;
len = 10 + 10 + 1 + 10 + 1 +
strlen(nfs4_client_id_uniquifier) + 1 +
strlen(clp->cl_rpcclient->cl_nodename) + 1;
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
/*
* Since this string is allocated at mount time, and held until the
* nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
* about a memory-reclaim deadlock.
*/
str = kmalloc(len, GFP_KERNEL);
if (!str)
return -ENOMEM;
result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version, clp->cl_minorversion,
nfs4_client_id_uniquifier,
clp->cl_rpcclient->cl_nodename);
if (result >= len) {
kfree(str);
return -EINVAL;
}
clp->cl_owner_id = str;
return 0;
}
static int
nfs4_init_uniform_client_string(struct nfs_client *clp)
{
int result;
size_t len;
char *str;
if (clp->cl_owner_id != NULL)
return strlcpy(buf, clp->cl_owner_id, len);
return 0;
if (nfs4_client_id_uniquifier[0] != '\0')
result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version,
clp->cl_minorversion,
nfs4_client_id_uniquifier,
nodename);
else
result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
nodename);
clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
return result;
return nfs4_init_uniquifier_client_string(clp);
len = 10 + 10 + 1 + 10 + 1 +
strlen(clp->cl_rpcclient->cl_nodename) + 1;
if (len > NFS4_OPAQUE_LIMIT + 1)
return -EINVAL;
/*
* Since this string is allocated at mount time, and held until the
* nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
* about a memory-reclaim deadlock.
*/
str = kmalloc(len, GFP_KERNEL);
if (!str)
return -ENOMEM;
result = scnprintf(str, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
clp->cl_rpcclient->cl_nodename);
if (result >= len) {
kfree(str);
return -EINVAL;
}
clp->cl_owner_id = str;
return 0;
}
/*
@ -5044,7 +5127,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
struct nfs4_setclientid setclientid = {
.sc_verifier = &sc_verifier,
.sc_prog = program,
.sc_cb_ident = clp->cl_cb_ident,
.sc_clnt = clp,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@ -5064,16 +5147,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
/* nfs_client_id4 */
nfs4_init_boot_verifier(clp, &sc_verifier);
if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
setclientid.sc_name_len =
nfs4_init_uniform_client_string(clp,
setclientid.sc_name,
sizeof(setclientid.sc_name));
status = nfs4_init_uniform_client_string(clp);
else
setclientid.sc_name_len =
nfs4_init_nonuniform_client_string(clp,
setclientid.sc_name,
sizeof(setclientid.sc_name));
status = nfs4_init_nonuniform_client_string(clp);
if (status)
goto out;
/* cb_client4 */
setclientid.sc_netid_len =
nfs4_init_callback_netid(clp,
@ -5083,9 +5165,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
sizeof(setclientid.sc_uaddr), "%s.%u.%u",
clp->cl_ipaddr, port >> 8, port & 255);
dprintk("NFS call setclientid auth=%s, '%.*s'\n",
dprintk("NFS call setclientid auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
setclientid.sc_name_len, setclientid.sc_name);
clp->cl_owner_id);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task)) {
status = PTR_ERR(task);
@ -5402,6 +5484,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
atomic_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
get_file(fl->fl_file);
memcpy(&p->fl, fl, sizeof(p->fl));
p->server = NFS_SERVER(inode);
return p;
@ -5413,6 +5496,7 @@ static void nfs4_locku_release_calldata(void *data)
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_lock_state(calldata->lsp);
put_nfs_open_context(calldata->ctx);
fput(calldata->fl.fl_file);
kfree(calldata);
}
@ -6846,11 +6930,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
};
nfs4_init_boot_verifier(clp, &verifier);
args.id_len = nfs4_init_uniform_client_string(clp, args.id,
sizeof(args.id));
dprintk("NFS call exchange_id auth=%s, '%.*s'\n",
status = nfs4_init_uniform_client_string(clp);
if (status)
goto out;
dprintk("NFS call exchange_id auth=%s, '%s'\n",
clp->cl_rpcclient->cl_auth->au_ops->au_name,
args.id_len, args.id);
clp->cl_owner_id);
res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
GFP_NOFS);
@ -6885,7 +6972,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
/* unsupported! */
WARN_ON_ONCE(1);
status = -EINVAL;
goto out_server_scope;
goto out_impl_id;
}
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
@ -6913,6 +7000,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
/* use the most recent implementation id */
kfree(clp->cl_implid);
clp->cl_implid = res.impl_id;
res.impl_id = NULL;
if (clp->cl_serverscope != NULL &&
!nfs41_same_server_scope(clp->cl_serverscope,
@ -6926,15 +7014,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (clp->cl_serverscope == NULL) {
clp->cl_serverscope = res.server_scope;
goto out;
res.server_scope = NULL;
}
} else
kfree(res.impl_id);
}
out_server_owner:
kfree(res.server_owner);
out_impl_id:
kfree(res.impl_id);
out_server_scope:
kfree(res.server_scope);
out_server_owner:
kfree(res.server_owner);
out:
if (clp->cl_implid != NULL)
dprintk("NFS reply exchange_id: Server Implementation ID: "
@ -8061,9 +8150,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
struct rpc_task *task;
int status = 0;
dprintk("NFS: %4d initiating layoutcommit call. sync %d "
"lbw: %llu inode %lu\n",
data->task.tk_pid, sync,
dprintk("NFS: initiating layoutcommit call. sync %d "
"lbw: %llu inode %lu\n", sync,
data->args.lastbytewritten,
data->args.inode->i_ino);
@ -8557,7 +8645,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ATOMIC_OPEN_V1
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK,
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,

View File

@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state))
goto do_confirm;
nfs4_begin_drain_session(clp);
status = nfs4_proc_exchange_id(clp, cred);
if (status != 0)
goto out;
@ -1482,6 +1481,8 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
spin_unlock(&state->state_lock);
}
nfs4_put_open_state(state);
clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
&state->flags);
spin_lock(&sp->so_lock);
goto restart;
}
@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp)
clp->cl_mvops->reboot_recovery_ops;
int status;
nfs4_begin_drain_session(clp);
cred = nfs4_get_clid_cred(clp);
if (cred == NULL)
return -ENOENT;

View File

@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int);
#define encode_setclientid_maxsz \
(op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \
XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \
/* client name */ \
1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
1 /* sc_prog */ + \
1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \
1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \
@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int);
#define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \
encode_verifier_maxsz + \
1 /* co_ownerid.len */ + \
XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \
/* eia_clientowner */ \
1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
1 /* flags */ + \
1 /* spa_how */ + \
/* max is SP4_MACH_CRED (for now) */ + \
@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
encode_nfs4_verifier(xdr, setclientid->sc_verifier);
encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
setclientid->sc_clnt->cl_owner_id);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(setclientid->sc_prog);
encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(setclientid->sc_cb_ident);
*p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
}
static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
encode_nfs4_verifier(xdr, args->verifier);
encode_string(xdr, args->id_len, args->id);
encode_string(xdr, strlen(args->client->cl_owner_id),
args->client->cl_owner_id);
encode_uint32(xdr, args->flags);
encode_uint32(xdr, args->state_protect.how);
@ -7427,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(SEEK, enc_seek, dec_seek),
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
#endif /* CONFIG_NFS_V4_2 */
};

View File

@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
dprintk("NFS: %5u initiated pgio call "
dprintk("NFS: initiated pgio call "
"(req %s/%llu, %u bytes @ offset %llu)\n",
hdr->task.tk_pid,
hdr->inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(hdr->inode),
hdr->args.count,
@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
static void nfs_pgio_release(void *calldata)
{
struct nfs_pgio_header *hdr = calldata;
if (hdr->rw_ops->rw_release)
hdr->rw_ops->rw_release(hdr);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
}
@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
* @inode: pointer to inode
* @doio: pointer to io function
* @pg_ops: pointer to pageio operations
* @compl_ops: pointer to pageio completion operations
* @rw_ops: pointer to nfs read/write operations
* @bsize: io block size
* @io_flags: extra parameters for the io function
*/
@ -1186,6 +1185,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
* nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
* nfs_pageio_descriptor
* @desc: pointer to io descriptor
* @mirror_idx: pointer to mirror index
*/
static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
u32 mirror_idx)

View File

@ -35,6 +35,7 @@
#include "iostat.h"
#include "nfs4trace.h"
#include "delegation.h"
#include "nfs42.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@ -1821,6 +1822,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
/* Resend all requests through the MDS */
nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
hdr->completion_ops);
set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
return nfs_pageio_resend(&pgio, hdr);
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
@ -1865,6 +1867,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
hdr->release(hdr);
}
static enum pnfs_try_status
@ -1979,6 +1982,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
hdr->release(hdr);
}
/*
@ -2247,3 +2251,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
}
return thp;
}
#if IS_ENABLED(CONFIG_NFS_V4_2)
int
pnfs_report_layoutstat(struct inode *inode)
{
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs42_layoutstat_data *data;
struct pnfs_layout_hdr *hdr;
int status = 0;
if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
goto out;
if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
goto out;
if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
goto out;
spin_lock(&inode->i_lock);
if (!NFS_I(inode)->layout) {
spin_unlock(&inode->i_lock);
goto out;
}
hdr = NFS_I(inode)->layout;
pnfs_get_layout_hdr(hdr);
spin_unlock(&inode->i_lock);
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data) {
status = -ENOMEM;
goto out_put;
}
data->args.fh = NFS_FH(inode);
data->args.inode = inode;
nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
status = ld->prepare_layoutstats(&data->args);
if (status)
goto out_free;
status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
out:
dprintk("%s returns %d\n", __func__, status);
return status;
out_free:
kfree(data);
out_put:
pnfs_put_layout_hdr(hdr);
smp_mb__before_atomic();
clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
smp_mb__after_atomic();
goto out;
}
EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
#endif

View File

@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type {
void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
struct xdr_stream *xdr,
const struct nfs4_layoutcommit_args *args);
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
};
struct pnfs_layout_hdr {
@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
/* nfs4_deviceid_flags */
enum {
NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void)
#endif /* CONFIG_NFS_V4_1 */
#if IS_ENABLED(CONFIG_NFS_V4_2)
int pnfs_report_layoutstat(struct inode *inode);
#else
static inline int
pnfs_report_layoutstat(struct inode *inode)
{
return 0;
}
#endif
#endif /* FS_NFS_PNFS_H */

View File

@ -1290,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
nfs_unlock_request(req);
nfs_end_page_writeback(req);
nfs_release_request(req);
@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
{
/* do nothing! */
}
/*
* Special version of should_remove_suid() that ignores capabilities.
*/
@ -1556,7 +1552,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
/* Set up the initial task struct. */
nfs_ops->commit_setup(data, &msg);
dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
dprintk("NFS: initiated commit call\n");
nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client,
NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg);
@ -2013,7 +2009,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
.rw_mode = FMODE_WRITE,
.rw_alloc_header = nfs_writehdr_alloc,
.rw_free_header = nfs_writehdr_free,
.rw_release = nfs_writeback_release_common,
.rw_done = nfs_writeback_done,
.rw_result = nfs_writeback_result,
.rw_initiate = nfs_initiate_write,

View File

@ -500,6 +500,7 @@ enum {
NFSPROC4_CLNT_SEEK,
NFSPROC4_CLNT_ALLOCATE,
NFSPROC4_CLNT_DEALLOCATE,
NFSPROC4_CLNT_LAYOUTSTATS,
};
/* nfs41 types */

View File

@ -219,6 +219,7 @@ struct nfs_inode {
#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */
#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{

View File

@ -237,5 +237,6 @@ struct nfs_server {
#define NFS_CAP_SEEK (1U << 19)
#define NFS_CAP_ALLOCATE (1U << 20)
#define NFS_CAP_DEALLOCATE (1U << 21)
#define NFS_CAP_LAYOUTSTATS (1U << 22)
#endif

View File

@ -67,7 +67,6 @@ struct nfs_rw_ops {
const fmode_t rw_mode;
struct nfs_pgio_header *(*rw_alloc_header)(void);
void (*rw_free_header)(struct nfs_pgio_header *);
void (*rw_release)(struct nfs_pgio_header *);
int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
struct inode *);
void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);

View File

@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
int rpc_status;
};
#define PNFS_LAYOUTSTATS_MAXSIZE 256
struct nfs42_layoutstat_args;
struct nfs42_layoutstat_devinfo;
typedef void (*layoutstats_encode_t)(struct xdr_stream *,
struct nfs42_layoutstat_args *,
struct nfs42_layoutstat_devinfo *);
/* Per file per deviceid layoutstats */
struct nfs42_layoutstat_devinfo {
struct nfs4_deviceid dev_id;
__u64 offset;
__u64 length;
__u64 read_count;
__u64 read_bytes;
__u64 write_count;
__u64 write_bytes;
__u32 layout_type;
layoutstats_encode_t layoutstats_encode;
void *layout_private;
};
struct nfs42_layoutstat_args {
struct nfs4_sequence_args seq_args;
struct nfs_fh *fh;
struct inode *inode;
nfs4_stateid stateid;
int num_dev;
struct nfs42_layoutstat_devinfo *devinfo;
};
struct nfs42_layoutstat_res {
struct nfs4_sequence_res seq_res;
int num_dev;
int rpc_status;
};
struct nfs42_layoutstat_data {
struct inode *inode;
struct nfs42_layoutstat_args args;
struct nfs42_layoutstat_res res;
};
struct stateowner_id {
__u64 create_time;
__u32 uniquifier;
@ -984,17 +1027,14 @@ struct nfs4_readlink_res {
struct nfs4_sequence_res seq_res;
};
#define NFS4_SETCLIENTID_NAMELEN (127)
struct nfs4_setclientid {
const nfs4_verifier * sc_verifier;
unsigned int sc_name_len;
char sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
u32 sc_prog;
unsigned int sc_netid_len;
char sc_netid[RPCBIND_MAXNETIDLEN + 1];
unsigned int sc_uaddr_len;
char sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
u32 sc_cb_ident;
struct nfs_client *sc_clnt;
struct rpc_cred *sc_cred;
};
@ -1142,12 +1182,9 @@ struct nfs41_state_protection {
struct nfs4_op_map allow;
};
#define NFS4_EXCHANGE_ID_LEN (48)
struct nfs41_exchange_id_args {
struct nfs_client *client;
nfs4_verifier *verifier;
unsigned int id_len;
char id[NFS4_EXCHANGE_ID_LEN];
u32 flags;
struct nfs41_state_protection state_protect;
};

View File

@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
void xprt_free_bc_request(struct rpc_rqst *req);
int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
int bc_send(struct rpc_rqst *req);
/*
* Determine if a shared backchannel is in use

View File

@ -56,6 +56,7 @@ struct rpc_clnt {
struct rpc_rtt * cl_rtt; /* RTO estimator data */
const struct rpc_timeout *cl_timeout; /* Timeout strategy */
atomic_t cl_swapper; /* swapfile count */
int cl_nodelen; /* nodename length */
char cl_nodename[UNX_MAXNODENAME+1];
struct rpc_pipe_dir_head cl_pipedir_objects;

View File

@ -205,8 +205,7 @@ struct rpc_wait_queue {
*/
struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
const struct rpc_call_ops *ops);
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
void rpc_put_task(struct rpc_task *);
void rpc_put_task_async(struct rpc_task *);
void rpc_exit_task(struct rpc_task *);
@ -269,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
}
#endif
#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
#else
static inline int
rpc_clnt_swap_activate(struct rpc_clnt *clnt)
{
return -EINVAL;
}
static inline void
rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
{
}
#endif /* CONFIG_SUNRPC_SWAP */
#endif /* _LINUX_SUNRPC_SCHED_H_ */

View File

@ -133,6 +133,9 @@ struct rpc_xprt_ops {
void (*close)(struct rpc_xprt *xprt);
void (*destroy)(struct rpc_xprt *xprt);
void (*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
int (*enable_swap)(struct rpc_xprt *xprt);
void (*disable_swap)(struct rpc_xprt *xprt);
void (*inject_disconnect)(struct rpc_xprt *xprt);
};
/*
@ -180,7 +183,7 @@ struct rpc_xprt {
atomic_t num_reqs; /* total slots */
unsigned long state; /* transport state */
unsigned char resvport : 1; /* use a reserved port */
unsigned int swapper; /* we're swapping over this
atomic_t swapper; /* we're swapping over this
transport */
unsigned int bind_index; /* bind function index */
@ -212,7 +215,8 @@ struct rpc_xprt {
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
struct svc_serv *bc_serv; /* The RPC service which will */
/* process the callback */
unsigned int bc_alloc_count; /* Total number of preallocs */
int bc_alloc_count; /* Total number of preallocs */
atomic_t bc_free_slots;
spinlock_t bc_pa_lock; /* Protects the preallocated
* items */
struct list_head bc_pa_list; /* List of preallocated
@ -241,6 +245,7 @@ struct rpc_xprt {
const char *address_strings[RPC_DISPLAY_MAX];
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct dentry *debugfs; /* debugfs directory */
atomic_t inject_disconnect;
#endif
};
@ -327,6 +332,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
return p + xprt->tsh_size;
}
static inline int
xprt_enable_swap(struct rpc_xprt *xprt)
{
return xprt->ops->enable_swap(xprt);
}
static inline void
xprt_disable_swap(struct rpc_xprt *xprt)
{
xprt->ops->disable_swap(xprt);
}
/*
* Transport switch helper functions
*/
@ -345,7 +362,6 @@ void xprt_release_rqst_cong(struct rpc_task *task);
void xprt_disconnect_done(struct rpc_xprt *xprt);
void xprt_force_disconnect(struct rpc_xprt *xprt);
void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
int xs_swapper(struct rpc_xprt *xprt, int enable);
bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
void xprt_unlock_connect(struct rpc_xprt *, void *);
@ -431,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
return test_and_set_bit(XPRT_BINDING, &xprt->state);
}
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
extern unsigned int rpc_inject_disconnect;
static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
{
if (!rpc_inject_disconnect)
return;
if (atomic_dec_return(&xprt->inject_disconnect))
return;
atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
xprt->ops->inject_disconnect(xprt);
}
#else
static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
{
}
#endif
#endif /* __KERNEL__*/
#endif /* _LINUX_SUNRPC_XPRT_H */

View File

@ -56,7 +56,8 @@
#define RPCRDMA_INLINE_PAD_THRESH (512)/* payload threshold to pad (bytes) */
/* memory registration strategies */
/* Memory registration strategies, by number.
* This is part of a kernel / user space API. Do not remove. */
enum rpcrdma_memreg {
RPCRDMA_BOUNCEBUFFERS = 0,
RPCRDMA_REGISTER,

View File

@ -14,6 +14,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
sunrpc_syms.o cache.o rpc_pipe.o \
svc_xprt.o
sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
sunrpc-$(CONFIG_SYSCTL) += sysctl.o

View File

@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
{
return xprt->bc_alloc_count > 0;
return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
}
static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
{
atomic_add(n, &xprt->bc_free_slots);
xprt->bc_alloc_count += n;
}
static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
{
atomic_sub(n, &xprt->bc_free_slots);
return xprt->bc_alloc_count -= n;
}
@ -60,13 +62,62 @@ static void xprt_free_allocation(struct rpc_rqst *req)
dprintk("RPC: free allocations for req= %p\n", req);
WARN_ON_ONCE(test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
xbufp = &req->rq_private_buf;
xbufp = &req->rq_rcv_buf;
free_page((unsigned long)xbufp->head[0].iov_base);
xbufp = &req->rq_snd_buf;
free_page((unsigned long)xbufp->head[0].iov_base);
kfree(req);
}
static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
{
struct page *page;
/* Preallocate one XDR receive buffer */
page = alloc_page(gfp_flags);
if (page == NULL)
return -ENOMEM;
buf->head[0].iov_base = page_address(page);
buf->head[0].iov_len = PAGE_SIZE;
buf->tail[0].iov_base = NULL;
buf->tail[0].iov_len = 0;
buf->page_len = 0;
buf->len = 0;
buf->buflen = PAGE_SIZE;
return 0;
}
static
struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags)
{
struct rpc_rqst *req;
/* Pre-allocate one backchannel rpc_rqst */
req = kzalloc(sizeof(*req), gfp_flags);
if (req == NULL)
return NULL;
req->rq_xprt = xprt;
INIT_LIST_HEAD(&req->rq_list);
INIT_LIST_HEAD(&req->rq_bc_list);
/* Preallocate one XDR receive buffer */
if (xprt_alloc_xdr_buf(&req->rq_rcv_buf, gfp_flags) < 0) {
printk(KERN_ERR "Failed to create bc receive xbuf\n");
goto out_free;
}
req->rq_rcv_buf.len = PAGE_SIZE;
/* Preallocate one XDR send buffer */
if (xprt_alloc_xdr_buf(&req->rq_snd_buf, gfp_flags) < 0) {
printk(KERN_ERR "Failed to create bc snd xbuf\n");
goto out_free;
}
return req;
out_free:
xprt_free_allocation(req);
return NULL;
}
/*
* Preallocate up to min_reqs structures and related buffers for use
* by the backchannel. This function can be called multiple times
@ -87,9 +138,7 @@ static void xprt_free_allocation(struct rpc_rqst *req)
*/
int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
{
struct page *page_rcv = NULL, *page_snd = NULL;
struct xdr_buf *xbufp = NULL;
struct rpc_rqst *req, *tmp;
struct rpc_rqst *req;
struct list_head tmp_list;
int i;
@ -106,7 +155,7 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
INIT_LIST_HEAD(&tmp_list);
for (i = 0; i < min_reqs; i++) {
/* Pre-allocate one backchannel rpc_rqst */
req = kzalloc(sizeof(struct rpc_rqst), GFP_KERNEL);
req = xprt_alloc_bc_req(xprt, GFP_KERNEL);
if (req == NULL) {
printk(KERN_ERR "Failed to create bc rpc_rqst\n");
goto out_free;
@ -115,41 +164,6 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
/* Add the allocated buffer to the tmp list */
dprintk("RPC: adding req= %p\n", req);
list_add(&req->rq_bc_pa_list, &tmp_list);
req->rq_xprt = xprt;
INIT_LIST_HEAD(&req->rq_list);
INIT_LIST_HEAD(&req->rq_bc_list);
/* Preallocate one XDR receive buffer */
page_rcv = alloc_page(GFP_KERNEL);
if (page_rcv == NULL) {
printk(KERN_ERR "Failed to create bc receive xbuf\n");
goto out_free;
}
xbufp = &req->rq_rcv_buf;
xbufp->head[0].iov_base = page_address(page_rcv);
xbufp->head[0].iov_len = PAGE_SIZE;
xbufp->tail[0].iov_base = NULL;
xbufp->tail[0].iov_len = 0;
xbufp->page_len = 0;
xbufp->len = PAGE_SIZE;
xbufp->buflen = PAGE_SIZE;
/* Preallocate one XDR send buffer */
page_snd = alloc_page(GFP_KERNEL);
if (page_snd == NULL) {
printk(KERN_ERR "Failed to create bc snd xbuf\n");
goto out_free;
}
xbufp = &req->rq_snd_buf;
xbufp->head[0].iov_base = page_address(page_snd);
xbufp->head[0].iov_len = 0;
xbufp->tail[0].iov_base = NULL;
xbufp->tail[0].iov_len = 0;
xbufp->page_len = 0;
xbufp->len = 0;
xbufp->buflen = PAGE_SIZE;
}
/*
@ -167,7 +181,10 @@ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
/*
* Memory allocation failed, free the temporary list
*/
list_for_each_entry_safe(req, tmp, &tmp_list, rq_bc_pa_list) {
while (!list_empty(&tmp_list)) {
req = list_first_entry(&tmp_list,
struct rpc_rqst,
rq_bc_pa_list);
list_del(&req->rq_bc_pa_list);
xprt_free_allocation(req);
}
@ -217,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
struct rpc_rqst *req = NULL;
dprintk("RPC: allocate a backchannel request\n");
if (list_empty(&xprt->bc_pa_list))
if (atomic_read(&xprt->bc_free_slots) <= 0)
goto not_found;
if (list_empty(&xprt->bc_pa_list)) {
req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
if (!req)
goto not_found;
/* Note: this 'free' request adds it to xprt->bc_pa_list */
xprt_free_bc_request(req);
}
req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
rq_bc_pa_list);
req->rq_reply_bytes_recvd = 0;
@ -245,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
req->rq_connect_cookie = xprt->connect_cookie - 1;
smp_mb__before_atomic();
WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
smp_mb__after_atomic();
if (!xprt_need_to_requeue(xprt)) {
/*
* Return it to the list of preallocations so that it
* may be reused by a new callback request.
*/
spin_lock_bh(&xprt->bc_pa_lock);
if (xprt_need_to_requeue(xprt)) {
list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
xprt->bc_alloc_count++;
req = NULL;
}
spin_unlock_bh(&xprt->bc_pa_lock);
if (req != NULL) {
/*
* The last remaining session was destroyed while this
* entry was in use. Free the entry and don't attempt
@ -260,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
xprt_free_allocation(req);
return;
}
/*
* Return it to the list of preallocations so that it
* may be reused by a new callback request.
*/
spin_lock_bh(&xprt->bc_pa_lock);
list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
spin_unlock_bh(&xprt->bc_pa_lock);
}
/*
@ -311,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
spin_lock(&xprt->bc_pa_lock);
list_del(&req->rq_bc_pa_list);
xprt->bc_alloc_count--;
spin_unlock(&xprt->bc_pa_lock);
req->rq_private_buf.len = copied;

View File

@ -1,63 +0,0 @@
/******************************************************************************
(c) 2007 Network Appliance, Inc. All Rights Reserved.
(c) 2009 NetApp. All Rights Reserved.
NetApp provides this source code under the GPL v2 License.
The GPL v2 license is available at
http://opensource.org/licenses/gpl-license.php.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
/*
* The NFSv4.1 callback service helper routines.
* They implement the transport level processing required to send the
* reply over an existing open connection previously established by the client.
*/
#include <linux/module.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/sched.h>
#include <linux/sunrpc/bc_xprt.h>
#define RPCDBG_FACILITY RPCDBG_SVCDSP
/* Empty callback ops */
static const struct rpc_call_ops nfs41_callback_ops = {
};
/*
* Send the callback reply
*/
int bc_send(struct rpc_rqst *req)
{
struct rpc_task *task;
int ret;
dprintk("RPC: bc_send req= %p\n", req);
task = rpc_run_bc_task(req, &nfs41_callback_ops);
if (IS_ERR(task))
ret = PTR_ERR(task);
else {
WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
ret = task->tk_status;
rpc_put_task(task);
}
dprintk("RPC: bc_send ret= %d\n", ret);
return ret;
}

View File

@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_flags |= RPC_TASK_SOFT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
if (sk_memalloc_socks()) {
struct rpc_xprt *xprt;
rcu_read_lock();
xprt = rcu_dereference(clnt->cl_xprt);
if (xprt->swapper)
task->tk_flags |= RPC_TASK_SWAPPER;
rcu_read_unlock();
}
if (atomic_read(&clnt->cl_swapper))
task->tk_flags |= RPC_TASK_SWAPPER;
/* Add to the client's list of all tasks */
spin_lock(&clnt->cl_lock);
list_add_tail(&task->tk_task, &clnt->cl_tasks);
@ -1031,15 +1024,14 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
* rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
* rpc_execute against it
* @req: RPC request
* @tk_ops: RPC call ops
*/
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
const struct rpc_call_ops *tk_ops)
struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
{
struct rpc_task *task;
struct xdr_buf *xbufp = &req->rq_snd_buf;
struct rpc_task_setup task_setup_data = {
.callback_ops = tk_ops,
.callback_ops = &rpc_default_ops,
.flags = RPC_TASK_SOFTCONN,
};
dprintk("RPC: rpc_run_bc_task req= %p\n", req);
@ -1614,6 +1606,7 @@ call_allocate(struct rpc_task *task)
req->rq_callsize + req->rq_rcvsize);
if (req->rq_buffer != NULL)
return;
xprt_inject_disconnect(xprt);
dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
@ -1951,33 +1944,36 @@ call_bc_transmit(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
if (!xprt_prepare_transmit(task)) {
/*
* Could not reserve the transport. Try again after the
* transport is released.
*/
task->tk_status = 0;
task->tk_action = call_bc_transmit;
return;
}
if (!xprt_prepare_transmit(task))
goto out_retry;
task->tk_action = rpc_exit_task;
if (task->tk_status < 0) {
printk(KERN_NOTICE "RPC: Could not send backchannel reply "
"error: %d\n", task->tk_status);
return;
goto out_done;
}
if (req->rq_connect_cookie != req->rq_xprt->connect_cookie)
req->rq_bytes_sent = 0;
xprt_transmit(task);
if (task->tk_status == -EAGAIN)
goto out_nospace;
xprt_end_transmit(task);
dprint_status(task);
switch (task->tk_status) {
case 0:
/* Success */
break;
case -EHOSTDOWN:
case -EHOSTUNREACH:
case -ENETUNREACH:
case -ECONNRESET:
case -ECONNREFUSED:
case -EADDRINUSE:
case -ENOTCONN:
case -EPIPE:
break;
case -ETIMEDOUT:
/*
* Problem reaching the server. Disconnect and let the
@ -2002,6 +1998,13 @@ call_bc_transmit(struct rpc_task *task)
break;
}
rpc_wake_up_queued_task(&req->rq_xprt->pending, task);
out_done:
task->tk_action = rpc_exit_task;
return;
out_nospace:
req->rq_connect_cookie = req->rq_xprt->connect_cookie;
out_retry:
task->tk_status = 0;
}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
@ -2476,3 +2479,59 @@ void rpc_show_tasks(struct net *net)
spin_unlock(&sn->rpc_client_lock);
}
#endif
#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
int
rpc_clnt_swap_activate(struct rpc_clnt *clnt)
{
int ret = 0;
struct rpc_xprt *xprt;
if (atomic_inc_return(&clnt->cl_swapper) == 1) {
retry:
rcu_read_lock();
xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
rcu_read_unlock();
if (!xprt) {
/*
* If we didn't get a reference, then we likely are
* racing with a migration event. Wait for a grace
* period and try again.
*/
synchronize_rcu();
goto retry;
}
ret = xprt_enable_swap(xprt);
xprt_put(xprt);
}
return ret;
}
EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
void
rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
{
struct rpc_xprt *xprt;
if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
retry:
rcu_read_lock();
xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
rcu_read_unlock();
if (!xprt) {
/*
* If we didn't get a reference, then we likely are
* racing with a migration event. Wait for a grace
* period and try again.
*/
synchronize_rcu();
goto retry;
}
xprt_disable_swap(xprt);
xprt_put(xprt);
}
}
EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
#endif /* CONFIG_SUNRPC_SWAP */

View File

@ -10,9 +10,12 @@
#include "netns.h"
static struct dentry *topdir;
static struct dentry *rpc_fault_dir;
static struct dentry *rpc_clnt_dir;
static struct dentry *rpc_xprt_dir;
unsigned int rpc_inject_disconnect;
struct rpc_clnt_iter {
struct rpc_clnt *clnt;
loff_t pos;
@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
debugfs_remove_recursive(xprt->debugfs);
xprt->debugfs = NULL;
}
atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
}
void
@ -266,11 +271,79 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
xprt->debugfs = NULL;
}
static int
fault_open(struct inode *inode, struct file *filp)
{
filp->private_data = kmalloc(128, GFP_KERNEL);
if (!filp->private_data)
return -ENOMEM;
return 0;
}
static int
fault_release(struct inode *inode, struct file *filp)
{
kfree(filp->private_data);
return 0;
}
static ssize_t
fault_disconnect_read(struct file *filp, char __user *user_buf,
size_t len, loff_t *offset)
{
char *buffer = (char *)filp->private_data;
size_t size;
size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
return simple_read_from_buffer(user_buf, len, offset, buffer, size);
}
static ssize_t
fault_disconnect_write(struct file *filp, const char __user *user_buf,
size_t len, loff_t *offset)
{
char buffer[16];
if (len >= sizeof(buffer))
len = sizeof(buffer) - 1;
if (copy_from_user(buffer, user_buf, len))
return -EFAULT;
buffer[len] = '\0';
if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
return -EINVAL;
return len;
}
static const struct file_operations fault_disconnect_fops = {
.owner = THIS_MODULE,
.open = fault_open,
.read = fault_disconnect_read,
.write = fault_disconnect_write,
.release = fault_release,
};
static struct dentry *
inject_fault_dir(struct dentry *topdir)
{
struct dentry *faultdir;
faultdir = debugfs_create_dir("inject_fault", topdir);
if (!faultdir)
return NULL;
if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
NULL, &fault_disconnect_fops))
return NULL;
return faultdir;
}
void __exit
sunrpc_debugfs_exit(void)
{
debugfs_remove_recursive(topdir);
topdir = NULL;
rpc_fault_dir = NULL;
rpc_clnt_dir = NULL;
rpc_xprt_dir = NULL;
}
@ -282,6 +355,10 @@ sunrpc_debugfs_init(void)
if (!topdir)
return;
rpc_fault_dir = inject_fault_dir(topdir);
if (!rpc_fault_dir)
goto out_remove;
rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
if (!rpc_clnt_dir)
goto out_remove;
@ -294,5 +371,6 @@ sunrpc_debugfs_init(void)
out_remove:
debugfs_remove_recursive(topdir);
topdir = NULL;
rpc_fault_dir = NULL;
rpc_clnt_dir = NULL;
}

View File

@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
{
struct kvec *argv = &rqstp->rq_arg.head[0];
struct kvec *resv = &rqstp->rq_res.head[0];
struct rpc_task *task;
int proc_error;
int error;
dprintk("svc: %s(%p)\n", __func__, req);
/* Build the svc_rqst used by the common processing routine */
rqstp->rq_xprt = serv->sv_bc_xprt;
@ -1372,21 +1377,36 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
/*
* Skip the next two words because they've already been
* processed in the trasport
* processed in the transport
*/
svc_getu32(argv); /* XID */
svc_getnl(argv); /* CALLDIR */
/* Returns 1 for send, 0 for drop */
if (svc_process_common(rqstp, argv, resv)) {
memcpy(&req->rq_snd_buf, &rqstp->rq_res,
sizeof(req->rq_snd_buf));
return bc_send(req);
} else {
/* drop request */
/* Parse and execute the bc call */
proc_error = svc_process_common(rqstp, argv, resv);
atomic_inc(&req->rq_xprt->bc_free_slots);
if (!proc_error) {
/* Processing error: drop the request */
xprt_free_bc_request(req);
return 0;
}
/* Finally, send the reply synchronously */
memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
task = rpc_run_bc_task(req);
if (IS_ERR(task)) {
error = PTR_ERR(task);
goto out;
}
WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
error = task->tk_status;
rpc_put_task(task);
out:
dprintk("svc: %s(), error=%d\n", __func__, error);
return error;
}
EXPORT_SYMBOL_GPL(bc_svc_process);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */

View File

@ -68,6 +68,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net);
static void xprt_request_init(struct rpc_task *, struct rpc_xprt *);
static void xprt_connect_status(struct rpc_task *task);
static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *);
static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *);
static void xprt_destroy(struct rpc_xprt *xprt);
static DEFINE_SPINLOCK(xprt_list_lock);
@ -250,6 +251,8 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task)
}
xprt_clear_locked(xprt);
out_sleep:
if (req)
__xprt_put_cong(xprt, req);
dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt);
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
@ -608,8 +611,8 @@ static void xprt_autoclose(struct work_struct *work)
struct rpc_xprt *xprt =
container_of(work, struct rpc_xprt, task_cleanup);
xprt->ops->close(xprt);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
xprt->ops->close(xprt);
xprt_release_write(xprt, NULL);
}
@ -967,6 +970,7 @@ void xprt_transmit(struct rpc_task *task)
task->tk_status = status;
return;
}
xprt_inject_disconnect(xprt);
dprintk("RPC: %5u xmit complete\n", task->tk_pid);
task->tk_flags |= RPC_TASK_SENT;
@ -1285,6 +1289,7 @@ void xprt_release(struct rpc_task *task)
spin_unlock_bh(&xprt->transport_lock);
if (req->rq_buffer)
xprt->ops->buf_free(req->rq_buffer);
xprt_inject_disconnect(xprt);
if (req->rq_cred != NULL)
put_rpccred(req->rq_cred);
task->tk_rqstp = NULL;

View File

@ -11,6 +11,21 @@
* can take tens of usecs to complete.
*/
/* Normal operation
*
* A Memory Region is prepared for RDMA READ or WRITE using the
* ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
* finished, the Memory Region is unmapped using the ib_unmap_fmr
* verb (fmr_op_unmap).
*/
/* Transport recovery
*
* After a transport reconnect, fmr_op_map re-uses the MR already
* allocated for the RPC, but generates a fresh rkey then maps the
* MR again. This process is synchronous.
*/
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@ -50,19 +65,28 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_mw *r;
int i, rc;
spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
dprintk("RPC: %s: initializing %d FMRs\n", __func__, i);
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES, 1);
i += 2; /* head + tail */
i *= buf->rb_max_requests; /* one set for each RPC slot */
dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i);
rc = -ENOMEM;
while (i--) {
r = kzalloc(sizeof(*r), GFP_KERNEL);
if (!r)
return -ENOMEM;
goto out;
r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
if (IS_ERR(r->r.fmr))
r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
sizeof(u64), GFP_KERNEL);
if (!r->r.fmr.physaddrs)
goto out_free;
r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
if (IS_ERR(r->r.fmr.fmr))
goto out_fmr_err;
list_add(&r->mw_list, &buf->rb_mws);
@ -71,12 +95,24 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
return 0;
out_fmr_err:
rc = PTR_ERR(r->r.fmr);
rc = PTR_ERR(r->r.fmr.fmr);
dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
kfree(r->r.fmr.physaddrs);
out_free:
kfree(r);
out:
return rc;
}
static int
__fmr_unmap(struct rpcrdma_mw *r)
{
LIST_HEAD(l);
list_add(&r->r.fmr.fmr->list, &l);
return ib_unmap_fmr(&l);
}
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
@ -85,12 +121,24 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_device *device = ia->ri_id->device;
struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw = seg1->rl_mw;
u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
int len, pageoff, i, rc;
struct rpcrdma_mw *mw;
mw = seg1->rl_mw;
seg1->rl_mw = NULL;
if (!mw) {
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
return -ENOMEM;
} else {
/* this is a retransmit; generate a fresh rkey */
rc = __fmr_unmap(mw);
if (rc)
return rc;
}
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@ -100,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction);
physaddrs[i] = seg->mr_dma;
mw->r.fmr.physaddrs[i] = seg->mr_dma;
len += seg->mr_len;
++seg;
++i;
@ -110,11 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
break;
}
rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma);
rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
i, seg1->mr_dma);
if (rc)
goto out_maperr;
seg1->mr_rkey = mw->r.fmr->rkey;
seg1->rl_mw = mw;
seg1->mr_rkey = mw->r.fmr.fmr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
@ -137,48 +187,28 @@ fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mr_seg *seg1 = seg;
struct ib_device *device;
struct rpcrdma_mw *mw = seg1->rl_mw;
int rc, nsegs = seg->mr_nsegs;
LIST_HEAD(l);
list_add(&seg1->rl_mw->r.fmr->list, &l);
rc = ib_unmap_fmr(&l);
read_lock(&ia->ri_qplock);
device = ia->ri_id->device;
dprintk("RPC: %s: FMR %p\n", __func__, mw);
seg1->rl_mw = NULL;
while (seg1->mr_nsegs--)
rpcrdma_unmap_one(device, seg++);
read_unlock(&ia->ri_qplock);
rpcrdma_unmap_one(ia->ri_device, seg++);
rc = __fmr_unmap(mw);
if (rc)
goto out_err;
rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
/* The FMR is abandoned, but remains in rb_all. fmr_op_destroy
* will attempt to release it when the transport is destroyed.
*/
dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc);
return nsegs;
}
/* After a disconnect, unmap all FMRs.
*
* This is invoked only in the transport connect worker in order
* to serialize with rpcrdma_register_fmr_external().
*/
static void
fmr_op_reset(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mw *r;
LIST_HEAD(list);
int rc;
list_for_each_entry(r, &buf->rb_all, mw_all)
list_add(&r->r.fmr->list, &list);
rc = ib_unmap_fmr(&list);
if (rc)
dprintk("RPC: %s: ib_unmap_fmr failed %i\n",
__func__, rc);
}
static void
fmr_op_destroy(struct rpcrdma_buffer *buf)
{
@ -188,10 +218,13 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
rc = ib_dealloc_fmr(r->r.fmr);
kfree(r->r.fmr.physaddrs);
rc = ib_dealloc_fmr(r->r.fmr.fmr);
if (rc)
dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
__func__, rc);
kfree(r);
}
}
@ -202,7 +235,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init = fmr_op_init,
.ro_reset = fmr_op_reset,
.ro_destroy = fmr_op_destroy,
.ro_displayname = "fmr",
};

View File

@ -11,12 +11,136 @@
* but most complex memory registration mode.
*/
/* Normal operation
*
* A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG
* Work Request (frmr_op_map). When the RDMA operation is finished, this
* Memory Region is invalidated using a LOCAL_INV Work Request
* (frmr_op_unmap).
*
* Typically these Work Requests are not signaled, and neither are RDMA
* SEND Work Requests (with the exception of signaling occasionally to
* prevent provider work queue overflows). This greatly reduces HCA
* interrupt workload.
*
* As an optimization, frwr_op_unmap marks MRs INVALID before the
* LOCAL_INV WR is posted. If posting succeeds, the MR is placed on
* rb_mws immediately so that no work (like managing a linked list
* under a spinlock) is needed in the completion upcall.
*
* But this means that frwr_op_map() can occasionally encounter an MR
* that is INVALID but the LOCAL_INV WR has not completed. Work Queue
* ordering prevents a subsequent FAST_REG WR from executing against
* that MR while it is still being invalidated.
*/
/* Transport recovery
*
* ->op_map and the transport connect worker cannot run at the same
* time, but ->op_unmap can fire while the transport connect worker
* is running. Thus MR recovery is handled in ->op_map, to guarantee
* that recovered MRs are owned by a sending RPC, and not one where
* ->op_unmap could fire at the same time transport reconnect is
* being done.
*
* When the underlying transport disconnects, MRs are left in one of
* three states:
*
* INVALID: The MR was not in use before the QP entered ERROR state.
* (Or, the LOCAL_INV WR has not completed or flushed yet).
*
* STALE: The MR was being registered or unregistered when the QP
* entered ERROR state, and the pending WR was flushed.
*
* VALID: The MR was registered before the QP entered ERROR state.
*
* When frwr_op_map encounters STALE and VALID MRs, they are recovered
* with ib_dereg_mr and then are re-initialized. Beause MR recovery
* allocates fresh resources, it is deferred to a workqueue, and the
* recovered MRs are placed back on the rb_mws list when recovery is
* complete. frwr_op_map allocates another MR for the current RPC while
* the broken MR is reset.
*
* To ensure that frwr_op_map doesn't encounter an MR that is marked
* INVALID but that is about to be flushed due to a previous transport
* disconnect, the transport connect worker attempts to drain all
* pending send queue WRs before the transport is reconnected.
*/
#include "xprt_rdma.h"
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
static struct workqueue_struct *frwr_recovery_wq;
#define FRWR_RECOVERY_WQ_FLAGS (WQ_UNBOUND | WQ_MEM_RECLAIM)
int
frwr_alloc_recovery_wq(void)
{
frwr_recovery_wq = alloc_workqueue("frwr_recovery",
FRWR_RECOVERY_WQ_FLAGS, 0);
return !frwr_recovery_wq ? -ENOMEM : 0;
}
void
frwr_destroy_recovery_wq(void)
{
struct workqueue_struct *wq;
if (!frwr_recovery_wq)
return;
wq = frwr_recovery_wq;
frwr_recovery_wq = NULL;
destroy_workqueue(wq);
}
/* Deferred reset of a single FRMR. Generate a fresh rkey by
* replacing the MR.
*
* There's no recovery if this fails. The FRMR is abandoned, but
* remains in rb_all. It will be cleaned up when the transport is
* destroyed.
*/
static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
r.frmr.fr_work);
struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
if (ib_dereg_mr(r->r.frmr.fr_mr))
goto out_fail;
r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(pd, depth);
if (IS_ERR(r->r.frmr.fr_mr))
goto out_fail;
dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
r->r.frmr.fr_state = FRMR_IS_INVALID;
rpcrdma_put_mw(r_xprt, r);
return;
out_fail:
pr_warn("RPC: %s: FRMR %p unrecovered\n",
__func__, r);
}
/* A broken MR was discovered in a context that can't sleep.
* Defer recovery to the recovery worker.
*/
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
}
static int
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
unsigned int depth)
@ -128,7 +252,7 @@ frwr_sendcompletion(struct ib_wc *wc)
/* WARNING: Only wr_id and status are reliable at this point */
r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
dprintk("RPC: %s: frmr %p (stale), status %s (%d)\n",
pr_warn("RPC: %s: frmr %p flushed, status %s (%d)\n",
__func__, r, ib_wc_status_msg(wc->status), wc->status);
r->r.frmr.fr_state = FRMR_IS_STALE;
}
@ -137,16 +261,19 @@ static int
frwr_op_init(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct ib_device *device = r_xprt->rx_ia.ri_id->device;
struct ib_device *device = r_xprt->rx_ia.ri_device;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
int i;
spin_lock_init(&buf->rb_mwlock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i);
i = max_t(int, RPCRDMA_MAX_DATA_SEGS / depth, 1);
i += 2; /* head + tail */
i *= buf->rb_max_requests; /* one set for each RPC slot */
dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i);
while (i--) {
struct rpcrdma_mw *r;
@ -165,6 +292,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
r->mw_sendcompletion = frwr_sendcompletion;
r->r.frmr.fr_xprt = r_xprt;
}
return 0;
@ -178,12 +306,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct ib_device *device = ia->ri_id->device;
struct ib_device *device = ia->ri_device;
enum dma_data_direction direction = rpcrdma_data_dir(writing);
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_mw *mw = seg1->rl_mw;
struct rpcrdma_frmr *frmr = &mw->r.frmr;
struct ib_mr *mr = frmr->fr_mr;
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
struct ib_send_wr fastreg_wr, *bad_wr;
u8 key;
int len, pageoff;
@ -192,12 +320,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
u64 pa;
int page_no;
mw = seg1->rl_mw;
seg1->rl_mw = NULL;
do {
if (mw)
__frwr_queue_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
return -ENOMEM;
} while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->r.frmr;
frmr->fr_state = FRMR_IS_VALID;
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
seg1->mr_len += pageoff;
len = -pageoff;
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
for (page_no = i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction);
pa = seg->mr_dma;
@ -216,8 +357,6 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
__func__, mw, i, len);
frmr->fr_state = FRMR_IS_VALID;
memset(&fastreg_wr, 0, sizeof(fastreg_wr));
fastreg_wr.wr_id = (unsigned long)(void *)mw;
fastreg_wr.opcode = IB_WR_FAST_REG_MR;
@ -229,6 +368,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
fastreg_wr.wr.fast_reg.access_flags = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
mr = frmr->fr_mr;
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
fastreg_wr.wr.fast_reg.rkey = mr->rkey;
@ -238,6 +378,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
if (rc)
goto out_senderr;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
@ -246,10 +387,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
ib_update_fast_reg_key(mr, --key);
frmr->fr_state = FRMR_IS_INVALID;
while (i--)
rpcrdma_unmap_one(device, --seg);
__frwr_queue_recovery(mw);
return rc;
}
@ -261,78 +401,46 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mw *mw = seg1->rl_mw;
struct ib_send_wr invalidate_wr, *bad_wr;
int rc, nsegs = seg->mr_nsegs;
struct ib_device *device;
seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID;
dprintk("RPC: %s: FRMR %p\n", __func__, mw);
seg1->rl_mw = NULL;
mw->r.frmr.fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof(invalidate_wr));
invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw;
invalidate_wr.wr_id = (unsigned long)(void *)mw;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey;
invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
read_lock(&ia->ri_qplock);
device = ia->ri_id->device;
while (seg1->mr_nsegs--)
rpcrdma_unmap_one(device, seg++);
rpcrdma_unmap_one(ia->ri_device, seg++);
read_lock(&ia->ri_qplock);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc)
goto out_err;
rpcrdma_put_mw(r_xprt, mw);
return nsegs;
out_err:
/* Force rpcrdma_buffer_get() to retry */
seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
__frwr_queue_recovery(mw);
return nsegs;
}
/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in
* an unusable state. Find FRMRs in this state and dereg / reg
* each. FRMRs that are VALID and attached to an rpcrdma_req are
* also torn down.
*
* This gives all in-use FRMRs a fresh rkey and leaves them INVALID.
*
* This is invoked only in the transport connect worker in order
* to serialize with rpcrdma_register_frmr_external().
*/
static void
frwr_op_reset(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct ib_device *device = r_xprt->rx_ia.ri_id->device;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
struct rpcrdma_mw *r;
int rc;
list_for_each_entry(r, &buf->rb_all, mw_all) {
if (r->r.frmr.fr_state == FRMR_IS_INVALID)
continue;
__frwr_release(r);
rc = __frwr_init(r, pd, device, depth);
if (rc) {
dprintk("RPC: %s: mw %p left %s\n",
__func__, r,
(r->r.frmr.fr_state == FRMR_IS_STALE ?
"stale" : "valid"));
continue;
}
r->r.frmr.fr_state = FRMR_IS_INVALID;
}
}
static void
frwr_op_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
/* Ensure stale MWs for "buf" are no longer in flight */
flush_workqueue(frwr_recovery_wq);
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
@ -347,7 +455,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
.ro_init = frwr_op_init,
.ro_reset = frwr_op_reset,
.ro_destroy = frwr_op_destroy,
.ro_displayname = "frwr",
};

View File

@ -50,8 +50,7 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
rpcrdma_map_one(ia->ri_id->device, seg,
rpcrdma_data_dir(writing));
rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
seg->mr_rkey = ia->ri_bind_mem->rkey;
seg->mr_base = seg->mr_dma;
seg->mr_nsegs = 1;
@ -65,18 +64,10 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
{
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
read_lock(&ia->ri_qplock);
rpcrdma_unmap_one(ia->ri_id->device, seg);
read_unlock(&ia->ri_qplock);
rpcrdma_unmap_one(ia->ri_device, seg);
return 1;
}
static void
physical_op_reset(struct rpcrdma_xprt *r_xprt)
{
}
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@ -88,7 +79,6 @@ const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
.ro_init = physical_op_init,
.ro_reset = physical_op_reset,
.ro_destroy = physical_op_destroy,
.ro_displayname = "physical",
};

View File

@ -284,9 +284,6 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target,
return (unsigned char *)iptr - (unsigned char *)headerp;
out:
if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR)
return n;
for (pos = 0; nchunks--;)
pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt,
&req->rl_segments[pos]);
@ -732,8 +729,8 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpcrdma_msg *headerp;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
struct rpc_xprt *xprt = rep->rr_xprt;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
__be32 *iptr;
int rdmalen, status;
unsigned long cwnd;
@ -770,7 +767,6 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
rep->rr_len);
repost:
r_xprt->rx_stats.bad_reply_count++;
rep->rr_func = rpcrdma_reply_handler;
if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
rpcrdma_recv_buffer_put(rep);

View File

@ -240,6 +240,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
xprt_clear_connecting(xprt);
}
static void
xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
rx_xprt);
pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
rdma_disconnect(r_xprt->rx_ia.ri_id);
}
/*
* xprt_rdma_destroy
*
@ -612,12 +622,6 @@ xprt_rdma_send_request(struct rpc_task *task)
if (req->rl_reply == NULL) /* e.g. reconnection */
rpcrdma_recv_buffer_get(req);
if (req->rl_reply) {
req->rl_reply->rr_func = rpcrdma_reply_handler;
/* this need only be done once, but... */
req->rl_reply->rr_xprt = xprt;
}
/* Must suppress retransmit to maintain credits */
if (req->rl_connect_cookie == xprt->connect_cookie)
goto drop_connection;
@ -676,6 +680,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
r_xprt->rx_stats.bad_reply_count);
}
static int
xprt_rdma_enable_swap(struct rpc_xprt *xprt)
{
return -EINVAL;
}
static void
xprt_rdma_disable_swap(struct rpc_xprt *xprt)
{
}
/*
* Plumbing for rpc transport switch and kernel module
*/
@ -694,7 +709,10 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
.print_stats = xprt_rdma_print_stats
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
.inject_disconnect = xprt_rdma_inject_disconnect
};
static struct xprt_class xprt_rdma = {
@ -720,17 +738,24 @@ void xprt_rdma_cleanup(void)
if (rc)
dprintk("RPC: %s: xprt_unregister returned %i\n",
__func__, rc);
frwr_destroy_recovery_wq();
}
int xprt_rdma_init(void)
{
int rc;
rc = xprt_register_transport(&xprt_rdma);
rc = frwr_alloc_recovery_wq();
if (rc)
return rc;
rc = xprt_register_transport(&xprt_rdma);
if (rc) {
frwr_destroy_recovery_wq();
return rc;
}
dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
dprintk("Defaults:\n");

View File

@ -80,7 +80,6 @@ static void
rpcrdma_run_tasklet(unsigned long data)
{
struct rpcrdma_rep *rep;
void (*func)(struct rpcrdma_rep *);
unsigned long flags;
data = data;
@ -89,14 +88,9 @@ rpcrdma_run_tasklet(unsigned long data)
rep = list_entry(rpcrdma_tasklets_g.next,
struct rpcrdma_rep, rr_list);
list_del(&rep->rr_list);
func = rep->rr_func;
rep->rr_func = NULL;
spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
if (func)
func(rep);
else
rpcrdma_recv_buffer_put(rep);
rpcrdma_reply_handler(rep);
spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
}
@ -236,7 +230,7 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
__func__, rep, wc->byte_len);
rep->rr_len = wc->byte_len;
ib_dma_sync_single_for_cpu(rdmab_to_ia(rep->rr_buffer)->ri_id->device,
ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
prefetch(rdmab_to_msg(rep->rr_rdmabuf));
@ -407,7 +401,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n",
sap, rpc_get_port(sap),
ia->ri_id->device->name,
ia->ri_device->name,
ia->ri_ops->ro_displayname,
xprt->rx_buf.rb_max_requests,
ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
@ -508,8 +502,9 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
rc = PTR_ERR(ia->ri_id);
goto out1;
}
ia->ri_device = ia->ri_id->device;
ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
ia->ri_pd = ib_alloc_pd(ia->ri_device);
if (IS_ERR(ia->ri_pd)) {
rc = PTR_ERR(ia->ri_pd);
dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
@ -517,7 +512,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
rc = ib_query_device(ia->ri_id->device, devattr);
rc = ib_query_device(ia->ri_device, devattr);
if (rc) {
dprintk("RPC: %s: ib_query_device failed %d\n",
__func__, rc);
@ -526,7 +521,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
ia->ri_have_dma_lkey = 1;
ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
}
if (memreg == RPCRDMA_FRMR) {
@ -541,7 +536,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
}
}
if (memreg == RPCRDMA_MTHCAFMR) {
if (!ia->ri_id->device->alloc_fmr) {
if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_ALLPHYSICAL;
@ -590,9 +585,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
dprintk("RPC: %s: memory registration strategy is '%s'\n",
__func__, ia->ri_ops->ro_displayname);
/* Else will do memory reg/dereg for each chunk */
ia->ri_memreg_strategy = memreg;
rwlock_init(&ia->ri_qplock);
return 0;
@ -622,17 +614,17 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia)
dprintk("RPC: %s: ib_dereg_mr returned %i\n",
__func__, rc);
}
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
rdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
}
if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
rc = ib_dealloc_pd(ia->ri_pd);
dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
__func__, rc);
}
/* If the pd is still busy, xprtrdma missed freeing a resource */
if (ia->ri_pd && !IS_ERR(ia->ri_pd))
WARN_ON(ib_dealloc_pd(ia->ri_pd));
}
/*
@ -693,8 +685,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
sendcq = ib_create_cq(ia->ri_id->device, rpcrdma_sendcq_upcall,
rpcrdma_cq_async_error_upcall, ep, &cq_attr);
sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
rpcrdma_cq_async_error_upcall, ep, &cq_attr);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
dprintk("RPC: %s: failed to create send CQ: %i\n",
@ -710,8 +702,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
}
cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
recvcq = ib_create_cq(ia->ri_id->device, rpcrdma_recvcq_upcall,
rpcrdma_cq_async_error_upcall, ep, &cq_attr);
recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
rpcrdma_cq_async_error_upcall, ep, &cq_attr);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
dprintk("RPC: %s: failed to create recv CQ: %i\n",
@ -817,8 +809,6 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
rpcrdma_flush_cqs(ep);
xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
ia->ri_ops->ro_reset(xprt);
id = rpcrdma_create_id(xprt, ia,
(struct sockaddr *)&xprt->rx_data.addr);
if (IS_ERR(id)) {
@ -832,7 +822,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
* More stuff I haven't thought of!
* Rrrgh!
*/
if (ia->ri_id->device != id->device) {
if (ia->ri_device != id->device) {
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
rdma_destroy_id(id);
@ -974,7 +964,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
goto out_free;
}
rep->rr_buffer = &r_xprt->rx_buf;
rep->rr_device = ia->ri_device;
rep->rr_rxprt = r_xprt;
return rep;
out_free:
@ -1098,31 +1089,33 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
kfree(buf->rb_pool);
}
/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
* some req segments uninitialized.
*/
static void
rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
struct rpcrdma_mw *
rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
{
if (*mw) {
list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
*mw = NULL;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mw *mw = NULL;
spin_lock(&buf->rb_mwlock);
if (!list_empty(&buf->rb_mws)) {
mw = list_first_entry(&buf->rb_mws,
struct rpcrdma_mw, mw_list);
list_del_init(&mw->mw_list);
}
spin_unlock(&buf->rb_mwlock);
if (!mw)
pr_err("RPC: %s: no MWs available\n", __func__);
return mw;
}
/* Cycle mw's back in reverse order, and "spin" them.
* This delays and scrambles reuse as much as possible.
*/
static void
rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
void
rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
{
struct rpcrdma_mr_seg *seg = req->rl_segments;
struct rpcrdma_mr_seg *seg1 = seg;
int i;
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
spin_lock(&buf->rb_mwlock);
list_add_tail(&mw->mw_list, &buf->rb_mws);
spin_unlock(&buf->rb_mwlock);
}
static void
@ -1132,115 +1125,10 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
req->rl_niovs = 0;
if (req->rl_reply) {
buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
req->rl_reply->rr_func = NULL;
req->rl_reply = NULL;
}
}
/* rpcrdma_unmap_one() was already done during deregistration.
* Redo only the ib_post_send().
*/
static void
rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt =
container_of(ia, struct rpcrdma_xprt, rx_ia);
struct ib_send_wr invalidate_wr, *bad_wr;
int rc;
dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
r->r.frmr.fr_state = FRMR_IS_INVALID;
memset(&invalidate_wr, 0, sizeof(invalidate_wr));
invalidate_wr.wr_id = (unsigned long)(void *)r;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
__func__, r, r->r.frmr.fr_mr->rkey);
read_lock(&ia->ri_qplock);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc) {
/* Force rpcrdma_buffer_get() to retry */
r->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: ib_post_send failed, %i\n",
__func__, rc);
}
}
static void
rpcrdma_retry_flushed_linv(struct list_head *stale,
struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
struct list_head *pos;
struct rpcrdma_mw *r;
unsigned long flags;
list_for_each(pos, stale) {
r = list_entry(pos, struct rpcrdma_mw, mw_list);
rpcrdma_retry_local_inv(r, ia);
}
spin_lock_irqsave(&buf->rb_lock, flags);
list_splice_tail(stale, &buf->rb_mws);
spin_unlock_irqrestore(&buf->rb_lock, flags);
}
static struct rpcrdma_req *
rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
struct list_head *stale)
{
struct rpcrdma_mw *r;
int i;
i = RPCRDMA_MAX_SEGS - 1;
while (!list_empty(&buf->rb_mws)) {
r = list_entry(buf->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
if (r->r.frmr.fr_state == FRMR_IS_STALE) {
list_add(&r->mw_list, stale);
continue;
}
req->rl_segments[i].rl_mw = r;
if (unlikely(i-- == 0))
return req; /* Success */
}
/* Not enough entries on rb_mws for this req */
rpcrdma_buffer_put_sendbuf(req, buf);
rpcrdma_buffer_put_mrs(req, buf);
return NULL;
}
static struct rpcrdma_req *
rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
{
struct rpcrdma_mw *r;
int i;
i = RPCRDMA_MAX_SEGS - 1;
while (!list_empty(&buf->rb_mws)) {
r = list_entry(buf->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
req->rl_segments[i].rl_mw = r;
if (unlikely(i-- == 0))
return req; /* Success */
}
/* Not enough entries on rb_mws for this req */
rpcrdma_buffer_put_sendbuf(req, buf);
rpcrdma_buffer_put_mrs(req, buf);
return NULL;
}
/*
* Get a set of request/reply buffers.
*
@ -1253,12 +1141,11 @@ rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
struct list_head stale;
struct rpcrdma_req *req;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
spin_unlock_irqrestore(&buffers->rb_lock, flags);
dprintk("RPC: %s: out of request buffers\n", __func__);
@ -1277,20 +1164,7 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
INIT_LIST_HEAD(&stale);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
break;
case RPCRDMA_MTHCAFMR:
req = rpcrdma_buffer_get_fmrs(req, buffers);
break;
default:
break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
if (!list_empty(&stale))
rpcrdma_retry_flushed_linv(&stale, buffers);
return req;
}
@ -1302,19 +1176,10 @@ void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
rpcrdma_buffer_put_sendbuf(req, buffers);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
case RPCRDMA_MTHCAFMR:
rpcrdma_buffer_put_mrs(req, buffers);
break;
default:
break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
}
@ -1344,10 +1209,9 @@ rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
struct rpcrdma_buffer *buffers = rep->rr_buffer;
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
unsigned long flags;
rep->rr_func = NULL;
spin_lock_irqsave(&buffers->rb_lock, flags);
buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
spin_unlock_irqrestore(&buffers->rb_lock, flags);
@ -1376,9 +1240,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
/*
* All memory passed here was kmalloc'ed, therefore phys-contiguous.
*/
iov->addr = ib_dma_map_single(ia->ri_id->device,
iov->addr = ib_dma_map_single(ia->ri_device,
va, len, DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(ia->ri_id->device, iov->addr))
if (ib_dma_mapping_error(ia->ri_device, iov->addr))
return -ENOMEM;
iov->length = len;
@ -1422,8 +1286,8 @@ rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
{
int rc;
ib_dma_unmap_single(ia->ri_id->device,
iov->addr, iov->length, DMA_BIDIRECTIONAL);
ib_dma_unmap_single(ia->ri_device,
iov->addr, iov->length, DMA_BIDIRECTIONAL);
if (NULL == mr)
return 0;
@ -1516,15 +1380,18 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
ib_dma_sync_single_for_device(ia->ri_id->device,
req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_id->device,
req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_id->device,
req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[3].addr,
req->rl_send_iov[3].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[1].addr,
req->rl_send_iov[1].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[0].addr,
req->rl_send_iov[0].length,
DMA_TO_DEVICE);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
@ -1557,7 +1424,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
recv_wr.num_sge = 1;
ib_dma_sync_single_for_cpu(ia->ri_id->device,
ib_dma_sync_single_for_cpu(ia->ri_device,
rdmab_addr(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf),
DMA_BIDIRECTIONAL);

View File

@ -62,6 +62,7 @@
struct rpcrdma_ia {
const struct rpcrdma_memreg_ops *ri_ops;
rwlock_t ri_qplock;
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
struct ib_mr *ri_bind_mem;
@ -69,7 +70,6 @@ struct rpcrdma_ia {
int ri_have_dma_lkey;
struct completion ri_done;
int ri_async_rc;
enum rpcrdma_memreg ri_memreg_strategy;
unsigned int ri_max_frmr_depth;
struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
@ -173,9 +173,8 @@ struct rpcrdma_buffer;
struct rpcrdma_rep {
unsigned int rr_len;
struct rpcrdma_buffer *rr_buffer;
struct rpc_xprt *rr_xprt;
void (*rr_func)(struct rpcrdma_rep *);
struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
struct list_head rr_list;
struct rpcrdma_regbuf *rr_rdmabuf;
};
@ -203,11 +202,18 @@ struct rpcrdma_frmr {
struct ib_fast_reg_page_list *fr_pgl;
struct ib_mr *fr_mr;
enum rpcrdma_frmr_state fr_state;
struct work_struct fr_work;
struct rpcrdma_xprt *fr_xprt;
};
struct rpcrdma_fmr {
struct ib_fmr *fmr;
u64 *physaddrs;
};
struct rpcrdma_mw {
union {
struct ib_fmr *fmr;
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
} r;
void (*mw_sendcompletion)(struct ib_wc *);
@ -281,15 +287,17 @@ rpcr_to_rdmar(struct rpc_rqst *rqst)
* One of these is associated with a transport instance
*/
struct rpcrdma_buffer {
spinlock_t rb_lock; /* protects indexes */
u32 rb_max_requests;/* client max requests */
struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */
struct list_head rb_all;
int rb_send_index;
spinlock_t rb_mwlock; /* protect rb_mws list */
struct list_head rb_mws;
struct list_head rb_all;
char *rb_pool;
spinlock_t rb_lock; /* protect buf arrays */
u32 rb_max_requests;
int rb_send_index;
int rb_recv_index;
struct rpcrdma_req **rb_send_bufs;
int rb_recv_index;
struct rpcrdma_rep **rb_recv_bufs;
char *rb_pool;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@ -350,7 +358,6 @@ struct rpcrdma_memreg_ops {
struct rpcrdma_create_data_internal *);
size_t (*ro_maxpages)(struct rpcrdma_xprt *);
int (*ro_init)(struct rpcrdma_xprt *);
void (*ro_reset)(struct rpcrdma_xprt *);
void (*ro_destroy)(struct rpcrdma_buffer *);
const char *ro_displayname;
};
@ -413,6 +420,8 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
void rpcrdma_buffer_put(struct rpcrdma_req *);
void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
@ -425,6 +434,9 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
int frwr_alloc_recovery_wq(void);
void frwr_destroy_recovery_wq(void);
/*
* Wrappers for chunk registration, shared by read/write chunk code.
*/

View File

@ -622,24 +622,6 @@ static int xs_udp_send_request(struct rpc_task *task)
return status;
}
/**
* xs_tcp_shutdown - gracefully shut down a TCP socket
* @xprt: transport
*
* Initiates a graceful shutdown of the TCP socket by calling the
* equivalent of shutdown(SHUT_RDWR);
*/
static void xs_tcp_shutdown(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct socket *sock = transport->sock;
if (sock != NULL) {
kernel_sock_shutdown(sock, SHUT_RDWR);
trace_rpc_socket_shutdown(xprt, sock);
}
}
/**
* xs_tcp_send_request - write an RPC request to a TCP socket
* @task: address of RPC task that manages the state of an RPC request
@ -786,6 +768,7 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt)
xs_sock_reset_connection_flags(xprt);
/* Mark transport as closed and wake up all pending tasks */
xprt_disconnect_done(xprt);
xprt_force_disconnect(xprt);
}
/**
@ -827,6 +810,9 @@ static void xs_reset_transport(struct sock_xprt *transport)
if (sk == NULL)
return;
if (atomic_read(&transport->xprt.swapper))
sk_clear_memalloc(sk);
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
transport->sock = NULL;
@ -863,6 +849,13 @@ static void xs_close(struct rpc_xprt *xprt)
xprt_disconnect_done(xprt);
}
static void xs_inject_disconnect(struct rpc_xprt *xprt)
{
dprintk("RPC: injecting transport disconnect on xprt=%p\n",
xprt);
xprt_disconnect_done(xprt);
}
static void xs_xprt_free(struct rpc_xprt *xprt)
{
xs_free_peer_addresses(xprt);
@ -901,7 +894,6 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
/**
* xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
* @sk: socket with data to read
* @len: how much data to read
*
* Currently this assumes we can read the whole reply in a single gulp.
*/
@ -965,7 +957,6 @@ static void xs_local_data_ready(struct sock *sk)
/**
* xs_udp_data_ready - "data ready" callback for UDP sockets
* @sk: socket with data to read
* @len: how much data to read
*
*/
static void xs_udp_data_ready(struct sock *sk)
@ -1389,7 +1380,6 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
* @bytes: how much data to read
*
*/
static void xs_tcp_data_ready(struct sock *sk)
@ -1886,9 +1876,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
/**
* xs_local_setup_socket - create AF_LOCAL socket, connect to a local endpoint
* @xprt: RPC transport to connect
* @transport: socket transport to connect
* @create_sock: function to create a socket of the correct type
*/
static int xs_local_setup_socket(struct sock_xprt *transport)
{
@ -1960,43 +1948,84 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
msleep_interruptible(15000);
}
#ifdef CONFIG_SUNRPC_SWAP
#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
/*
* Note that this should be called with XPRT_LOCKED held (or when we otherwise
* know that we have exclusive access to the socket), to guard against
* races with xs_reset_transport.
*/
static void xs_set_memalloc(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
xprt);
if (xprt->swapper)
/*
* If there's no sock, then we have nothing to set. The
* reconnecting process will get it for us.
*/
if (!transport->inet)
return;
if (atomic_read(&xprt->swapper))
sk_set_memalloc(transport->inet);
}
/**
* xs_swapper - Tag this transport as being used for swap.
* xs_enable_swap - Tag this transport as being used for swap.
* @xprt: transport to tag
* @enable: enable/disable
*
* Take a reference to this transport on behalf of the rpc_clnt, and
* optionally mark it for swapping if it wasn't already.
*/
int xs_swapper(struct rpc_xprt *xprt, int enable)
static int
xs_enable_swap(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
xprt);
int err = 0;
struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
if (enable) {
xprt->swapper++;
xs_set_memalloc(xprt);
} else if (xprt->swapper) {
xprt->swapper--;
sk_clear_memalloc(transport->inet);
}
return err;
if (atomic_inc_return(&xprt->swapper) != 1)
return 0;
if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
return -ERESTARTSYS;
if (xs->inet)
sk_set_memalloc(xs->inet);
xprt_release_xprt(xprt, NULL);
return 0;
}
/**
* xs_disable_swap - Untag this transport as being used for swap.
* @xprt: transport to tag
*
* Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
* swapper refcount goes to 0, untag the socket as a memalloc socket.
*/
static void
xs_disable_swap(struct rpc_xprt *xprt)
{
struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
if (!atomic_dec_and_test(&xprt->swapper))
return;
if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE))
return;
if (xs->inet)
sk_clear_memalloc(xs->inet);
xprt_release_xprt(xprt, NULL);
}
EXPORT_SYMBOL_GPL(xs_swapper);
#else
static void xs_set_memalloc(struct rpc_xprt *xprt)
{
}
static int
xs_enable_swap(struct rpc_xprt *xprt)
{
return -EINVAL;
}
static void
xs_disable_swap(struct rpc_xprt *xprt)
{
}
#endif
static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@ -2057,6 +2086,27 @@ static void xs_udp_setup_socket(struct work_struct *work)
xprt_wake_pending_tasks(xprt, status);
}
/**
* xs_tcp_shutdown - gracefully shut down a TCP socket
* @xprt: transport
*
* Initiates a graceful shutdown of the TCP socket by calling the
* equivalent of shutdown(SHUT_RDWR);
*/
static void xs_tcp_shutdown(struct rpc_xprt *xprt)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct socket *sock = transport->sock;
if (sock == NULL)
return;
if (xprt_connected(xprt)) {
kernel_sock_shutdown(sock, SHUT_RDWR);
trace_rpc_socket_shutdown(xprt, sock);
} else
xs_reset_transport(transport);
}
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@ -2067,6 +2117,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
unsigned int keepidle = xprt->timeout->to_initval / HZ;
unsigned int keepcnt = xprt->timeout->to_retries + 1;
unsigned int opt_on = 1;
unsigned int timeo;
/* TCP Keepalive options */
kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
@ -2078,6 +2129,12 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT,
(char *)&keepcnt, sizeof(keepcnt));
/* TCP user timeout (see RFC5482) */
timeo = jiffies_to_msecs(xprt->timeout->to_initval) *
(xprt->timeout->to_retries + 1);
kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
(char *)&timeo, sizeof(timeo));
write_lock_bh(&sk->sk_callback_lock);
xs_save_old_callbacks(transport, sk);
@ -2125,9 +2182,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
/**
* xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
* @xprt: RPC transport to connect
* @transport: socket transport to connect
* @create_sock: function to create a socket of the correct type
*
* Invoked by a work queue tasklet.
*/
@ -2463,6 +2517,8 @@ static struct rpc_xprt_ops xs_local_ops = {
.close = xs_close,
.destroy = xs_destroy,
.print_stats = xs_local_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
};
static struct rpc_xprt_ops xs_udp_ops = {
@ -2482,6 +2538,9 @@ static struct rpc_xprt_ops xs_udp_ops = {
.close = xs_close,
.destroy = xs_destroy,
.print_stats = xs_udp_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
.inject_disconnect = xs_inject_disconnect,
};
static struct rpc_xprt_ops xs_tcp_ops = {
@ -2498,6 +2557,9 @@ static struct rpc_xprt_ops xs_tcp_ops = {
.close = xs_tcp_shutdown,
.destroy = xs_destroy,
.print_stats = xs_tcp_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
.inject_disconnect = xs_inject_disconnect,
};
/*
@ -2515,6 +2577,9 @@ static struct rpc_xprt_ops bc_tcp_ops = {
.close = bc_close,
.destroy = bc_destroy,
.print_stats = xs_tcp_print_stats,
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
.inject_disconnect = xs_inject_disconnect,
};
static int xs_init_anyaddr(const int family, struct sockaddr *sap)