nfsd: implement pNFS layout recalls

Add support to issue layout recalls to clients.  For now we only support
full-file recalls to get a simple and stable implementation.  This allows
to embedd a nfsd4_callback structure in the layout_state and thus avoid
any memory allocations under spinlocks during a recall.  For normal
use cases that do not intent to share a single file between multiple
clients this implementation is fully sufficient.

To ensure layouts are recalled on local filesystem access each layout
state registers a new FL_LAYOUT lease with the kernel file locking code,
which filesystems that support pNFS exports that require recalls need
to break on conflicting access patterns.

The XDR code is based on the old pNFS server implementation by
Andy Adamson, Benny Halevy, Boaz Harrosh, Dean Hildebrand, Fred Isaman,
Marc Eshel, Mike Sager and Ricardo Labiaga.

Signed-off-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
Christoph Hellwig 2014-09-23 12:38:48 +02:00
parent 9cf514ccfa
commit c5c707f96f
6 changed files with 330 additions and 1 deletions

View File

@ -546,6 +546,102 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
return status;
}
#ifdef CONFIG_NFSD_PNFS
/*
* CB_LAYOUTRECALL4args
*
* struct layoutrecall_file4 {
* nfs_fh4 lor_fh;
* offset4 lor_offset;
* length4 lor_length;
* stateid4 lor_stateid;
* };
*
* union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
* case LAYOUTRECALL4_FILE:
* layoutrecall_file4 lor_layout;
* case LAYOUTRECALL4_FSID:
* fsid4 lor_fsid;
* case LAYOUTRECALL4_ALL:
* void;
* };
*
* struct CB_LAYOUTRECALL4args {
* layouttype4 clora_type;
* layoutiomode4 clora_iomode;
* bool clora_changed;
* layoutrecall4 clora_recall;
* };
*/
static void encode_cb_layout4args(struct xdr_stream *xdr,
const struct nfs4_layout_stateid *ls,
struct nfs4_cb_compound_hdr *hdr)
{
__be32 *p;
BUG_ON(hdr->minorversion == 0);
p = xdr_reserve_space(xdr, 5 * 4);
*p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
*p++ = cpu_to_be32(ls->ls_layout_type);
*p++ = cpu_to_be32(IOMODE_ANY);
*p++ = cpu_to_be32(1);
*p = cpu_to_be32(RETURN_FILE);
encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
p = xdr_reserve_space(xdr, 2 * 8);
p = xdr_encode_hyper(p, 0);
xdr_encode_hyper(p, NFS4_MAX_UINT64);
encode_stateid4(xdr, &ls->ls_recall_sid);
hdr->nops++;
}
static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
struct xdr_stream *xdr,
const struct nfsd4_callback *cb)
{
const struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
struct nfs4_cb_compound_hdr hdr = {
.ident = 0,
.minorversion = cb->cb_minorversion,
};
encode_cb_compound4args(xdr, &hdr);
encode_cb_sequence4args(xdr, cb, &hdr);
encode_cb_layout4args(xdr, ls, &hdr);
encode_cb_nops(&hdr);
}
static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
struct xdr_stream *xdr,
struct nfsd4_callback *cb)
{
struct nfs4_cb_compound_hdr hdr;
enum nfsstat4 nfserr;
int status;
status = decode_cb_compound4res(xdr, &hdr);
if (unlikely(status))
goto out;
if (cb) {
status = decode_cb_sequence4res(xdr, cb);
if (unlikely(status))
goto out;
}
status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
if (unlikely(status))
goto out;
if (unlikely(nfserr != NFS4_OK))
status = nfs_cb_stat_to_errno(nfserr);
out:
return status;
}
#endif /* CONFIG_NFSD_PNFS */
/*
* RPC procedure tables
*/
@ -563,6 +659,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
static struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NULL, NULL, cb_null, cb_null),
PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
#ifdef CONFIG_NFSD_PNFS
PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
#endif
};
static struct rpc_version nfs_cb_version4 = {

View File

@ -1,8 +1,11 @@
/*
* Copyright (c) 2014 Christoph Hellwig.
*/
#include <linux/kmod.h>
#include <linux/file.h>
#include <linux/jhash.h>
#include <linux/sched.h>
#include <linux/sunrpc/addr.h>
#include "pnfs.h"
#include "netns.h"
@ -18,6 +21,9 @@ struct nfs4_layout {
static struct kmem_cache *nfs4_layout_cache;
static struct kmem_cache *nfs4_layout_stateid_cache;
static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
};
@ -127,9 +133,42 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
list_del_init(&ls->ls_perfile);
spin_unlock(&fp->fi_lock);
vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
fput(ls->ls_file);
if (ls->ls_recalled)
atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
kmem_cache_free(nfs4_layout_stateid_cache, ls);
}
static int
nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
{
struct file_lock *fl;
int status;
fl = locks_alloc_lock();
if (!fl)
return -ENOMEM;
locks_init_lock(fl);
fl->fl_lmops = &nfsd4_layouts_lm_ops;
fl->fl_flags = FL_LAYOUT;
fl->fl_type = F_RDLCK;
fl->fl_end = OFFSET_MAX;
fl->fl_owner = ls;
fl->fl_pid = current->tgid;
fl->fl_file = ls->ls_file;
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
if (status) {
locks_free_lock(fl);
return status;
}
BUG_ON(fl != NULL);
return 0;
}
static struct nfs4_layout_stateid *
nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
struct nfs4_stid *parent, u32 layout_type)
@ -152,6 +191,20 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
spin_lock_init(&ls->ls_lock);
INIT_LIST_HEAD(&ls->ls_layouts);
ls->ls_layout_type = layout_type;
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
if (parent->sc_type == NFS4_DELEG_STID)
ls->ls_file = get_file(fp->fi_deleg_file);
else
ls->ls_file = find_any_file(fp);
BUG_ON(!ls->ls_file);
if (nfsd4_layout_setlease(ls)) {
put_nfs4_file(fp);
kmem_cache_free(nfs4_layout_stateid_cache, ls);
return NULL;
}
spin_lock(&clp->cl_lock);
stp->sc_type = NFS4_LAYOUT_STID;
@ -215,6 +268,27 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
return status;
}
static void
nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
{
spin_lock(&ls->ls_lock);
if (ls->ls_recalled)
goto out_unlock;
ls->ls_recalled = true;
atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
if (list_empty(&ls->ls_layouts))
goto out_unlock;
atomic_inc(&ls->ls_stid.sc_count);
update_stateid(&ls->ls_stid.sc_stateid);
memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
spin_unlock(&ls->ls_lock);
}
static inline u64
layout_end(struct nfsd4_layout_seg *seg)
{
@ -258,18 +332,44 @@ layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
return true;
}
static __be32
nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
{
struct nfs4_file *fp = ls->ls_stid.sc_file;
struct nfs4_layout_stateid *l, *n;
__be32 nfserr = nfs_ok;
assert_spin_locked(&fp->fi_lock);
list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
if (l != ls) {
nfsd4_recall_file_layout(l);
nfserr = nfserr_recallconflict;
}
}
return nfserr;
}
__be32
nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
{
struct nfsd4_layout_seg *seg = &lgp->lg_seg;
struct nfs4_file *fp = ls->ls_stid.sc_file;
struct nfs4_layout *lp, *new = NULL;
__be32 nfserr;
spin_lock(&fp->fi_lock);
nfserr = nfsd4_recall_conflict(ls);
if (nfserr)
goto out;
spin_lock(&ls->ls_lock);
list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
if (layouts_try_merge(&lp->lo_seg, seg))
goto done;
}
spin_unlock(&ls->ls_lock);
spin_unlock(&fp->fi_lock);
new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
if (!new)
@ -277,6 +377,10 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
new->lo_state = ls;
spin_lock(&fp->fi_lock);
nfserr = nfsd4_recall_conflict(ls);
if (nfserr)
goto out;
spin_lock(&ls->ls_lock);
list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
if (layouts_try_merge(&lp->lo_seg, seg))
@ -290,9 +394,11 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
update_stateid(&ls->ls_stid.sc_stateid);
memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
spin_unlock(&ls->ls_lock);
out:
spin_unlock(&fp->fi_lock);
if (new)
kmem_cache_free(nfs4_layout_cache, new);
return nfs_ok;
return nfserr;
}
static void
@ -448,6 +554,112 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
nfsd4_free_layouts(&reaplist);
}
static void
nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
{
struct nfs4_client *clp = ls->ls_stid.sc_client;
char addr_str[INET6_ADDRSTRLEN];
static char *envp[] = {
"HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
char *argv[8];
int error;
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
" Fencing..\n", addr_str);
argv[0] = "/sbin/nfsd-recall-failed";
argv[1] = addr_str;
argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
argv[3] = NULL;
error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
if (error) {
printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
addr_str, error);
}
}
static int
nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
LIST_HEAD(reaplist);
switch (task->tk_status) {
case 0:
return 1;
case -NFS4ERR_NOMATCHING_LAYOUT:
task->tk_status = 0;
return 1;
case -NFS4ERR_DELAY:
/* Poll the client until it's done with the layout */
/* FIXME: cap number of retries.
* The pnfs standard states that we need to only expire
* the client after at-least "lease time" .eg lease-time * 2
* when failing to communicate a recall
*/
rpc_delay(task, HZ/100); /* 10 mili-seconds */
return 0;
default:
/*
* Unknown error or non-responding client, we'll need to fence.
*/
nfsd4_cb_layout_fail(ls);
return -1;
}
}
static void
nfsd4_cb_layout_release(struct nfsd4_callback *cb)
{
struct nfs4_layout_stateid *ls =
container_of(cb, struct nfs4_layout_stateid, ls_recall);
LIST_HEAD(reaplist);
nfsd4_return_all_layouts(ls, &reaplist);
nfsd4_free_layouts(&reaplist);
nfs4_put_stid(&ls->ls_stid);
}
static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
};
static bool
nfsd4_layout_lm_break(struct file_lock *fl)
{
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a layout isn't returned
* in time:
*/
fl->fl_break_time = 0;
nfsd4_recall_file_layout(fl->fl_owner);
return false;
}
static int
nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
struct list_head *dispose)
{
BUG_ON(!(arg & F_UNLCK));
return lease_modify(onlist, arg, dispose);
}
static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
.lm_break = nfsd4_layout_lm_break,
.lm_change = nfsd4_layout_lm_change,
};
int
nfsd4_init_pnfs(void)
{

View File

@ -1301,6 +1301,10 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
if (nfserr)
goto out;
nfserr = nfserr_recallconflict;
if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
goto out_put_stid;
nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
current_fh, lgp);
if (nfserr)

View File

@ -3065,6 +3065,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
memset(fp->fi_access, 0, sizeof(fp->fi_access));
#ifdef CONFIG_NFSD_PNFS
INIT_LIST_HEAD(&fp->fi_lo_states);
atomic_set(&fp->fi_lo_recalls, 0);
#endif
hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}

View File

@ -502,6 +502,7 @@ struct nfs4_file {
bool fi_had_conflict;
#ifdef CONFIG_NFSD_PNFS
struct list_head fi_lo_states;
atomic_t fi_lo_recalls;
#endif
};
@ -542,6 +543,10 @@ struct nfs4_layout_stateid {
spinlock_t ls_lock;
struct list_head ls_layouts;
u32 ls_layout_type;
struct file *ls_file;
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
@ -556,6 +561,7 @@ static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
NFSPROC4_CLNT_CB_LAYOUT,
NFSPROC4_CLNT_CB_SEQUENCE,
};

View File

@ -21,3 +21,10 @@
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
cb_sequence_enc_sz + \
1 + 3 + \
enc_nfs4_fh_sz + 4)
#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)