Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs:
  xfs: use proper interfaces for on-stack plugging
  xfs: fix xfs_debug warnings
  xfs: fix variable set but not used warnings
  xfs: convert log tail checking to a warning
  xfs: catch bad block numbers freeing extents.
  xfs: push the AIL from memory reclaim and periodic sync
  xfs: clean up code layout in xfs_trans_ail.c
  xfs: convert the xfsaild threads to a workqueue
  xfs: introduce background inode reclaim work
  xfs: convert ENOSPC inode flushing to use new syncd workqueue
  xfs: introduce a xfssyncd workqueue
  xfs: fix extent format buffer allocation size
  xfs: fix unreferenced var error in xfs_buf.c

Also, applied patch from Tony Luck that fixes ia64:
  xfs_destroy_workqueues() should not be tagged with__exit
in the branch before merging.
This commit is contained in:
Linus Torvalds
2011-04-11 15:48:57 -07:00
17 changed files with 531 additions and 507 deletions

View File

@ -22,6 +22,7 @@
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@ -39,6 +40,8 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
/*
* The inode lookup is done in batches to keep the amount of lock traffic and
* radix tree lookups to a minimum. The batch size is a trade off between
@ -431,62 +434,12 @@ xfs_quiesce_attr(
xfs_unmountfs_writesb(mp);
}
/*
* Enqueue a work item to be picked up by the vfs xfssyncd thread.
* Doing this has two advantages:
* - It saves on stack space, which is tight in certain situations
* - It can be used (with care) as a mechanism to avoid deadlocks.
* Flushing while allocating in a full filesystem requires both.
*/
STATIC void
xfs_syncd_queue_work(
struct xfs_mount *mp,
void *data,
void (*syncer)(struct xfs_mount *, void *),
struct completion *completion)
static void
xfs_syncd_queue_sync(
struct xfs_mount *mp)
{
struct xfs_sync_work *work;
work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
INIT_LIST_HEAD(&work->w_list);
work->w_syncer = syncer;
work->w_data = data;
work->w_mount = mp;
work->w_completion = completion;
spin_lock(&mp->m_sync_lock);
list_add_tail(&work->w_list, &mp->m_sync_list);
spin_unlock(&mp->m_sync_lock);
wake_up_process(mp->m_sync_task);
}
/*
* Flush delayed allocate data, attempting to free up reserved space
* from existing allocations. At this point a new allocation attempt
* has failed with ENOSPC and we are in the process of scratching our
* heads, looking about for more room...
*/
STATIC void
xfs_flush_inodes_work(
struct xfs_mount *mp,
void *arg)
{
struct inode *inode = arg;
xfs_sync_data(mp, SYNC_TRYLOCK);
xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
iput(inode);
}
void
xfs_flush_inodes(
xfs_inode_t *ip)
{
struct inode *inode = VFS_I(ip);
DECLARE_COMPLETION_ONSTACK(completion);
igrab(inode);
xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
wait_for_completion(&completion);
xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
msecs_to_jiffies(xfs_syncd_centisecs * 10));
}
/*
@ -496,9 +449,10 @@ xfs_flush_inodes(
*/
STATIC void
xfs_sync_worker(
struct xfs_mount *mp,
void *unused)
struct work_struct *work)
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_sync_work);
int error;
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@ -508,73 +462,106 @@ xfs_sync_worker(
error = xfs_fs_log_dummy(mp);
else
xfs_log_force(mp, 0);
xfs_reclaim_inodes(mp, 0);
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
/* start pushing all the metadata that is currently dirty */
xfs_ail_push_all(mp->m_ail);
}
mp->m_sync_seq++;
wake_up(&mp->m_wait_single_sync_task);
/* queue us up again */
xfs_syncd_queue_sync(mp);
}
STATIC int
xfssyncd(
void *arg)
/*
* Queue a new inode reclaim pass if there are reclaimable inodes and there
* isn't a reclaim pass already in progress. By default it runs every 5s based
* on the xfs syncd work default of 30s. Perhaps this should have it's own
* tunable, but that can be done if this method proves to be ineffective or too
* aggressive.
*/
static void
xfs_syncd_queue_reclaim(
struct xfs_mount *mp)
{
struct xfs_mount *mp = arg;
long timeleft;
xfs_sync_work_t *work, *n;
LIST_HEAD (tmp);
set_freezable();
timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
for (;;) {
if (list_empty(&mp->m_sync_list))
timeleft = schedule_timeout_interruptible(timeleft);
/* swsusp */
try_to_freeze();
if (kthread_should_stop() && list_empty(&mp->m_sync_list))
break;
/*
* We can have inodes enter reclaim after we've shut down the syncd
* workqueue during unmount, so don't allow reclaim work to be queued
* during unmount.
*/
if (!(mp->m_super->s_flags & MS_ACTIVE))
return;
spin_lock(&mp->m_sync_lock);
/*
* We can get woken by laptop mode, to do a sync -
* that's the (only!) case where the list would be
* empty with time remaining.
*/
if (!timeleft || list_empty(&mp->m_sync_list)) {
if (!timeleft)
timeleft = xfs_syncd_centisecs *
msecs_to_jiffies(10);
INIT_LIST_HEAD(&mp->m_sync_work.w_list);
list_add_tail(&mp->m_sync_work.w_list,
&mp->m_sync_list);
}
list_splice_init(&mp->m_sync_list, &tmp);
spin_unlock(&mp->m_sync_lock);
list_for_each_entry_safe(work, n, &tmp, w_list) {
(*work->w_syncer)(mp, work->w_data);
list_del(&work->w_list);
if (work == &mp->m_sync_work)
continue;
if (work->w_completion)
complete(work->w_completion);
kmem_free(work);
}
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
rcu_read_unlock();
}
return 0;
/*
* This is a fast pass over the inode cache to try to get reclaim moving on as
* many inodes as possible in a short period of time. It kicks itself every few
* seconds, as well as being kicked by the inode cache shrinker when memory
* goes low. It scans as quickly as possible avoiding locked inodes or those
* already being flushed, and once done schedules a future pass.
*/
STATIC void
xfs_reclaim_worker(
struct work_struct *work)
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
xfs_syncd_queue_reclaim(mp);
}
/*
* Flush delayed allocate data, attempting to free up reserved space
* from existing allocations. At this point a new allocation attempt
* has failed with ENOSPC and we are in the process of scratching our
* heads, looking about for more room.
*
* Queue a new data flush if there isn't one already in progress and
* wait for completion of the flush. This means that we only ever have one
* inode flush in progress no matter how many ENOSPC events are occurring and
* so will prevent the system from bogging down due to every concurrent
* ENOSPC event scanning all the active inodes in the system for writeback.
*/
void
xfs_flush_inodes(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
queue_work(xfs_syncd_wq, &mp->m_flush_work);
flush_work_sync(&mp->m_flush_work);
}
STATIC void
xfs_flush_worker(
struct work_struct *work)
{
struct xfs_mount *mp = container_of(work,
struct xfs_mount, m_flush_work);
xfs_sync_data(mp, SYNC_TRYLOCK);
xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
}
int
xfs_syncd_init(
struct xfs_mount *mp)
{
mp->m_sync_work.w_syncer = xfs_sync_worker;
mp->m_sync_work.w_mount = mp;
mp->m_sync_work.w_completion = NULL;
mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
if (IS_ERR(mp->m_sync_task))
return -PTR_ERR(mp->m_sync_task);
INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
xfs_syncd_queue_sync(mp);
xfs_syncd_queue_reclaim(mp);
return 0;
}
@ -582,7 +569,9 @@ void
xfs_syncd_stop(
struct xfs_mount *mp)
{
kthread_stop(mp->m_sync_task);
cancel_delayed_work_sync(&mp->m_sync_work);
cancel_delayed_work_sync(&mp->m_reclaim_work);
cancel_work_sync(&mp->m_flush_work);
}
void
@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
/* schedule periodic background inode reclaim */
xfs_syncd_queue_reclaim(ip->i_mount);
trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
}
@ -1017,7 +1010,13 @@ xfs_reclaim_inodes(
}
/*
* Shrinker infrastructure.
* Inode cache shrinker.
*
* When called we make sure that there is a background (fast) inode reclaim in
* progress, while we will throttle the speed of reclaim via doiing synchronous
* reclaim of inodes. That means if we come across dirty inodes, we wait for
* them to be cleaned, which we hope will not be very long due to the
* background walker having already kicked the IO off on those dirty inodes.
*/
static int
xfs_reclaim_inode_shrink(
@ -1032,10 +1031,15 @@ xfs_reclaim_inode_shrink(
mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
/* kick background reclaimer and push the AIL */
xfs_syncd_queue_reclaim(mp);
xfs_ail_push_all(mp->m_ail);
if (!(gfp_mask & __GFP_FS))
return -1;
xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
&nr_to_scan);
/* terminate if we don't exhaust the scan */
if (nr_to_scan > 0)
return -1;