Merge branch 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6
* 'for-linus' of git://oss.sgi.com:8090/xfs/xfs-2.6: [XFS] Fix memory corruption with small buffer reads [XFS] Fix inode list allocation size in writeback. [XFS] Don't allow memory reclaim to wait on the filesystem in inode [XFS] Fix fsync() b0rkage. [XFS] Include linux/random.h in all builds, not just debug builds.
This commit is contained in:
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
|
|||||||
if (unlikely(page == NULL)) {
|
if (unlikely(page == NULL)) {
|
||||||
if (flags & XBF_READ_AHEAD) {
|
if (flags & XBF_READ_AHEAD) {
|
||||||
bp->b_page_count = i;
|
bp->b_page_count = i;
|
||||||
|
for (i = 0; i < bp->b_page_count; i++)
|
||||||
|
unlock_page(bp->b_pages[i]);
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
|
|||||||
ASSERT(!PagePrivate(page));
|
ASSERT(!PagePrivate(page));
|
||||||
if (!PageUptodate(page)) {
|
if (!PageUptodate(page)) {
|
||||||
page_count--;
|
page_count--;
|
||||||
if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
|
if (blocksize >= PAGE_CACHE_SIZE) {
|
||||||
|
if (flags & XBF_READ)
|
||||||
|
bp->b_flags |= _XBF_PAGE_LOCKED;
|
||||||
|
} else if (!PagePrivate(page)) {
|
||||||
if (test_page_region(page, offset, nbytes))
|
if (test_page_region(page, offset, nbytes))
|
||||||
page_count++;
|
page_count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unlock_page(page);
|
|
||||||
bp->b_pages[i] = page;
|
bp->b_pages[i] = page;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
|
||||||
|
for (i = 0; i < bp->b_page_count; i++)
|
||||||
|
unlock_page(bp->b_pages[i]);
|
||||||
|
}
|
||||||
|
|
||||||
if (page_count == bp->b_page_count)
|
if (page_count == bp->b_page_count)
|
||||||
bp->b_flags |= XBF_DONE;
|
bp->b_flags |= XBF_DONE;
|
||||||
|
|
||||||
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
|
|||||||
bp->b_count_desired = len;
|
bp->b_count_desired = len;
|
||||||
bp->b_buffer_length = buflen;
|
bp->b_buffer_length = buflen;
|
||||||
bp->b_flags |= XBF_MAPPED;
|
bp->b_flags |= XBF_MAPPED;
|
||||||
|
bp->b_flags &= ~_XBF_PAGE_LOCKED;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -1093,9 +1103,11 @@ _xfs_buf_ioend(
|
|||||||
xfs_buf_t *bp,
|
xfs_buf_t *bp,
|
||||||
int schedule)
|
int schedule)
|
||||||
{
|
{
|
||||||
if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
|
if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
|
||||||
|
bp->b_flags &= ~_XBF_PAGE_LOCKED;
|
||||||
xfs_buf_ioend(bp, schedule);
|
xfs_buf_ioend(bp, schedule);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
STATIC void
|
STATIC void
|
||||||
xfs_buf_bio_end_io(
|
xfs_buf_bio_end_io(
|
||||||
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
|
|||||||
|
|
||||||
if (--bvec >= bio->bi_io_vec)
|
if (--bvec >= bio->bi_io_vec)
|
||||||
prefetchw(&bvec->bv_page->flags);
|
prefetchw(&bvec->bv_page->flags);
|
||||||
|
|
||||||
|
if (bp->b_flags & _XBF_PAGE_LOCKED)
|
||||||
|
unlock_page(page);
|
||||||
} while (bvec >= bio->bi_io_vec);
|
} while (bvec >= bio->bi_io_vec);
|
||||||
|
|
||||||
_xfs_buf_ioend(bp, 1);
|
_xfs_buf_ioend(bp, 1);
|
||||||
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
|
|||||||
* filesystem block size is not smaller than the page size.
|
* filesystem block size is not smaller than the page size.
|
||||||
*/
|
*/
|
||||||
if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
|
if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
|
||||||
(bp->b_flags & XBF_READ) &&
|
((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
|
||||||
|
(XBF_READ|_XBF_PAGE_LOCKED)) &&
|
||||||
(blocksize >= PAGE_CACHE_SIZE)) {
|
(blocksize >= PAGE_CACHE_SIZE)) {
|
||||||
bio = bio_alloc(GFP_NOIO, 1);
|
bio = bio_alloc(GFP_NOIO, 1);
|
||||||
|
|
||||||
|
@@ -66,6 +66,25 @@ typedef enum {
|
|||||||
_XBF_PAGES = (1 << 18), /* backed by refcounted pages */
|
_XBF_PAGES = (1 << 18), /* backed by refcounted pages */
|
||||||
_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
|
_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
|
||||||
_XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
|
_XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Special flag for supporting metadata blocks smaller than a FSB.
|
||||||
|
*
|
||||||
|
* In this case we can have multiple xfs_buf_t on a single page and
|
||||||
|
* need to lock out concurrent xfs_buf_t readers as they only
|
||||||
|
* serialise access to the buffer.
|
||||||
|
*
|
||||||
|
* If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
|
||||||
|
* between reads of the page. Hence we can have one thread read the
|
||||||
|
* page and modify it, but then race with another thread that thinks
|
||||||
|
* the page is not up-to-date and hence reads it again.
|
||||||
|
*
|
||||||
|
* The result is that the first modifcation to the page is lost.
|
||||||
|
* This sort of AGF/AGI reading race can happen when unlinking inodes
|
||||||
|
* that require truncation and results in the AGI unlinked list
|
||||||
|
* modifications being lost.
|
||||||
|
*/
|
||||||
|
_XBF_PAGE_LOCKED = (1 << 22),
|
||||||
} xfs_buf_flags_t;
|
} xfs_buf_flags_t;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
@@ -184,19 +184,24 @@ xfs_file_release(
|
|||||||
return -xfs_release(XFS_I(inode));
|
return -xfs_release(XFS_I(inode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We ignore the datasync flag here because a datasync is effectively
|
||||||
|
* identical to an fsync. That is, datasync implies that we need to write
|
||||||
|
* only the metadata needed to be able to access the data that is written
|
||||||
|
* if we crash after the call completes. Hence if we are writing beyond
|
||||||
|
* EOF we have to log the inode size change as well, which makes it a
|
||||||
|
* full fsync. If we don't write beyond EOF, the inode core will be
|
||||||
|
* clean in memory and so we don't need to log the inode, just like
|
||||||
|
* fsync.
|
||||||
|
*/
|
||||||
STATIC int
|
STATIC int
|
||||||
xfs_file_fsync(
|
xfs_file_fsync(
|
||||||
struct file *filp,
|
struct file *filp,
|
||||||
struct dentry *dentry,
|
struct dentry *dentry,
|
||||||
int datasync)
|
int datasync)
|
||||||
{
|
{
|
||||||
int flags = FSYNC_WAIT;
|
|
||||||
|
|
||||||
if (datasync)
|
|
||||||
flags |= FSYNC_DATA;
|
|
||||||
xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
|
xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
|
||||||
return -xfs_fsync(XFS_I(dentry->d_inode), flags,
|
return -xfs_fsync(XFS_I(dentry->d_inode));
|
||||||
(xfs_off_t)0, (xfs_off_t)-1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@@ -229,14 +229,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
|
|||||||
#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */
|
#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */
|
||||||
#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
|
#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
|
||||||
|
|
||||||
/*
|
|
||||||
* Flags to vop_fsync/reclaim.
|
|
||||||
*/
|
|
||||||
#define FSYNC_NOWAIT 0 /* asynchronous flush */
|
|
||||||
#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */
|
|
||||||
#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */
|
|
||||||
#define FSYNC_DATA 0x4 /* synchronous fsync of data only */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Tracking vnode activity.
|
* Tracking vnode activity.
|
||||||
*/
|
*/
|
||||||
|
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
|
|||||||
xfs_mount_t *mp = ip->i_mount;
|
xfs_mount_t *mp = ip->i_mount;
|
||||||
xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
|
xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
|
||||||
unsigned long first_index, mask;
|
unsigned long first_index, mask;
|
||||||
|
unsigned long inodes_per_cluster;
|
||||||
int ilist_size;
|
int ilist_size;
|
||||||
xfs_inode_t **ilist;
|
xfs_inode_t **ilist;
|
||||||
xfs_inode_t *iq;
|
xfs_inode_t *iq;
|
||||||
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
|
|||||||
ASSERT(pag->pagi_inodeok);
|
ASSERT(pag->pagi_inodeok);
|
||||||
ASSERT(pag->pag_ici_init);
|
ASSERT(pag->pag_ici_init);
|
||||||
|
|
||||||
ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
|
inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
|
||||||
ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
|
ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
|
||||||
|
ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
|
||||||
if (!ilist)
|
if (!ilist)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
|
|||||||
read_lock(&pag->pag_ici_lock);
|
read_lock(&pag->pag_ici_lock);
|
||||||
/* really need a gang lookup range call here */
|
/* really need a gang lookup range call here */
|
||||||
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
|
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
|
||||||
first_index,
|
first_index, inodes_per_cluster);
|
||||||
XFS_INODE_CLUSTER_SIZE(mp));
|
|
||||||
if (nr_found == 0)
|
if (nr_found == 0)
|
||||||
goto out_free;
|
goto out_free;
|
||||||
|
|
||||||
|
@@ -856,18 +856,14 @@ xfs_readlink(
|
|||||||
/*
|
/*
|
||||||
* xfs_fsync
|
* xfs_fsync
|
||||||
*
|
*
|
||||||
* This is called to sync the inode and its data out to disk.
|
* This is called to sync the inode and its data out to disk. We need to hold
|
||||||
* We need to hold the I/O lock while flushing the data, and
|
* the I/O lock while flushing the data, and the inode lock while flushing the
|
||||||
* the inode lock while flushing the inode. The inode lock CANNOT
|
* inode. The inode lock CANNOT be held while flushing the data, so acquire
|
||||||
* be held while flushing the data, so acquire after we're done
|
* after we're done with that.
|
||||||
* with that.
|
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
xfs_fsync(
|
xfs_fsync(
|
||||||
xfs_inode_t *ip,
|
xfs_inode_t *ip)
|
||||||
int flag,
|
|
||||||
xfs_off_t start,
|
|
||||||
xfs_off_t stop)
|
|
||||||
{
|
{
|
||||||
xfs_trans_t *tp;
|
xfs_trans_t *tp;
|
||||||
int error;
|
int error;
|
||||||
@@ -875,102 +871,78 @@ xfs_fsync(
|
|||||||
|
|
||||||
xfs_itrace_entry(ip);
|
xfs_itrace_entry(ip);
|
||||||
|
|
||||||
ASSERT(start >= 0 && stop >= -1);
|
|
||||||
|
|
||||||
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
|
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
|
||||||
return XFS_ERROR(EIO);
|
return XFS_ERROR(EIO);
|
||||||
|
|
||||||
if (flag & FSYNC_DATA)
|
/* capture size updates in I/O completion before writing the inode. */
|
||||||
filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
|
error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
|
||||||
|
if (error)
|
||||||
|
return XFS_ERROR(error);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We always need to make sure that the required inode state
|
* We always need to make sure that the required inode state is safe on
|
||||||
* is safe on disk. The vnode might be clean but because
|
* disk. The vnode might be clean but we still might need to force the
|
||||||
* of committed transactions that haven't hit the disk yet.
|
* log because of committed transactions that haven't hit the disk yet.
|
||||||
* Likewise, there could be unflushed non-transactional
|
* Likewise, there could be unflushed non-transactional changes to the
|
||||||
* changes to the inode core that have to go to disk.
|
* inode core that have to go to disk and this requires us to issue
|
||||||
|
* a synchronous transaction to capture these changes correctly.
|
||||||
*
|
*
|
||||||
* The following code depends on one assumption: that
|
* This code relies on the assumption that if the update_* fields
|
||||||
* any transaction that changes an inode logs the core
|
* of the inode are clear and the inode is unpinned then it is clean
|
||||||
* because it has to change some field in the inode core
|
* and no action is required.
|
||||||
* (typically nextents or nblocks). That assumption
|
|
||||||
* implies that any transactions against an inode will
|
|
||||||
* catch any non-transactional updates. If inode-altering
|
|
||||||
* transactions exist that violate this assumption, the
|
|
||||||
* code breaks. Right now, it figures that if the involved
|
|
||||||
* update_* field is clear and the inode is unpinned, the
|
|
||||||
* inode is clean. Either it's been flushed or it's been
|
|
||||||
* committed and the commit has hit the disk unpinning the inode.
|
|
||||||
* (Note that xfs_inode_item_format() called at commit clears
|
|
||||||
* the update_* fields.)
|
|
||||||
*/
|
*/
|
||||||
xfs_ilock(ip, XFS_ILOCK_SHARED);
|
xfs_ilock(ip, XFS_ILOCK_SHARED);
|
||||||
|
|
||||||
/* If we are flushing data then we care about update_size
|
if (!(ip->i_update_size || ip->i_update_core)) {
|
||||||
* being set, otherwise we care about update_core
|
|
||||||
*/
|
|
||||||
if ((flag & FSYNC_DATA) ?
|
|
||||||
(ip->i_update_size == 0) :
|
|
||||||
(ip->i_update_core == 0)) {
|
|
||||||
/*
|
/*
|
||||||
* Timestamps/size haven't changed since last inode
|
* Timestamps/size haven't changed since last inode flush or
|
||||||
* flush or inode transaction commit. That means
|
* inode transaction commit. That means either nothing got
|
||||||
* either nothing got written or a transaction
|
* written or a transaction committed which caught the updates.
|
||||||
* committed which caught the updates. If the
|
* If the latter happened and the transaction hasn't hit the
|
||||||
* latter happened and the transaction hasn't
|
* disk yet, the inode will be still be pinned. If it is,
|
||||||
* hit the disk yet, the inode will be still
|
* force the log.
|
||||||
* be pinned. If it is, force the log.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
||||||
|
|
||||||
if (xfs_ipincount(ip)) {
|
if (xfs_ipincount(ip)) {
|
||||||
_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
|
error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
|
||||||
XFS_LOG_FORCE |
|
XFS_LOG_FORCE | XFS_LOG_SYNC,
|
||||||
((flag & FSYNC_WAIT)
|
|
||||||
? XFS_LOG_SYNC : 0),
|
|
||||||
&log_flushed);
|
&log_flushed);
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* If the inode is not pinned and nothing
|
* If the inode is not pinned and nothing has changed
|
||||||
* has changed we don't need to flush the
|
* we don't need to flush the cache.
|
||||||
* cache.
|
|
||||||
*/
|
*/
|
||||||
changed = 0;
|
changed = 0;
|
||||||
}
|
}
|
||||||
error = 0;
|
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* Kick off a transaction to log the inode
|
* Kick off a transaction to log the inode core to get the
|
||||||
* core to get the updates. Make it
|
* updates. The sync transaction will also force the log.
|
||||||
* sync if FSYNC_WAIT is passed in (which
|
|
||||||
* is done by everybody but specfs). The
|
|
||||||
* sync transaction will also force the log.
|
|
||||||
*/
|
*/
|
||||||
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
xfs_iunlock(ip, XFS_ILOCK_SHARED);
|
||||||
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
|
tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
|
||||||
if ((error = xfs_trans_reserve(tp, 0,
|
error = xfs_trans_reserve(tp, 0,
|
||||||
XFS_FSYNC_TS_LOG_RES(ip->i_mount),
|
XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
|
||||||
0, 0, 0))) {
|
if (error) {
|
||||||
xfs_trans_cancel(tp, 0);
|
xfs_trans_cancel(tp, 0);
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note - it's possible that we might have pushed
|
* Note - it's possible that we might have pushed ourselves out
|
||||||
* ourselves out of the way during trans_reserve
|
* of the way during trans_reserve which would flush the inode.
|
||||||
* which would flush the inode. But there's no
|
* But there's no guarantee that the inode buffer has actually
|
||||||
* guarantee that the inode buffer has actually
|
* gone out yet (it's delwri). Plus the buffer could be pinned
|
||||||
* gone out yet (it's delwri). Plus the buffer
|
* anyway if it's part of an inode in another recent
|
||||||
* could be pinned anyway if it's part of an
|
* transaction. So we play it safe and fire off the
|
||||||
* inode in another recent transaction. So we
|
* transaction anyway.
|
||||||
* play it safe and fire off the transaction anyway.
|
|
||||||
*/
|
*/
|
||||||
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
|
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
|
||||||
xfs_trans_ihold(tp, ip);
|
xfs_trans_ihold(tp, ip);
|
||||||
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
|
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
|
||||||
if (flag & FSYNC_WAIT)
|
|
||||||
xfs_trans_set_sync(tp);
|
xfs_trans_set_sync(tp);
|
||||||
error = _xfs_trans_commit(tp, 0, &log_flushed);
|
error = _xfs_trans_commit(tp, 0, &log_flushed);
|
||||||
|
|
||||||
|
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
|
|||||||
int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
|
int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
|
||||||
struct cred *credp);
|
struct cred *credp);
|
||||||
int xfs_readlink(struct xfs_inode *ip, char *link);
|
int xfs_readlink(struct xfs_inode *ip, char *link);
|
||||||
int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
|
int xfs_fsync(struct xfs_inode *ip);
|
||||||
xfs_off_t stop);
|
|
||||||
int xfs_release(struct xfs_inode *ip);
|
int xfs_release(struct xfs_inode *ip);
|
||||||
int xfs_inactive(struct xfs_inode *ip);
|
int xfs_inactive(struct xfs_inode *ip);
|
||||||
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
|
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
|
||||||
|
Reference in New Issue
Block a user