Contained in this update:

- Fix race conditions in the CoW code
 - Fix some incorrect input validation checks
 - Avoid crashing fs by running out of space when freeing inodes
 - Fix toctou race wrt whether or not an inode has an attr
 - Fix build error on arm
 - Fix page refcount corruption when readahead fails
 - Don't corrupt userspace in the bmap ioctl
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABCgAGBQJYi4RJAAoJEPh/dxk0SrTrsU4QAIZBUUSpvpwggyfbTcOb6QWb
 F/vBoj50f+cxZB2jxLrch4aRsvlhW6IkRnsKiciG4cQaDbjPbjhSMH4JEUUqrPvG
 TRTcBvT6rEbxnB3adIH2DVrDAaEEVpRSkPaV/vLjYEfJp8Nyv8yXb6U8zH//NeZQ
 Pwrwe0RX0bJRyAEBIRwBnTayMP6xIccCE9Ml7+sMG/UVnb0Fa8t5zg/9igfd0MrB
 xf3MdkW0fpFCcp1Bbby8cnDmTjjxEtB6OApL82UnSZ7l2/U5AHhiA0NgHreYuzt8
 47ezqEQfk+IWK5LY1c6V/vARKhVvM738jS2dG1tsFhnbTbq9yXA2yiCMdA+sKB7+
 wlRIuTq7tuhN4Lk/9eheXHR4xHKDbOKY+zWEWi/AlFRaWmld0otMykVC6wbp6soo
 1gYgbaCjJcJcResKYAdby92jqvIRONqknpUF2L0jOiGIPgz8rmjA6BIvymjaXEuO
 4MLfSjeVP4Ip2tDcaa0R3dSQ40lP778UQNiuqcKb1WODx1AyljJB+gemK0jE8kwN
 OEY7IBSs+wP/UBYN+XbYhoIGlJ4ckwyhIctl4bMvVOduQ40uASlyQS6hmqng5Df/
 NIFd+fCwuBCa45mwUJ2LPTzx3WndMyLv30z/ladtshV+WlUbu60yTzT4bIiQDcpZ
 CYALhDjBiCHzrs6rIhxW
 =Co7t
 -----END PGP SIGNATURE-----

Merge tag 'xfs-for-linus-4.10-rc6-5' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs uodates from Darrick Wong:
 "I have some more fixes this week: better input validation, corruption
  avoidance, build fixes, memory leak fixes, and a couple from Christoph
  to avoid an ENOSPC failure.

  Summary:
   - Fix race conditions in the CoW code
   - Fix some incorrect input validation checks
   - Avoid crashing fs by running out of space when freeing inodes
   - Fix toctou race wrt whether or not an inode has an attr
   - Fix build error on arm
   - Fix page refcount corruption when readahead fails
   - Don't corrupt userspace in the bmap ioctl"

* tag 'xfs-for-linus-4.10-rc6-5' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: prevent quotacheck from overloading inode lru
  xfs: fix bmv_count confusion w/ shared extents
  xfs: clear _XBF_PAGES from buffers when readahead page
  xfs: extsize hints are not unlikely in xfs_bmap_btalloc
  xfs: remove racy hasattr check from attr ops
  xfs: use per-AG reservations for the finobt
  xfs: only update mount/resv fields on success in __xfs_ag_resv_init
  xfs: verify dirblocklog correctly
  xfs: fix COW writeback race
This commit is contained in:
Linus Torvalds 2017-01-27 12:44:32 -08:00
commit 3365135d43
13 changed files with 220 additions and 63 deletions

View file

@ -39,6 +39,7 @@
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
#include "xfs_ialloc_btree.h"
/*
* Per-AG Block Reservations
@ -200,22 +201,30 @@ __xfs_ag_resv_init(
struct xfs_mount *mp = pag->pag_mount;
struct xfs_ag_resv *resv;
int error;
xfs_extlen_t reserved;
resv = xfs_perag_resv(pag, type);
if (used > ask)
ask = used;
resv->ar_asked = ask;
resv->ar_reserved = resv->ar_orig_reserved = ask - used;
mp->m_ag_max_usable -= ask;
reserved = ask - used;
trace_xfs_ag_resv_init(pag, type, ask);
error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
if (error)
error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true);
if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
xfs_warn(mp,
"Per-AG reservation for AG %u failed. Filesystem may run out of space.",
pag->pag_agno);
return error;
}
return error;
mp->m_ag_max_usable -= ask;
resv = xfs_perag_resv(pag, type);
resv->ar_asked = ask;
resv->ar_reserved = resv->ar_orig_reserved = reserved;
trace_xfs_ag_resv_init(pag, type, ask);
return 0;
}
/* Create a per-AG block reservation. */
@ -223,6 +232,8 @@ int
xfs_ag_resv_init(
struct xfs_perag *pag)
{
struct xfs_mount *mp = pag->pag_mount;
xfs_agnumber_t agno = pag->pag_agno;
xfs_extlen_t ask;
xfs_extlen_t used;
int error = 0;
@ -231,23 +242,45 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
error = xfs_refcountbt_calc_reserves(pag->pag_mount,
pag->pag_agno, &ask, &used);
error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used);
if (error)
goto out;
if (error) {
/*
* Because we didn't have per-AG reservations when the
* finobt feature was added we might not be able to
* reserve all needed blocks. Warn and fall back to the
* old and potentially buggy code in that case, but
* ensure we do have the reservation for the refcountbt.
*/
ask = used = 0;
mp->m_inotbt_nores = true;
error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
&used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used);
if (error)
goto out;
}
}
/* Create the AGFL metadata reservation */
if (pag->pag_agfl_resv.ar_asked == 0) {
ask = used = 0;
error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
&ask, &used);
error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
if (error)
goto out;
@ -256,9 +289,16 @@ xfs_ag_resv_init(
goto out;
}
#ifdef DEBUG
/* need to read in the AGF for the ASSERT below to work */
error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
if (error)
return error;
ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
pag->pagf_freeblks + pag->pagf_flcount);
#endif
out:
return error;
}

View file

@ -131,9 +131,6 @@ xfs_attr_get(
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
if (!xfs_inode_hasattr(ip))
return -ENOATTR;
error = xfs_attr_args_init(&args, ip, name, flags);
if (error)
return error;
@ -392,9 +389,6 @@ xfs_attr_remove(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
return error;

View file

@ -3629,7 +3629,7 @@ xfs_bmap_btalloc(
align = xfs_get_cowextsz_hint(ap->ip);
else if (xfs_alloc_is_userdata(ap->datatype))
align = xfs_get_extsz_hint(ap->ip);
if (unlikely(align)) {
if (align) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
align, 0, ap->eof, 0, ap->conv,
&ap->offset, &ap->length);
@ -3701,7 +3701,7 @@ xfs_bmap_btalloc(
args.minlen = ap->minlen;
}
/* apply extent size hints if obtained earlier */
if (unlikely(align)) {
if (align) {
args.prod = align;
if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
args.mod = (xfs_extlen_t)(args.prod - args.mod);
@ -4514,8 +4514,6 @@ xfs_bmapi_write(
int n; /* current extent index */
xfs_fileoff_t obno; /* old block number (offset) */
int whichfork; /* data or attr fork */
char inhole; /* current location is hole in file */
char wasdelay; /* old extent was delayed */
#ifdef DEBUG
xfs_fileoff_t orig_bno; /* original block number value */
@ -4603,22 +4601,44 @@ xfs_bmapi_write(
bma.firstblock = firstblock;
while (bno < end && n < *nmap) {
inhole = eof || bma.got.br_startoff > bno;
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
bool need_alloc = false, wasdelay = false;
/*
* Make sure we only reflink into a hole.
*/
if (flags & XFS_BMAPI_REMAP)
ASSERT(inhole);
if (flags & XFS_BMAPI_COWFORK)
ASSERT(!inhole);
/* in hole or beyoned EOF? */
if (eof || bma.got.br_startoff > bno) {
if (flags & XFS_BMAPI_DELALLOC) {
/*
* For the COW fork we can reasonably get a
* request for converting an extent that races
* with other threads already having converted
* part of it, as there converting COW to
* regular blocks is not protected using the
* IOLOCK.
*/
ASSERT(flags & XFS_BMAPI_COWFORK);
if (!(flags & XFS_BMAPI_COWFORK)) {
error = -EIO;
goto error0;
}
if (eof || bno >= end)
break;
} else {
need_alloc = true;
}
} else {
/*
* Make sure we only reflink into a hole.
*/
ASSERT(!(flags & XFS_BMAPI_REMAP));
if (isnullstartblock(bma.got.br_startblock))
wasdelay = true;
}
/*
* First, deal with the hole before the allocated space
* that we found, if any.
*/
if (inhole || wasdelay) {
if (need_alloc || wasdelay) {
bma.eof = eof;
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;

View file

@ -110,6 +110,9 @@ struct xfs_extent_free_item
/* Map something in the CoW fork. */
#define XFS_BMAPI_COWFORK 0x200
/* Only convert delalloc space, don't allocate entirely new extents */
#define XFS_BMAPI_DELALLOC 0x400
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@ -120,7 +123,8 @@ struct xfs_extent_free_item
{ XFS_BMAPI_CONVERT, "CONVERT" }, \
{ XFS_BMAPI_ZERO, "ZERO" }, \
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_DELALLOC, "DELALLOC" }
static inline int xfs_bmapi_aflag(int w)

View file

@ -82,11 +82,12 @@ xfs_finobt_set_root(
}
STATIC int
xfs_inobt_alloc_block(
__xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
int *stat,
enum xfs_ag_resv_type resv)
{
xfs_alloc_arg_t args; /* block allocation args */
int error; /* error return value */
@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
args.maxlen = 1;
args.prod = 1;
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.resv = resv;
error = xfs_alloc_vextent(&args);
if (error) {
@ -122,6 +124,27 @@ xfs_inobt_alloc_block(
return 0;
}
STATIC int
xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
}
STATIC int
xfs_finobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
return __xfs_inobt_alloc_block(cur, start, new, stat,
XFS_AG_RESV_METADATA);
}
STATIC int
xfs_inobt_free_block(
struct xfs_btree_cur *cur,
@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
.dup_cursor = xfs_inobt_dup_cursor,
.set_root = xfs_finobt_set_root,
.alloc_block = xfs_inobt_alloc_block,
.alloc_block = xfs_finobt_alloc_block,
.free_block = xfs_inobt_free_block,
.get_minrecs = xfs_inobt_get_minrecs,
.get_maxrecs = xfs_inobt_get_maxrecs,
@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
return 0;
}
#endif /* DEBUG */
static xfs_extlen_t
xfs_inobt_max_size(
struct xfs_mount *mp)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_inobt_mxr[0] == 0)
return 0;
return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
XFS_INODES_PER_CHUNK);
}
static int
xfs_inobt_count_blocks(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_btnum_t btnum,
xfs_extlen_t *tree_blocks)
{
struct xfs_buf *agbp;
struct xfs_btree_cur *cur;
int error;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error)
return error;
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
error = xfs_btree_count_blocks(cur, tree_blocks);
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
xfs_buf_relse(agbp);
return error;
}
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
int
xfs_finobt_calc_reserves(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
xfs_extlen_t tree_len = 0;
int error;
if (!xfs_sb_version_hasfinobt(&mp->m_sb))
return 0;
error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
if (error)
return error;
*ask += xfs_inobt_max_size(mp);
*used += tree_len;
return 0;
}

View file

@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
#define xfs_inobt_rec_check_count(mp, rec) 0
#endif /* DEBUG */
int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_extlen_t *ask, xfs_extlen_t *used);
#endif /* __XFS_IALLOC_BTREE_H__ */

View file

@ -242,7 +242,7 @@ xfs_mount_validate_sb(
sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||

View file

@ -528,7 +528,6 @@ xfs_getbmap(
xfs_bmbt_irec_t *map; /* buffer for user's data */
xfs_mount_t *mp; /* file system mount point */
int nex; /* # of user extents can do */
int nexleft; /* # of user extents left */
int subnex; /* # of bmapi's can do */
int nmap; /* number of map entries */
struct getbmapx *out; /* output structure */
@ -686,10 +685,8 @@ xfs_getbmap(
goto out_free_map;
}
nexleft = nex;
do {
nmap = (nexleft > subnex) ? subnex : nexleft;
nmap = (nex> subnex) ? subnex : nex;
error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
XFS_BB_TO_FSB(mp, bmv->bmv_length),
map, &nmap, bmapi_flags);
@ -697,8 +694,8 @@ xfs_getbmap(
goto out_free_map;
ASSERT(nmap <= subnex);
for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
cur_ext < bmv->bmv_count; i++) {
for (i = 0; i < nmap && bmv->bmv_length &&
cur_ext < bmv->bmv_count - 1; i++) {
out[cur_ext].bmv_oflags = 0;
if (map[i].br_state == XFS_EXT_UNWRITTEN)
out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@ -760,16 +757,27 @@ xfs_getbmap(
continue;
}
/*
* In order to report shared extents accurately,
* we report each distinct shared/unshared part
* of a single bmbt record using multiple bmap
* extents. To make that happen, we iterate the
* same map array item multiple times, each
* time trimming out the subextent that we just
* reported.
*
* Because of this, we must check the out array
* index (cur_ext) directly against bmv_count-1
* to avoid overflows.
*/
if (inject_map.br_startblock != NULLFSBLOCK) {
map[i] = inject_map;
i--;
} else
nexleft--;
}
bmv->bmv_entries++;
cur_ext++;
}
} while (nmap && nexleft && bmv->bmv_length &&
cur_ext < bmv->bmv_count);
} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
out_free_map:
kmem_free(map);

View file

@ -422,6 +422,7 @@ retry:
out_free_pages:
for (i = 0; i < bp->b_page_count; i++)
__free_page(bp->b_pages[i]);
bp->b_flags &= ~_XBF_PAGES;
return error;
}

View file

@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
int error;
/*
* The ifree transaction might need to allocate blocks for record
* insertion to the finobt. We don't want to fail here at ENOSPC, so
* allow ifree to dip into the reserved block pool if necessary.
*
* Freeing large sets of inodes generally means freeing inode chunks,
* directory and file data blocks, so this should be relatively safe.
* Only under severe circumstances should it be possible to free enough
* inodes to exhaust the reserve block pool via finobt expansion while
* at the same time not creating free space in the filesystem.
* We try to use a per-AG reservation for any block needed by the finobt
* tree, but as the finobt feature predates the per-AG reservation
* support a degraded file system might not have enough space for the
* reservation at mount time. In that case try to dip into the reserved
* pool and pray.
*
* Send a warning if the reservation does happen to fail, as the inode
* now remains allocated and sits on the unlinked list until the fs is
* repaired.
*/
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
if (unlikely(mp->m_inotbt_nores)) {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
&tp);
} else {
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
}
if (error) {
if (error == -ENOSPC) {
xfs_warn_ratelimited(mp,

View file

@ -681,7 +681,7 @@ xfs_iomap_write_allocate(
xfs_trans_t *tp;
int nimaps;
int error = 0;
int flags = 0;
int flags = XFS_BMAPI_DELALLOC;
int nres;
if (whichfork == XFS_COW_FORK)

View file

@ -140,6 +140,7 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
bool m_inotbt_nores; /* no per-AG finobt resv. */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_ialloc_min_blks;/* min blocks in sparse inode

View file

@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust(
* the case in all other instances. It's OK that we do this because
* quotacheck is done only at mount time.
*/
error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
&ip);
if (error) {
*res = BULKSTAT_RV_NOTHING;
return error;