Date: Wed, 27 Feb 2013 07:31:24 +0000 (UTC) From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r247387 - head/sys/ufs/ffs Message-ID: <201302270731.r1R7VOrT056842@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kib Date: Wed Feb 27 07:31:23 2013 New Revision: 247387 URL: http://svnweb.freebsd.org/changeset/base/247387 Log: An inode block must not be blockingly read while cg block is owned. The order is inode buffer lock -> snaplk -> cg buffer lock, reversing the order causes deadlocks. Inode block must not be written while cg block buffer is owned. The FFS copy on write needs to allocate a block to copy the content of the inode block, and the cylinder group selected for the allocation might be the same as the owned cg block. The reserved block detection code in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the situation, because the locked cg buffer is not exposed to it. In order to maintain the dependency between initialized inode block and the cg_initediblk pointer, look up the inode buffer in non-blocking mode. If succeeded, brelse cg block, initialize the inode block and write it. After the write is finished, reread cg block and update the cg_initediblk. If inode block is already locked by another thread, let the another thread initialize it. If another thread raced with us after we started writing inode block, the situation is detected by an update of cg_initediblk. Note that double-initialization of the inode block is harmless, the block cannot be used until cg_initediblk is incremented. Sponsored by: The FreeBSD Foundation In collaboration with: pho Reviewed by: mckusick MFC after: 1 month X-MFC-note: after r246877 Modified: head/sys/ufs/ffs/ffs_alloc.c Modified: head/sys/ufs/ffs/ffs_alloc.c ============================================================================== --- head/sys/ufs/ffs/ffs_alloc.c Wed Feb 27 06:53:15 2013 (r247386) +++ head/sys/ufs/ffs/ffs_alloc.c Wed Feb 27 07:31:23 2013 (r247387) @@ -1790,6 +1790,17 @@ fail: return (0); } +static inline struct buf * +getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) +{ + struct fs *fs; + + fs = ip->i_fs; + return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, + cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, + gbflags)); +} + /* * Determine whether an inode can be allocated. * @@ -1814,9 +1825,11 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu u_int8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; + u_int32_t old_initediblk; fs = ip->i_fs; ump = ip->i_ump; +check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); @@ -1828,13 +1841,13 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu return (0); } cgp = (struct cg *)bp->b_data; +restart: if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } bp->b_xflags |= BX_BKGRDWRITE; - cgp->cg_old_time = cgp->cg_time = time_second; inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; @@ -1856,7 +1869,6 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; - cgp->cg_irotor = ipref; gotit: /* * Check to see if we need to initialize more inodes. @@ -1864,9 +1876,37 @@ gotit: if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { - ibp = getblk(ip->i_devvp, fsbtodb(fs, - ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)), - (int)fs->fs_bsize, 0, 0, 0); + old_initediblk = cgp->cg_initediblk; + + /* + * Free the cylinder group lock before writing the + * initialized inode block. Entering the + * babarrierwrite() with the cylinder group lock + * causes lock order violation between the lock and + * snaplk. + * + * Another thread can decide to initialize the same + * inode block, but whichever thread first gets the + * cylinder group lock after writing the newly + * allocated inode block will update it and the other + * will realize that it has lost and leave the + * cylinder group unchanged. + */ + ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); + brelse(bp); + if (ibp == NULL) { + /* + * The inode block buffer is already owned by + * another thread, which must initialize it. + * Wait on the buffer to allow another thread + * to finish the updates, with dropped cg + * buffer lock, then retry. + */ + ibp = getinobuf(ip, cg, old_initediblk, 0); + brelse(ibp); + UFS_LOCK(ump); + goto check_nifree; + } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { @@ -1883,8 +1923,29 @@ gotit: * loading of newly created filesystems. */ babarrierwrite(ibp); - cgp->cg_initediblk += INOPB(fs); + + /* + * After the inode block is written, try to update the + * cg initediblk pointer. If another thread beat us + * to it, then leave it unchanged as the other thread + * has already set it correctly. + */ + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + UFS_LOCK(ump); + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + if (error != 0) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (cgp->cg_initediblk == old_initediblk) + cgp->cg_initediblk += INOPB(fs); + goto restart; } + cgp->cg_old_time = cgp->cg_time = time_second; + cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201302270731.r1R7VOrT056842>