Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 14 Jun 2018 18:13:37 -0500
From:      Alan Cox <alc@rice.edu>
To:        Steven Hartland <steven.hartland@multiplay.co.uk>
Cc:        Konstantin Belousov <kib@FreeBSD.org>, src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   Re: svn commit: r335171 - head/sys/vm
Message-ID:  <2DA5B980-1703-463C-80E1-B8430BFA2A38@rice.edu>
In-Reply-To: <F220E356-D55E-48B9-9AF0-ABBF41A74FC7@rice.edu>
References:  <201806141941.w5EJf2qa069373@repo.freebsd.org> <c241bd08-eb94-152e-c1f1-d77dc6987908@multiplay.co.uk> <F220E356-D55E-48B9-9AF0-ABBF41A74FC7@rice.edu>

next in thread | previous in thread | raw e-mail | index | archive | help

> On Jun 14, 2018, at 6:07 PM, Alan Cox <alc@rice.edu> wrote:
>=20
>=20
>> On Jun 14, 2018, at 5:54 PM, Steven Hartland =
<steven.hartland@multiplay.co.uk =
<mailto:steven.hartland@multiplay.co.uk>> wrote:
>>=20
>> Out of interest, how would this exhibit itself?
>>=20
>=20
> A panic in vm_page_insert_after().
>=20

I should add that a non-debug kernel will panic a little later in =
vm_radix_insert().

>> On 14/06/2018 20:41, Konstantin Belousov wrote:
>>> Author: kib
>>> Date: Thu Jun 14 19:41:02 2018
>>> New Revision: 335171
>>> URL: https://svnweb.freebsd.org/changeset/base/335171 =
<https://svnweb.freebsd.org/changeset/base/335171>;
>>>=20
>>> Log:
>>>   Handle the race between fork/vm_object_split() and faults.
>>>  =20
>>>   If fault started before vmspace_fork() locked the map, and then =
during
>>>   fork, vm_map_copy_entry()->vm_object_split() is executed, it is
>>>   possible that the fault instantiate the page into the original =
object
>>>   when the page was already copied into the new object (see
>>>   vm_map_split() for the orig/new objects terminology). This can =
happen
>>>   if split found a busy page (e.g. from the fault) and slept =
dropping
>>>   the objects lock, which allows the swap pager to instantiate
>>>   read-behind pages for the fault.  Then the restart of the scan can =
see
>>>   a page in the scanned range, where it was already copied to the =
upper
>>>   object.
>>>  =20
>>>   Fix it by instantiating the read-ahead pages before
>>>   swap_pager_getpages() method drops the lock to allocate pbuf.  The
>>>   object scan would see the whole range prefilled with the busy =
pages
>>>   and not proceed the range.
>>>  =20
>>>   Note that vm_fault rechecks the map generation count after the =
object
>>>   unlock, so that it restarts the handling if raced with split, and
>>>   re-lookups the right page from the upper object.
>>>  =20
>>>   In collaboration with:	alc
>>>   Tested by:	pho
>>>   Sponsored by:	The FreeBSD Foundation
>>>   MFC after:	1 week
>>>=20
>>> Modified:
>>>   head/sys/vm/swap_pager.c
>>>=20
>>> Modified: head/sys/vm/swap_pager.c
>>> =
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D
>>> --- head/sys/vm/swap_pager.c	Thu Jun 14 19:01:40 2018	=
(r335170)
>>> +++ head/sys/vm/swap_pager.c	Thu Jun 14 19:41:02 2018	=
(r335171)
>>> @@ -1096,21 +1096,24 @@ swap_pager_getpages(vm_object_t object, =
vm_page_t *ma,
>>>      int *rahead)
>>>  {
>>>  	struct buf *bp;
>>> -	vm_page_t mpred, msucc, p;
>>> +	vm_page_t bm, mpred, msucc, p;
>>>  	vm_pindex_t pindex;
>>>  	daddr_t blk;
>>> -	int i, j, maxahead, maxbehind, reqcount, shift;
>>> +	int i, maxahead, maxbehind, reqcount;
>>> =20
>>>  	reqcount =3D count;
>>> =20
>>> -	VM_OBJECT_WUNLOCK(object);
>>> -	bp =3D getpbuf(&nsw_rcount);
>>> -	VM_OBJECT_WLOCK(object);
>>> -
>>> -	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, =
&maxahead)) {
>>> -		relpbuf(bp, &nsw_rcount);
>>> +	/*
>>> +	 * Determine the final number of read-behind pages and
>>> +	 * allocate them BEFORE releasing the object lock.  Otherwise,
>>> +	 * there can be a problematic race with vm_object_split().
>>> +	 * Specifically, vm_object_split() might first transfer pages
>>> +	 * that precede ma[0] in the current object to a new object,
>>> +	 * and then this function incorrectly recreates those pages as
>>> +	 * read-behind pages in the current object.
>>> +	 */
>>> +	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, =
&maxahead))
>>>  		return (VM_PAGER_FAIL);
>>> -	}
>>> =20
>>>  	/*
>>>  	 * Clip the readahead and readbehind ranges to exclude resident =
pages.
>>> @@ -1132,35 +1135,31 @@ swap_pager_getpages(vm_object_t object, =
vm_page_t *ma,
>>>  			*rbehind =3D pindex - mpred->pindex - 1;
>>>  	}
>>> =20
>>> +	bm =3D ma[0];
>>> +	for (i =3D 0; i < count; i++)
>>> +		ma[i]->oflags |=3D VPO_SWAPINPROG;
>>> +
>>>  	/*
>>>  	 * Allocate readahead and readbehind pages.
>>>  	 */
>>> -	shift =3D rbehind !=3D NULL ? *rbehind : 0;
>>> -	if (shift !=3D 0) {
>>> -		for (i =3D 1; i <=3D shift; i++) {
>>> +	if (rbehind !=3D NULL) {
>>> +		for (i =3D 1; i <=3D *rbehind; i++) {
>>>  			p =3D vm_page_alloc(object, ma[0]->pindex - i,
>>>  			    VM_ALLOC_NORMAL);
>>> -			if (p =3D=3D NULL) {
>>> -				/* Shift allocated pages to the left. */
>>> -				for (j =3D 0; j < i - 1; j++)
>>> -					bp->b_pages[j] =3D
>>> -					    bp->b_pages[j + shift - i + =
1];
>>> +			if (p =3D=3D NULL)
>>>  				break;
>>> -			}
>>> -			bp->b_pages[shift - i] =3D p;
>>> +			p->oflags |=3D VPO_SWAPINPROG;
>>> +			bm =3D p;
>>>  		}
>>> -		shift =3D i - 1;
>>> -		*rbehind =3D shift;
>>> +		*rbehind =3D i - 1;
>>>  	}
>>> -	for (i =3D 0; i < reqcount; i++)
>>> -		bp->b_pages[i + shift] =3D ma[i];
>>>  	if (rahead !=3D NULL) {
>>>  		for (i =3D 0; i < *rahead; i++) {
>>>  			p =3D vm_page_alloc(object,
>>>  			    ma[reqcount - 1]->pindex + i + 1, =
VM_ALLOC_NORMAL);
>>>  			if (p =3D=3D NULL)
>>>  				break;
>>> -			bp->b_pages[shift + reqcount + i] =3D p;
>>> +			p->oflags |=3D VPO_SWAPINPROG;
>>>  		}
>>>  		*rahead =3D i;
>>>  	}
>>> @@ -1171,15 +1170,18 @@ swap_pager_getpages(vm_object_t object, =
vm_page_t *ma,
>>> =20
>>>  	vm_object_pip_add(object, count);
>>> =20
>>> -	for (i =3D 0; i < count; i++)
>>> -		bp->b_pages[i]->oflags |=3D VPO_SWAPINPROG;
>>> -
>>> -	pindex =3D bp->b_pages[0]->pindex;
>>> +	pindex =3D bm->pindex;
>>>  	blk =3D swp_pager_meta_ctl(object, pindex, 0);
>>>  	KASSERT(blk !=3D SWAPBLK_NONE,
>>>  	    ("no swap blocking containing %p(%jx)", object, =
(uintmax_t)pindex));
>>> =20
>>>  	VM_OBJECT_WUNLOCK(object);
>>> +	bp =3D getpbuf(&nsw_rcount);
>>> +	/* Pages cannot leave the object while busy. */
>>> +	for (i =3D 0, p =3D bm; i < count; i++, p =3D TAILQ_NEXT(p, =
listq)) {
>>> +		MPASS(p->pindex =3D=3D bm->pindex + i);
>>> +		bp->b_pages[i] =3D p;
>>> +	}
>>> =20
>>>  	bp->b_flags |=3D B_PAGING;
>>>  	bp->b_iocmd =3D BIO_READ;
>>>=20
>>=20
>=20




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?2DA5B980-1703-463C-80E1-B8430BFA2A38>