Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 8 Dec 2012 04:03:04 +0000 (UTC)
From:      "Kenneth D. Merry" <ken@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r244014 - head/sys/cam/scsi
Message-ID:  <201212080403.qB8434EP057889@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: ken
Date: Sat Dec  8 04:03:04 2012
New Revision: 244014
URL: http://svnweb.freebsd.org/changeset/base/244014

Log:
  Fix a device departure bug for the the pass(4), enc(4), sg(4) and ch(4)
  drivers.
  
  The bug occurrs when a userland process has the driver instance
  open and the underlying device goes away.  We get the devfs
  callback that the device node has been destroyed, but not all of
  the closes necessary to fully decrement the reference count on the
  CAM peripheral.
  
  The reason is that once devfs calls back and says the device has
  been destroyed, it is moved off to deadfs, and devfs guarantees
  that there will be no more open or close calls.  So the solution
  is to keep track of how many outstanding open calls there are on
  the device, and just release that many references when we get the
  callback from devfs.
  
  scsi_pass.c,
  scsi_enc.c,
  scsi_enc_internal.h:	Add an open count to the softc in these
  			drivers.  Increment it on open and
  			decrement it on close.
  
  			When we get a devfs callback to say that
  			the device node has gone away, decrement
  			the peripheral reference count by the
  			number of still outstanding opens.
  
  			Make sure we don't access the peripheral
  			with cam_periph_unlock() after what might
  			be the final call to
  			cam_periph_release_locked().  The
  			peripheral might have been freed, and we
  			will be dereferencing freed memory.
  
  scsi_ch.c,
  scsi_sg.c:		For the ch(4) and sg(4) drivers, add the
  			same changes described above, and in
  			addition, fix another bug that was
  			previously fixed in the pass(4) and enc(4)
  			drivers.
  
  			These drivers were calling destroy_dev()
  			from their cleanup routine, but that could
  			cause a deadlock because the cleanup
  			routine could be indirectly called from
  			the driver's close routine.  This would
  			cause a deadlock, because the device node
  			is being held open by the active close
  			call, and can't be destroyed.
  
  Sponsored by:	Spectra Logic Corporation
  MFC after:	1 week

Modified:
  head/sys/cam/scsi/scsi_ch.c
  head/sys/cam/scsi/scsi_enc.c
  head/sys/cam/scsi/scsi_enc_internal.h
  head/sys/cam/scsi/scsi_pass.c
  head/sys/cam/scsi/scsi_sg.c

Modified: head/sys/cam/scsi/scsi_ch.c
==============================================================================
--- head/sys/cam/scsi/scsi_ch.c	Sat Dec  8 02:37:18 2012	(r244013)
+++ head/sys/cam/scsi/scsi_ch.c	Sat Dec  8 04:03:04 2012	(r244014)
@@ -144,7 +144,8 @@ struct ch_softc {
 	ch_quirks	quirks;
 	union ccb	saved_ccb;
 	struct devstat	*device_stats;
-	struct cdev *dev;
+	struct cdev     *dev;
+	int		open_count;
 
 	int		sc_picker;	/* current picker */
 
@@ -237,6 +238,48 @@ chinit(void)
 }
 
 static void
+chdevgonecb(void *arg)
+{
+	struct cam_sim	  *sim;
+	struct ch_softc   *softc;
+	struct cam_periph *periph;
+	int i;
+
+	periph = (struct cam_periph *)arg;
+	sim = periph->sim;
+	softc = (struct ch_softc *)periph->softc;
+
+	KASSERT(softc->open_count >= 0, ("Negative open count %d",
+		softc->open_count));
+
+	mtx_lock(sim->mtx);
+
+	/*
+	 * When we get this callback, we will get no more close calls from
+	 * devfs.  So if we have any dangling opens, we need to release the
+	 * reference held for that particular context.
+	 */
+	for (i = 0; i < softc->open_count; i++)
+		cam_periph_release_locked(periph);
+
+	softc->open_count = 0;
+
+	/*
+	 * Release the reference held for the device node, it is gone now.
+	 */
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the final call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 */
+	mtx_unlock(sim->mtx);
+}
+
+static void
 choninvalidate(struct cam_periph *periph)
 {
 	struct ch_softc *softc;
@@ -250,6 +293,12 @@ choninvalidate(struct cam_periph *periph
 
 	softc->flags |= CH_FLAG_INVALID;
 
+	/*
+	 * Tell devfs this device has gone away, and ask for a callback
+	 * when it has cleaned up its state.
+	 */
+	destroy_dev_sched_cb(softc->dev, chdevgonecb, periph);
+
 	xpt_print(periph->path, "lost device\n");
 
 }
@@ -262,10 +311,9 @@ chcleanup(struct cam_periph *periph)
 	softc = (struct ch_softc *)periph->softc;
 
 	xpt_print(periph->path, "removing device entry\n");
+
 	devstat_remove_entry(softc->device_stats);
-	cam_periph_unlock(periph);
-	destroy_dev(softc->dev);
-	cam_periph_lock(periph);
+
 	free(softc, M_DEVBUF);
 }
 
@@ -359,6 +407,19 @@ chregister(struct cam_periph *periph, vo
 			  XPORT_DEVSTAT_TYPE(cpi.transport),
 			  DEVSTAT_PRIORITY_OTHER);
 
+	/*
+	 * Acquire a reference to the periph before we create the devfs
+	 * instance for it.  We'll release this reference once the devfs
+	 * instance has been freed.
+	 */
+	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
+		xpt_print(periph->path, "%s: lost periph during "
+			  "registration!\n", __func__);
+		cam_periph_lock(periph);
+		return (CAM_REQ_CMP_ERR);
+	}
+
+
 	/* Register the device */
 	softc->dev = make_dev(&ch_cdevsw, periph->unit_number, UID_ROOT,
 			      GID_OPERATOR, 0600, "%s%d", periph->periph_name,
@@ -419,6 +480,9 @@ chopen(struct cdev *dev, int flags, int 
 	}
 
 	cam_periph_unhold(periph);
+
+	softc->open_count++;
+
 	cam_periph_unlock(periph);
 
 	return(error);
@@ -427,13 +491,36 @@ chopen(struct cdev *dev, int flags, int 
 static int
 chclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
+	struct	cam_sim *sim;
 	struct	cam_periph *periph;
+	struct  ch_softc *softc;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (periph == NULL)
 		return(ENXIO);
 
-	cam_periph_release(periph);
+	sim = periph->sim;
+	softc = (struct ch_softc *)periph->softc;
+
+	mtx_lock(sim->mtx);
+
+	softc->open_count--;
+
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 *
+	 * cam_periph_release() avoids this problem using the same method,
+	 * but we're manually acquiring and dropping the lock here to
+	 * protect the open count and avoid another lock acquisition and
+	 * release.
+	 */
+	mtx_unlock(sim->mtx);
 
 	return(0);
 }

Modified: head/sys/cam/scsi/scsi_enc.c
==============================================================================
--- head/sys/cam/scsi/scsi_enc.c	Sat Dec  8 02:37:18 2012	(r244013)
+++ head/sys/cam/scsi/scsi_enc.c	Sat Dec  8 04:03:04 2012	(r244014)
@@ -111,11 +111,40 @@ enc_init(void)
 static void
 enc_devgonecb(void *arg)
 {
+	struct cam_sim    *sim;
 	struct cam_periph *periph;
+	struct enc_softc  *enc;
+	int i;
 
 	periph = (struct cam_periph *)arg;
+	sim = periph->sim;
+	enc = (struct enc_softc *)periph->softc;
+
+	mtx_lock(sim->mtx);
+
+	/*
+	 * When we get this callback, we will get no more close calls from
+	 * devfs.  So if we have any dangling opens, we need to release the
+	 * reference held for that particular context.
+	 */
+	for (i = 0; i < enc->open_count; i++)
+		cam_periph_release_locked(periph);
 
-	cam_periph_release(periph);
+	enc->open_count = 0;
+
+	/*
+	 * Release the reference held for the device node, it is gone now.
+	 */
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the final call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 */
+	mtx_unlock(sim->mtx);
 }
 
 static void
@@ -262,6 +291,8 @@ enc_open(struct cdev *dev, int flags, in
 out:
 	if (error != 0)
 		cam_periph_release_locked(periph);
+	else
+		softc->open_count++;
 
 	cam_periph_unlock(periph);
 
@@ -271,13 +302,36 @@ out:
 static int
 enc_close(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
+	struct cam_sim    *sim;
 	struct cam_periph *periph;
+	struct enc_softc  *enc;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (periph == NULL)
 		return (ENXIO);
 
-	cam_periph_release(periph);
+	sim = periph->sim;
+	enc = periph->softc;
+
+	mtx_lock(sim->mtx);
+
+	enc->open_count--;
+
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 *
+	 * cam_periph_release() avoids this problem using the same method,
+	 * but we're manually acquiring and dropping the lock here to
+	 * protect the open count and avoid another lock acquisition and
+	 * release.
+	 */
+	mtx_unlock(sim->mtx);
 
 	return (0);
 }
@@ -946,6 +1000,11 @@ enc_ctor(struct cam_periph *periph, void
 		}
 	}
 
+	/*
+	 * Acquire a reference to the periph before we create the devfs
+	 * instance for it.  We'll release this reference once the devfs
+	 * instance has been freed.
+	 */
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);

Modified: head/sys/cam/scsi/scsi_enc_internal.h
==============================================================================
--- head/sys/cam/scsi/scsi_enc_internal.h	Sat Dec  8 02:37:18 2012	(r244013)
+++ head/sys/cam/scsi/scsi_enc_internal.h	Sat Dec  8 04:03:04 2012	(r244014)
@@ -148,6 +148,7 @@ struct enc_softc {
 	union ccb		 saved_ccb;
 	struct cdev		*enc_dev;
 	struct cam_periph	*periph;
+	int			 open_count;
 
 	/* Bitmap of pending operations. */
 	uint32_t		 pending_actions;

Modified: head/sys/cam/scsi/scsi_pass.c
==============================================================================
--- head/sys/cam/scsi/scsi_pass.c	Sat Dec  8 02:37:18 2012	(r244013)
+++ head/sys/cam/scsi/scsi_pass.c	Sat Dec  8 04:03:04 2012	(r244014)
@@ -76,6 +76,7 @@ struct pass_softc {
 	pass_flags	 flags;
 	u_int8_t	 pd_type;
 	union ccb	 saved_ccb;
+	int		 open_count;
 	struct devstat	*device_stats;
 	struct cdev	*dev;
 	struct cdev	*alias_dev;
@@ -140,12 +141,43 @@ passinit(void)
 static void
 passdevgonecb(void *arg)
 {
+	struct cam_sim    *sim;
 	struct cam_periph *periph;
+	struct pass_softc *softc;
+	int i;
 
 	periph = (struct cam_periph *)arg;
+	sim = periph->sim;
+	softc = (struct pass_softc *)periph->softc;
+
+	KASSERT(softc->open_count >= 0, ("Negative open count %d",
+		softc->open_count));
+
+	mtx_lock(sim->mtx);
+
+	/*
+	 * When we get this callback, we will get no more close calls from
+	 * devfs.  So if we have any dangling opens, we need to release the
+	 * reference held for that particular context.
+	 */
+	for (i = 0; i < softc->open_count; i++)
+		cam_periph_release_locked(periph);
+
+	softc->open_count = 0;
+
+	/*
+	 * Release the reference held for the device node, it is gone now.
+	 */
+	cam_periph_release_locked(periph);
 
-	xpt_print(periph->path, "%s: devfs entry is gone\n", __func__);
-	cam_periph_release(periph);
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the final call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 */
+	mtx_unlock(sim->mtx);
 }
 
 static void
@@ -368,7 +400,7 @@ passregister(struct cam_periph *periph, 
 	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
 		xpt_print(periph->path, "%s: lost periph during "
 			  "registration!\n", __func__);
-		mtx_lock(periph->sim->mtx);
+		cam_periph_lock(periph);
 		return (CAM_REQ_CMP_ERR);
 	}
 
@@ -461,6 +493,8 @@ passopen(struct cdev *dev, int flags, in
 		return(EINVAL);
 	}
 
+	softc->open_count++;
+
 	cam_periph_unlock(periph);
 
 	return (error);
@@ -469,13 +503,36 @@ passopen(struct cdev *dev, int flags, in
 static int
 passclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
+	struct  cam_sim    *sim;
 	struct 	cam_periph *periph;
+	struct  pass_softc *softc;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (periph == NULL)
 		return (ENXIO);	
 
-	cam_periph_release(periph);
+	sim = periph->sim;
+	softc = periph->softc;
+
+	mtx_lock(sim->mtx);
+
+	softc->open_count--;
+
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 *
+	 * cam_periph_release() avoids this problem using the same method,
+	 * but we're manually acquiring and dropping the lock here to
+	 * protect the open count and avoid another lock acquisition and
+	 * release.
+	 */
+	mtx_unlock(sim->mtx);
 
 	return (0);
 }

Modified: head/sys/cam/scsi/scsi_sg.c
==============================================================================
--- head/sys/cam/scsi/scsi_sg.c	Sat Dec  8 02:37:18 2012	(r244013)
+++ head/sys/cam/scsi/scsi_sg.c	Sat Dec  8 04:03:04 2012	(r244014)
@@ -99,6 +99,7 @@ struct sg_rdwr {
 struct sg_softc {
 	sg_state		state;
 	sg_flags		flags;
+	int			open_count;
 	struct devstat		*device_stats;
 	TAILQ_HEAD(, sg_rdwr)	rdwr_done;
 	struct cdev		*dev;
@@ -169,6 +170,49 @@ sginit(void)
 }
 
 static void
+sgdevgonecb(void *arg)
+{
+	struct cam_sim    *sim;
+	struct cam_periph *periph;
+	struct sg_softc *softc;
+	int i;
+
+	periph = (struct cam_periph *)arg;
+	sim = periph->sim;
+	softc = (struct sg_softc *)periph->softc;
+
+	KASSERT(softc->open_count >= 0, ("Negative open count %d",
+		softc->open_count));
+
+	mtx_lock(sim->mtx);
+
+	/*
+	 * When we get this callback, we will get no more close calls from
+	 * devfs.  So if we have any dangling opens, we need to release the
+	 * reference held for that particular context.
+	 */
+	for (i = 0; i < softc->open_count; i++)
+		cam_periph_release_locked(periph);
+
+	softc->open_count = 0;
+
+	/*
+	 * Release the reference held for the device node, it is gone now.
+	 */
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the final call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 */
+	mtx_unlock(sim->mtx);
+}
+
+
+static void
 sgoninvalidate(struct cam_periph *periph)
 {
 	struct sg_softc *softc;
@@ -183,6 +227,12 @@ sgoninvalidate(struct cam_periph *periph
 	softc->flags |= SG_FLAG_INVALID;
 
 	/*
+	 * Tell devfs this device has gone away, and ask for a callback
+	 * when it has cleaned up its state.
+	 */
+	destroy_dev_sched_cb(softc->dev, sgdevgonecb, periph);
+
+	/*
 	 * XXX Return all queued I/O with ENXIO.
 	 * XXX Handle any transactions queued to the card
 	 *     with XPT_ABORT_CCB.
@@ -201,10 +251,9 @@ sgcleanup(struct cam_periph *periph)
 	softc = (struct sg_softc *)periph->softc;
 	if (bootverbose)
 		xpt_print(periph->path, "removing device entry\n");
+
 	devstat_remove_entry(softc->device_stats);
-	cam_periph_unlock(periph);
-	destroy_dev(softc->dev);
-	cam_periph_lock(periph);
+
 	free(softc, M_DEVBUF);
 }
 
@@ -299,6 +348,18 @@ sgregister(struct cam_periph *periph, vo
 			DEVSTAT_TYPE_PASS,
 			DEVSTAT_PRIORITY_PASS);
 
+	/*
+	 * Acquire a reference to the periph before we create the devfs
+	 * instance for it.  We'll release this reference once the devfs
+	 * instance has been freed.
+	 */
+	if (cam_periph_acquire(periph) != CAM_REQ_CMP) {
+		xpt_print(periph->path, "%s: lost periph during "
+			  "registration!\n", __func__);
+		cam_periph_lock(periph);
+		return (CAM_REQ_CMP_ERR);
+	}
+
 	/* Register the device */
 	softc->dev = make_dev(&sg_cdevsw, periph->unit_number,
 			      UID_ROOT, GID_OPERATOR, 0600, "%s%d",
@@ -414,6 +475,8 @@ sgopen(struct cdev *dev, int flags, int 
 		return (ENXIO);
 	}
 
+	softc->open_count++;
+
 	cam_periph_unlock(periph);
 
 	return (error);
@@ -422,13 +485,36 @@ sgopen(struct cdev *dev, int flags, int 
 static int
 sgclose(struct cdev *dev, int flag, int fmt, struct thread *td)
 {
+	struct cam_sim    *sim;
 	struct cam_periph *periph;
+	struct sg_softc   *softc;
 
 	periph = (struct cam_periph *)dev->si_drv1;
 	if (periph == NULL)
 		return (ENXIO);
 
-	cam_periph_release(periph);
+	sim = periph->sim;
+	softc = periph->softc;
+
+	mtx_lock(sim->mtx);
+
+	softc->open_count--;
+
+	cam_periph_release_locked(periph);
+
+	/*
+	 * We reference the SIM lock directly here, instead of using
+	 * cam_periph_unlock().  The reason is that the call to
+	 * cam_periph_release_locked() above could result in the periph
+	 * getting freed.  If that is the case, dereferencing the periph
+	 * with a cam_periph_unlock() call would cause a page fault.
+	 *
+	 * cam_periph_release() avoids this problem using the same method,
+	 * but we're manually acquiring and dropping the lock here to
+	 * protect the open count and avoid another lock acquisition and
+	 * release.
+	 */
+	mtx_unlock(sim->mtx);
 
 	return (0);
 }



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201212080403.qB8434EP057889>