Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 6 Jul 2015 07:51:35 +0200
From:      Mateusz Guzik <mjguzik@gmail.com>
To:        Konstantin Belousov <kostikbel@gmail.com>
Cc:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   Re: svn commit: r267760 - head/sys/kern
Message-ID:  <20150706055135.GA28898@dft-labs.eu>
In-Reply-To: <20140717005638.GF93733@kib.kiev.ua>
References:  <20140623080501.GB27040@dft-labs.eu> <20140623081823.GG93733@kib.kiev.ua> <20140623131653.GC27040@dft-labs.eu> <20140623163523.GK93733@kib.kiev.ua> <20140711024351.GA18214@dft-labs.eu> <20140711095551.GA93733@kib.kiev.ua> <20140711111925.GB18214@dft-labs.eu> <20140713132652.GZ93733@kib.kiev.ua> <20140713213623.GA13241@dft-labs.eu> <20140717005638.GF93733@kib.kiev.ua>

next in thread | previous in thread | raw e-mail | index | archive | help
On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote:
> On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote:
> > There is an additional problem with slocked case: witness report a lor
> > with devfs vnodes.
> > 
> > lock order reversal:
> >  1st 0xfffff80006fe6ab0 process imagelock (process imagelock) @ /usr/src/sys/kern/kern_proc.c:287
> >  2nd 0xfffff80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241
> > KDB: stack backtrace:
> > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe012324f120
> > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfffffe012324f1d0
> > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfffffe012324f260
> > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfffffe012324f3a0
> > vop_stdlock() at vop_stdlock+0x3c/frame 0xfffffe012324f3c0
> > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfffffe012324f3f0
> > _vn_lock() at _vn_lock+0xaa/frame 0xfffffe012324f460
> > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfffffe012324f4d0
> > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfffffe012324f530
> > vn_fullpath() at vn_fullpath+0xc1/frame 0xfffffe012324f580
> > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfffffe012324f7b0
> > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 0xfffffe012324f840
> > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 0xfffffe012324f900
> > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 0xfffffe012324f940
> > sysctl_root() at sysctl_root+0x18e/frame 0xfffffe012324f990
> > userland_sysctl() at userland_sysctl+0x192/frame 0xfffffe012324fa30
> > 
> > witness detected the following lock orders:
> > devfs -> proctree
> Where is the dependency catched comes from ?  I suspect it might be tty.
> 
> I consider this case as an advantage of using sx over the hand-rolled lock,
> since the issue is/must be present with the counter as well, or the LOR
> is false positive, possibly due to sx taken in shared mode.  But debugging
> features of sx give the warning, while counter is silent.
> 
> That said, if the issue above is analyzed, I do not have any preference
> and will not object strongly against you decision.
> 

[snip]

This patch is about providing a way to block execs and exits of
processes so that it is safe to inspect their state. For instance it
deals with a case where the target process executes a setuid binary,
where the process inspecting it could have its permission checked prior
to exec but is reading data after setuid completed.

LOR mentioned above indeed comes form tty handling and is handled by
relocking the vnode.

Patch below is the variant with sx lock.

diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index fe8e9ef..31f43a8 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
 	if (vp->v_data == NULL)
 		return (0);
 
+	vp_locked = VOP_ISLOCKED(vp);
+
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
@@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
+			VOP_UNLOCK(vp, 0);
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
@@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap)
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
+			vn_lock(vp, vp_locked | LK_RETRY);
 			if (oldvp != NULL)
 				vrele(oldvp);
 		} else
@@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap)
 	}
 	vholdl(vp);
 	VI_UNLOCK(vp);
-	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c
index d3bac30..118a7d9 100644
--- a/sys/fs/nfsclient/nfs_clport.c
+++ b/sys/fs/nfsclient/nfs_clport.c
@@ -1188,7 +1188,7 @@ nfscl_procdoesntexist(u_int8_t *own)
 	tl.cval[2] = *own++;
 	tl.cval[3] = *own++;
 	pid = tl.lval;
-	p = pfind_locked(pid);
+	p = pfind_locked(pid, 0);
 	if (p == NULL)
 		return (1);
 	if (p->p_stats == NULL) {
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 0675128..2ad09aa 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -1887,6 +1887,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 		sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
 		sbuf_set_drain(sb, sbuf_drain_count, &size);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sx_slock(&p->p_imagelock);
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1);
 		sbuf_finish(sb);
@@ -1895,6 +1896,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep)
 	} else {
 		structsize = sizeof(struct kinfo_file);
 		sbuf_bcat(sb, &structsize, sizeof(structsize));
+		sx_slock(&p->p_imagelock);
 		PROC_LOCK(p);
 		kern_proc_filedesc_out(p, sb, -1);
 	}
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 37539c4..9364dcb 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -480,6 +480,7 @@ proc0_init(void *dummy __unused)
 	p->p_flag2 = 0;
 	p->p_state = PRS_NORMAL;
 	knlist_init_mtx(&p->p_klist, &p->p_mtx);
+	sx_init_flags(&p->p_imagelock, "process imagelock", SX_NEW);
 	STAILQ_INIT(&p->p_ktr);
 	p->p_nice = NZERO;
 	/* pid_max cannot be greater than PID_MAX */
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 85cda01..bbdcbf5 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -3271,6 +3271,7 @@ kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
+	sx_sunlock(&p->p_imagelock);
 	free(efbuf, M_TEMP);
 	return (error);
 }
@@ -3292,7 +3293,7 @@ sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
-	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	error = pget((pid_t)name[0], PGET_LOCK, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
@@ -3359,7 +3360,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 	struct proc *p;
 
 	name = (int *)arg1;
-	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
+	error = pget((pid_t)name[0], PGET_LOCK, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
@@ -3391,6 +3392,7 @@ sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
+	sx_sunlock(&p->p_imagelock);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 859b2e3..0d9aac1 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -381,6 +381,7 @@ do_execve(td, args, mac_p)
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
+	sx_xlock(&p->p_imagelock);
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
@@ -907,6 +908,7 @@ exec_fail:
 	SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
 
 done2:
+	sx_xunlock(&p->p_imagelock);
 #ifdef MAC
 	mac_execve_exit(imgp);
 	mac_execve_interpreter_exit(interpvplabel);
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 9ce6d34..27fd764 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -213,6 +213,7 @@ exit1(struct thread *td, int rv)
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
+	sx_xlock(&p->p_imagelock);
 	PROC_LOCK(p);
 	/*
 	 * First check if some other thread or external request got
@@ -279,6 +280,7 @@ exit1(struct thread *td, int rv)
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
+	sx_xunlock(&p->p_imagelock);
 	wakeup(&p->p_stype);
 
 	/*
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 27c6f40..ac6b441 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -230,6 +230,7 @@ proc_init(void *mem, int size, int flags)
 	mtx_init(&p->p_statmtx, "pstatl", NULL, MTX_SPIN | MTX_NEW);
 	mtx_init(&p->p_itimmtx, "pitiml", NULL, MTX_SPIN | MTX_NEW);
 	mtx_init(&p->p_profmtx, "pprofl", NULL, MTX_SPIN | MTX_NEW);
+	sx_init_flags(&p->p_imagelock, "process imagelock", SX_NEW);
 	cv_init(&p->p_pwait, "ppwait");
 	cv_init(&p->p_dbgwait, "dbgwait");
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
@@ -278,16 +279,20 @@ inferior(struct proc *p)
 }
 
 struct proc *
-pfind_locked(pid_t pid)
+pfind_locked(pid_t pid, int lockimage)
 {
 	struct proc *p;
 
 	sx_assert(&allproc_lock, SX_LOCKED);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash) {
 		if (p->p_pid == pid) {
+			if (lockimage)
+				sx_slock(&p->p_imagelock);
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
+				if (lockimage)
+					sx_sunlock(&p->p_imagelock);
 				p = NULL;
 			}
 			break;
@@ -308,7 +313,7 @@ pfind(pid_t pid)
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
-	p = pfind_locked(pid);
+	p = pfind_locked(pid, 0);
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
@@ -364,11 +369,17 @@ int
 pget(pid_t pid, int flags, struct proc **pp)
 {
 	struct proc *p;
-	int error;
+	int error, lockimage;
+
+	lockimage = ((flags & PGET_IMAGELOCK) != 0);
+	if (lockimage) {
+		MPASS((flags & PGET_NOTWEXIT) != 0);
+		MPASS((flags & PGET_HOLD) == 0);
+	}
 
 	sx_slock(&allproc_lock);
 	if (pid <= PID_MAX) {
-		p = pfind_locked(pid);
+		p = pfind_locked(pid, lockimage);
 		if (p == NULL && (flags & PGET_NOTWEXIT) == 0)
 			p = zpfind_locked(pid);
 	} else if ((flags & PGET_NOTID) == 0) {
@@ -413,6 +424,8 @@ pget(pid_t pid, int flags, struct proc **pp)
 	return (0);
 errout:
 	PROC_UNLOCK(p);
+	if (lockimage)
+		sx_sunlock(&p->p_imagelock);
 	return (error);
 }
 
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index e6c83b4..5df817a 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -522,6 +522,7 @@ struct proc {
 					       (if I am reaper). */
 	LIST_ENTRY(proc) p_reapsibling;	/* (e) List of siblings - descendants of
 					       the same reaper. */
+	struct sx	p_imagelock;	/* Lock blocking exec/exit. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 	struct mtx	p_statmtx;	/* Lock for the stats */
 	struct mtx	p_itimmtx;	/* Lock for the virt/prof timers */
@@ -886,7 +887,7 @@ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct uma_zone *proc_zone;
 
 struct	proc *pfind(pid_t);		/* Find process by id. */
-struct	proc *pfind_locked(pid_t pid);
+struct	proc *pfind_locked(pid_t pid, int lockimage);
 struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
 struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
 
@@ -900,8 +901,10 @@ struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
 #define	PGET_NOTWEXIT	0x00010	/* Check that the process is not in P_WEXIT. */
 #define	PGET_NOTINEXEC	0x00020	/* Check that the process is not in P_INEXEC. */
 #define	PGET_NOTID	0x00040	/* Do not assume tid if pid > PID_MAX. */
+#define	PGET_IMAGELOCK	0x00080 /* Prevent execs and exits. */
 
 #define	PGET_WANTREAD	(PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT)
+#define	PGET_LOCK	(PGET_CANDEBUG | PGET_NOTWEXIT | PGET_IMAGELOCK)
 
 int	pget(pid_t pid, int flags, struct proc **pp);

-- 
Mateusz Guzik <mjguzik gmail.com>



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20150706055135.GA28898>