Date: Wed, 13 Jul 2005 11:03:33 GMT From: Robert Watson <rwatson@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 80064 for review Message-ID: <200507131103.j6DB3Xlh051032@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=80064 Change 80064 by rwatson@rwatson_paprika on 2005/07/13 11:03:29 A number of annotations for consideration in future cleanup: - Might be good to use UMA for per-thread and per-record storage, rather than malloc. - Active credential and vnode are thread-local variables. - Annotate some error choices. - Minor style tweaks. - Annotate odd, fixable, improper, or otherwise interesting use of error handling. FreeBSD does not permit M_WAITOK to fail for some memory types, and that lack of that failure can clean up quite a bit of logic here. - Annotate possible spots to remove Giant due to MPSAFE VFS. Annotate lack of locking in the netinet interactions. - Comment about calculation of free space. - Summary comments for audit_worker(), audit_init(), audit_rotate_vnode(), currecord(), audit_syscall_enter(), audit_syscall_exit(), as well as potential issues with them. - Comments about places to possibly add assertions. Affected files ... .. //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 edit Differences ... ==== //depot/projects/trustedbsd/audit3/sys/security/audit/kern_audit.c#26 (text+ko) ==== @@ -55,6 +55,10 @@ #include <bsm/audit_kernel.h> #include <security/audit/audit_klib.h> +/* + * XXXAUDIT: Might be nice to use a UMA zone for records, and malloc(9) only + * for things like triggers, temporary storage, etc. + */ MALLOC_DEFINE(M_AUDIT, "audit", "Audit event records"); #ifdef AUDIT @@ -134,6 +138,8 @@ * replacement. When a replacement is completed, this cv is signalled * by the worker thread so a waiting thread can start another replacement. * We also store a credential to perform audit log write operations with. + * + * The current credential and vnode are thread-local to audit_worker. */ static struct cv audit_replacement_cv; @@ -211,6 +217,9 @@ error = 0; audit_isopen = 1; } else { + /* + * XXXRW: Why not EBUSY? + */ error = EOPNOTSUPP; } mtx_unlock(&audit_mtx); @@ -263,10 +272,15 @@ static int audit_write(struct cdev *dev, struct uio *uio, int ioflag) { + /* Communication is kernel->userspace only */ return EOPNOTSUPP; } +/* + * XXXAUDIT: Since we can't fail this call, we should not have a return + * value. + */ static int send_trigger(unsigned int trigger) { @@ -276,6 +290,9 @@ if (!audit_isopen) return 0; + /* + * XXXAUDIT: Use a condition variable instead of msleep/wakeup? + */ ti = malloc(sizeof *ti, M_AUDIT, M_WAITOK); mtx_lock(&audit_mtx); ti->trigger = trigger; @@ -294,9 +311,13 @@ .d_name = "audit" }; +/* + * XXXAUDIT: For consistency, perhaps audit_record_free()? + */ static void audit_free(struct kaudit_record *ar) { + if (ar->k_ar.ar_arg_upath1 != NULL) { free(ar->k_ar.ar_arg_upath1, M_AUDIT); } @@ -318,6 +339,13 @@ free(ar, M_AUDIT); } +/* + * XXXAUDIT: Should adjust comments below to make it clear that we get to + * this point only if we believe we have storage, so not having space here + * is a violation of invariants derived from administrative procedures. + * I.e., someone else has written to the audit partition, leaving less space + * than we accounted for. + */ static int audit_record_write(struct vnode *vp, struct kaudit_record *ar, struct ucred *cred, struct thread *td) @@ -328,6 +356,9 @@ struct vattr vattr; struct statfs *mnt_stat = &vp->v_mount->mnt_stat; + /* + * XXXAUDIT: In the world of MPSAFE VFS, this may not be necessary. + */ mtx_assert(&Giant, MA_OWNED); /* @@ -385,6 +416,8 @@ /* * Send a message to the audit daemon that disk space * is getting low. + * + * XXXAUDIT: Check math and block size calculation here. */ if (audit_qctrl.aq_minfree != 0) { temp = mnt_stat->f_blocks / (100 / @@ -463,6 +496,10 @@ goto out; } + /* + * XXXAUDIT: Should we actually allow this conversion to fail? With + * sleeping memory allocation and invariants checks, perhaps not. + */ ret = kaudit_to_bsm(ar, &bsm); if (ret == BSM_NOAUDIT) { ret = 0; @@ -486,6 +523,8 @@ * away from the BSM record generation and have the BSM generation * done before this function is called. This function will then * take the BSM record as a parameter. + * + * XXXRW: In the new world order, this is no longer true. */ ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len, (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td)); @@ -510,6 +549,16 @@ return (ret); } +/* + * The audit_worker thread is responsible for watching the event queue, + * dequeueing records, converting them to BSM format, and committing them to + * disk. In order to minimize lock thrashing, records are dequeued in sets + * to a thread-local work queue. In addition, the audit_work performs the + * actual exchange of audit log vnode pointer, as audit_vp is a thread-local + * variable. + * + * XXXAUDIT: Giant is now less required here. + */ static void audit_worker(void *arg) { @@ -649,6 +698,8 @@ * with the audit_mtx held, to avoid a lock order reversal * as free() may grab Giant. This should be fixed at * some point. + * + * XXXAUDIT: free() no longer grabs Giant. */ while ((ar = TAILQ_FIRST(&audit_q))) { TAILQ_REMOVE(&audit_q, ar, k_q); @@ -691,6 +742,11 @@ } } +/* + * Initialize the Audit subsystem: configuration state, work queue, + * synchronization primitives, worker thread, and trigger device node. Also + * call into the BSM assembly code to initialize it. + */ void audit_init(void) { @@ -735,6 +791,26 @@ SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL) +/* + * audit_rotate_vnode() is called by a user or kernel thread to configure or + * de-configure auditing on a vnode. The arguments are the replacement + * credential and vnode to substitute for the current credential and vnode, + * if any. If either is set to NULL, both should be NULL, and this is used + * to indicate that audit is being disabled. The real work is done in the + * audit_worker thread, but audit_rotate_vnode() waits synchronously for that + * to complete. + * + * The vnode should be referenced and opened by the caller. The credential + * should be referenced. audit_rotate_vnode() will own both references as of + * this call, so the caller should not release either. + * + * XXXAUDIT: Review synchronize communication logic. Really, this is a + * message queue of depth 1. + * + * XXXAUDIT: Enhance the comments below to indicate that we are basically + * acquiring ownership of the communications queue, inserting our message, + * and waiting for an acknowledgement. + */ static void audit_rotate_vnode(struct ucred *cred, struct vnode *vp) { @@ -775,6 +851,8 @@ /* XXX Need to figure out how the kernel->userspace file full * signalling will take place. + * + * XXXAUDIT: This comment may now be obsolete. */ audit_file_rotate_wait = 0; /* We can now request another rotation */ } @@ -785,12 +863,17 @@ void audit_shutdown(void) { + audit_rotate_vnode(NULL, NULL); } +/* + * Return the current thread's audit record, if any. + */ static __inline__ struct kaudit_record * currecord(void) { + return (curthread->td_ar); } @@ -833,9 +916,12 @@ /* This is not very efficient; we're required to allocate * a complete kernel audit record just so the user record * can tag along. + * + * XXXAUDIT: Maybe AUE_AUDIT in the system call context and + * special pre-select handling? */ td->td_ar = audit_new(AUE_NULL, td); - if (td->td_ar == NULL) /*auditing not on, or memory error */ + if (td->td_ar == NULL) return (ENOTSUP); ar = td->td_ar; } @@ -858,6 +944,9 @@ /* Attach the user audit record to the kernel audit record. Because * this system call is an auditable event, we will write the user * record along with the record for this audit event. + * + * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen, + * k_ar_commit & AR_COMMIT_USER? */ ar->k_udata = rec; ar->k_ulen = uap->length; @@ -921,6 +1010,8 @@ /* XXX Need to implement these commands by accessing the global * values associated with the commands. + * + * XXXAUDIT: Locking? */ switch (uap->cmd) { case A_GETPOLICY: @@ -1004,6 +1095,8 @@ case A_GETPINFO: if (udata.au_aupinfo.ap_pid < 1) return (EINVAL); + + /* XXXAUDIT: p_cansee()? */ if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) return (EINVAL); @@ -1022,6 +1115,8 @@ case A_SETPMASK: if (udata.au_aupinfo.ap_pid < 1) return (EINVAL); + + /* XXXAUDIT: p_cansee()? */ if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) return (EINVAL); @@ -1129,6 +1224,10 @@ /* * XXX: * Integer write on static pointer dereference: doesn't need locking? + * + * XXXAUDIT: Might need locking to serialize audit events in the same + * order as change events? Or maybe that's an under-solveable + * problem. */ PROC_LOCK(td->td_proc); td->td_proc->p_au->ai_auid = id; @@ -1237,10 +1336,15 @@ * If a path is specified, open the replacement vnode, perform * validity checks, and grab another reference to the current * credential. + * + * XXXAUDIT: On Darwin, a NULL path is used to disable audit. */ if (uap->path == NULL) return (EINVAL); + /* + * XXXAUDIT: Giant may no longer be required here. + */ mtx_lock(&Giant); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->path, td); flags = audit_open_flags; @@ -1259,6 +1363,11 @@ } cred = td->td_ucred; crhold(cred); + + /* + * XXXAUDIT: Should audit_suspended actually be cleared by + * audit_worker? + */ audit_suspended = 0; mtx_unlock(&Giant); @@ -1274,6 +1383,13 @@ /* * MPSAFE + * + * XXXAUDIT: There are a number of races present in the code below due to + * release and re-grab of the mutex. The code should be revised to become + * slightly less racy. + * + * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available + * pre_q space, suspending the system call until there is room? */ struct kaudit_record * audit_new(int event, struct thread *td) @@ -1306,7 +1422,7 @@ * XXX: We may want to fail-stop if allocation fails. * XXX: The number of outstanding uncommitted audit records is * limited by the number of concurrent threads servicing system - * calls in the kernel. + * calls in the kernel. */ ar = malloc(sizeof(*ar), M_AUDIT, M_WAITOK); @@ -1322,7 +1438,12 @@ ar->k_ar.ar_event = event; nanotime(&ar->k_ar.ar_starttime); - /* Export the subject credential. */ + /* + * Export the subject credential. + * + * XXXAUDIT: td_ucred access is OK without proc lock, but some other + * fields here may require the proc lock. + */ cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred); ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid; ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid; @@ -1332,6 +1453,7 @@ ar->k_ar.ar_subj_pid = td->td_proc->p_pid; ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask; ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid; + bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN); return (ar); @@ -1366,7 +1488,9 @@ /* * Decide whether to commit the audit record by checking the * error value from the system call and using the appropriate - * audit mask. + * audit mask. + * + * XXXAUDIT: Synchronize access to audit_nae_mask? */ if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID) aumask = &audit_nae_mask; @@ -1457,8 +1581,10 @@ } /* - * Calls to set up and tear down audit structures associated with - * each system call. + * audit_syscall_enter() is called on entry to eatch system call. It is + * responsible for deciding whether or not to audit the call (preselection), + * and if so, allocating a per-thread audit record. audit_new() will fill in + * basic thread/credential properties. */ void audit_syscall_enter(unsigned short code, struct thread *td) @@ -1466,12 +1592,17 @@ int audit_event; struct au_mask *aumask; + KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL")); + /* * In FreeBSD, each ABI has its own system call table, and hence * mapping of system call codes to audit events. Convert the code to * an audit event identifier using the process system call table * reference. In Darwin, there's only one, so we use the global * symbol for the system call table. + * + * XXXAUDIT: Should we audit that a bad system call was made, and if + * so, how? */ if (code >= td->td_proc->p_sysent->sv_size) return; @@ -1480,9 +1611,8 @@ if (audit_event == AUE_NULL) return; - KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL")); - - /* Check which audit mask to use; either the kernel non-attributable + /* + * Check which audit mask to use; either the kernel non-attributable * event mask or the process audit mask. */ if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID) @@ -1500,6 +1630,13 @@ * If we're out of space and need to suspend unprivileged * processes, do that here rather than trying to allocate * another audit record. + * + * XXXRW: We might wish to be able to continue here in the + * future, if the system recovers. That should be possible + * by means of checking the condition in a loop around + * cv_wait(). It might be desirable to reevaluate whether an + * audit record is still required for this event by + * re-calling au_preselect(). */ if (audit_in_failure && suser(td) != 0) { cv_wait(&audit_fail_cv, &audit_mtx); @@ -1510,6 +1647,11 @@ td->td_ar = NULL; } +/* + * audit_syscall_exit() is called from the return of every system call, or in + * the event of exit1(), during the execution of exit1(). It is responsible + * for committing the audit record, if any, along with return condition. + */ void audit_syscall_exit(int error, struct thread *td) { @@ -1543,6 +1685,9 @@ * check the thread audit record pointer anyway, as the audit condition * could change, and pre-selection may not have allocated an audit * record for this event. + * + * XXXAUDIT: Should we assert, in each case, that this field of the record + * hasn't already been filled in? */ void audit_arg_addr(void * addr) @@ -1768,7 +1913,9 @@ if ((ar == NULL) || (p == NULL)) return; - /* XXX May need to lock the credentials structures */ + /* + * XXXAUDIT: PROC_LOCK_ASSERT(p); + */ ar->k_ar.ar_arg_auid = p->p_au->ai_auid; ar->k_ar.ar_arg_euid = p->p_ucred->cr_uid; ar->k_ar.ar_arg_egid = p->p_ucred->cr_groups[0]; @@ -1811,6 +1958,10 @@ ar->k_ar.ar_valid_arg |= ARG_SOCKINFO; } +/* + * XXXAUDIT: Argument here should be 'sa' not 'so'. Caller is responsible + * for synchronizing access to the source of the address. + */ void audit_arg_sockaddr(struct thread *td, struct sockaddr *so) { @@ -1820,9 +1971,6 @@ if (ar == NULL || td == NULL || so == NULL) return; - /* - * XXX: Do we need to lock the socket? - */ bcopy(so, &ar->k_ar.ar_arg_sockaddr, sizeof(ar->k_ar.ar_arg_sockaddr)); switch (so->sa_family) { case AF_INET: @@ -1836,6 +1984,7 @@ ARG_UPATH1); ar->k_ar.ar_valid_arg |= ARG_SADDRUNIX; break; + /* XXXAUDIT: default:? */ } } @@ -1997,6 +2146,9 @@ struct socket *so; struct inpcb *pcb; + /* + * XXXAUDIT: Why is the (ar == NULL) test only in the socket case? + */ if (fp->f_type == DTYPE_VNODE) { audit_arg_vnpath((struct vnode *)fp->f_data, ARG_VNODE1); return; @@ -2006,6 +2158,10 @@ ar = currecord(); if (ar == NULL) return; + + /* + * XXXAUDIT: Socket locking? Inpcb locking? + */ so = (struct socket *)fp->f_data; if (INP_CHECK_SOCKAF(so, PF_INET)) { if (so->so_pcb == NULL) @@ -2027,6 +2183,7 @@ pcb->inp_lport; ar->k_ar.ar_valid_arg |= ARG_SOCKINFO; } + /* XXXAUDIT: else? */ } } @@ -2041,6 +2198,7 @@ KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)", p->p_pid)); p->p_au = malloc(sizeof(*(p->p_au)), M_AUDIT, M_WAITOK); + /* XXXAUDIT: Zero? Slab allocate? */ //printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au); } @@ -2089,6 +2247,10 @@ //printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid, // child->p_au); bcopy(parent->p_au, child->p_au, sizeof(*child->p_au)); + /* + * XXXAUDIT: Zero pointers to external memory, or assert they are + * zero? + */ } /* @@ -2100,6 +2262,9 @@ KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid)); //printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au); + /* + * XXXAUDIT: Assert that external memory pointers are NULL? + */ free(p->p_au, M_AUDIT); p->p_au = NULL; } @@ -2109,6 +2274,8 @@ * record stored on the user thread. This function will allocate the memory to * store the path info if not already available. This memory will be * freed when the audit record is freed. + * + * XXXAUDIT: Possibly assert that the memory isn't already allocated? */ void audit_arg_upath(struct thread *td, char *upath, u_int64_t flags) @@ -2119,18 +2286,24 @@ if (td == NULL || upath == NULL) return; /* nothing to do! */ + /* + * XXXAUDIT: Witness warning for possible sleep here? + */ + + /* + * XXXAUDIT: KASSERT argument validity? + */ if ((flags & (ARG_UPATH1 | ARG_UPATH2)) == 0) return; ar = currecord(); - if (ar == NULL) /* This will be the case for unaudited system calls */ + if (ar == NULL) return; if (flags & ARG_UPATH1) { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH1); pathp = &ar->k_ar.ar_arg_upath1; - } - else { + } else { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_UPATH2); pathp = &ar->k_ar.ar_arg_upath2; } @@ -2158,6 +2331,8 @@ * XXX: We should accept the process argument from the caller, since * it's very likely they already have a reference. * XXX: Error handling in this function is poor. + * + * XXXAUDIT: Possibly KASSERT the path pointer is NULL? */ void audit_arg_vnpath(struct vnode *vp, u_int64_t flags) @@ -2169,9 +2344,15 @@ struct vnode_au_info *vnp; struct thread *td; + /* + * XXXAUDIT: Why is vp possibly NULL here? + */ if (vp == NULL) return; + /* + * XXXAUDIT: Less Giant needed here. + */ mtx_assert(&Giant, MA_OWNED); ASSERT_VOP_LOCKED(vp, "audit_arg_vnpath") @@ -2179,6 +2360,9 @@ if (ar == NULL) /* This will be the case for unaudited system calls */ return; + /* + * XXXAUDIT: KASSERT argument validity instead? + */ if ((flags & (ARG_VNODE1 | ARG_VNODE2)) == 0) return; @@ -2189,8 +2373,7 @@ ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE1); pathp = &ar->k_ar.ar_arg_kpath1; vnp = &ar->k_ar.ar_arg_vnode1; - } - else { + } else { ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_KPATH2); ar->k_ar.ar_valid_arg &= (ARG_ALL ^ ARG_VNODE2); pathp = &ar->k_ar.ar_arg_kpath2;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200507131103.j6DB3Xlh051032>