Date: Mon, 21 Aug 2006 07:18:35 GMT From: Chris Jones <cdjones@FreeBSD.org> To: Perforce Change Reviews <perforce@FreeBSD.org> Subject: PERFORCE change 104658 for review Message-ID: <200608210718.k7L7IZoK036654@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=104658 Change 104658 by cdjones@cdjones-impulse on 2006/08/21 07:18:05 Introduce security.jail.limit_jail_memory and security.jail.jail_pager_interval sysctls. Bring jpager_td back into the build, running iff limit_jail_memory sysctl set. Get rid of old scheduler td cruft. Add jail_set_resource_limits syscall. Affected files ... .. //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 edit Differences ... ==== //depot/projects/soc2006/cdjones_jail/src/sys/kern/kern_jail.c#23 (text+ko) ==== @@ -5,6 +5,35 @@ * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- + * + * Portions copyright (c) 2006 Chris Jones + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Chris Jones + * thanks to the support of Google's Summer of Code program and + * mentoring by Kip Macy. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * */ #include <sys/cdefs.h> @@ -78,6 +107,17 @@ &jail_chflags_allowed, 0, "Processes in jail can alter system file flags"); +int jail_limit_memory = 0; +SYSCTL_INT(_security_jail, OID_AUTO, limit_jail_memory, CTLFLAG_RW, + &jail_limit_memory, 0, + "Limit jails' memory usage"); + +int jail_memory_pager_interval = 5; +SYSCTL_INT(_security_jail, OID_AUTO, jail_pager_interval, + CTLTYPE_INT | CTLFLAG_RW, + &jail_memory_pager_interval, 0, + "Interval between jail memory limit checks"); + /* allprison, lastprid, and prisoncount are protected by allprison_mtx. */ struct prisonlist allprison; struct mtx allprison_mtx; @@ -99,111 +139,104 @@ SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); -#if 0 static void jpager_td(void *arg) { - struct proc *p; - struct prison *pr = arg; - struct thread *td; - long limit, cursize, newsize, usage; - int breakout; - int pr_id = pr->pr_id; - int flags = J_SCHED_TD_ACTIVE; - pr->pr_scheduler_flags_ptr = &flags; - - printf("Starting jpager/%d with memory limit %ld bytes\n", - pr_id, (long) prison_memory_limit(pr)); - - for (;;) { - if (flags & J_PAGER_TD_DIE) - break; + struct proc *p; + struct prison *pr = arg; + struct thread *td; + long limit, cursize, newsize, usage; + int breakout; + int pr_id = pr->pr_id; + int flags = J_PAGER_TD_ACTIVE; + pr->pr_pager_flags_ptr = &flags; + + for (;;) { + if (flags & J_PAGER_TD_DIE) + break; + + if (jail_limit_memory && pr->pr_mem_limit) { + /* TODO: consider whether it might be better to start + * pushing back when we approach the limit, rather than + * when we hit it. + */ + limit = (long) prison_memory_limit(pr); + usage = (long) prison_memory(pr); + + /* The logic from vm_daemon() really needs to go here. + * Problem: we want to push things below their rlimits. + * + * TODO: refactor vm_daemon to optionally act on specific jails? + */ + + printf("jpager/%d: memory %ld / %ld bytes\n", + pr_id, usage, limit); + + if ((usage - limit) > 0) { + printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n", + pr_id, usage - limit, + (double) 100 * ((double) (usage - limit) / (double) limit)); + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + + if (pr != p->p_ucred->cr_prison) + continue; + + PROC_LOCK(p); + if (p->p_flag & (P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + + mtx_lock_spin(&sched_lock); + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td)) { + breakout = 1; + break; + } + } + mtx_unlock_spin(&sched_lock); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + + /* NOTE: we differ here from vm_daemon b/c we don't + * care about the rlimit; things that are exceeding that will + * get caught in due course. We need, however, to decrease + * the pressure on our permitted memory allocation. Fortunately, + * we only care about eventually hitting the limit, so if we + * don't get there right away, it's okay. + */ + + /* TODO: this arbitrarily reduces each process's space by + * 5% (until it's completely swapped out) while + * we're under memory pressure. A better way would be + * to either hit large processes first, or to hit the + * least-active processes first, or go proportionally, + * or .... + */ + newsize = cursize = (long) vmspace_resident_count(p->p_vmspace); + newsize -= newsize / 20; + if (cursize < 0) + newsize = 0; + PROC_UNLOCK(p); + printf("jpager/%d: squeezing process %d from %ld to %ld\n", + pr_id, p->p_pid, cursize, newsize); + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize); + } /* end LIST_FOREACH procs */ + sx_sunlock(&allproc_lock); + } + } - /* TODO: consider whether it might be better to start - * pushing back when we approach the limit, rather than - * when we hit it. - */ - limit = (long) prison_memory_limit(pr); - usage = (long) prison_memory(pr); - - /* The logic from vm_daemon() really needs to go here. - * Problem: we want to push things below their rlimits. - * - * TODO: refactor vm_daemon to optionally act on specific jails? - */ - - printf("jpager/%d: memory %ld / %ld bytes\n", - pr_id, usage, limit); - - if ((usage - limit) > 0) { - printf("jpager/%d: overcommitted by %ld bytes (%f percent)\n", - pr_id, usage - limit, - (double) 100 * ((double) (usage - limit) / (double) limit)); - sx_slock(&allproc_lock); - LIST_FOREACH(p, &allproc, p_list) { - - if (pr != p->p_ucred->cr_prison) - continue; - - PROC_LOCK(p); - if (p->p_flag & (P_SYSTEM | P_WEXIT)) { - PROC_UNLOCK(p); - continue; + tsleep(pr, 0, "-", jail_memory_pager_interval * hz); } - mtx_lock_spin(&sched_lock); - breakout = 0; - FOREACH_THREAD_IN_PROC(p, td) { - if (!TD_ON_RUNQ(td) && - !TD_IS_RUNNING(td) && - !TD_IS_SLEEPING(td)) { - breakout = 1; - break; - } - } - mtx_unlock_spin(&sched_lock); - if (breakout) { - PROC_UNLOCK(p); - continue; - } - - /* NOTE: we differ here from vm_daemon b/c we don't - * care about the rlimit; things that are exceeding that will - * get caught in due course. We need, however, to decrease - * the pressure on our permitted memory allocation. Fortunately, - * we only care about eventually hitting the limit, so if we - * don't get there right away, it's okay. - */ - - /* TODO: this arbitrarily reduces each process's space by - * 5% (until it's completely swapped out) while - * we're under memory pressure. A better way would be - * to either hit large processes first, or to hit the - * least-active processes first, or go proportionally, - * or .... - */ - newsize = cursize = (long) vmspace_resident_count(p->p_vmspace); - newsize -= newsize / 20; - if (cursize < 0) - newsize = 0; - PROC_UNLOCK(p); - printf("jpager/%d: squeezing process %d from %ld to %ld\n", - pr_id, p->p_pid, cursize, newsize); - vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize); - } /* end LIST_FOREACH procs */ - sx_sunlock(&allproc_lock); - } - - /* TODO --- make interval into a sysctl. */ - /* 6 seconds because VM recomputes totals every 5. */ - printf("jpager_td sleeping\n"); - tsleep(pr, 0, "-", 6 * hz); - } - - printf("Exiting jpager_td\n"); - kthread_exit(0); + kthread_exit(0); } -#endif /* * MPSAFE @@ -219,7 +252,7 @@ struct prison *pr, *tpr; struct jail j; struct jail_attach_args jaa; - /* struct proc *j_pager_proc = NULL; */ + struct proc *j_pager_proc = NULL; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); @@ -275,10 +308,10 @@ prisoncount++; mtx_unlock(&allprison_mtx); - /* if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) - goto e_dropprref; + if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) + goto e_dropprref; KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc")); - pr->pr_pager = j_pager_proc; */ + pr->pr_pager = j_pager_proc; error = jail_attach(td, &jaa); if (error) @@ -404,8 +437,7 @@ mtx_unlock(&allprison_mtx); /* Tell scheduler, pager to die. No need to wait. */ -/* *pr->pr_scheduler_flags_ptr = J_SCHED_TD_DIE; - *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; */ + *pr->pr_pager_flags_ptr = J_PAGER_TD_DIE; wakeup(pr); TASK_INIT(&pr->pr_task, 0, prison_complete, pr); @@ -523,40 +555,36 @@ vm_pindex_t prison_memory(struct prison *pr) { - struct proc *p; - u_int mem_used = 0; - - /* TODO: cut this to search only procs in given jail. */ - FOREACH_PROC_IN_SYSTEM(p) { - if (!jailed(p->p_ucred) || - (pr != p->p_ucred->cr_prison)) { - continue; - } - - /* Get memory usage (see vm/vm_map.h). */ - /* TODO maybe use vm_swrss? */ - mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */ - mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */ - mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */ - } - - /* Convert to bytes, cache (maybe unncessary?). */ - mem_used *= PAGE_SIZE; - /* mtx_lock(&pr->pr_mtx); - pr->pr_mem_usage = mem_used; - mtx_unlock(&pr->pr_mtx); */ - return mem_used; + struct proc *p; + u_int mem_used = 0; + + /* TODO: cut this to search only procs in given jail. */ + FOREACH_PROC_IN_SYSTEM(p) { + if (!jailed(p->p_ucred) || + (pr != p->p_ucred->cr_prison)) { + continue; + } + + /* Get memory usage (see vm/vm_map.h). */ + /* TODO maybe use vm_swrss? */ + mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */ + mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */ + mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */ + } + + mem_used *= PAGE_SIZE; + return mem_used; } /* Given credential, return permitted memory usage in bytes. */ vm_pindex_t prison_memory_limit(struct prison *pr) { - vm_pindex_t memlimit; - mtx_lock(&pr->pr_mtx); - memlimit = (vm_pindex_t) pr->pr_mem_limit; - mtx_unlock(&pr->pr_mtx); - return memlimit; + vm_pindex_t memlimit; + mtx_lock(&pr->pr_mtx); + memlimit = (vm_pindex_t) pr->pr_mem_limit; + mtx_unlock(&pr->pr_mtx); + return memlimit; } /* @@ -689,6 +717,52 @@ } } +/* + * Change resource limit for a prison. + * + * unsigned int jid: id of jail to mess with + * + * int cpushares: 0 -> remove prison from cpu limits + * -1 -> don't change existing shares + * >0 -> set cpu shares + * + * int memlimit: 0 -> remove prison from mem limits + * -1 -> don't change existing limit + * >1 -> set memory limit (bytes) + * + * TODO: might this be better handled via a writable + * sysctl than with a new syscall? + */ +int +jail_set_resource_limits(struct thread *td, struct jail_set_resource_limits_args *uap) +{ + struct prison *pr; + int error; + + error = suser(td); + if (error) + return (error); + + mtx_lock(&allprison_mtx); + LIST_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id == uap->jid) + break; + } + if (NULL == pr) { + mtx_unlock(&allprison_mtx); + return 1; + } + + mtx_lock(&pr->pr_mtx); + if (-1 != uap->cpushares) + pr->pr_sched_shares = uap->cpushares; + if (-1 != uap->memlimit) + pr->pr_mem_limit = uap->memlimit; + mtx_unlock(&pr->pr_mtx); + mtx_unlock(&allprison_mtx); + return 0; +} + static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) {
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200608210718.k7L7IZoK036654>