Date: Tue, 1 Aug 2006 14:16:46 -0600 From: Chris Jones <cdjones-freebsd-hackers@novusordo.net> To: freebsd-hackers@freebsd.org Subject: [PATCH] Jail Memory Limits Message-ID: <F61BB1C8-E979-4AEA-81C4-A570CE7A2AE8@novusordo.net>
next in thread | raw e-mail | index | archive | help
--Apple-Mail-3-62547150 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset=US-ASCII; delsp=yes; format=flowed Hi, folks --- I have a beta patch to add memory limits (on the basis of RSS) to jails, and would love to get some people to test it out. The patch files (below) are against RELENG_6 from earlier this morning, but should work against anything recent. They should be applied under /usr/src. This creates a kernel process for each jail to intermittently 1) check whether the jail's overcommitted on RSS and 2) if so, to partially page out the processes in the same way that's used for when the system's short on memory. This permits short periods of over- use, with a tendency back to the limit. (Aside: this is the same way Solaris handles it.) To test, use the new '-m MEM_LIMIT_IN_MB' flag to jail to set the memory limit for the jail; I've also included a trivial program which consumes and holds memory which can be run inside the jail. Take a look on the console for the debugging information, which is rather verbose at the moment. I'm expecting patches for jail scheduling to be coming down the pipe soon. Cheers, Chris --Apple-Mail-3-62547150 Content-Transfer-Encoding: 7bit Content-Type: application/octet-stream; x-unix-mode=0644; name=kern.patch Content-Disposition: attachment; filename=kern.patch Only in /usr/src/sys/kern: CVS diff -u /usr/src/sys/kern/kern_jail.c sys/kern/kern_jail.c --- /usr/src/sys/kern/kern_jail.c Sat Nov 12 20:12:32 2005 +++ sys/kern/kern_jail.c Tue Aug 1 12:18:07 2006 @@ -15,12 +15,19 @@ #include <sys/param.h> #include <sys/types.h> #include <sys/kernel.h> +#include <sys/kthread.h> #include <sys/systm.h> #include <sys/errno.h> #include <sys/sysproto.h> #include <sys/mac.h> #include <sys/malloc.h> #include <sys/proc.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_pageout.h> #include <sys/taskqueue.h> #include <sys/jail.h> #include <sys/lock.h> @@ -92,6 +99,134 @@ SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL); +static void +jsched_td(void *arg) +{ + struct prison *pr; + pr = arg; + +/* printf("Starting jsched_td\n"); */ + + for (;;) { + if (pr->pr_scheduler_flags & J_SCHED_TD_DIE) + break; + + /* Scheduling stuff goes here. */ +/* printf("jsched_td running\n"); */ + tsleep(pr, 0, "-", hz); + } + +/* printf("Exiting jsched_td\n"); */ + + pr->pr_scheduler_flags = J_SCHED_TD_DEAD; + kthread_exit(0); +} + +static void +jpager_td(void *arg) +{ + struct proc *p; + struct prison *pr; + struct thread *td; + long limit, cursize, newsize, usage; + int breakout; + + pr = arg; + + printf("Starting jpager/%d with memory limit %ld bytes\n", + pr->pr_id, (long) prison_memory_limit(pr)); + + for (;;) { + if (pr->pr_pager_flags & J_PAGER_TD_DIE) + break; + + /* TODO: consider whether it might be better to start + * pushing back when we approach the limit, rather than + * when we hit it. + */ + limit = (long) prison_memory_limit(pr); + usage = (long) prison_memory(pr); + + /* The logic from vm_daemon() really needs to go here. + * Problem: we want to push things below their rlimits. + * + * TODO: refactor vm_daemon to optionally act on specific jails? + */ + + printf("jpager/%d: memory %ld / %ld bytes\n", + pr->pr_id, usage, limit); + + if ((usage - limit) > 0) { + printf("jpager/%d: overcommitted by %ld bytes (%lf percent)\n", + pr->pr_id, usage - limit, + (double) 100 * ((double) (usage - limit) / (double) limit)); + sx_slock(&allproc_lock); + LIST_FOREACH(p, &allproc, p_list) { + + if (pr != p->p_ucred->cr_prison) + continue; + + PROC_LOCK(p); + if (p->p_flag & (P_SYSTEM | P_WEXIT)) { + PROC_UNLOCK(p); + continue; + } + + mtx_lock_spin(&sched_lock); + breakout = 0; + FOREACH_THREAD_IN_PROC(p, td) { + if (!TD_ON_RUNQ(td) && + !TD_IS_RUNNING(td) && + !TD_IS_SLEEPING(td)) { + breakout = 1; + break; + } + } + mtx_unlock_spin(&sched_lock); + if (breakout) { + PROC_UNLOCK(p); + continue; + } + + /* NOTE: we differ here from vm_daemon b/c we don't + * care about the rlimit; things that are exceeding that will + * get caught in due course. We need, however, to decrease + * the pressure on our permitted memory allocation. Fortunately, + * we only care about eventually hitting the limit, so if we + * don't get there right away, it's okay. + */ + + /* TODO: this arbitrarily reduces each process's space by + * 5% (until it's completely swapped out) while + * we're under memory pressure. A better way would be + * to either hit large processes first, or to hit the + * least-active processes first, or go proportionally, + * or .... + */ + newsize = cursize = (long) vmspace_resident_count(p->p_vmspace); + newsize -= newsize / 20; + if (cursize < 0) + newsize = 0; + PROC_UNLOCK(p); + printf("jpager/%d: squeezing process %d from %ld to %ld\n", + pr->pr_id, p->p_pid, cursize, newsize); + vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize); + } /* end LIST_FOREACH procs */ + sx_sunlock(&allproc_lock); + } + + /* TODO --- make interval into a sysctl. */ + /* 6 seconds because VM recomputes totals every 5. */ + printf("jpager_td sleeping\n"); + tsleep(pr, 0, "-", 6 * hz); + } + + printf("Exiting jpager_td\n"); + + pr->pr_pager_flags = J_PAGER_TD_DEAD; + kthread_exit(0); +} + /* * MPSAFE * @@ -106,6 +241,8 @@ struct prison *pr, *tpr; struct jail j; struct jail_attach_args jaa; + struct proc *j_sched_proc = NULL; + struct proc *j_pager_proc = NULL; int vfslocked, error, tryprid; error = copyin(uap->jail, &j, sizeof(j)); @@ -135,7 +272,9 @@ goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; + pr->pr_priority = j.priority; pr->pr_securelevel = securelevel; + pr->pr_mem_limit = j.mem_limit; /* Determine next pr_id and add prison to allprison list. */ mtx_lock(&allprison_mtx); @@ -159,6 +298,19 @@ prisoncount++; mtx_unlock(&allprison_mtx); + /* TODO #ifdef SCHED_HIER */ + pr->pr_scheduler_flags = J_SCHED_TD_ACTIVE; + if (kthread_create(jsched_td, pr, (void *) j_sched_proc, 0, 0, "jsched %d", pr->pr_id)) + goto e_dropprref; + KASSERT(j_sched_proc != NULL, ("NULL j_sched_proc")); + pr->pr_scheduler = j_sched_proc; + pr->pr_pager_flags = J_PAGER_TD_ACTIVE; + if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id)) + goto e_dropprref; + KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc")); + pr->pr_pager = j_pager_proc; + /* TODO #endif */ + error = jail_attach(td, &jaa); if (error) goto e_dropprref; @@ -282,6 +434,11 @@ prisoncount--; mtx_unlock(&allprison_mtx); + /* Tell scheduler to die. No need to wait for it. */ + pr->pr_scheduler_flags |= J_SCHED_TD_DIE; + pr->pr_pager_flags |= J_PAGER_TD_DIE; + wakeup(pr); + TASK_INIT(&pr->pr_task, 0, prison_complete, pr); taskqueue_enqueue(taskqueue_thread, &pr->pr_task); return; @@ -391,6 +548,42 @@ else ok = 0; return (ok); +} + +/* Given credential, return memory usage in bytes. */ +vm_pindex_t +prison_memory(struct prison *pr) +{ + struct proc *p; + u_int mem_used = 0; + + /* TODO: cut this to search only procs in given jail. */ + FOREACH_PROC_IN_SYSTEM(p) { + if (!jailed(p->p_ucred) || + (pr != p->p_ucred->cr_prison)) { + continue; + } + + /* Get memory usage (see vm/vm_map.h). */ + /* TODO maybe use vm_swrss? */ + mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */ + mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */ + mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */ + } + + /* Convert to bytes, cache (maybe unncessary?). */ + mem_used *= PAGE_SIZE; + /* mtx_lock(&pr->pr_mtx); + pr->pr_mem_usage = mem_used; + mtx_unlock(&pr->pr_mtx); */ + return mem_used; +} + +/* Given credential, return permitted memory usage in bytes. */ +vm_pindex_t +prison_memory_limit(struct prison *pr) +{ + return pr->pr_mem_limit; } /* --Apple-Mail-3-62547150 Content-Transfer-Encoding: 7bit Content-Type: application/octet-stream; x-unix-mode=0644; name=sys.patch Content-Disposition: attachment; filename=sys.patch Only in /usr/src/sys/sys: CVS diff -u /usr/src/sys/sys/jail.h sys/sys/jail.h --- /usr/src/sys/sys/jail.h Thu Jun 9 12:49:19 2005 +++ sys/sys/jail.h Fri Jul 28 12:03:26 2006 @@ -18,6 +18,10 @@ char *path; char *hostname; u_int32_t ip_number; + unsigned int priority; + unsigned int mem_limit; +/* struct thread *scheduler; + CJ TODO --- add reference to preferred scheduler, e.g. by name? */ }; struct xprison { @@ -26,9 +30,26 @@ char pr_path[MAXPATHLEN]; char pr_host[MAXHOSTNAMELEN]; u_int32_t pr_ip; + unsigned int priority; + unsigned int mem_limit; + /* struct thread *scheduler; */ }; #define XPRISON_VERSION 1 +#define JAIL_DEFAULT_PRIORITY 10 +#define JAIL_MINIMUM_PRIORITY 1 +#define JAIL_MAXIMUM_PRIORITY 100 + +#define JAIL_DEFAULT_MEM_LIMIT 256 * 1024 * 1024 + +#define J_SCHED_TD_ACTIVE 0x01 +#define J_SCHED_TD_DIE 0x02 +#define J_SCHED_TD_DEAD 0x04 + +#define J_PAGER_TD_ACTIVE 0x01 +#define J_PAGER_TD_DIE 0x02 +#define J_PAGER_TD_DEAD 0x04 + #ifndef _KERNEL int jail(struct jail *); @@ -61,6 +82,11 @@ * (d) set only during destruction of jail, no mutex needed */ #if defined(_KERNEL) || defined(_WANT_PRISON) + +#include <sys/proc.h> +/*struct proc; */ + + struct prison { LIST_ENTRY(prison) pr_list; /* (a) all prisons */ int pr_id; /* (c) prison id */ @@ -73,6 +99,13 @@ int pr_securelevel; /* (p) securelevel */ struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; + unsigned int pr_priority; /* (p) jail priority */ + struct proc *pr_scheduler; /* (c) scheduler pid */ + int pr_scheduler_flags; /* (p) communication to scheduler */ + struct proc *pr_pager; /* (c) pager pid */ + int pr_pager_flags; /* (p) communication to pager */ + size_t pr_mem_limit; /* (p) memory allocation limit */ + size_t pr_mem_usage; /* (p) memory in use */ }; #endif /* _KERNEL || _WANT_PRISON */ @@ -110,6 +143,8 @@ void prison_hold(struct prison *pr); int prison_if(struct ucred *cred, struct sockaddr *sa); int prison_ip(struct ucred *cred, int flag, u_int32_t *ip); +vm_pindex_t prison_memory(struct prison *pr); +vm_pindex_t prison_memory_limit(struct prison *pr); void prison_remote_ip(struct ucred *cred, int flags, u_int32_t *ip); #endif /* _KERNEL */ --Apple-Mail-3-62547150 Content-Transfer-Encoding: 7bit Content-Type: application/octet-stream; x-unix-mode=0644; name=usr.sbin.patch Content-Disposition: attachment; filename=usr.sbin.patch Only in /usr/src/usr.sbin/jail: CVS diff -u /usr/src/usr.sbin/jail/jail.c usr.sbin/jail/jail.c --- /usr/src/usr.sbin/jail/jail.c Tue Aug 1 13:50:48 2006 +++ usr.sbin/jail/jail.c Tue Aug 1 12:18:07 2006 @@ -56,6 +56,7 @@ struct in_addr in; gid_t groups[NGROUPS]; int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag; + unsigned int mem_limit, priority; char path[PATH_MAX], *ep, *username, *JidFile; static char *cleanenv; const char *shell, *p = NULL; @@ -63,11 +64,13 @@ FILE *fp; iflag = Jflag = lflag = uflag = Uflag = 0; + mem_limit = JAIL_DEFAULT_MEM_LIMIT; + priority = JAIL_DEFAULT_PRIORITY; securelevel = -1; username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "ilp:m:s:u:U:J:")) != -1) { switch (ch) { case 'i': iflag = 1; @@ -76,6 +79,17 @@ JidFile = optarg; Jflag = 1; break; + case 'm': + /* TODO --- should this be specified in MB? */ + mem_limit = atoi(optarg); + mem_limit *= 1024 * 1024; + break; + case 'p': + priority = atoi(optarg); + if (priority < JAIL_MINIMUM_PRIORITY || + priority > JAIL_MAXIMUM_PRIORITY) + errx(1, "invalid priority: `%s'", optarg); + break; case 's': ltmp = strtol(optarg, &ep, 0); if (*ep || ep == optarg || ltmp > INT_MAX || !ltmp) @@ -118,6 +132,8 @@ if (inet_aton(argv[2], &in) == 0) errx(1, "Could not make sense of ip-number: %s", argv[2]); j.ip_number = ntohl(in.s_addr); + j.mem_limit = mem_limit; + j.priority = priority; if (Jflag) { fp = fopen(JidFile, "w"); if (fp == NULL) @@ -182,8 +198,10 @@ usage(void) { - (void)fprintf(stderr, "%s%s%s\n", - "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ", + (void)fprintf(stderr, "%s%s%s%s%s\n", + "usage: jail [-i] [-J jid_file] [-m mem_limit] ", + "[-p priority] [-s securelevel]", + " [-l -u ", "username | -U username]", " path hostname ip-number command ..."); exit(1); --Apple-Mail-3-62547150 Content-Transfer-Encoding: 7bit Content-Type: application/octet-stream; x-unix-mode=0444; name=useMemory.c Content-Disposition: attachment; filename=useMemory.c #include <sys/cdefs.h> #include <err.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> static void usage(void); extern char **environ; int main(int argc, char **argv) { unsigned int memsize; unsigned int p = 0; /* offset from beginning of boundary */ char *memstart; int ch; char testdata = 0xff; while ((ch = getopt(argc, argv, "m:")) != -1) { switch (ch) { case 'm': memsize = atoi(optarg); memsize *= 1024 * 1024; break; default: usage(); } } argc -= optind; argv += optind; /* Allocate memory. */ memstart = malloc(memsize * sizeof(char)); if (NULL == memstart) { printf("useMemory: couldn't allocate memory!"); exit(2); } printf("useMemory: allocated %ld bytes of memory\n", memsize); while (p < memsize) { memstart[p] = 0xde; memstart[p+1] = 0xad; memstart[p+2] = 0xbe; memstart[p+3] = 0xef; if (0 == (p % 1048576)) printf("useMemory: writing to %ld / %ld bytes at %p (%dddd)\n", p, memsize, &memstart[p], memstart[p], memstart[p+1], memstart[p+2], memstart[p+3]); p += 1024; /* this really should be set to the page size */ } for (;;) sleep(10); exit(0); } static void usage(void) { (void) fprintf(stderr, "%s\n", "usage: useMemory [-m memsize]"); exit(1); } --Apple-Mail-3-62547150--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?F61BB1C8-E979-4AEA-81C4-A570CE7A2AE8>