Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 1 Aug 2006 14:16:46 -0600
From:      Chris Jones <cdjones-freebsd-hackers@novusordo.net>
To:        freebsd-hackers@freebsd.org
Subject:   [PATCH] Jail Memory Limits
Message-ID:  <F61BB1C8-E979-4AEA-81C4-A570CE7A2AE8@novusordo.net>

next in thread | raw e-mail | index | archive | help

--Apple-Mail-3-62547150
Content-Transfer-Encoding: 7bit
Content-Type: text/plain;
	charset=US-ASCII;
	delsp=yes;
	format=flowed


Hi, folks --- I have a beta patch to add memory limits (on the basis  
of RSS) to jails, and would love to get some people to test it out.   
The patch files (below) are against RELENG_6 from earlier this  
morning, but should work against anything recent.  They should be  
applied under /usr/src.

This creates a kernel process for each jail to intermittently 1)  
check whether the jail's overcommitted on RSS and 2) if so, to  
partially page out the processes in the same way that's used for when  
the system's short on memory.  This permits short periods of over- 
use, with a tendency back to the limit.  (Aside: this is the same way  
Solaris handles it.)

To test, use the new '-m MEM_LIMIT_IN_MB' flag to jail to set the  
memory limit for the jail; I've also included a trivial program which  
consumes and holds memory which can be run inside the jail.  Take a  
look on the console for the debugging information, which is rather  
verbose at the moment.

I'm expecting patches for jail scheduling to be coming down the pipe  
soon.

Cheers,

Chris




--Apple-Mail-3-62547150
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name=kern.patch
Content-Disposition: attachment;
	filename=kern.patch

Only in /usr/src/sys/kern: CVS
diff -u /usr/src/sys/kern/kern_jail.c sys/kern/kern_jail.c
--- /usr/src/sys/kern/kern_jail.c	Sat Nov 12 20:12:32 2005
+++ sys/kern/kern_jail.c	Tue Aug  1 12:18:07 2006
@@ -15,12 +15,19 @@
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
+#include <sys/kthread.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
 #include <sys/taskqueue.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
@@ -92,6 +99,134 @@
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
+static void
+jsched_td(void *arg)
+{
+  struct prison *pr;
+  pr = arg;
+  
+/*  printf("Starting jsched_td\n"); */
+  
+  for (;;) {
+    if (pr->pr_scheduler_flags & J_SCHED_TD_DIE) 
+      break;
+    
+    /* Scheduling stuff goes here. */
+/*    printf("jsched_td running\n"); */
+    tsleep(pr, 0, "-", hz);
+  }
+
+/*  printf("Exiting jsched_td\n"); */
+
+  pr->pr_scheduler_flags = J_SCHED_TD_DEAD;
+  kthread_exit(0);
+}
+
+static void
+jpager_td(void *arg)
+{
+  struct proc *p;
+  struct prison *pr;
+  struct thread *td;
+  long limit, cursize, newsize, usage;
+  int breakout;
+  
+  pr = arg;
+  
+  printf("Starting jpager/%d with memory limit %ld bytes\n", 
+         pr->pr_id, (long) prison_memory_limit(pr));
+  
+  for (;;) {
+    if (pr->pr_pager_flags & J_PAGER_TD_DIE)
+      break;
+    
+    /* TODO: consider whether it might be better to start
+     * pushing back when we approach the limit, rather than
+     * when we hit it.
+     */
+    limit = (long) prison_memory_limit(pr);
+    usage = (long) prison_memory(pr);
+
+    /* The logic from vm_daemon() really needs to go here.
+     * Problem: we want to push things below their rlimits.
+     *
+     * TODO: refactor vm_daemon to optionally act on specific jails?
+     */
+
+    printf("jpager/%d: memory %ld / %ld bytes\n", 
+           pr->pr_id, usage, limit);
+
+    if ((usage - limit) > 0) {
+      printf("jpager/%d: overcommitted by %ld bytes (%lf percent)\n",
+             pr->pr_id, usage - limit,
+             (double) 100 * ((double) (usage - limit) / (double) limit)); 
+      sx_slock(&allproc_lock);
+      LIST_FOREACH(p, &allproc, p_list) {
+	
+	if (pr != p->p_ucred->cr_prison)
+	  continue;
+	
+	PROC_LOCK(p);
+	if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
+	  PROC_UNLOCK(p);
+	  continue;
+	}
+	
+	mtx_lock_spin(&sched_lock);
+	breakout = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+	  if (!TD_ON_RUNQ(td) &&
+	      !TD_IS_RUNNING(td) &&
+	      !TD_IS_SLEEPING(td)) {
+	    breakout = 1;
+	    break;
+	  }
+	}
+	mtx_unlock_spin(&sched_lock);
+	if (breakout) {
+	  PROC_UNLOCK(p);
+	  continue;
+	}
+	
+	/* NOTE: we differ here from vm_daemon b/c we don't 
+	 * care about the rlimit; things that are exceeding that will
+	 * get caught in due course.  We need, however, to decrease
+	 * the pressure on our permitted memory allocation.  Fortunately, 
+	 * we only care about eventually hitting the limit, so if we
+	 * don't get there right away, it's okay.
+	 */      
+	
+	/* TODO: this arbitrarily reduces each process's space by
+	 * 5% (until it's completely swapped out) while
+	 * we're under memory pressure.  A better way would be 
+	 * to either hit large processes first, or to hit the
+	 * least-active processes first, or go proportionally,
+         * or .... 
+	 */
+	newsize = cursize = (long) vmspace_resident_count(p->p_vmspace);
+	newsize -= newsize / 20;
+	if (cursize < 0)
+	  newsize = 0;
+	PROC_UNLOCK(p);
+	printf("jpager/%d: squeezing process %d from %ld to %ld\n", 
+               pr->pr_id, p->p_pid, cursize, newsize);
+	vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, newsize);
+      } /* end LIST_FOREACH procs */
+      sx_sunlock(&allproc_lock);
+    }
+    
+    /* TODO --- make interval into a sysctl. */
+    /* 6 seconds because VM recomputes totals every 5. */
+    printf("jpager_td sleeping\n");
+    tsleep(pr, 0, "-", 6 * hz); 
+  }
+
+  printf("Exiting jpager_td\n");
+  
+  pr->pr_pager_flags = J_PAGER_TD_DEAD;
+  kthread_exit(0);
+}
+
 /*
  * MPSAFE
  *
@@ -106,6 +241,8 @@
 	struct prison *pr, *tpr;
 	struct jail j;
 	struct jail_attach_args jaa;
+	struct proc *j_sched_proc = NULL;
+	struct proc *j_pager_proc = NULL;
 	int vfslocked, error, tryprid;
 
 	error = copyin(uap->jail, &j, sizeof(j));
@@ -135,7 +272,9 @@
 		goto e_dropvnref;
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
+	pr->pr_priority = j.priority;
 	pr->pr_securelevel = securelevel;
+	pr->pr_mem_limit = j.mem_limit;
 
 	/* Determine next pr_id and add prison to allprison list. */
 	mtx_lock(&allprison_mtx);
@@ -159,6 +298,19 @@
 	prisoncount++;
 	mtx_unlock(&allprison_mtx);
 
+	/* TODO #ifdef SCHED_HIER */
+	pr->pr_scheduler_flags = J_SCHED_TD_ACTIVE;
+	if (kthread_create(jsched_td, pr, (void *) j_sched_proc, 0, 0, "jsched %d", pr->pr_id))
+	  goto e_dropprref;
+	KASSERT(j_sched_proc != NULL, ("NULL j_sched_proc"));
+	pr->pr_scheduler = j_sched_proc;
+	pr->pr_pager_flags = J_PAGER_TD_ACTIVE;
+	if (kthread_create(jpager_td, pr, (void *) j_pager_proc, 0, 0, "jpager %d", pr->pr_id))
+	  goto e_dropprref;
+	KASSERT(j_pager_proc != NULL, ("NULL j_pager_proc"));
+	pr->pr_pager = j_pager_proc;
+	/* TODO #endif */
+
 	error = jail_attach(td, &jaa);
 	if (error)
 		goto e_dropprref;
@@ -282,6 +434,11 @@
 		prisoncount--;
 		mtx_unlock(&allprison_mtx);
 
+		/* Tell scheduler to die.  No need to wait for it. */
+		pr->pr_scheduler_flags |= J_SCHED_TD_DIE;
+		pr->pr_pager_flags |= J_PAGER_TD_DIE;
+		wakeup(pr);
+
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
@@ -391,6 +548,42 @@
 	else
 		ok = 0;
 	return (ok);
+}
+
+/* Given credential, return memory usage in bytes. */
+vm_pindex_t
+prison_memory(struct prison *pr)
+{
+  struct proc *p;
+  u_int mem_used = 0;
+
+  /* TODO: cut this to search only procs in given jail. */
+  FOREACH_PROC_IN_SYSTEM(p) {
+    if (!jailed(p->p_ucred) ||
+	(pr != p->p_ucred->cr_prison)) {
+      continue;
+    }
+
+    /* Get memory usage (see vm/vm_map.h). */
+    /* TODO maybe use vm_swrss? */
+    mem_used += (p->p_vmspace)->vm_tsize; /* text size (pages) */
+    mem_used += (p->p_vmspace)->vm_dsize; /* data size (pages) */
+    mem_used += (p->p_vmspace)->vm_ssize; /* stack size (pages) */
+  }
+
+  /* Convert to bytes, cache (maybe unncessary?). */
+  mem_used *= PAGE_SIZE;
+  /*  mtx_lock(&pr->pr_mtx);
+  pr->pr_mem_usage = mem_used;
+  mtx_unlock(&pr->pr_mtx); */
+  return mem_used;
+}
+
+/* Given credential, return permitted memory usage in bytes. */
+vm_pindex_t
+prison_memory_limit(struct prison *pr)
+{
+  return pr->pr_mem_limit;
 }
 
 /*

--Apple-Mail-3-62547150
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0644;
	name=sys.patch
Content-Disposition: attachment;
	filename=sys.patch

Only in /usr/src/sys/sys: CVS
diff -u /usr/src/sys/sys/jail.h sys/sys/jail.h
--- /usr/src/sys/sys/jail.h	Thu Jun  9 12:49:19 2005
+++ sys/sys/jail.h	Fri Jul 28 12:03:26 2006
@@ -18,6 +18,10 @@
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
+	unsigned int	priority;
+        unsigned int    mem_limit;
+/*        struct thread   *scheduler;
+ CJ TODO --- add reference to preferred scheduler, e.g. by name? */
 };
 
 struct xprison {
@@ -26,9 +30,26 @@
 	char		 pr_path[MAXPATHLEN];
 	char 		 pr_host[MAXHOSTNAMELEN];
 	u_int32_t	 pr_ip;
+        unsigned int     priority;
+        unsigned int     mem_limit;
+  /*        struct thread    *scheduler; */
 };
 #define	XPRISON_VERSION	1
 
+#define JAIL_DEFAULT_PRIORITY 10
+#define JAIL_MINIMUM_PRIORITY 1
+#define JAIL_MAXIMUM_PRIORITY 100
+
+#define JAIL_DEFAULT_MEM_LIMIT 256 * 1024 * 1024
+
+#define J_SCHED_TD_ACTIVE 0x01
+#define J_SCHED_TD_DIE    0x02
+#define J_SCHED_TD_DEAD   0x04
+
+#define J_PAGER_TD_ACTIVE 0x01
+#define J_PAGER_TD_DIE    0x02
+#define J_PAGER_TD_DEAD   0x04
+
 #ifndef _KERNEL
 
 int jail(struct jail *);
@@ -61,6 +82,11 @@
  *   (d) set only during destruction of jail, no mutex needed
  */
 #if defined(_KERNEL) || defined(_WANT_PRISON)
+
+#include <sys/proc.h>
+/*struct proc; */
+
+
 struct prison {
 	LIST_ENTRY(prison) pr_list;			/* (a) all prisons */
 	int		 pr_id;				/* (c) prison id */
@@ -73,6 +99,13 @@
 	int		 pr_securelevel;		/* (p) securelevel */
 	struct task	 pr_task;			/* (d) destroy task */
 	struct mtx	 pr_mtx;
+	unsigned int	 pr_priority;			/* (p) jail priority */
+        struct proc     *pr_scheduler;                  /* (c) scheduler pid */
+        int              pr_scheduler_flags;            /* (p) communication to scheduler */
+        struct proc     *pr_pager;                      /* (c) pager pid */
+        int              pr_pager_flags;                /* (p) communication to pager */
+        size_t           pr_mem_limit;                  /* (p) memory allocation limit */
+        size_t           pr_mem_usage;                  /* (p) memory in use */
 };
 #endif /* _KERNEL || _WANT_PRISON */
 
@@ -110,6 +143,8 @@
 void prison_hold(struct prison *pr);
 int prison_if(struct ucred *cred, struct sockaddr *sa);
 int prison_ip(struct ucred *cred, int flag, u_int32_t *ip);
+vm_pindex_t prison_memory(struct prison *pr);
+vm_pindex_t prison_memory_limit(struct prison *pr);
 void prison_remote_ip(struct ucred *cred, int flags, u_int32_t *ip);
 
 #endif /* _KERNEL */

--Apple-Mail-3-62547150
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream; x-unix-mode=0644; name=usr.sbin.patch
Content-Disposition: attachment;
	filename=usr.sbin.patch

Only in /usr/src/usr.sbin/jail: CVS
diff -u /usr/src/usr.sbin/jail/jail.c usr.sbin/jail/jail.c
--- /usr/src/usr.sbin/jail/jail.c	Tue Aug  1 13:50:48 2006
+++ usr.sbin/jail/jail.c	Tue Aug  1 12:18:07 2006
@@ -56,6 +56,7 @@
 	struct in_addr in;
 	gid_t groups[NGROUPS];
 	int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag;
+	unsigned int mem_limit, priority;
 	char path[PATH_MAX], *ep, *username, *JidFile;
 	static char *cleanenv;
 	const char *shell, *p = NULL;
@@ -63,11 +64,13 @@
 	FILE *fp;
 
 	iflag = Jflag = lflag = uflag = Uflag = 0;
+	mem_limit = JAIL_DEFAULT_MEM_LIMIT;
+	priority = JAIL_DEFAULT_PRIORITY;
 	securelevel = -1;
 	username = JidFile = cleanenv = NULL;
 	fp = NULL;
 
-	while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) {
+	while ((ch = getopt(argc, argv, "ilp:m:s:u:U:J:")) != -1) {
 		switch (ch) {
 		case 'i':
 			iflag = 1;
@@ -76,6 +79,17 @@
 			JidFile = optarg;
 			Jflag = 1;
 			break;
+		case 'm':
+			/* TODO --- should this be specified in MB? */
+			mem_limit = atoi(optarg);
+			mem_limit *= 1024 * 1024;
+			break;
+		case 'p':
+			priority = atoi(optarg);
+			if (priority < JAIL_MINIMUM_PRIORITY || 
+                            priority > JAIL_MAXIMUM_PRIORITY)
+				errx(1, "invalid priority: `%s'", optarg);
+			break;
 		case 's':
 			ltmp = strtol(optarg, &ep, 0);
 			if (*ep || ep == optarg || ltmp > INT_MAX || !ltmp)
@@ -118,6 +132,8 @@
 	if (inet_aton(argv[2], &in) == 0)
 		errx(1, "Could not make sense of ip-number: %s", argv[2]);
 	j.ip_number = ntohl(in.s_addr);
+	j.mem_limit = mem_limit;
+	j.priority = priority;
 	if (Jflag) {
 		fp = fopen(JidFile, "w");
 		if (fp == NULL)
@@ -182,8 +198,10 @@
 usage(void)
 {
 
-	(void)fprintf(stderr, "%s%s%s\n",
-	     "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ",
+	(void)fprintf(stderr, "%s%s%s%s%s\n",
+	     "usage: jail [-i] [-J jid_file] [-m mem_limit] ",
+             "[-p priority] [-s securelevel]",
+             " [-l -u ",
 	     "username | -U username]",
 	     " path hostname ip-number command ...");
 	exit(1);

--Apple-Mail-3-62547150
Content-Transfer-Encoding: 7bit
Content-Type: application/octet-stream;
	x-unix-mode=0444;
	name=useMemory.c
Content-Disposition: attachment;
	filename=useMemory.c

#include <sys/cdefs.h>

#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

static void usage(void);
extern char **environ;


int
main(int argc, char **argv)
{
  unsigned int memsize;
  unsigned int p = 0; /* offset from beginning of boundary */
  char *memstart;
  int ch;
  char testdata = 0xff;

  while ((ch = getopt(argc, argv, "m:")) != -1) {
    switch (ch) {
    case 'm':
      memsize = atoi(optarg);
      memsize *= 1024 * 1024;
      break;

    default:
      usage();
    }

  }
  argc -= optind;
  argv += optind;

  /* Allocate memory. */
  memstart = malloc(memsize * sizeof(char));
  if (NULL == memstart) {
    printf("useMemory: couldn't allocate memory!");
    exit(2);
  }
  printf("useMemory: allocated %ld bytes of memory\n", memsize);

  while (p < memsize) {
    memstart[p] = 0xde;
    memstart[p+1] = 0xad;
    memstart[p+2] = 0xbe;
    memstart[p+3] = 0xef;
    if (0 == (p % 1048576))
      printf("useMemory: writing to %ld / %ld bytes at %p (%dddd)\n", p, memsize, 
	     &memstart[p], memstart[p], memstart[p+1], memstart[p+2], memstart[p+3]);
    p += 1024; /* this really should be set 
		  to the page size */
  }

  for (;;)
    sleep(10);

  exit(0);
}

static void
usage(void)
{
  (void) fprintf(stderr, "%s\n",
		 "usage: useMemory [-m memsize]");
  exit(1);
} 

--Apple-Mail-3-62547150--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?F61BB1C8-E979-4AEA-81C4-A570CE7A2AE8>