Date: Tue, 21 Apr 2009 09:13:35 +0400 From: =?UTF-8?B?0JzQtdC90YzRiNC40LrQvtCyINCa0L7QvdGB0YLQsNC90YLQuNC9?= <k.menshikov@peterhost.ru> To: freebsd-hackers@freebsd.org Subject: CPU limit for Jails(patch for ULE scheduler) Message-ID: <49ED55FF.5080306@peterhost.ru>
next in thread | raw e-mail | index | archive | help
[-- Attachment #1 --] Hello all! Many users want have limits on resourse for jail, for examle cpu and memory limit. I`m rewrire original cdjones patch for cpu limit for jail under ULE scheduler. So, this work simple. We count cpu usage for all jails, and if jail use cpu more than have shared cpu, we move his threads to IDLE queue and return to TIMESHARE in reverse case. Jailed thread can use all avaliable cpu time, if system has avaliable cpu. If system under heavy load, jailed thread can`t use cpu long as ratio (shared cpu for jail/ all shared cpu) < (estimate usage cpu for jail / all usage cpu) . Unjailed thread and interactive thread are not subject to this regime. Add 2 sysctl kern.sched.total_sched_shares - total count shares cpu in system, increase if we have more cpu kern.sched.flush_estcpu_interval - flush estcpu interval in ticks, default is 2560 = 2 * 128 * 10, NCPU*stathz*sec, increase if we have more cpu For use cpu limit, you need use flag -S NSharedCPU in /usr/sbin/jail program. My example jail -S100 /usr/jails/root/ root.kostjn.pht 192.168.0.245 /bin/csh I`m tested this under 10 simultaneous process in jail and in main system. test program is infinity cycle an 8 core xeon, use RELENG_7. First run process in jail, and after in main system. This one process tracking cpu usage Jail root 1052 0.0 0.0 3692 784 p1 RJ 7:38PM 0:00.39 /test.o root 1052 21.2 0.0 3692 784 p1 RJ 7:38PM 0:02.40 /test.o root 1052 35.6 0.0 3692 784 p1 RJ 7:38PM 0:04.40 /test.o root 1052 47.5 0.0 3692 784 p1 RJ 7:38PM 0:06.41 /test.o root 1052 39.9 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 33.2 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 27.6 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 22.9 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 19.0 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 15.8 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 13.0 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 10.8 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /test.o root 1052 8.9 0.0 3692 784 p1 RJ 7:38PM 0:06.62 /tes Main system root 1088 14.9 0.0 3692 780 p0 R 7:38PM 0:01.57 /root/test.o root 1088 30.8 0.0 3692 780 p0 R 7:38PM 0:03.60 /root/test.o root 1088 43.8 0.0 3692 780 p0 R 7:38PM 0:05.60 /root/test.o root 1088 51.0 0.0 3692 780 p0 R 7:38PM 0:07.25 /root/test.o root 1088 50.8 0.0 3692 780 p0 R 7:38PM 0:08.28 /root/test.o root 1088 49.1 0.0 3692 780 p0 R 7:38PM 0:09.21 /root/test.o root 1088 48.1 0.0 3692 780 p0 R 7:38PM 0:10.24 /root/test.o root 1088 46.2 0.0 3692 780 p0 R 7:38PM 0:11.17 /root/test.o root 1088 42.9 0.0 3692 780 p0 R 7:38PM 0:11.95 /root/test.o So we see, that after run in main system, jailed process can`t usage cpu. Please communicate me about all problem in this patch. This is initial version, without tune jail parameter in runtime. So, this work. But i`m not sure, that is best way. Attempt increase priority for jailed thread not work, because non interactive thread (that utilize many cpu) already have small prioriry(numerical high). Attempt decrease number ticks in cpu time slice, also not good idea, because, this increase number context switching on high load. May be you see other way for do this? Share you idea. Thank. Original cdjones cpu and memory limit patch http://wiki.freebsd.org/JailResourceLimits [-- Attachment #2 --] diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c --- sys/kern/kern_jail.c 2009-03-10 22:33:50.000000000 +0300 +++ sys.new/kern/kern_jail.c 2009-04-17 18:51:34.000000000 +0400 @@ -531,6 +532,7 @@ kern_jail(struct thread *td, struct jail } #endif pr->pr_linux = NULL; + pr->pr_sched_shares = j->sched_shares; pr->pr_securelevel = securelevel; if (prison_service_slots == 0) pr->pr_slots = NULL; diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c --- sys/kern/sched_ule.c 2009-03-30 23:20:56.000000000 +0400 +++ sys.new/kern/sched_ule.c 2009-04-17 19:10:07.000000000 +0400 @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u #include <sys/umtx.h> #include <sys/vmmeter.h> #include <sys/cpuset.h> +#include <sys/jail.h> #ifdef KTRACE #include <sys/uio.h> #include <sys/ktrace.h> @@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA static int realstathz; static int tickincr; static int sched_slice; + +#define ESTCPU_SHIFT 10 +/* + * estcpu: Global counter ticks from stat timer + * flush_estcpu_interval: Number ticks, after that we to zero estcpu, + * flush_estcpu_interval = mp_ncpus*stathz*10, + * default 2*128*10 = 2560 + * total_sched_shares: Total count shares cpu, 1000 per core, + * default 2*1000 = 2000 +*/ + + +static int estcpu; +static int flush_estcpu_interval = 2560; +static int total_sched_shares = 2000; + #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; @@ -2200,6 +2219,7 @@ sched_clock(struct thread *td) { struct tdq *tdq; struct td_sched *ts; + struct prison *pr = td->td_proc->p_ucred->cr_prison; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); @@ -2234,6 +2254,20 @@ sched_clock(struct thread *td) td->td_sched->ts_runtime += tickincr; sched_interact_update(td); } + + /* Increase counter and flush if need */ + estcpu++; + if (pr != NULL) + pr->pr_estcpu++; + + if (estcpu > flush_estcpu_interval){ + estcpu = 0; + LIST_FOREACH(pr, &allprison, pr_list) { + pr->pr_estcpu = 0; + } + CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails"); + } + /* * We used up one time slice. */ @@ -2375,6 +2409,8 @@ tdq_add(struct tdq *tdq, struct thread * int cpumask; #endif + struct prison *pr = td->td_proc->p_ucred->cr_prison; + TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); @@ -2383,6 +2419,32 @@ tdq_add(struct tdq *tdq, struct thread * KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); + /* We move thread in IDLE queue if prison estimate cpu more than shares + * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid + * rounding away results */ + if(pr != NULL) + CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\ + estcpu %i shares %i interact %i", + td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu, + estcpu, pr->pr_sched_shares, sched_interact_score(td)); + if (pr != NULL && pr->pr_sched_shares != 0 && + sched_interact_score(td) > sched_interact && + estcpu != 0 && total_sched_shares != 0){ + + if ((pr->pr_estcpu << ESTCPU_SHIFT) / (estcpu) > + (pr->pr_sched_shares << ESTCPU_SHIFT) / (total_sched_shares)) + { + td->td_priority = PRI_MIN_IDLE; + td->td_pri_class = PRI_IDLE; + CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority); + + } else { + CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id); + sched_priority(td); + td->td_pri_class = PRI_TIMESHARE; + } + } + ts = td->td_sched; class = PRI_BASE(td->td_pri_class); TD_SET_RUNQ(td); @@ -2746,6 +2808,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0,"Min priority for preemption, lower priorities have greater precedence"); +SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval, + 0,"Number ticks stat timer after thar we zero estcpu counter"); +SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares, + 0,"Total number shared cpu for system"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "Pick the target cpu based on priority rather than load."); diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h --- sys/sys/jail.h 2009-02-18 23:12:08.000000000 +0300 +++ sys.new/sys/jail.h 2009-04-17 18:53:43.000000000 +0400 @@ -31,6 +31,7 @@ struct jail { uint32_t ip6s; struct in_addr *ip4; struct in6_addr *ip6; + uint32_t sched_shares; }; #define JAIL_API_VERSION 2 @@ -132,6 +133,9 @@ struct prison { struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; void **pr_slots; /* (p) additional data */ + uint32_t pr_estcpu; /* (p) cpu usage */ + uint32_t pr_sched_shares; /* (c) number virtual cpu */ + int pr_ip4s; /* (c) number of v4 IPs */ struct in_addr *pr_ip4; /* (c) v4 IPs of jail */ int pr_ip6s; /* (c) number of v6 IPs */ diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c --- usr.sbin/jail/jail.c 2009-02-07 16:19:08.000000000 +0300 +++ usr.sbin.new/jail/jail.c 2009-04-17 18:57:15.000000000 +0400 @@ -83,6 +83,7 @@ main(int argc, char **argv) int ch, error, i, ngroups, securelevel; int hflag, iflag, Jflag, lflag, uflag, Uflag; char path[PATH_MAX], *jailname, *ep, *username, *JidFile, *ip; + uint32_t sched_shares = 0; static char *cleanenv; const char *shell, *p = NULL; long ltmp; @@ -94,7 +95,7 @@ main(int argc, char **argv) jailname = username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "hiln:s:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "hilS:n:s:u:U:J:")) != -1) { switch (ch) { case 'h': hflag = 1; @@ -115,6 +116,9 @@ main(int argc, char **argv) errx(1, "invalid securelevel: `%s'", optarg); securelevel = ltmp; break; + case 'S': + sched_shares = (uint32_t)strtol(optarg,NULL,10); + break; case 'u': username = optarg; uflag = 1; @@ -152,6 +156,8 @@ main(int argc, char **argv) if (jailname != NULL) j.jailname = jailname; + j.sched_shares = sched_shares; + /* Handle IP addresses. If requested resolve hostname too. */ bzero(&hints, sizeof(struct addrinfo)); hints.ai_protocol = IPPROTO_TCP; @@ -264,9 +270,10 @@ static void usage(void) { - (void)fprintf(stderr, "%s%s%s\n", + (void)fprintf(stderr, "%s%s%s%s\n", "usage: jail [-hi] [-n jailname] [-J jid_file] ", "[-s securelevel] [-l -u username | -U username] ", + "[-S number shared cpu] ", "path hostname [ip[,..]] command ..."); exit(1); } [-- Attachment #3 --] diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/kern_jail.c sys.new/kern/kern_jail.c --- sys/kern/kern_jail.c 2008-11-25 05:59:29.000000000 +0300 +++ sys.new/kern/kern_jail.c 2009-04-17 20:23:40.000000000 +0400 @@ -156,6 +156,7 @@ jail(struct thread *td, struct jail_args goto e_dropvnref; pr->pr_ip = j.ip_number; pr->pr_linux = NULL; + pr->pr_sched_shares = j->sched_shares; pr->pr_securelevel = securelevel; if (prison_service_slots == 0) pr->pr_slots = NULL; diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/kern/sched_ule.c sys.new/kern/sched_ule.c --- sys/kern/sched_ule.c 2008-11-25 05:59:29.000000000 +0300 +++ sys.new/kern/sched_ule.c 2009-04-17 20:23:40.000000000 +0400 @@ -61,6 +61,7 @@ __FBSDID("$FreeBSD: src/sys/kern/sched_u #include <sys/umtx.h> #include <sys/vmmeter.h> #include <sys/cpuset.h> +#include <sys/jail.h> #ifdef KTRACE #include <sys/uio.h> #include <sys/ktrace.h> @@ -186,6 +187,22 @@ static int sched_interact = SCHED_INTERA static int realstathz; static int tickincr; static int sched_slice; + +#define ESTCPU_SHIFT 10 +/* + * estcpu: Global counter ticks from stat timer + * flush_estcpu_interval: Number ticks, after that we to zero estcpu, + * flush_estcpu_interval = mp_ncpus*stathz*10, + * default 2*128*10 = 2560 + * total_sched_shares: Total count shares cpu, 1000 per core, + * default 2*1000 = 2000 +*/ + + +static int estcpu; +static int flush_estcpu_interval = 2560; +static int total_sched_shares = 2000; + #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; @@ -2200,6 +2217,7 @@ sched_clock(struct thread *td) { struct tdq *tdq; struct td_sched *ts; + struct prison *pr = td->td_proc->p_ucred->cr_prison; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); @@ -2234,6 +2252,20 @@ sched_clock(struct thread *td) td->td_sched->ts_runtime += tickincr; sched_interact_update(td); } + + /* Increase counter and flush if need */ + estcpu++; + if (pr != NULL) + pr->pr_estcpu++; + + if (estcpu > flush_estcpu_interval){ + estcpu = 0; + LIST_FOREACH(pr, &allprison, pr_list) { + pr->pr_estcpu = 0; + } + CTR0(KTR_SCHED,"Flush estcpu and pr_estcpu for all jails"); + } + /* * We used up one time slice. */ @@ -2375,6 +2407,8 @@ tdq_add(struct tdq *tdq, struct thread * int cpumask; #endif + struct prison *pr = td->td_proc->p_ucred->cr_prison; + TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); @@ -2383,6 +2417,32 @@ tdq_add(struct tdq *tdq, struct thread * KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); + /* We move thread in IDLE queue if prison estimate cpu more than shares + * cpu and thread is not interactive. Use ESTCPU_SHIFT to avoid + * rounding away results */ + if(pr != NULL) + CTR6(KTR_SCHED,"pid %i, prison %i, pr_estcpu %i,\ + estcpu %i shares %i interact %i", + td->td_proc->p_pid,pr->pr_id,pr->pr_estcpu, + estcpu, pr->pr_sched_shares, sched_interact_score(td)); + if (pr != NULL && pr->pr_sched_shares != 0 && + sched_interact_score(td) > sched_interact && + estcpu != 0 && total_sched_shares != 0){ + + if ((pr->pr_estcpu << ESTCPU_SHIFT) / (estcpu) > + (pr->pr_sched_shares << ESTCPU_SHIFT) / (total_sched_shares)) + { + td->td_priority = PRI_MIN_IDLE; + td->td_pri_class = PRI_IDLE; + CTR2(KTR_SCHED,"prison %i excess cpu limit!!! new pri = %i ",pr->pr_id,td->td_priority); + + } else { + CTR1(KTR_SCHED,"prison %i use cpu less limit",pr->pr_id); + sched_priority(td); + td->td_pri_class = PRI_TIMESHARE; + } + } + ts = td->td_sched; class = PRI_BASE(td->td_pri_class); TD_SET_RUNQ(td); @@ -2741,6 +2801,10 @@ SYSCTL_INT(_kern_sched, OID_AUTO, intera "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0,"Min priority for preemption, lower priorities have greater precedence"); +SYSCTL_INT(_kern_sched, OID_AUTO, flush_estcpu_interval, CTLFLAG_RW, &flush_estcpu_interval, + 0,"Number ticks stat timer after thar we zero estcpu counter"); +SYSCTL_INT(_kern_sched, OID_AUTO, total_sched_shares, CTLFLAG_RW, &total_sched_shares, + 0,"Total number shared cpu for system"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "Pick the target cpu based on priority rather than load."); diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines sys/sys/jail.h sys.new/sys/jail.h --- sys/sys/jail.h 2008-11-25 05:59:29.000000000 +0300 +++ sys.new/sys/jail.h 2009-04-17 20:26:54.000000000 +0400 @@ -18,6 +18,7 @@ struct jail { char *path; char *hostname; u_int32_t ip_number; + uint32_t sched_shares; }; struct xprison { @@ -74,6 +75,8 @@ struct prison { struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; void **pr_slots; /* (p) additional data */ + uint32_t pr_estcpu; /* (p) cpu usage */ + uint32_t pr_sched_shares; /* (c) number virtual cpu */ }; #endif /* _KERNEL || _WANT_PRISON */ diff -U3 -r --show-c-function --ignore-all-space --ignore-tab-expansion --ignore-blank-lines usr.sbin/jail/jail.c usr.sbin.new/jail/jail.c --- usr.sbin/jail/jail.c 2008-11-25 05:59:29.000000000 +0300 +++ usr.sbin.new/jail/jail.c 2009-04-17 20:31:17.000000000 +0400 @@ -57,6 +57,7 @@ main(int argc, char **argv) gid_t groups[NGROUPS]; int ch, i, iflag, Jflag, lflag, ngroups, securelevel, uflag, Uflag; char path[PATH_MAX], *ep, *username, *JidFile; + uint32_t sched_shares = 0; static char *cleanenv; const char *shell, *p = NULL; long ltmp; @@ -67,7 +68,7 @@ main(int argc, char **argv) username = JidFile = cleanenv = NULL; fp = NULL; - while ((ch = getopt(argc, argv, "ils:u:U:J:")) != -1) { + while ((ch = getopt(argc, argv, "ilS:s:u:U:J:")) != -1) { switch (ch) { case 'i': iflag = 1; @@ -82,6 +83,9 @@ main(int argc, char **argv) errx(1, "invalid securelevel: `%s'", optarg); securelevel = ltmp; break; + case 'S': + sched_shares = (uint32_t)strtol(optarg,NULL,10); + break; case 'u': username = optarg; uflag = 1; @@ -115,6 +119,7 @@ main(int argc, char **argv) j.version = 0; j.path = path; j.hostname = argv[1]; + j.sched_shares = sched_shares; if (inet_aton(argv[2], &in) == 0) errx(1, "Could not make sense of ip-number: %s", argv[2]); j.ip_number = ntohl(in.s_addr); @@ -182,9 +187,10 @@ static void usage(void) { - (void)fprintf(stderr, "%s%s%s\n", + (void)fprintf(stderr, "%s%s%s%s\n", "usage: jail [-i] [-J jid_file] [-s securelevel] [-l -u ", "username | -U username]", + "[-S number shared cpu] ", " path hostname ip-number command ..."); exit(1); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?49ED55FF.5080306>
