Date: Wed, 30 May 2007 12:53:30 +1000 (EST) From: Bruce Evans <brde@optusnet.com.au> To: Jeff Roberson <jroberson@chesapeake.net> Cc: arch@FreeBSD.org Subject: Re: initial rusage patch. Message-ID: <20070530094538.D11725@besplex.bde.org> In-Reply-To: <20070529152059.Y661@10.0.0.1> References: <20070529152059.Y661@10.0.0.1>
next in thread | previous in thread | raw e-mail | index | archive | help
On Tue, 29 May 2007, Jeff Roberson wrote: > http://people.freebsd.org/~jeff/rusage3.diff % Index: compat/svr4/svr4_misc.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/compat/svr4/svr4_misc.c,v % retrieving revision 1.92 % diff -u -p -r1.92 svr4_misc.c % --- compat/svr4/svr4_misc.c 18 May 2007 07:10:44 -0000 1.92 % +++ compat/svr4/svr4_misc.c 29 May 2007 14:42:19 -0000 % @@ -1238,7 +1238,7 @@ loop: % /* Found a zombie, so cache info in local variables. */ % pid = p->p_pid; % status = p->p_xstat; % - ru = *p->p_ru; % + rufetch(p, &ru); % calcru(p, &ru.ru_utime, &ru.ru_stime); % PROC_UNLOCK(p); % sx_sunlock(&proctree_lock); This and similar changes seem to be wrong. *p->p_ru is where things have been accumulated already. At this point (with a zombie in wait*()), I think all of the threads have gone away so rufetch() will find nothing. Accumulation into *p->p_ru must occur at thread exit time, and wait*() doesn't need changing to access it. The non-compat wait*() does this correctly. % Index: kern/kern_acct.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/kern/kern_acct.c,v % retrieving revision 1.89 % diff -u -p -r1.89 kern_acct.c % --- kern/kern_acct.c 22 May 2007 06:51:37 -0000 1.89 % +++ kern/kern_acct.c 29 May 2007 14:43:21 -0000 % @@ -383,19 +383,19 @@ acct_process(struct thread *td) % acct.ac_etime = encode_timeval(tmp); % % /* (4) The average amount of memory used */ % - r = &p->p_stats->p_ru; % + rufetch(p, &ru); Should be same null change as as for wait*()? If not, an rufetch() call is probably needed before the calcru() in (2). % Index: kern/kern_clock.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/kern/kern_clock.c,v % retrieving revision 1.198 % diff -u -p -r1.198 kern_clock.c % --- kern/kern_clock.c 28 May 2007 21:50:54 -0000 1.198 % +++ kern/kern_clock.c 29 May 2007 13:15:14 -0000 % @@ -396,8 +396,8 @@ stopprofclock(p) % /* % * Statistics clock. Grab profile sample, and if divider reaches 0, % * do process and kernel statistics. Most of the statistics are only % - * used by user-level statistics programs. The main exceptions are % - * ke->ke_uticks, p->p_rux.rux_sticks, p->p_rux.rux_iticks, and p->p_estcpu. % + * used by user-level statistics programs. % + * % * This should be called by all active processors. % */ % void Better just remove third sentence. Statistics involving tick counters are still done here, and now just not in p_rux. Kernel-only statistics not mentioned in the comment were always done here. p_estcpu rotted earlier. It is now spelled td_estcpu and is only maintained by SCHED_4BSD. It is still spelled p_estcpu in comments near ttyinfo() and supposed to be maintained by schedulers for sorting in ttyinfo(). The part of the comment about the divider rotted even earlier so it should be removed too. % Index: kern/kern_exit.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/kern/kern_exit.c,v % retrieving revision 1.298 % diff -u -p -r1.298 kern_exit.c % --- kern/kern_exit.c 14 May 2007 22:21:58 -0000 1.298 % +++ kern/kern_exit.c 29 May 2007 15:02:48 -0000 % @@ -229,7 +229,7 @@ retry: % */ % EVENTHANDLER_INVOKE(process_exit, p); % % - MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), % + MALLOC(p->p_exitru, struct rusage *, sizeof(struct rusage), % M_ZOMBIE, M_WAITOK); % /* % * If parent is waiting for us to exit or exec, This is a better name, but more churn. % @@ -445,8 +445,8 @@ retry: % PROC_LOCK(p); % p->p_xstat = rv; % p->p_xthread = td; % - p->p_stats->p_ru.ru_nvcsw++; % - *p->p_ru = p->p_stats->p_ru; % + rufetch(p, p->p_exitru); % + p->p_exitru->ru_nvcsw++; % % /* % * Notify interested parties of our demise. This fixes 2 races (needed sched_lock here). If possible this should be moved later, together with the times finalization in thread_exit(), to pick up any changes that occur during exit. % @@ -721,7 +721,7 @@ loop: % if (status) % *status = p->p_xstat; /* convert to int */ % if (rusage) { % - *rusage = *p->p_ru; % + *rusage = *p->p_exitru; % calcru(p, &rusage->ru_utime, &rusage->ru_stime); % } % This is correct code for wait*() -- don't call rufetch(), but just use the result of the rufetch() to p_exitru. % Index: kern/kern_resource.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/kern/kern_resource.c,v % retrieving revision 1.171 % diff -u -p -r1.171 kern_resource.c % --- kern/kern_resource.c 27 May 2007 20:50:23 -0000 1.171 % +++ kern/kern_resource.c 29 May 2007 15:39:33 -0000 % @@ -802,10 +802,12 @@ calcru(struct proc *p, struct timeval *u % * We reset the thread and CPU state as if we had performed a context % * switch right here. % */ % - if (curthread->td_proc == p) { % - td = curthread; % + td = curthread; % + if (td->td_proc == p) { % u = cpu_ticks(); % p->p_rux.rux_runtime += u - PCPU_GET(switchtime); % + p->p_rux.rux_runtime += td->td_runtime; % + td->td_runtime = 0; % PCPU_SET(switchtime, u); % p->p_rux.rux_uticks += td->td_uticks; % td->td_uticks = 0; The old code here was broken in rev.1.153-1.154. This function (calcru()) is called quite often on non-current threads (mainly from fill_kinfo*()). That used to be handled by doing read-only accesses to the times on the other CPUs and non-current threads. I think the tick counts were only per-process before 1.153, so no special handling was needed for them. Now it is even more necessary to look at times on other CPUs, since rux_runtime is missing not only the current timeslice for threads running on other CPUs, but also the runtime accumulated in td_runtime but not yet copied to rux_runtime for all threads in the process other than curthread for the current CPU. Something like rufetch() for the times and tick counts only is needed here. % @@ -950,22 +952,21 @@ kern_getrusage(td, who, rup) % } % % void % -ruadd(ru, rux, ru2, rux2) % - struct rusage *ru; % - struct rusage_ext *rux; % - struct rusage *ru2; % - struct rusage_ext *rux2; % -{ % - register long *ip, *ip2; % - register int i; % - % - rux->rux_runtime += rux2->rux_runtime; % - rux->rux_uticks += rux2->rux_uticks; % - rux->rux_sticks += rux2->rux_sticks; % - rux->rux_iticks += rux2->rux_iticks; % - rux->rux_uu += rux2->rux_uu; % - rux->rux_su += rux2->rux_su; % - rux->rux_tu += rux2->rux_tu; % +ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2, % + struct rusage_ext *rux2) % +{ % + long *ip, *ip2; % + int i; % + % + if (rux) { % + rux->rux_runtime += rux2->rux_runtime; % + rux->rux_uticks += rux2->rux_uticks; % + rux->rux_sticks += rux2->rux_sticks; % + rux->rux_iticks += rux2->rux_iticks; % + rux->rux_uu += rux2->rux_uu; % + rux->rux_su += rux2->rux_su; % + rux->rux_tu += rux2->rux_tu; % + } % if (ru->ru_maxrss < ru2->ru_maxrss) % ru->ru_maxrss = ru2->ru_maxrss; % ip = &ru->ru_first; Unrelated style changes. Style bug checking (rux != NULL). I don't like the interface chaange. % @@ -975,6 +976,33 @@ ruadd(ru, rux, ru2, rux2) % } % % /* % + * Update the rusage_ext structure and fetch a valid aggregate rusage % + * for proc p if storage for one is supplied. % + */ % +void % +rufetch(struct proc *p, struct rusage *ru) ru should be spelled rup when it is a pointer. % +{ % + struct thread *td; % + % + if (ru) % + memset(ru, 0, sizeof(*ru)); I think (ru != NULL) shouldn't be supported and doesn't occur in this patch. The pointer is always &foo or p->p_exitru. % + mtx_lock_spin(&sched_lock); % + FOREACH_THREAD_IN_PROC(p, td) { % + p->p_rux.rux_runtime += td->td_runtime; % + td->td_runtime = 0; % + p->p_rux.rux_uticks += td->td_uticks; % + td->td_uticks = 0; % + p->p_rux.rux_iticks += td->td_iticks; % + td->td_iticks = 0; % + p->p_rux.rux_sticks += td->td_sticks; % + td->td_sticks = 0; % + if (ru) % + ruadd(ru, NULL, &td->td_ru, NULL); % + } % + mtx_unlock_spin(&sched_lock); % +} This is unaware of other CPUs and the per-CPU data switchticks and switchtime on them, but probably should be. Some callers will get stale times by not adding in the delta from switchtime to the current time on all CPUs. These callers also typically do: rufetch(p, rup); calcru(p, &rup->ru_utime, &rup->ru_stime); This gives a lot of duplication and some races. rufetch() adds in the times for other threads, so problems from not doing this addition in calcru() are reduced. calcru() repeats most of the code in the above loop for curthread only. calcru() handles switchtime etc. where the above doesn't, but again for curthread only. Races are caused by dropping the lock between rufetch() and calcru(). It's bogus that rufetch() copies p's rusage to *rup and then we use p's rusage but not not *rup in the calcru() call. Calcru() also uses rusage_ext which belongs only to p. These scatterings of the data give unnecessary races. For wait*() the races probably don't occur because p's rusage is not updated after we copy it, but for getrusage() it results in the times being slightly more up to date than the rest of the results. This bogusness is mostly old. % Index: sys/proc.h % =================================================================== % RCS file: /usr/home/ncvs/src/sys/sys/proc.h,v % retrieving revision 1.477 % diff -u -p -r1.477 proc.h % --- sys/proc.h 24 Apr 2007 10:59:21 -0000 1.477 % +++ sys/proc.h 29 May 2007 15:14:31 -0000 % @@ -49,6 +49,7 @@ % #include <sys/priority.h> % #include <sys/rtprio.h> /* XXX. */ % #include <sys/runq.h> % +#include <sys/resource.h> More namespace pollution. rusage_ext exists partly to avoid this include. The include of rtprio is XXXed because it is pollution. % @@ -255,10 +256,12 @@ struct thread { % struct kse_upcall *td_upcall; /* (k + j) Upcall structure. */ % u_int td_estcpu; /* (j) Sum of the same field in KSEs. */ % u_int td_slptime; /* (j) How long completely blocked. */ % - u_int td_pticks; /* (k) Statclock hits for profiling */ % - u_int td_sticks; /* (k) Statclock hits in system mode. */ % - u_int td_iticks; /* (k) Statclock hits in intr mode. */ % - u_int td_uticks; /* (k) Statclock hits in user mode. */ % + struct rusage td_ru; /* (j) rusage information */ % + uint64_t td_runtime; /* (j) How many cpu ticks we've run. */ % + uint64_t td_pticks; /* (j) Statclock hits for profiling */ % + uint64_t td_sticks; /* (j) Statclock hits in system mode. */ % + uint64_t td_iticks; /* (j) Statclock hits in intr mode. */ % + u_int td_uticks; /* (j) Statclock hits in user mode. */ u_int is the the correct type for these tick counters. This is enough for 388 days worth of ticks unless stathz is broken (> 128 Hz). 64-bis never actually worked even for the accumulated tick counters, since the algorithms in calcru() depend on them only using 32 bits for things like 32x32 -> 64 bit multiplications to not overflow. Thus rusage times never worked for processes with more than 388 days user or sys time. It is easy to arrange for accumulation of the per-thread counters more often than every 388 days so that they don't overflow. % Index: ufs/ufs/ufs_bmap.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/ufs/ufs/ufs_bmap.c,v % retrieving revision 1.65 % diff -u -p -r1.65 ufs_bmap.c % --- ufs/ufs/ufs_bmap.c 25 Oct 2005 19:46:15 -0000 1.65 % +++ ufs/ufs/ufs_bmap.c 29 May 2007 14:21:47 -0000 % @@ -226,7 +226,7 @@ ufs_bmaparray(vp, bn, bnp, nbp, runp, ru % vfs_busy_pages(bp, 0); % bp->b_iooffset = dbtob(bp->b_blkno); % bstrategy(bp); % - curproc->p_stats->p_ru.ru_inblock++; /* XXX */ % + curthread->td_ru.ru_inblock++; % error = bufwait(bp); % if (error) { % brelse(bp); Was the XXX because it was locked by (n) (nothing)? Is it locked now? The XXX was cloned to ext2fs but was not attached to most accesses to ru_inblock. % Index: vm/vm_fault.c % =================================================================== % RCS file: /usr/home/ncvs/src/sys/vm/vm_fault.c,v % retrieving revision 1.228 % diff -u -p -r1.228 vm_fault.c % --- vm/vm_fault.c 22 May 2007 04:45:59 -0000 1.228 % +++ vm/vm_fault.c 29 May 2007 15:21:57 -0000 % @@ -918,15 +918,10 @@ readrest: % * Unlock everything, and return % */ % unlock_and_deallocate(&fs); % - PROC_LOCK(curproc); % - if ((curproc->p_sflag & PS_INMEM) && curproc->p_stats) { % - if (hardfault) { % - curproc->p_stats->p_ru.ru_majflt++; % - } else { % - curproc->p_stats->p_ru.ru_minflt++; % - } % - } % - PROC_UNLOCK(curproc); % + if (hardfault) % + curthread->td_ru.ru_majflt++; % + else % + curthread->td_ru.ru_minflt++; % % return (KERN_SUCCESS); % } What locks these now? The locking legend still says (c) (proc). Bruce
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20070530094538.D11725>