Date: Thu, 4 Sep 2025 18:59:58 GMT From: Jamie Gritton <jamie@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 1bd74d201a53 - main - jail: add kqueue(2) support for jails Message-ID: <202509041859.584Ixw2G014955@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by jamie: URL: https://cgit.FreeBSD.org/src/commit/?id=1bd74d201a534540614663686890ab96a3bbe2c7 commit 1bd74d201a534540614663686890ab96a3bbe2c7 Author: Jamie Gritton <jamie@FreeBSD.org> AuthorDate: 2025-09-04 18:56:56 +0000 Commit: Jamie Gritton <jamie@FreeBSD.org> CommitDate: 2025-09-04 18:56:56 +0000 jail: add kqueue(2) support for jails Add kqueue tracking to jails, inspired by how it's done with processes. EVFILT_JAIL takes a jail ID, and tracks with NOTE_JAIL_SET, NOTE_JAIL_ATTACH, NOTE_JAIL_REMOVE, and NOTE_JAIL_CHILD. It also uses the NOTE_TRACK mechanism that EVFILT_PROC uses, using the same result flags (NOTE_CHILD and NOTE_TRACKERR). Relnotes: yes Differential Revision: https://reviews.freebsd.org/D51940 --- lib/libsys/kqueue.2 | 58 ++++++++++++++++++++- sys/kern/kern_event.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++-- sys/kern/kern_jail.c | 64 ++++++++++++++++++++--- sys/sys/event.h | 19 +++++-- sys/sys/jail.h | 7 ++- 5 files changed, 270 insertions(+), 17 deletions(-) diff --git a/lib/libsys/kqueue.2 b/lib/libsys/kqueue.2 index d6e949baa24c..e413f7d4fbca 100644 --- a/lib/libsys/kqueue.2 +++ b/lib/libsys/kqueue.2 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd March 26, 2023 +.Dd September 4, 2025 .Dt KQUEUE 2 .Os .Sh NAME @@ -593,6 +593,62 @@ returns the number of times the signal has occurred since the last call to This filter automatically sets the .Dv EV_CLEAR flag internally. +.It Dv EVFILT_JAIL +Takes the jail ID to monitor as the identifier and the events to watch for +in +.Va fflags , +and returns when the jail performs one or more of the requested events. +If a process can normally see a jail, it can attach an event to it. +An identifier of zero will watch the process's own jail. +The events to monitor are: +.Bl -tag -width "Dv NOTE_JAIL_ATTACH" +.It Dv NOTE_JAIL_SET +The jail has been changed via +.Xr jail_set 2 . +.It Dv NOTE_JAIL_ATTACH +A process has attached to the jail via +.Xr jail_attach 2 +or a similar call. +The process ID will be stored in +.Va data . +If more than one process has attached since the last call to +.Fn kevent , +.Va data +will contain the most recently attached process ID, +with +.Dv NOTE_JAIL_ATTACH_MULTI +set in +.Va fflags . +.It Dv NOTE_JAIL_REMOVE +The jail has been removed. +.It Dv NOTE_JAIL_CHILD +A child of the watched jail has been created. +.It Dv NOTE_TRACK +Follow child jails created under this jail. +Register a new kevent to monitor the child jail using the same +.Va fflags +as the original event. +The child jail will signal an event with +.Dv NOTE_CHILD +set in +.Va fflags +and the parent JID in +.Va data . +.Pp +If registering a new kevent fails +.Pq usually due to resource limitations , +it will signal an event with +.Dv NOTE_TRACKERR +set in +.Va fflags , +and the child jail will not signal a +.Dv NOTE_CHILD +event. +.El +.Pp +On return, +.Va fflags +contains the events which triggered the filter. .It Dv EVFILT_TIMER Establishes an arbitrary timer identified by .Va ident . diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index eb77a5064113..501adc151d44 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -50,6 +50,7 @@ #include <sys/filedesc.h> #include <sys/filio.h> #include <sys/fcntl.h> +#include <sys/jail.h> #include <sys/kthread.h> #include <sys/selinfo.h> #include <sys/queue.h> @@ -163,6 +164,9 @@ static int filt_kqueue(struct knote *kn, long hint); static int filt_procattach(struct knote *kn); static void filt_procdetach(struct knote *kn); static int filt_proc(struct knote *kn, long hint); +static int filt_jailattach(struct knote *kn); +static void filt_jaildetach(struct knote *kn); +static int filt_jail(struct knote *kn, long hint); static int filt_fileattach(struct knote *kn); static void filt_timerexpire(void *knx); static void filt_timerexpire_l(struct knote *kn, bool proc_locked); @@ -195,6 +199,12 @@ static const struct filterops proc_filtops = { .f_detach = filt_procdetach, .f_event = filt_proc, }; +static const struct filterops jail_filtops = { + .f_isfd = 0, + .f_attach = filt_jailattach, + .f_detach = filt_jaildetach, + .f_event = filt_jail, +}; static const struct filterops timer_filtops = { .f_isfd = 0, .f_attach = filt_timerattach, @@ -365,6 +375,7 @@ static struct { [~EVFILT_USER] = { &user_filtops, 1 }, [~EVFILT_SENDFILE] = { &null_filtops }, [~EVFILT_EMPTY] = { &file_filtops, 1 }, + [~EVFILT_JAIL] = { &jail_filtops, 1 }, }; /* @@ -528,7 +539,8 @@ filt_proc(struct knote *kn, long hint) * process forked. Additionally, for each knote attached to the * parent, check whether user wants to track the new process. If so * attach a new knote to it, and immediately report an event with the - * child's pid. + * child's pid. This is also called on jail creation, which is treated + * the same way by jail events. */ void knote_fork(struct knlist *list, int pid) @@ -555,6 +567,8 @@ knote_fork(struct knlist *list, int pid) /* * The same as knote(), activate the event. */ + _Static_assert(NOTE_JAIL_CHILD == NOTE_FORK, + "NOTE_JAIL_CHILD should be the same as NOTE_FORK"); if ((kn->kn_sfflags & NOTE_TRACK) == 0) { if (kn->kn_fop->f_event(kn, NOTE_FORK)) KNOTE_ACTIVATE(kn, 1); @@ -614,6 +628,124 @@ knote_fork(struct knlist *list, int pid) } } +int +filt_jailattach(struct knote *kn) +{ + struct prison *pr; + bool immediate; + + immediate = false; + if (kn->kn_id == 0) { + /* Let jid=0 watch the current prison (including prison0). */ + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + } else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) { + /* + * The kernel registers prisons before they are valid, + * so prison_find_child will fail. + */ + TAILQ_FOREACH(pr, &allprison, pr_list) { + if (pr->pr_id < kn->kn_id) + continue; + if (pr->pr_id > kn->kn_id) { + pr = NULL; + break; + } + mtx_lock(&pr->pr_mtx); + break; + } + if (pr == NULL) + return (ENOENT); + } else { + sx_slock(&allprison_lock); + pr = prison_find_child(curthread->td_ucred->cr_prison, + kn->kn_id); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + if (!prison_isalive(pr)) { + mtx_unlock(&pr->pr_mtx); + return (ENOENT); + } + } + kn->kn_ptr.p_prison = pr; + kn->kn_flags |= EV_CLEAR; + + /* + * Internal flag indicating registration done by kernel for the + * purposes of getting a NOTE_CHILD notification. + */ + if (kn->kn_flags & EV_FLAG2) { + kn->kn_flags &= ~EV_FLAG2; + kn->kn_data = kn->kn_sdata; /* parent id */ + kn->kn_fflags = NOTE_CHILD; + kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK; + immediate = true; /* Force immediate activation of child note. */ + } + /* + * Internal flag indicating registration done by kernel (for other than + * NOTE_CHILD). + */ + if (kn->kn_flags & EV_FLAG1) { + kn->kn_flags &= ~EV_FLAG1; + } + + knlist_add(pr->pr_klist, kn, 1); + + /* Immediately activate any child notes. */ + if (immediate) + KNOTE_ACTIVATE(kn, 0); + + mtx_unlock(&pr->pr_mtx); + return (0); +} + +void +filt_jaildetach(struct knote *kn) +{ + if (kn->kn_ptr.p_prison != NULL) { + knlist_remove(kn->kn_knlist, kn, 0); + kn->kn_ptr.p_prison = NULL; + } else + kn->kn_status |= KN_DETACHED; +} + +int +filt_jail(struct knote *kn, long hint) +{ + struct prison *pr; + u_int event; + + pr = kn->kn_ptr.p_prison; + if (pr == NULL) /* already activated, from attach filter */ + return (0); + + /* Mask off extra data. */ + event = (u_int)hint & NOTE_JAIL_CTRLMASK; + + /* If the user is interested in this event, record it. */ + if (kn->kn_sfflags & event) + kn->kn_fflags |= event; + + /* Report the attached process id. */ + if (event == NOTE_JAIL_ATTACH) { + if (kn->kn_data != 0) + kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI; + kn->kn_data = hint & NOTE_JAIL_DATAMASK; + } + + /* Prison is gone, so flag the event as finished. */ + if (event == NOTE_JAIL_REMOVE) { + kn->kn_flags |= EV_EOF | EV_ONESHOT; + kn->kn_ptr.p_prison = NULL; + if (kn->kn_fflags == 0) + kn->kn_flags |= EV_DROP; + return (1); + } + + return (kn->kn_fflags != 0); +} + /* * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the * interval timer support code. @@ -1597,8 +1729,8 @@ findkn: /* * If possible, find an existing knote to use for this kevent. */ - if (kev->filter == EVFILT_PROC && - (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { + if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL) + && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) { /* This is an internal creation of a process tracking * note. Don't attempt to coalesce this with an * existing note. @@ -2800,6 +2932,7 @@ knote_init(void) knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue); + prison0.pr_klist = knlist_alloc(&prison0.pr_mtx); } SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL); diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index 7c9a15ae18f3..52210553016b 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -45,6 +45,7 @@ #include <sys/priv.h> #include <sys/proc.h> #include <sys/epoch.h> +#include <sys/event.h> #include <sys/taskqueue.h> #include <sys/fcntl.h> #include <sys/jail.h> @@ -154,7 +155,8 @@ static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison); static int prison_lock_xlock(struct prison *pr, int flags); -static void prison_cleanup(struct prison *pr); +static void prison_cleanup_locked(struct prison *pr); +static void prison_cleanup_unlocked(struct prison *pr); static void prison_free_not_last(struct prison *pr); static void prison_proc_free_not_last(struct prison *pr); static void prison_proc_relink(struct prison *opr, struct prison *npr, @@ -167,6 +169,7 @@ static void prison_racct_attach(struct prison *pr); static void prison_racct_modify(struct prison *pr); static void prison_racct_detach(struct prison *pr); #endif +static void prison_knote(struct prison *pr, long hint); /* Flags for prison_deref */ #define PD_DEREF 0x01 /* Decrement pr_ref */ @@ -1018,6 +1021,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) int ip6s; bool redo_ip6; #endif + bool maybe_changed; uint64_t pr_allow, ch_allow, pr_flags, ch_flags; uint64_t pr_allow_diff; unsigned tallow; @@ -1422,6 +1426,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) pr = NULL; inspr = NULL; deadpr = NULL; + maybe_changed = false; if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { namelc = strrchr(name, '.'); jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); @@ -1643,6 +1648,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) tpr->pr_childcount++; + pr->pr_klist = knlist_alloc(&pr->pr_mtx); /* Set some default values, and inherit some from the parent. */ if (namelc == NULL) @@ -1880,6 +1886,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_deref; } } + maybe_changed = true; /* Set the parameters of the prison. */ #ifdef INET @@ -2112,7 +2119,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * reference via persistence, or is about to gain one via attachment. */ if (created) { - drflags = prison_lock_xlock(pr, drflags); + sx_assert(&allprison_lock, SX_XLOCKED); + mtx_lock(&ppr->pr_mtx); + knote_fork(ppr->pr_klist, pr->pr_id); + mtx_unlock(&ppr->pr_mtx); + mtx_lock(&pr->pr_mtx); + drflags |= PD_LOCKED; pr->pr_state = PRISON_STATE_ALIVE; } @@ -2150,6 +2162,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) td->td_retval[0] = pr->pr_id; done_deref: + /* + * Report changes to kevent. This can happen even if the + * system call fails, as changes might have been made before + * the failure. + */ + if (maybe_changed && !created) + prison_knote(pr, NOTE_JAIL_SET); /* Release any temporary prison holds and/or locks. */ if (pr != NULL) prison_deref(pr, drflags); @@ -2755,6 +2774,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags) prison_proc_relink(oldcred->cr_prison, pr, p); prison_deref(oldcred->cr_prison, drflags); crfree(oldcred); + prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid); /* * If the prison was killed while changing credentials, die along @@ -3182,9 +3202,10 @@ prison_deref(struct prison *pr, int flags) refcount_load(&prison0.pr_uref) > 0, ("prison0 pr_uref=0")); pr->pr_state = PRISON_STATE_DYING; + prison_cleanup_locked(pr); mtx_unlock(&pr->pr_mtx); flags &= ~PD_LOCKED; - prison_cleanup(pr); + prison_cleanup_unlocked(pr); } } } @@ -3327,8 +3348,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) } if (!(cpr->pr_flags & PR_REMOVE)) continue; - prison_cleanup(cpr); + prison_cleanup_unlocked(cpr); mtx_lock(&cpr->pr_mtx); + prison_cleanup_locked(cpr); cpr->pr_flags &= ~PR_REMOVE; if (cpr->pr_flags & PR_PERSIST) { cpr->pr_flags &= ~PR_PERSIST; @@ -3363,8 +3385,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison) if (rpr != NULL) LIST_REMOVE(rpr, pr_sibling); - prison_cleanup(pr); + prison_cleanup_unlocked(pr); mtx_lock(&pr->pr_mtx); + prison_cleanup_locked(pr); if (pr->pr_flags & PR_PERSIST) { pr->pr_flags &= ~PR_PERSIST; prison_proc_free_not_last(pr); @@ -3411,10 +3434,21 @@ prison_lock_xlock(struct prison *pr, int flags) /* * Release a prison's resources when it starts dying (when the last user - * reference is dropped, or when it is killed). + * reference is dropped, or when it is killed). Two functions are called, + * for work that requires a locked prison or an unlocked one. */ static void -prison_cleanup(struct prison *pr) +prison_cleanup_locked(struct prison *pr) +{ + sx_assert(&allprison_lock, SA_XLOCKED); + mtx_assert(&pr->pr_mtx, MA_OWNED); + prison_knote(pr, NOTE_JAIL_REMOVE); + knlist_detach(pr->pr_klist); + pr->pr_klist = NULL; +} + +static void +prison_cleanup_unlocked(struct prison *pr) { sx_assert(&allprison_lock, SA_XLOCKED); mtx_assert(&pr->pr_mtx, MA_NOTOWNED); @@ -5039,6 +5073,22 @@ prison_racct_detach(struct prison *pr) } #endif /* RACCT */ +/* + * Submit a knote for a prison, locking if necessary. + */ +static void +prison_knote(struct prison *pr, long hint) +{ + int locked; + + locked = mtx_owned(&pr->pr_mtx); + if (!locked) + mtx_lock(&pr->pr_mtx); + KNOTE_LOCKED(pr->pr_klist, hint); + if (!locked) + mtx_unlock(&pr->pr_mtx); +} + #ifdef DDB static void diff --git a/sys/sys/event.h b/sys/sys/event.h index 1b30e4292de8..f161d2c938c1 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -45,7 +45,8 @@ #define EVFILT_USER (-11) /* User events */ #define EVFILT_SENDFILE (-12) /* attached to sendfile requests */ #define EVFILT_EMPTY (-13) /* empty send socket buf */ -#define EVFILT_SYSCOUNT 13 +#define EVFILT_JAIL (-14) /* attached to struct prison */ +#define EVFILT_SYSCOUNT 14 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L #define EV_SET(kevp_, a, b, c, d, e, f) do { \ @@ -204,10 +205,19 @@ struct freebsd11_kevent32 { #define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */ #define NOTE_PDATAMASK 0x000fffff /* mask for pid */ -/* additional flags for EVFILT_PROC */ -#define NOTE_TRACK 0x00000001 /* follow across forks */ +/* data/hint flags for EVFILT_JAIL */ +#define NOTE_JAIL_SET 0x80000000 /* jail was modified */ +#define NOTE_JAIL_CHILD 0x40000000 /* child jail was created */ +#define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */ +#define NOTE_JAIL_REMOVE 0x10000000 /* jail was removed */ +#define NOTE_JAIL_ATTACH_MULTI 0x08000000 /* multiple procs attached */ +#define NOTE_JAIL_CTRLMASK 0xf0000000 /* mask for hint bits */ +#define NOTE_JAIL_DATAMASK 0x000fffff /* mask for pid */ + +/* additional flags for EVFILT_PROC and EVFILT_JAIL */ +#define NOTE_TRACK 0x00000001 /* follow across fork/create */ #define NOTE_TRACKERR 0x00000002 /* could not track child */ -#define NOTE_CHILD 0x00000004 /* am a child process */ +#define NOTE_CHILD 0x00000004 /* am a child process/jail */ /* additional flags for EVFILT_TIMER */ #define NOTE_SECONDS 0x00000001 /* data is seconds */ @@ -309,6 +319,7 @@ struct knote { struct proc *p_proc; /* proc pointer */ struct kaiocb *p_aio; /* AIO job pointer */ struct aioliojob *p_lio; /* LIO job pointer */ + struct prison *p_prison; /* prison pointer */ void *p_v; /* generic other pointer */ } kn_ptr; const struct filterops *kn_fop; diff --git a/sys/sys/jail.h b/sys/sys/jail.h index d2655c52e832..4512e43a2866 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -144,6 +144,7 @@ MALLOC_DECLARE(M_PRISON); #define JAIL_META_PRIVATE "meta" #define JAIL_META_SHARED "env" +struct knlist; struct racct; struct prison_racct; @@ -189,7 +190,8 @@ struct prison { struct vnode *pr_root; /* (c) vnode to rdir */ struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */ struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */ - void *pr_sparep[3]; + struct knlist *pr_klist; /* (m) attached knotes */ + void *pr_sparep[2]; int pr_childcount; /* (a) number of child jails */ int pr_childmax; /* (p) maximum child jails */ unsigned pr_allow; /* (p) PR_ALLOW_* flags */ @@ -425,10 +427,11 @@ SYSCTL_DECL(_security_jail_param); /* * Kernel support functions for jail(). */ -struct ucred; +struct knote; struct mount; struct sockaddr; struct statfs; +struct ucred; struct vfsconf; /*
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202509041859.584Ixw2G014955>