From owner-svn-src-head@freebsd.org Wed Apr 19 13:03:31 2017 Return-Path: Delivered-To: svn-src-head@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 6AEC5D453D1; Wed, 19 Apr 2017 13:03:31 +0000 (UTC) (envelope-from hselasky@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 2F76818E1; Wed, 19 Apr 2017 13:03:31 +0000 (UTC) (envelope-from hselasky@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id v3JD3UVv093316; Wed, 19 Apr 2017 13:03:30 GMT (envelope-from hselasky@FreeBSD.org) Received: (from hselasky@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id v3JD3UZV093313; Wed, 19 Apr 2017 13:03:30 GMT (envelope-from hselasky@FreeBSD.org) Message-Id: <201704191303.v3JD3UZV093313@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: hselasky set sender to hselasky@FreeBSD.org using -f From: Hans Petter Selasky Date: Wed, 19 Apr 2017 13:03:30 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r317137 - in head/sys/compat/linuxkpi/common: include/linux src X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 19 Apr 2017 13:03:31 -0000 Author: hselasky Date: Wed Apr 19 13:03:29 2017 New Revision: 317137 URL: https://svnweb.freebsd.org/changeset/base/317137 Log: Fix problem regarding priority inversion when using the concurrency kit, CK, in the LinuxKPI. When threads are pinned to a CPU core or when there is only one CPU, it can happen that a higher priority thread can call the CK synchronize function while a lower priority thread holds the read lock. Because the CK's synchronize is a simple wait loop this can lead to a deadlock situation. To solve this problem use the recently introduced CK's wait callback function. When detecting a CK blocking condition figure out the lowest priority among the blockers and update the calling thread's priority and yield. If another CPU core is holding the read lock, pin the thread to the blocked CPU core and update the priority. The calling threads priority and CPU bindings are restored before return. If a thread holding a CK read lock is detected to be sleeping, pause() will be used instead of yield(). MFC after: 1 week Sponsored by: Mellanox Technologies Modified: head/sys/compat/linuxkpi/common/include/linux/sched.h head/sys/compat/linuxkpi/common/include/linux/srcu.h head/sys/compat/linuxkpi/common/src/linux_rcu.c Modified: head/sys/compat/linuxkpi/common/include/linux/sched.h ============================================================================== --- head/sys/compat/linuxkpi/common/include/linux/sched.h Wed Apr 19 12:39:45 2017 (r317136) +++ head/sys/compat/linuxkpi/common/include/linux/sched.h Wed Apr 19 13:03:29 2017 (r317137) @@ -37,7 +37,7 @@ #include #include -#include +#include #include #include #include @@ -72,6 +72,8 @@ struct task_struct { unsigned bsd_ioctl_len; struct completion parked; struct completion exited; + TAILQ_ENTRY(task_struct) rcu_entry; + int rcu_recurse; }; #define current ({ \ Modified: head/sys/compat/linuxkpi/common/include/linux/srcu.h ============================================================================== --- head/sys/compat/linuxkpi/common/include/linux/srcu.h Wed Apr 19 12:39:45 2017 (r317136) +++ head/sys/compat/linuxkpi/common/include/linux/srcu.h Wed Apr 19 13:03:29 2017 (r317137) @@ -29,9 +29,7 @@ #ifndef _LINUX_SRCU_H_ #define _LINUX_SRCU_H_ -struct srcu_epoch_record; struct srcu_struct { - struct srcu_epoch_record *ss_epoch_record; }; #define srcu_dereference(ptr,srcu) ((__typeof(*(ptr)) *)(ptr)) Modified: head/sys/compat/linuxkpi/common/src/linux_rcu.c ============================================================================== --- head/sys/compat/linuxkpi/common/src/linux_rcu.c Wed Apr 19 12:39:45 2017 (r317136) +++ head/sys/compat/linuxkpi/common/src/linux_rcu.c Wed Apr 19 13:03:29 2017 (r317137) @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -45,26 +46,34 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include -struct callback_head; -struct writer_epoch_record { - ck_epoch_record_t epoch_record; - struct mtx head_lock; - struct mtx sync_lock; - struct task task; - STAILQ_HEAD(, callback_head) head; -} __aligned(CACHE_LINE_SIZE); +/* + * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will + * not be skipped during panic(). + */ +#ifdef CONFIG_NO_RCU_SKIP +#define RCU_SKIP(void) 0 +#else +#define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) +#endif struct callback_head { STAILQ_ENTRY(callback_head) entry; rcu_callback_t func; }; -struct srcu_epoch_record { +struct linux_epoch_head { + STAILQ_HEAD(, callback_head) cb_head; + struct mtx lock; + struct task task; +} __aligned(CACHE_LINE_SIZE); + +struct linux_epoch_record { ck_epoch_record_t epoch_record; - struct mtx read_lock; - struct mtx sync_lock; -}; + TAILQ_HEAD(, task_struct) ts_head; + int cpuid; +} __aligned(CACHE_LINE_SIZE); /* * Verify that "struct rcu_head" is big enough to hold "struct @@ -72,57 +81,42 @@ struct srcu_epoch_record { * compile flags for including ck_epoch.h to all clients of the * LinuxKPI. */ -CTASSERT(sizeof(struct rcu_head) >= sizeof(struct callback_head)); +CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); /* * Verify that "epoch_record" is at beginning of "struct - * writer_epoch_record": + * linux_epoch_record": */ -CTASSERT(offsetof(struct writer_epoch_record, epoch_record) == 0); - -/* - * Verify that "epoch_record" is at beginning of "struct - * srcu_epoch_record": - */ -CTASSERT(offsetof(struct srcu_epoch_record, epoch_record) == 0); +CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); static ck_epoch_t linux_epoch; -static MALLOC_DEFINE(M_LRCU, "lrcu", "Linux RCU"); -static DPCPU_DEFINE(ck_epoch_record_t *, linux_reader_epoch_record); -static DPCPU_DEFINE(struct writer_epoch_record *, linux_writer_epoch_record); +static struct linux_epoch_head linux_epoch_head; +static DPCPU_DEFINE(struct linux_epoch_record, linux_epoch_record); static void linux_rcu_cleaner_func(void *, int); static void linux_rcu_runtime_init(void *arg __unused) { + struct linux_epoch_head *head; int i; ck_epoch_init(&linux_epoch); - /* setup reader records */ - CPU_FOREACH(i) { - ck_epoch_record_t *record; - - record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); - ck_epoch_register(&linux_epoch, record, NULL); + head = &linux_epoch_head; - DPCPU_ID_SET(i, linux_reader_epoch_record, record); - } + mtx_init(&head->lock, "LRCU-HEAD", NULL, MTX_DEF); + TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, NULL); + STAILQ_INIT(&head->cb_head); - /* setup writer records */ CPU_FOREACH(i) { - struct writer_epoch_record *record; + struct linux_epoch_record *record; - record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); + record = &DPCPU_ID_GET(i, linux_epoch_record); + record->cpuid = i; ck_epoch_register(&linux_epoch, &record->epoch_record, NULL); - mtx_init(&record->head_lock, "LRCU-HEAD", NULL, MTX_DEF); - mtx_init(&record->sync_lock, "LRCU-SYNC", NULL, MTX_DEF); - TASK_INIT(&record->task, 0, linux_rcu_cleaner_func, record); - STAILQ_INIT(&record->head); - - DPCPU_ID_SET(i, linux_writer_epoch_record, record); + TAILQ_INIT(&record->ts_head); } } SYSINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_init, NULL); @@ -130,91 +124,40 @@ SYSINIT(linux_rcu_runtime, SI_SUB_LOCK, static void linux_rcu_runtime_uninit(void *arg __unused) { - ck_stack_entry_t *cursor; - ck_stack_entry_t *next; - int i; + struct linux_epoch_head *head; - /* make sure all callbacks have been called */ - linux_rcu_barrier(); + head = &linux_epoch_head; - /* destroy all writer record mutexes */ - CPU_FOREACH(i) { - struct writer_epoch_record *record; - - record = DPCPU_ID_GET(i, linux_writer_epoch_record); - - mtx_destroy(&record->head_lock); - mtx_destroy(&record->sync_lock); - } - - /* free all registered reader and writer records */ - CK_STACK_FOREACH_SAFE(&linux_epoch.records, cursor, next) { - ck_epoch_record_t *record; - - record = container_of(cursor, - struct ck_epoch_record, record_next); - free(record, M_LRCU); - } + /* destroy head lock */ + mtx_destroy(&head->lock); } SYSUNINIT(linux_rcu_runtime, SI_SUB_LOCK, SI_ORDER_SECOND, linux_rcu_runtime_uninit, NULL); -static inline struct srcu_epoch_record * -linux_srcu_get_record(void) -{ - struct srcu_epoch_record *record; - - WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, - "linux_srcu_get_record() might sleep"); - - /* - * NOTE: The only records that are unregistered and can be - * recycled are srcu_epoch_records. - */ - record = (struct srcu_epoch_record *)ck_epoch_recycle(&linux_epoch, NULL); - if (__predict_true(record != NULL)) - return (record); - - record = malloc(sizeof(*record), M_LRCU, M_WAITOK | M_ZERO); - mtx_init(&record->read_lock, "SRCU-READ", NULL, MTX_DEF | MTX_NOWITNESS); - mtx_init(&record->sync_lock, "SRCU-SYNC", NULL, MTX_DEF | MTX_NOWITNESS); - ck_epoch_register(&linux_epoch, &record->epoch_record, NULL); - - return (record); -} - -static inline void -linux_rcu_synchronize_sub(struct writer_epoch_record *record) -{ - - /* protect access to epoch_record */ - mtx_lock(&record->sync_lock); - ck_epoch_synchronize(&record->epoch_record); - mtx_unlock(&record->sync_lock); -} - static void -linux_rcu_cleaner_func(void *context, int pending __unused) +linux_rcu_cleaner_func(void *context __unused, int pending __unused) { - struct writer_epoch_record *record; + struct linux_epoch_head *head; struct callback_head *rcu; - STAILQ_HEAD(, callback_head) head; + STAILQ_HEAD(, callback_head) tmp_head; + + linux_set_current(curthread); - record = context; + head = &linux_epoch_head; /* move current callbacks into own queue */ - mtx_lock(&record->head_lock); - STAILQ_INIT(&head); - STAILQ_CONCAT(&head, &record->head); - mtx_unlock(&record->head_lock); + mtx_lock(&head->lock); + STAILQ_INIT(&tmp_head); + STAILQ_CONCAT(&tmp_head, &head->cb_head); + mtx_unlock(&head->lock); /* synchronize */ - linux_rcu_synchronize_sub(record); + linux_synchronize_rcu(); /* dispatch all callbacks, if any */ - while ((rcu = STAILQ_FIRST(&head)) != NULL) { + while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { uintptr_t offset; - STAILQ_REMOVE_HEAD(&head, entry); + STAILQ_REMOVE_HEAD(&tmp_head, entry); offset = (uintptr_t)rcu->func; @@ -228,148 +171,217 @@ linux_rcu_cleaner_func(void *context, in void linux_rcu_read_lock(void) { - ck_epoch_record_t *record; + struct linux_epoch_record *record; + struct task_struct *ts; + + if (RCU_SKIP()) + return; /* * Pin thread to current CPU so that the unlock code gets the - * same per-CPU reader epoch record: + * same per-CPU epoch record: */ sched_pin(); - record = DPCPU_GET(linux_reader_epoch_record); + record = &DPCPU_GET(linux_epoch_record); + ts = current; /* * Use a critical section to prevent recursion inside * ck_epoch_begin(). Else this function supports recursion. */ critical_enter(); - ck_epoch_begin(record, NULL); + ck_epoch_begin(&record->epoch_record, NULL); + ts->rcu_recurse++; + if (ts->rcu_recurse == 1) + TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry); critical_exit(); } void linux_rcu_read_unlock(void) { - ck_epoch_record_t *record; + struct linux_epoch_record *record; + struct task_struct *ts; + + if (RCU_SKIP()) + return; - record = DPCPU_GET(linux_reader_epoch_record); + record = &DPCPU_GET(linux_epoch_record); + ts = current; /* * Use a critical section to prevent recursion inside * ck_epoch_end(). Else this function supports recursion. */ critical_enter(); - ck_epoch_end(record, NULL); + ck_epoch_end(&record->epoch_record, NULL); + ts->rcu_recurse--; + if (ts->rcu_recurse == 0) + TAILQ_REMOVE(&record->ts_head, ts, rcu_entry); critical_exit(); sched_unpin(); } +static void +linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) +{ + struct linux_epoch_record *record = + container_of(epoch_record, struct linux_epoch_record, epoch_record); + struct thread *td = curthread; + struct task_struct *ts; + + /* check if blocked on the current CPU */ + if (record->cpuid == PCPU_GET(cpuid)) { + bool is_sleeping = 0; + u_char prio = 0; + u_char old_prio; + + /* + * Find the lowest priority or sleeping thread which + * is blocking synchronization on this CPU core. All + * the threads in the queue are CPU-pinned and cannot + * go anywhere while the current thread is locked. + */ + TAILQ_FOREACH(ts, &record->ts_head, rcu_entry) { + if (ts->task_thread->td_priority > prio) + prio = ts->task_thread->td_priority; + is_sleeping |= (ts->task_thread->td_inhibitors != 0); + } + + if (is_sleeping) { + thread_unlock(td); + pause("W", 1); + thread_lock(td); + } else { + old_prio = td->td_priority; + /* set new thread priority */ + sched_prio(td, prio); + /* task switch */ + mi_switch(SW_VOL | SWT_RELINQUISH, NULL); + /* restore thread priority */ + sched_prio(td, old_prio); + } + } else { + /* + * To avoid spinning move execution to the other CPU + * which is blocking synchronization. Set highest + * thread priority so that code gets run. The thread + * priority will be restored later. + */ + sched_prio(td, 0); + sched_bind(td, record->cpuid); + } +} + void linux_synchronize_rcu(void) { - linux_rcu_synchronize_sub(DPCPU_GET(linux_writer_epoch_record)); + struct thread *td; + int was_bound; + int old_cpu; + int old_pinned; + + if (RCU_SKIP()) + return; + + WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, + "linux_synchronize_rcu() can sleep"); + + td = curthread; + + DROP_GIANT(); + + /* + * Synchronizing RCU might change the CPU core this function + * is running on. Save current values: + */ + thread_lock(td); + + old_cpu = PCPU_GET(cpuid); + old_pinned = td->td_pinned; + td->td_pinned = 0; + was_bound = sched_is_bound(td); + sched_bind(td, old_cpu); + + ck_epoch_synchronize_wait(&linux_epoch, + &linux_synchronize_rcu_cb, NULL); + + /* restore CPU binding, if any */ + if (was_bound != 0) { + sched_bind(td, old_cpu); + } else { + /* get thread back to initial CPU, if any */ + if (old_pinned != 0) + sched_bind(td, old_cpu); + sched_unbind(td); + } + /* restore pinned after bind */ + td->td_pinned = old_pinned; + thread_unlock(td); + + PICKUP_GIANT(); } void linux_rcu_barrier(void) { - int i; - - CPU_FOREACH(i) { - struct writer_epoch_record *record; + struct linux_epoch_head *head; - record = DPCPU_ID_GET(i, linux_writer_epoch_record); + linux_synchronize_rcu(); - linux_rcu_synchronize_sub(record); + head = &linux_epoch_head; - /* wait for callbacks to complete */ - taskqueue_drain(taskqueue_fast, &record->task); - } + /* wait for callbacks to complete */ + taskqueue_drain(taskqueue_fast, &head->task); } void linux_call_rcu(struct rcu_head *context, rcu_callback_t func) { struct callback_head *rcu = (struct callback_head *)context; - struct writer_epoch_record *record; - - record = DPCPU_GET(linux_writer_epoch_record); + struct linux_epoch_head *head = &linux_epoch_head; - mtx_lock(&record->head_lock); + mtx_lock(&head->lock); rcu->func = func; - STAILQ_INSERT_TAIL(&record->head, rcu, entry); - taskqueue_enqueue(taskqueue_fast, &record->task); - mtx_unlock(&record->head_lock); + STAILQ_INSERT_TAIL(&head->cb_head, rcu, entry); + taskqueue_enqueue(taskqueue_fast, &head->task); + mtx_unlock(&head->lock); } int init_srcu_struct(struct srcu_struct *srcu) { - struct srcu_epoch_record *record; - - record = linux_srcu_get_record(); - srcu->ss_epoch_record = record; return (0); } void cleanup_srcu_struct(struct srcu_struct *srcu) { - struct srcu_epoch_record *record; - - record = srcu->ss_epoch_record; - srcu->ss_epoch_record = NULL; - - ck_epoch_unregister(&record->epoch_record); } int srcu_read_lock(struct srcu_struct *srcu) { - struct srcu_epoch_record *record; - - record = srcu->ss_epoch_record; - - mtx_lock(&record->read_lock); - ck_epoch_begin(&record->epoch_record, NULL); - mtx_unlock(&record->read_lock); - + linux_rcu_read_lock(); return (0); } void srcu_read_unlock(struct srcu_struct *srcu, int key __unused) { - struct srcu_epoch_record *record; - - record = srcu->ss_epoch_record; - - mtx_lock(&record->read_lock); - ck_epoch_end(&record->epoch_record, NULL); - mtx_unlock(&record->read_lock); + linux_rcu_read_unlock(); } void synchronize_srcu(struct srcu_struct *srcu) { - struct srcu_epoch_record *record; - - record = srcu->ss_epoch_record; - - mtx_lock(&record->sync_lock); - ck_epoch_synchronize(&record->epoch_record); - mtx_unlock(&record->sync_lock); + linux_synchronize_rcu(); } void srcu_barrier(struct srcu_struct *srcu) { - struct srcu_epoch_record *record; - - record = srcu->ss_epoch_record; - - mtx_lock(&record->sync_lock); - ck_epoch_barrier(&record->epoch_record); - mtx_unlock(&record->sync_lock); + linux_rcu_barrier(); }