Date: Sun, 14 Jan 2024 21:53:51 GMT From: Arthur Kiyanovski <akiyano@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org Subject: git: 71cd348d2454 - stable/14 - ena: Add sysctl support for spreading IRQs Message-ID: <202401142153.40ELrp4J041414@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch stable/14 has been updated by akiyano: URL: https://cgit.FreeBSD.org/src/commit/?id=71cd348d245404b17c08c69b2aa8be3190e68732 commit 71cd348d245404b17c08c69b2aa8be3190e68732 Author: Osama Abboud <osamaabb@amazon.com> AuthorDate: 2023-10-30 11:27:03 +0000 Commit: Arthur Kiyanovski <akiyano@FreeBSD.org> CommitDate: 2024-01-14 21:18:10 +0000 ena: Add sysctl support for spreading IRQs This commit allows spreading IO IRQs over different CPUs through sysctl. Two sysctl nodes are introduced: 1- base_cpu: servers as the first CPU to which the first IO IRQ will be bound. 2- cpu_stride: sets the distance between every two CPUs to which every two consecutive IO IRQs are bound. For example for doing the following IO IRQs / CPU binding: IRQ idx | CPU ---------------- 1 | 0 2 | 2 3 | 4 4 | 6 Run the following commands: sysctl dev.ena.<device index>.irq_affinity.base_cpu=0 sysctl dev.ena.<device_index>.irq_affinity.cpu_stride=2 Also introduced rss_enabled field, which is intended to replace '#ifdef RSS' in multiple places, in order to prevent code duplication. We want to bind interrupts to CPUs in case of rss set OR in case the newly defined sysctl paremeter is set. This requires to remove a couple of '#ifdef RSS' as well in the structs, since we'll be using the relevant parameters in the CPU binding code. Approved by: cperciva (mentor) MFC after: 2 weeks Sponsored by: Amazon, Inc. (cherry picked from commit f9e1d9471077109c19fd7d6dc9c1d35432efdede) --- sys/dev/ena/ena.c | 126 ++++++++++++++++++++++++++++++++++------ sys/dev/ena/ena.h | 14 +++-- sys/dev/ena/ena_sysctl.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 263 insertions(+), 22 deletions(-) diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c index 455386a4f152..067c8a6f3a08 100644 --- a/sys/dev/ena/ena.c +++ b/sys/dev/ena/ena.c @@ -1237,6 +1237,84 @@ ena_update_io_rings(struct ena_adapter *adapter, uint32_t num) ena_init_io_rings(adapter); } +int +ena_update_base_cpu(struct ena_adapter *adapter, int new_num) +{ + int old_num; + int rc = 0; + bool dev_was_up; + + dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter); + old_num = adapter->irq_cpu_base; + + ena_down(adapter); + + adapter->irq_cpu_base = new_num; + + if (dev_was_up) { + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to configure device %d IRQ base CPU. " + "Reverting to previous value: %d\n", + new_num, old_num); + + adapter->irq_cpu_base = old_num; + + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to revert to previous setup." + "Triggering device reset.\n"); + ENA_FLAG_SET_ATOMIC( + ENA_FLAG_DEV_UP_BEFORE_RESET, adapter); + ena_trigger_reset(adapter, + ENA_REGS_RESET_OS_TRIGGER); + } + } + } + return (rc); +} + +int +ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num) +{ + uint32_t old_num; + int rc = 0; + bool dev_was_up; + + dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter); + old_num = adapter->irq_cpu_stride; + + ena_down(adapter); + + adapter->irq_cpu_stride = new_num; + + if (dev_was_up) { + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to configure device %d IRQ CPU stride. " + "Reverting to previous value: %d\n", + new_num, old_num); + + adapter->irq_cpu_stride = old_num; + + rc = ena_up(adapter); + if (unlikely(rc != 0)) { + ena_log(adapter->pdev, ERR, + "Failed to revert to previous setup." + "Triggering device reset.\n"); + ENA_FLAG_SET_ATOMIC( + ENA_FLAG_DEV_UP_BEFORE_RESET, adapter); + ena_trigger_reset(adapter, + ENA_REGS_RESET_OS_TRIGGER); + } + } + } + return (rc); +} + /* Caller should sanitize new_num */ int ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num) @@ -1683,6 +1761,13 @@ ena_setup_io_intr(struct ena_adapter *adapter) ena_log(adapter->pdev, DBG, "ena_setup_io_intr vector: %d\n", adapter->msix_entries[irq_idx].vector); + if (adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) { + adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu = + (unsigned)(adapter->irq_cpu_base + + i * adapter->irq_cpu_stride) % (unsigned)mp_ncpus; + CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask); + } + #ifdef RSS adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu = rss_getcpu(cur_bind); @@ -1790,20 +1875,19 @@ ena_request_io_irq(struct ena_adapter *adapter) } irq->requested = true; -#ifdef RSS - rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu); - if (unlikely(rc != 0)) { - ena_log(pdev, ERR, - "failed to bind interrupt handler for irq %ju to cpu %d: %d\n", - rman_get_start(irq->res), irq->cpu, rc); - goto err; - } + if (adapter->rss_enabled || adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) { + rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu); + if (unlikely(rc != 0)) { + ena_log(pdev, ERR, + "failed to bind interrupt handler for irq %ju to cpu %d: %d\n", + rman_get_start(irq->res), irq->cpu, rc); + goto err; + } - ena_log(pdev, INFO, "queue %d - cpu %d\n", - i - ENA_IO_IRQ_FIRST_IDX, irq->cpu); -#endif + ena_log(pdev, INFO, "queue %d - cpu %d\n", + i - ENA_IO_IRQ_FIRST_IDX, irq->cpu); + } } - return (rc); err: @@ -1814,13 +1898,14 @@ err: /* Once we entered err: section and irq->requested is true we free both intr and resources */ - if (irq->requested) + if (irq->requested) { rcc = bus_teardown_intr(adapter->pdev, irq->res, irq->cookie); - if (unlikely(rcc != 0)) - ena_log(pdev, ERR, - "could not release irq: %d, error: %d\n", - irq->vector, rcc); + if (unlikely(rcc != 0)) + ena_log(pdev, ERR, + "could not release irq: %d, error: %d\n", + irq->vector, rcc); + } /* If we entered err: section without irq->requested set we know it was bus_alloc_resource_any() that needs cleanup, provided @@ -3523,6 +3608,13 @@ ena_attach(device_t pdev) adapter->missing_tx_max_queues = ENA_DEFAULT_TX_MONITORED_QUEUES; adapter->missing_tx_threshold = ENA_DEFAULT_TX_CMP_THRESHOLD; + adapter->irq_cpu_base = ENA_BASE_CPU_UNSPECIFIED; + adapter->irq_cpu_stride = 0; + +#ifdef RSS + adapter->rss_enabled = 1; +#endif + if (version_printed++ == 0) ena_log(pdev, INFO, "%s\n", ena_version); diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h index b6e791d6ff6a..3687e9b2522e 100644 --- a/sys/dev/ena/ena.h +++ b/sys/dev/ena/ena.h @@ -69,6 +69,7 @@ #define ENA_DEFAULT_RING_SIZE 1024 #define ENA_MIN_RING_SIZE 256 +#define ENA_BASE_CPU_UNSPECIFIED -1 /* * Refill Rx queue when number of required descriptors is above * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET @@ -201,9 +202,7 @@ struct ena_irq { void *cookie; unsigned int vector; bool requested; -#ifdef RSS int cpu; -#endif char name[ENA_IRQNAME_SIZE]; }; @@ -216,10 +215,8 @@ struct ena_que { struct taskqueue *cleanup_tq; uint32_t id; -#ifdef RSS int cpu; cpuset_t cpu_mask; -#endif int domain; struct sysctl_oid *oid; }; @@ -448,6 +445,12 @@ struct ena_adapter { ena_state_t flags; + /* IRQ CPU affinity */ + int irq_cpu_base; + uint32_t irq_cpu_stride; + + uint8_t rss_enabled; + /* Queue will represent one TX and one RX ring */ struct ena_que que[ENA_MAX_NUM_IO_QUEUES] __aligned(CACHE_LINE_SIZE); @@ -524,7 +527,8 @@ int ena_update_buf_ring_size(struct ena_adapter *adapter, int ena_update_queue_size(struct ena_adapter *adapter, uint32_t new_tx_size, uint32_t new_rx_size); int ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num); - +int ena_update_base_cpu(struct ena_adapter *adapter, int new_num); +int ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num); static inline int ena_mbuf_count(struct mbuf *mbuf) { diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c index d450fcf48d15..6668f9924787 100644 --- a/sys/dev/ena/ena_sysctl.c +++ b/sys/dev/ena/ena_sysctl.c @@ -38,6 +38,7 @@ static void ena_sysctl_add_wd(struct ena_adapter *); static void ena_sysctl_add_stats(struct ena_adapter *); static void ena_sysctl_add_eni_metrics(struct ena_adapter *); static void ena_sysctl_add_tuneables(struct ena_adapter *); +static void ena_sysctl_add_irq_affinity(struct ena_adapter *); /* Kernel option RSS prevents manipulation of key hash and indirection table. */ #ifndef RSS static void ena_sysctl_add_rss(struct ena_adapter *); @@ -45,6 +46,8 @@ static void ena_sysctl_add_rss(struct ena_adapter *); static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS); static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS); +static int ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS); +static int ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS); static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS); #ifndef RSS static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS); @@ -102,6 +105,7 @@ ena_sysctl_add_nodes(struct ena_adapter *adapter) ena_sysctl_add_stats(adapter); ena_sysctl_add_eni_metrics(adapter); ena_sysctl_add_tuneables(adapter); + ena_sysctl_add_irq_affinity(adapter); #ifndef RSS ena_sysctl_add_rss(adapter); #endif @@ -448,6 +452,36 @@ ena_sysctl_add_rss(struct ena_adapter *adapter) } #endif /* RSS */ +static void +ena_sysctl_add_irq_affinity(struct ena_adapter *adapter) +{ + device_t dev; + + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = adapter->pdev; + + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "irq_affinity", + CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Decide base CPU and stride for irqs affinity."); + child = SYSCTL_CHILDREN(tree); + + /* Add base cpu leaf */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "base_cpu", + CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, + ena_sysctl_irq_base_cpu, "I", "Base cpu index for setting irq affinity."); + + /* Add cpu stride leaf */ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "cpu_stride", + CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0, + ena_sysctl_irq_cpu_stride, "I", "Distance between irqs when setting affinity."); +} + /* * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes. @@ -707,6 +741,117 @@ unlock: return (0); } +static int +ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS) +{ + struct ena_adapter *adapter = arg1; + int irq_base_cpu = 0; + int error; + + ENA_LOCK_LOCK(); + if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { + error = ENODEV; + goto unlock; + } + + error = sysctl_wire_old_buffer(req, sizeof(irq_base_cpu)); + if (error == 0) { + irq_base_cpu = adapter->irq_cpu_base; + error = sysctl_handle_int(oidp, &irq_base_cpu, 0, req); + } + if (error != 0 || req->newptr == NULL) + goto unlock; + + if (irq_base_cpu <= ENA_BASE_CPU_UNSPECIFIED) { + ena_log(adapter->pdev, ERR, + "Requested base CPU is less than zero.\n"); + error = EINVAL; + goto unlock; + } + + if (irq_base_cpu > mp_ncpus) { + ena_log(adapter->pdev, INFO, + "Requested base CPU is larger than the number of available CPUs. \n"); + error = EINVAL; + goto unlock; + + } + + if (irq_base_cpu == adapter->irq_cpu_base) { + ena_log(adapter->pdev, INFO, + "Requested IRQ base CPU is equal to current value " + "(%d)\n", + adapter->irq_cpu_base); + goto unlock; + } + + ena_log(adapter->pdev, INFO, + "Requested new IRQ base CPU: %d, current value: %d\n", + irq_base_cpu, adapter->irq_cpu_base); + + error = ena_update_base_cpu(adapter, irq_base_cpu); + +unlock: + ENA_LOCK_UNLOCK(); + + return (error); +} + +static int +ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS) +{ + struct ena_adapter *adapter = arg1; + int32_t irq_cpu_stride = 0; + int error; + + ENA_LOCK_LOCK(); + if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) { + error = ENODEV; + goto unlock; + } + + error = sysctl_wire_old_buffer(req, sizeof(irq_cpu_stride)); + if (error == 0) { + irq_cpu_stride = adapter->irq_cpu_stride; + error = sysctl_handle_int(oidp, &irq_cpu_stride, 0, req); + } + if (error != 0 || req->newptr == NULL) + goto unlock; + + if (irq_cpu_stride < 0) { + ena_log(adapter->pdev, ERR, + "Requested IRQ stride is less than zero.\n"); + error = EINVAL; + goto unlock; + } + + if (irq_cpu_stride > mp_ncpus) { + ena_log(adapter->pdev, INFO, + "Warning: Requested IRQ stride is larger than the number of available CPUs.\n"); + } + + if (irq_cpu_stride == adapter->irq_cpu_stride) { + ena_log(adapter->pdev, INFO, + "Requested IRQ CPU stride is equal to current value " + "(%u)\n", + adapter->irq_cpu_stride); + goto unlock; + } + + ena_log(adapter->pdev, INFO, + "Requested new IRQ CPU stride: %u, current value: %u\n", + irq_cpu_stride, adapter->irq_cpu_stride); + + error = ena_update_cpu_stride(adapter, irq_cpu_stride); + if (error != 0) + goto unlock; + +unlock: + ENA_LOCK_UNLOCK(); + + return (error); +} + #ifndef RSS /* * Change the Receive Side Scaling hash key.
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202401142153.40ELrp4J041414>