From owner-svn-src-all@FreeBSD.ORG Wed Feb 8 11:43:30 2012 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 9490A1065672; Wed, 8 Feb 2012 11:43:30 +0000 (UTC) (envelope-from luigi@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 7DAB08FC15; Wed, 8 Feb 2012 11:43:30 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q18BhUTG040956; Wed, 8 Feb 2012 11:43:30 GMT (envelope-from luigi@svn.freebsd.org) Received: (from luigi@localhost) by svn.freebsd.org (8.14.4/8.14.4/Submit) id q18BhUth040948; Wed, 8 Feb 2012 11:43:30 GMT (envelope-from luigi@svn.freebsd.org) Message-Id: <201202081143.q18BhUth040948@svn.freebsd.org> From: Luigi Rizzo Date: Wed, 8 Feb 2012 11:43:30 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r231198 - in head: sys/dev/netmap sys/net tools/tools/netmap X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 08 Feb 2012 11:43:30 -0000 Author: luigi Date: Wed Feb 8 11:43:29 2012 New Revision: 231198 URL: http://svn.freebsd.org/changeset/base/231198 Log: - change the buffer size from a constant to a TUNABLE variable (hw.netmap.buf_size) so we can experiment with values different from 2048 which may give better cache performance. - rearrange the memory allocation code so it will be easier to replace it with a different implementation. The current code relies on a single large contiguous chunk of memory obtained through contigmalloc. The new implementation (not committed yet) uses multiple smaller chunks which are easier to fit in a fragmented address space. Modified: head/sys/dev/netmap/if_em_netmap.h head/sys/dev/netmap/if_igb_netmap.h head/sys/dev/netmap/if_lem_netmap.h head/sys/dev/netmap/if_re_netmap.h head/sys/dev/netmap/ixgbe_netmap.h head/sys/dev/netmap/netmap.c head/sys/dev/netmap/netmap_kern.h head/sys/net/netmap.h head/sys/net/netmap_user.h head/tools/tools/netmap/pkt-gen.c Modified: head/sys/dev/netmap/if_em_netmap.h ============================================================================== --- head/sys/dev/netmap/if_em_netmap.h Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/if_em_netmap.h Wed Feb 8 11:43:29 2012 (r231198) @@ -61,7 +61,6 @@ em_netmap_attach(struct adapter *adapter na.nm_rxsync = em_netmap_rxsync; na.nm_lock = em_netmap_lock_wrapper; na.nm_register = em_netmap_reg; - na.buff_size = NETMAP_BUF_SIZE; netmap_attach(&na, adapter->num_queues); } Modified: head/sys/dev/netmap/if_igb_netmap.h ============================================================================== --- head/sys/dev/netmap/if_igb_netmap.h Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/if_igb_netmap.h Wed Feb 8 11:43:29 2012 (r231198) @@ -58,7 +58,6 @@ igb_netmap_attach(struct adapter *adapte na.nm_rxsync = igb_netmap_rxsync; na.nm_lock = igb_netmap_lock_wrapper; na.nm_register = igb_netmap_reg; - na.buff_size = NETMAP_BUF_SIZE; netmap_attach(&na, adapter->num_queues); } Modified: head/sys/dev/netmap/if_lem_netmap.h ============================================================================== --- head/sys/dev/netmap/if_lem_netmap.h Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/if_lem_netmap.h Wed Feb 8 11:43:29 2012 (r231198) @@ -62,7 +62,6 @@ lem_netmap_attach(struct adapter *adapte na.nm_rxsync = lem_netmap_rxsync; na.nm_lock = lem_netmap_lock_wrapper; na.nm_register = lem_netmap_reg; - na.buff_size = NETMAP_BUF_SIZE; netmap_attach(&na, 1); } @@ -247,6 +246,7 @@ lem_netmap_txsync(void *a, u_int ring_nr ring->avail = kring->nr_hwavail; } } + if (do_lock) EM_TX_UNLOCK(adapter); return 0; @@ -351,9 +351,9 @@ lem_netmap_rxsync(void *a, u_int ring_nr /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail ; + if (do_lock) EM_RX_UNLOCK(adapter); return 0; } - - +/* end of file */ Modified: head/sys/dev/netmap/if_re_netmap.h ============================================================================== --- head/sys/dev/netmap/if_re_netmap.h Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/if_re_netmap.h Wed Feb 8 11:43:29 2012 (r231198) @@ -56,7 +56,6 @@ re_netmap_attach(struct rl_softc *sc) na.nm_rxsync = re_netmap_rxsync; na.nm_lock = re_netmap_lock_wrapper; na.nm_register = re_netmap_reg; - na.buff_size = NETMAP_BUF_SIZE; netmap_attach(&na, 1); } Modified: head/sys/dev/netmap/ixgbe_netmap.h ============================================================================== --- head/sys/dev/netmap/ixgbe_netmap.h Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/ixgbe_netmap.h Wed Feb 8 11:43:29 2012 (r231198) @@ -82,13 +82,6 @@ ixgbe_netmap_attach(struct adapter *adap na.nm_rxsync = ixgbe_netmap_rxsync; na.nm_lock = ixgbe_netmap_lock_wrapper; na.nm_register = ixgbe_netmap_reg; - /* - * XXX where do we put this comment ? - * adapter->rx_mbuf_sz is set by SIOCSETMTU, but in netmap mode - * we allocate the buffers on the first register. So we must - * disallow a SIOCSETMTU when if_capenable & IFCAP_NETMAP is set. - */ - na.buff_size = NETMAP_BUF_SIZE; netmap_attach(&na, adapter->num_queues); } @@ -354,7 +347,8 @@ ring_reset: * otherwise we go to sleep (in netmap_poll()) and will be * woken up when slot nr_kflags will be ready. */ - struct ixgbe_legacy_tx_desc *txd = (struct ixgbe_legacy_tx_desc *)txr->tx_base; + struct ixgbe_legacy_tx_desc *txd = + (struct ixgbe_legacy_tx_desc *)txr->tx_base; j = txr->next_to_clean + kring->nkr_num_slots/2; if (j >= kring->nkr_num_slots) @@ -365,9 +359,7 @@ ring_reset: kring->nr_kflags = j; /* the slot to check */ j = txd[j].upper.fields.status & IXGBE_TXD_STAT_DD; } - if (!j) { - netmap_skip_txsync++; - } else { + if (j) { int delta; /* @@ -471,7 +463,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_ if (j > lim) j -= lim + 1; - if (force_update) { + if (netmap_no_pendintr || force_update) { for (n = 0; ; n++) { union ixgbe_adv_rx_desc *curr = &rxr->rx_base[l]; uint32_t staterr = le32toh(curr->wb.upper.status_error); @@ -548,6 +540,7 @@ ixgbe_netmap_rxsync(void *a, u_int ring_ } /* tell userspace that there are new packets */ ring->avail = kring->nr_hwavail ; + if (do_lock) IXGBE_RX_UNLOCK(rxr); return 0; Modified: head/sys/dev/netmap/netmap.c ============================================================================== --- head/sys/dev/netmap/netmap.c Wed Feb 8 11:33:46 2012 (r231197) +++ head/sys/dev/netmap/netmap.c Wed Feb 8 11:43:29 2012 (r231198) @@ -92,6 +92,39 @@ MALLOC_DEFINE(M_NETMAP, "netmap", "Netwo */ #define NMA_LOCK() mtx_lock(&netmap_mem_d->nm_mtx); #define NMA_UNLOCK() mtx_unlock(&netmap_mem_d->nm_mtx); +struct netmap_mem_d; +static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */ + +u_int netmap_total_buffers; +char *netmap_buffer_base; /* address of an invalid buffer */ + +/* user-controlled variables */ +int netmap_verbose; + +static int netmap_no_timestamp; /* don't timestamp on rxsync */ + +SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); +SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, + CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); +SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, + CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp"); +int netmap_buf_size = 2048; +TUNABLE_INT("hw.netmap.buf_size", &netmap_buf_size); +SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size, + CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers"); +int netmap_mitigate = 1; +SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); +int netmap_no_pendintr; +SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, + CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets."); + + + +/*----- memory allocator -----------------*/ +/* + * Here we have the low level routines for memory allocator + * and its primary users. + */ /* * Default amount of memory pre-allocated by the module. @@ -128,30 +161,13 @@ struct netmap_buf_pool { uint32_t *bitmap; /* one bit per buffer, 1 means free */ }; struct netmap_buf_pool nm_buf_pool; -/* XXX move these two vars back into netmap_buf_pool */ -u_int netmap_total_buffers; -char *netmap_buffer_base; /* address of an invalid buffer */ - -/* user-controlled variables */ -int netmap_verbose; - -static int no_timestamp; /* don't timestamp on rxsync */ - -SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args"); -SYSCTL_INT(_dev_netmap, OID_AUTO, verbose, - CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode"); -SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp, - CTLFLAG_RW, &no_timestamp, 0, "no_timestamp"); SYSCTL_INT(_dev_netmap, OID_AUTO, total_buffers, CTLFLAG_RD, &nm_buf_pool.total_buffers, 0, "total_buffers"); SYSCTL_INT(_dev_netmap, OID_AUTO, free_buffers, CTLFLAG_RD, &nm_buf_pool.free, 0, "free_buffers"); -int netmap_mitigate = 1; -SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, ""); -int netmap_skip_txsync; -SYSCTL_INT(_dev_netmap, OID_AUTO, skip_txsync, CTLFLAG_RW, &netmap_skip_txsync, 0, ""); -int netmap_skip_rxsync; -SYSCTL_INT(_dev_netmap, OID_AUTO, skip_rxsync, CTLFLAG_RW, &netmap_skip_rxsync, 0, ""); + + + /* * Allocate n buffers from the ring, and fill the slot. @@ -165,79 +181,446 @@ netmap_new_bufs(struct netmap_if *nifp _ uint32_t bi = 0; /* index in the bitmap */ uint32_t mask, j, i = 0; /* slot counter */ - if (n > p->free) { - D("only %d out of %d buffers available", i, n); - return; - } - /* termination is guaranteed by p->free */ - while (i < n && p->free > 0) { - uint32_t cur = p->bitmap[bi]; - if (cur == 0) { /* bitmask is fully used */ - bi++; - continue; - } - /* locate a slot */ - for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; - p->bitmap[bi] &= ~mask; /* slot in use */ - p->free--; - slot[i].buf_idx = bi*32+j; - slot[i].len = p->bufsize; - slot[i].flags = NS_BUF_CHANGED; - i++; - } - ND("allocated %d buffers, %d available", n, p->free); + if (n > p->free) { + D("only %d out of %d buffers available", i, n); + return; + } + /* termination is guaranteed by p->free */ + while (i < n && p->free > 0) { + uint32_t cur = p->bitmap[bi]; + if (cur == 0) { /* bitmask is fully used */ + bi++; + continue; + } + /* locate a slot */ + for (j = 0, mask = 1; (cur & mask) == 0; j++, mask <<= 1) ; + p->bitmap[bi] &= ~mask; /* slot in use */ + p->free--; + slot[i].buf_idx = bi*32+j; + slot[i].len = p->bufsize; + slot[i].flags = NS_BUF_CHANGED; + i++; + } + ND("allocated %d buffers, %d available", n, p->free); +} + + +static void +netmap_free_buf(struct netmap_if *nifp __unused, uint32_t i) +{ + struct netmap_buf_pool *p = &nm_buf_pool; + + uint32_t pos, mask; + if (i >= p->total_buffers) { + D("invalid free index %d", i); + return; + } + pos = i / 32; + mask = 1 << (i % 32); + if (p->bitmap[pos] & mask) { + D("slot %d already free", i); + return; + } + p->bitmap[pos] |= mask; + p->free++; +} + + +/* Descriptor of the memory objects handled by our memory allocator. */ +struct netmap_mem_obj { + TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the + chain. */ + int nmo_used; /* flag set on used memory objects. */ + size_t nmo_size; /* size of the memory area reserved for the + object. */ + void *nmo_data; /* pointer to the memory area. */ +}; + +/* Wrap our memory objects to make them ``chainable``. */ +TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj); + + +/* Descriptor of our custom memory allocator. */ +struct netmap_mem_d { + struct mtx nm_mtx; /* lock used to handle the chain of memory + objects. */ + struct netmap_mem_obj_h nm_molist; /* list of memory objects */ + size_t nm_size; /* total amount of memory used for rings etc. */ + size_t nm_totalsize; /* total amount of allocated memory + (the difference is used for buffers) */ + size_t nm_buf_start; /* offset of packet buffers. + This is page-aligned. */ + size_t nm_buf_len; /* total memory for buffers */ + void *nm_buffer; /* pointer to the whole pre-allocated memory + area. */ +}; + +/* Shorthand to compute a netmap interface offset. */ +#define netmap_if_offset(v) \ + ((char *) (v) - (char *) netmap_mem_d->nm_buffer) +/* .. and get a physical address given a memory offset */ +#define netmap_ofstophys(o) \ + (vtophys(netmap_mem_d->nm_buffer) + (o)) + + +/*------ netmap memory allocator -------*/ +/* + * Request for a chunk of memory. + * + * Memory objects are arranged into a list, hence we need to walk this + * list until we find an object with the needed amount of data free. + * This sounds like a completely inefficient implementation, but given + * the fact that data allocation is done once, we can handle it + * flawlessly. + * + * Return NULL on failure. + */ +static void * +netmap_malloc(size_t size, __unused const char *msg) +{ + struct netmap_mem_obj *mem_obj, *new_mem_obj; + void *ret = NULL; + + NMA_LOCK(); + TAILQ_FOREACH(mem_obj, &netmap_mem_d->nm_molist, nmo_next) { + if (mem_obj->nmo_used != 0 || mem_obj->nmo_size < size) + continue; + + new_mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_BEFORE(mem_obj, new_mem_obj, nmo_next); + + new_mem_obj->nmo_used = 1; + new_mem_obj->nmo_size = size; + new_mem_obj->nmo_data = mem_obj->nmo_data; + memset(new_mem_obj->nmo_data, 0, new_mem_obj->nmo_size); + + mem_obj->nmo_size -= size; + mem_obj->nmo_data = (char *) mem_obj->nmo_data + size; + if (mem_obj->nmo_size == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, + nmo_next); + free(mem_obj, M_NETMAP); + } + + ret = new_mem_obj->nmo_data; + + break; + } + NMA_UNLOCK(); + ND("%s: %d bytes at %p", msg, size, ret); + + return (ret); +} + +/* + * Return the memory to the allocator. + * + * While freeing a memory object, we try to merge adjacent chunks in + * order to reduce memory fragmentation. + */ +static void +netmap_free(void *addr, const char *msg) +{ + size_t size; + struct netmap_mem_obj *cur, *prev, *next; + + if (addr == NULL) { + D("NULL addr for %s", msg); + return; + } + + NMA_LOCK(); + TAILQ_FOREACH(cur, &netmap_mem_d->nm_molist, nmo_next) { + if (cur->nmo_data == addr && cur->nmo_used) + break; + } + if (cur == NULL) { + NMA_UNLOCK(); + D("invalid addr %s %p", msg, addr); + return; + } + + size = cur->nmo_size; + cur->nmo_used = 0; + + /* merge current chunk of memory with the previous one, + if present. */ + prev = TAILQ_PREV(cur, netmap_mem_obj_h, nmo_next); + if (prev && prev->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, cur, nmo_next); + prev->nmo_size += cur->nmo_size; + free(cur, M_NETMAP); + cur = prev; + } + + /* merge with the next one */ + next = TAILQ_NEXT(cur, nmo_next); + if (next && next->nmo_used == 0) { + TAILQ_REMOVE(&netmap_mem_d->nm_molist, next, nmo_next); + cur->nmo_size += next->nmo_size; + free(next, M_NETMAP); + } + NMA_UNLOCK(); + ND("freed %s %d bytes at %p", msg, size, addr); +} + + +/* + * Create and return a new ``netmap_if`` object, and possibly also + * rings and packet buffors. + * + * Return NULL on failure. + */ +static void * +netmap_if_new(const char *ifname, struct netmap_adapter *na) +{ + struct netmap_if *nifp; + struct netmap_ring *ring; + char *buff; + u_int i, len, ofs; + u_int n = na->num_queues + 1; /* shorthand, include stack queue */ + + /* + * the descriptor is followed inline by an array of offsets + * to the tx and rx rings in the shared memory region. + */ + len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t); + nifp = netmap_if_malloc(len); + if (nifp == NULL) + return (NULL); + + /* initialize base fields */ + *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues; + strncpy(nifp->ni_name, ifname, IFNAMSIZ); + + (na->refcount)++; /* XXX atomic ? we are under lock */ + if (na->refcount > 1) + goto final; + + /* + * If this is the first instance, allocate the shadow rings and + * buffers for this card (one for each hw queue, one for the host). + * The rings are contiguous, but have variable size. + * The entire block is reachable at + * na->tx_rings[0].ring + */ + + len = n * (2 * sizeof(struct netmap_ring) + + (na->num_tx_desc + na->num_rx_desc) * + sizeof(struct netmap_slot) ); + buff = netmap_ring_malloc(len); + if (buff == NULL) { + D("failed to allocate %d bytes for %s shadow ring", + len, ifname); +error: + (na->refcount)--; + netmap_if_free(nifp); + return (NULL); + } + /* do we have the bufers ? we are in need of num_tx_desc buffers for + * each tx ring and num_tx_desc buffers for each rx ring. */ + len = n * (na->num_tx_desc + na->num_rx_desc); + NMA_LOCK(); + if (nm_buf_pool.free < len) { + NMA_UNLOCK(); + netmap_free(buff, "not enough bufs"); + goto error; + } + /* + * in the kring, store the pointers to the shared rings + * and initialize the rings. We are under NMA_LOCK(). + */ + ofs = 0; + for (i = 0; i < n; i++) { + struct netmap_kring *kring; + int numdesc; + + /* Transmit rings */ + kring = &na->tx_rings[i]; + numdesc = na->num_tx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs); + *(uint32_t *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + + /* + * IMPORTANT: + * Always keep one slot empty, so we can detect new + * transmissions comparing cur and nr_hwcur (they are + * the same only if there are no new transmissions). + */ + ring->avail = kring->nr_hwavail = numdesc - 1; + ring->cur = kring->nr_hwcur = 0; + *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; + netmap_new_bufs(nifp, ring->slot, numdesc); + + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + + /* Receive rings */ + kring = &na->rx_rings[i]; + numdesc = na->num_rx_desc; + bzero(kring, sizeof(*kring)); + kring->na = na; + + ring = kring->ring = (struct netmap_ring *)(buff + ofs); + *(ssize_t *)(uintptr_t)&ring->buf_ofs = + nm_buf_pool.base - (char *)ring; + ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs); + *(uint32_t *)(uintptr_t)&ring->num_slots = + kring->nkr_num_slots = numdesc; + ring->cur = kring->nr_hwcur = 0; + ring->avail = kring->nr_hwavail = 0; /* empty */ + *(uint16_t *)(uintptr_t)&ring->nr_buf_size = NETMAP_BUF_SIZE; + netmap_new_bufs(nifp, ring->slot, numdesc); + ofs += sizeof(struct netmap_ring) + + numdesc * sizeof(struct netmap_slot); + } + NMA_UNLOCK(); + for (i = 0; i < n+1; i++) { + // XXX initialize the selrecord structs. + } +final: + /* + * fill the slots for the rx and tx queues. They contain the offset + * between the ring and nifp, so the information is usable in + * userspace to reach the ring from the nifp. + */ + for (i = 0; i < n; i++) { + char *base = (char *)nifp; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = + (char *)na->tx_rings[i].ring - base; + *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] = + (char *)na->rx_rings[i].ring - base; + } + return (nifp); +} + +/* + * Initialize the memory allocator. + * + * Create the descriptor for the memory , allocate the pool of memory + * and initialize the list of memory objects with a single chunk + * containing the whole pre-allocated memory marked as free. + * + * Start with a large size, then halve as needed if we fail to + * allocate the block. While halving, always add one extra page + * because buffers 0 and 1 are used for special purposes. + * Return 0 on success, errno otherwise. + */ +static int +netmap_memory_init(void) +{ + struct netmap_mem_obj *mem_obj; + void *buf = NULL; + int i, n, sz = NETMAP_MEMORY_SIZE; + int extra_sz = 0; // space for rings and two spare buffers + + for (; sz >= 1<<20; sz >>=1) { + extra_sz = sz/200; + extra_sz = (extra_sz + 2*PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + buf = contigmalloc(sz + extra_sz, + M_NETMAP, + M_WAITOK | M_ZERO, + 0, /* low address */ + -1UL, /* high address */ + PAGE_SIZE, /* alignment */ + 0 /* boundary */ + ); + if (buf) + break; + } + if (buf == NULL) + return (ENOMEM); + sz += extra_sz; + netmap_mem_d = malloc(sizeof(struct netmap_mem_d), M_NETMAP, + M_WAITOK | M_ZERO); + mtx_init(&netmap_mem_d->nm_mtx, "netmap memory allocator lock", NULL, + MTX_DEF); + TAILQ_INIT(&netmap_mem_d->nm_molist); + netmap_mem_d->nm_buffer = buf; + netmap_mem_d->nm_totalsize = sz; + + /* + * A buffer takes 2k, a slot takes 8 bytes + ring overhead, + * so the ratio is 200:1. In other words, we can use 1/200 of + * the memory for the rings, and the rest for the buffers, + * and be sure we never run out. + */ + netmap_mem_d->nm_size = sz/200; + netmap_mem_d->nm_buf_start = + (netmap_mem_d->nm_size + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); + netmap_mem_d->nm_buf_len = sz - netmap_mem_d->nm_buf_start; + + nm_buf_pool.base = netmap_mem_d->nm_buffer; + nm_buf_pool.base += netmap_mem_d->nm_buf_start; + netmap_buffer_base = nm_buf_pool.base; + D("netmap_buffer_base %p (offset %d)", + netmap_buffer_base, (int)netmap_mem_d->nm_buf_start); + /* number of buffers, they all start as free */ + + netmap_total_buffers = nm_buf_pool.total_buffers = + netmap_mem_d->nm_buf_len / NETMAP_BUF_SIZE; + nm_buf_pool.bufsize = NETMAP_BUF_SIZE; + + D("Have %d MB, use %dKB for rings, %d buffers at %p", + (sz >> 20), (int)(netmap_mem_d->nm_size >> 10), + nm_buf_pool.total_buffers, nm_buf_pool.base); + + /* allocate and initialize the bitmap. Entry 0 is considered + * always busy (used as default when there are no buffers left). + */ + n = (nm_buf_pool.total_buffers + 31) / 32; + nm_buf_pool.bitmap = malloc(sizeof(uint32_t) * n, M_NETMAP, + M_WAITOK | M_ZERO); + nm_buf_pool.bitmap[0] = ~3; /* slot 0 and 1 always busy */ + for (i = 1; i < n; i++) + nm_buf_pool.bitmap[i] = ~0; + nm_buf_pool.free = nm_buf_pool.total_buffers - 2; + + mem_obj = malloc(sizeof(struct netmap_mem_obj), M_NETMAP, + M_WAITOK | M_ZERO); + TAILQ_INSERT_HEAD(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + mem_obj->nmo_used = 0; + mem_obj->nmo_size = netmap_mem_d->nm_size; + mem_obj->nmo_data = netmap_mem_d->nm_buffer; + + return (0); } +/* + * Finalize the memory allocator. + * + * Free all the memory objects contained inside the list, and deallocate + * the pool of memory; finally free the memory allocator descriptor. + */ static void -netmap_free_buf(struct netmap_if *nifp __unused, uint32_t i) +netmap_memory_fini(void) { - struct netmap_buf_pool *p = &nm_buf_pool; + struct netmap_mem_obj *mem_obj; - uint32_t pos, mask; - if (i >= p->total_buffers) { - D("invalid free index %d", i); - return; - } - pos = i / 32; - mask = 1 << (i % 32); - if (p->bitmap[pos] & mask) { - D("slot %d already free", i); - return; + while (!TAILQ_EMPTY(&netmap_mem_d->nm_molist)) { + mem_obj = TAILQ_FIRST(&netmap_mem_d->nm_molist); + TAILQ_REMOVE(&netmap_mem_d->nm_molist, mem_obj, nmo_next); + if (mem_obj->nmo_used == 1) { + printf("netmap: leaked %d bytes at %p\n", + (int)mem_obj->nmo_size, + mem_obj->nmo_data); + } + free(mem_obj, M_NETMAP); } - p->bitmap[pos] |= mask; - p->free++; + contigfree(netmap_mem_d->nm_buffer, netmap_mem_d->nm_totalsize, M_NETMAP); + // XXX mutex_destroy(nm_mtx); + free(netmap_mem_d, M_NETMAP); } - - -/* Descriptor of the memory objects handled by our memory allocator. */ -struct netmap_mem_obj { - TAILQ_ENTRY(netmap_mem_obj) nmo_next; /* next object in the - chain. */ - int nmo_used; /* flag set on used memory objects. */ - size_t nmo_size; /* size of the memory area reserved for the - object. */ - void *nmo_data; /* pointer to the memory area. */ -}; - -/* Wrap our memory objects to make them ``chainable``. */ -TAILQ_HEAD(netmap_mem_obj_h, netmap_mem_obj); - - -/* Descriptor of our custom memory allocator. */ -struct netmap_mem_d { - struct mtx nm_mtx; /* lock used to handle the chain of memory - objects. */ - struct netmap_mem_obj_h nm_molist; /* list of memory objects */ - size_t nm_size; /* total amount of memory used for rings etc. */ - size_t nm_totalsize; /* total amount of allocated memory - (the difference is used for buffers) */ - size_t nm_buf_start; /* offset of packet buffers. - This is page-aligned. */ - size_t nm_buf_len; /* total memory for buffers */ - void *nm_buffer; /* pointer to the whole pre-allocated memory - area. */ -}; +/*------------- end of memory allocator -----------------*/ /* Structure associated to each thread which registered an interface. */ @@ -250,15 +633,8 @@ struct netmap_priv_d { uint16_t np_txpoll; }; -/* Shorthand to compute a netmap interface offset. */ -#define netmap_if_offset(v) \ - ((char *) (v) - (char *) netmap_mem_d->nm_buffer) -/* .. and get a physical address given a memory offset */ -#define netmap_ofstophys(o) \ - (vtophys(netmap_mem_d->nm_buffer) + (o)) static struct cdev *netmap_dev; /* /dev/netmap character device. */ -static struct netmap_mem_d *netmap_mem_d; /* Our memory allocator. */ static d_mmap_t netmap_mmap; @@ -351,6 +727,7 @@ netmap_kqfilter(struct cdev *dev, struct } #endif /* NETMAP_KEVENT */ + /* * File descriptor's private data destructor. * @@ -358,19 +735,13 @@ netmap_kqfilter(struct cdev *dev, struct * revert to normal operation. We expect that np_ifp has not gone. */ static void -netmap_dtor(void *data) +netmap_dtor_locked(void *data) { struct netmap_priv_d *priv = data; struct ifnet *ifp = priv->np_ifp; struct netmap_adapter *na = NA(ifp); struct netmap_if *nifp = priv->np_nifp; - if (0) - printf("%s starting for %p ifp %p\n", __FUNCTION__, priv, - priv ? priv->np_ifp : NULL); - - na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); - na->refcount--; if (na->refcount <= 0) { /* last instance */ u_int i; @@ -411,159 +782,34 @@ netmap_dtor(void *data) for (j = 0; j < lim; j++) netmap_free_buf(nifp, ring->slot[j].buf_idx); - ND("rx queue %d", i); - ring = na->rx_rings[i].ring; - lim = na->rx_rings[i].nkr_num_slots; - for (j = 0; j < lim; j++) - netmap_free_buf(nifp, ring->slot[j].buf_idx); - } - NMA_UNLOCK(); - netmap_free_rings(na); - wakeup(na); - } - netmap_if_free(nifp); - - na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); - - if_rele(ifp); - - bzero(priv, sizeof(*priv)); /* XXX for safety */ - free(priv, M_DEVBUF); -} - - -/* - * Create and return a new ``netmap_if`` object, and possibly also - * rings and packet buffors. - * - * Return NULL on failure. - */ -static void * -netmap_if_new(const char *ifname, struct netmap_adapter *na) -{ - struct netmap_if *nifp; - struct netmap_ring *ring; - char *buff; - u_int i, len, ofs; - u_int n = na->num_queues + 1; /* shorthand, include stack queue */ - - /* - * the descriptor is followed inline by an array of offsets - * to the tx and rx rings in the shared memory region. - */ - len = sizeof(struct netmap_if) + 2 * n * sizeof(ssize_t); - nifp = netmap_if_malloc(len); - if (nifp == NULL) - return (NULL); - - /* initialize base fields */ - *(int *)(uintptr_t)&nifp->ni_num_queues = na->num_queues; - strncpy(nifp->ni_name, ifname, IFNAMSIZ); - - (na->refcount)++; /* XXX atomic ? we are under lock */ - if (na->refcount > 1) - goto final; - - /* - * If this is the first instance, allocate the shadow rings and - * buffers for this card (one for each hw queue, one for the host). - * The rings are contiguous, but have variable size. - * The entire block is reachable at - * na->tx_rings[0].ring - */ - - len = n * (2 * sizeof(struct netmap_ring) + - (na->num_tx_desc + na->num_rx_desc) * - sizeof(struct netmap_slot) ); - buff = netmap_ring_malloc(len); - if (buff == NULL) { - D("failed to allocate %d bytes for %s shadow ring", - len, ifname); -error: - (na->refcount)--; - netmap_if_free(nifp); - return (NULL); - } - /* do we have the bufers ? we are in need of num_tx_desc buffers for - * each tx ring and num_tx_desc buffers for each rx ring. */ - len = n * (na->num_tx_desc + na->num_rx_desc); - NMA_LOCK(); - if (nm_buf_pool.free < len) { - NMA_UNLOCK(); - netmap_free(buff, "not enough bufs"); - goto error; - } - /* - * in the kring, store the pointers to the shared rings - * and initialize the rings. We are under NMA_LOCK(). - */ - ofs = 0; - for (i = 0; i < n; i++) { - struct netmap_kring *kring; - int numdesc; - - /* Transmit rings */ - kring = &na->tx_rings[i]; - numdesc = na->num_tx_desc; - bzero(kring, sizeof(*kring)); - kring->na = na; - - ring = kring->ring = (struct netmap_ring *)(buff + ofs); - *(ssize_t *)(uintptr_t)&ring->buf_ofs = - nm_buf_pool.base - (char *)ring; - ND("txring[%d] at %p ofs %d", i, ring, ring->buf_ofs); - *(int *)(int *)(uintptr_t)&ring->num_slots = - kring->nkr_num_slots = numdesc; - - /* - * IMPORTANT: - * Always keep one slot empty, so we can detect new - * transmissions comparing cur and nr_hwcur (they are - * the same only if there are no new transmissions). - */ - ring->avail = kring->nr_hwavail = numdesc - 1; - ring->cur = kring->nr_hwcur = 0; - netmap_new_bufs(nifp, ring->slot, numdesc); - - ofs += sizeof(struct netmap_ring) + - numdesc * sizeof(struct netmap_slot); - - /* Receive rings */ - kring = &na->rx_rings[i]; - numdesc = na->num_rx_desc; - bzero(kring, sizeof(*kring)); - kring->na = na; - - ring = kring->ring = (struct netmap_ring *)(buff + ofs); - *(ssize_t *)(uintptr_t)&ring->buf_ofs = - nm_buf_pool.base - (char *)ring; - ND("rxring[%d] at %p offset %d", i, ring, ring->buf_ofs); - *(int *)(int *)(uintptr_t)&ring->num_slots = - kring->nkr_num_slots = numdesc; - ring->cur = kring->nr_hwcur = 0; - ring->avail = kring->nr_hwavail = 0; /* empty */ - netmap_new_bufs(nifp, ring->slot, numdesc); - ofs += sizeof(struct netmap_ring) + - numdesc * sizeof(struct netmap_slot); - } - NMA_UNLOCK(); - for (i = 0; i < n+1; i++) { - // XXX initialize the selrecord structs. - } -final: - /* - * fill the slots for the rx and tx queues. They contain the offset - * between the ring and nifp, so the information is usable in - * userspace to reach the ring from the nifp. - */ - for (i = 0; i < n; i++) { - char *base = (char *)nifp; - *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i] = - (char *)na->tx_rings[i].ring - base; - *(ssize_t *)(uintptr_t)&nifp->ring_ofs[i+n] = - (char *)na->rx_rings[i].ring - base; + ND("rx queue %d", i); + ring = na->rx_rings[i].ring; + lim = na->rx_rings[i].nkr_num_slots; + for (j = 0; j < lim; j++) + netmap_free_buf(nifp, ring->slot[j].buf_idx); + } + NMA_UNLOCK(); + netmap_free_rings(na); + wakeup(na); } - return (nifp); + netmap_if_free(nifp); +} + + +static void +netmap_dtor(void *data) +{ + struct netmap_priv_d *priv = data; + struct ifnet *ifp = priv->np_ifp; + struct netmap_adapter *na = NA(ifp); + + na->nm_lock(ifp->if_softc, NETMAP_CORE_LOCK, 0); + netmap_dtor_locked(data); + na->nm_lock(ifp->if_softc, NETMAP_CORE_UNLOCK, 0); + + if_rele(ifp); + bzero(priv, sizeof(*priv)); /* XXX for safety */ + free(priv, M_DEVBUF); } @@ -894,7 +1140,6 @@ netmap_ioctl(__unused struct cdev *dev, break; } - for (i = 10; i > 0; i--) { na->nm_lock(adapter, NETMAP_CORE_LOCK, 0); if (!NETMAP_DELETING(na)) @@ -924,25 +1169,16 @@ netmap_ioctl(__unused struct cdev *dev, * and make it use the shared buffers. */ error = na->nm_register(ifp, 1); /* mode on */ - if (error) { - /* - * do something similar to netmap_dtor(). - */ - netmap_free_rings(na); - // XXX tx_rings is inline, must not be freed. - // free(na->tx_rings, M_DEVBUF); // XXX wrong ? - na->tx_rings = na->rx_rings = NULL; - na->refcount--; - netmap_if_free(nifp); - nifp = NULL; - } + if (error) + netmap_dtor_locked(priv); } if (error) { /* reg. failed, release priv and ref */ error: na->nm_lock(adapter, NETMAP_CORE_UNLOCK, 0); - free(priv, M_DEVBUF); if_rele(ifp); /* return the refcount */ + bzero(priv, sizeof(*priv)); + free(priv, M_DEVBUF); break; } @@ -1166,6 +1402,12 @@ netmap_poll(__unused struct cdev *dev, i if (priv->np_txpoll || want_tx) { for (i = priv->np_qfirst; i < priv->np_qlast; i++) { kring = &na->tx_rings[i]; + /* + * Skip the current ring if want_tx == 0 + * (we have already done a successful sync on + * a previous ring) AND kring->cur == kring->hwcur + * (there are no pending transmissions for this ring). + */ if (!want_tx && kring->ring->cur == kring->nr_hwcur) continue; if (core_lock == NEED_CL) { @@ -1181,6 +1423,7 @@ netmap_poll(__unused struct cdev *dev, i if (na->nm_txsync(adapter, i, 0 /* no lock */)) revents |= POLLERR; + /* Check avail/call selrecord only if called with POLLOUT */ if (want_tx) { if (kring->ring->avail > 0) { /* stop at the first ring. We don't risk @@ -1212,9 +1455,10 @@ netmap_poll(__unused struct cdev *dev, i if (na->nm_rxsync(adapter, i, 0 /* no lock */)) revents |= POLLERR; - if (no_timestamp == 0 || - kring->ring->flags & NR_TIMESTAMP) + if (netmap_no_timestamp == 0 || *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***