Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 19 Oct 2012 04:13:12 +0000 (UTC)
From:      Luigi Rizzo <luigi@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r241719 - head/sys/dev/netmap
Message-ID:  <201210190413.q9J4DC0V091484@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: luigi
Date: Fri Oct 19 04:13:12 2012
New Revision: 241719
URL: http://svn.freebsd.org/changeset/base/241719

Log:
  This is an import of code, mostly from Giuseppe Lettieri,
  that revises the netmap memory allocator so that the
  various parameters (number and size of buffers, rings, descriptors)
  can be modified at runtime through sysctl variables.
  The changes become effective when no netmap clients are active.
  
  The API is mostly unchanged, although the NIOCUNREGIF ioctl now
  does not bring the interface back to normal mode: and you
  need to close the file descriptor for that.
  This change was necessary to track who is using the mapped region,
  and since it is a simplification of the API there was no
  incentive in trying to preserve NIOCUNREGIF.
  We will remove the ioctl from the kernel next time we need
  a real API change (and version bump).
  
  Among other things, buffer allocation when opening devices is
  now much faster: it used to take O(N^2) time, now it is linear.
  
  Submitted by:	Giuseppe Lettieri

Modified:
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h
  head/sys/dev/netmap/netmap_mem2.c

Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c	Fri Oct 19 03:01:25 2012	(r241718)
+++ head/sys/dev/netmap/netmap.c	Fri Oct 19 04:13:12 2012	(r241719)
@@ -98,15 +98,8 @@ MALLOC_DEFINE(M_NETMAP, "netmap", "Netwo
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
-/*
- * lock and unlock for the netmap memory allocator
- */
-#define NMA_LOCK()	mtx_lock(&nm_mem->nm_mtx);
-#define NMA_UNLOCK()	mtx_unlock(&nm_mem->nm_mtx);
-struct netmap_mem_d;
-static struct netmap_mem_d *nm_mem;	/* Our memory allocator. */
-
 u_int netmap_total_buffers;
+u_int netmap_buf_size;
 char *netmap_buffer_base;	/* address of an invalid buffer */
 
 /* user-controlled variables */
@@ -119,10 +112,6 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, verbos
     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
-u_int netmap_buf_size = 2048;
-TUNABLE_INT("hw.netmap.buf_size", (u_int *)&netmap_buf_size);
-SYSCTL_INT(_dev_netmap, OID_AUTO, buf_size,
-    CTLFLAG_RD, &netmap_buf_size, 0, "Size of packet buffers");
 int netmap_mitigate = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
 int netmap_no_pendintr = 1;
@@ -294,23 +283,62 @@ nm_find_bridge(const char *name)
 #endif /* !NETMAP_MEM2 */
 /*------------ end of memory allocator ----------*/
 
-/* Structure associated to each thread which registered an interface. */
+
+/* Structure associated to each thread which registered an interface.
+ *
+ * The first 4 fields of this structure are written by NIOCREGIF and
+ * read by poll() and NIOC?XSYNC.
+ * There is low contention among writers (actually, a correct user program
+ * should have no contention among writers) and among writers and readers,
+ * so we use a single global lock to protect the structure initialization.
+ * Since initialization involves the allocation of memory, we reuse the memory
+ * allocator lock.
+ * Read access to the structure is lock free. Readers must check that
+ * np_nifp is not NULL before using the other fields.
+ * If np_nifp is NULL initialization has not been performed, so they should
+ * return an error to userlevel.
+ *
+ * The ref_done field is used to regulate access to the refcount in the
+ * memory allocator. The refcount must be incremented at most once for
+ * each open("/dev/netmap"). The increment is performed by the first
+ * function that calls netmap_get_memory() (currently called by
+ * mmap(), NIOCGINFO and NIOCREGIF).
+ * If the refcount is incremented, it is then decremented when the
+ * private structure is destroyed.
+ */
 struct netmap_priv_d {
-	struct netmap_if *np_nifp;	/* netmap interface descriptor. */
+	struct netmap_if * volatile np_nifp;	/* netmap interface descriptor. */
 
 	struct ifnet	*np_ifp;	/* device for which we hold a reference */
 	int		np_ringid;	/* from the ioctl */
 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
 	uint16_t	np_txpoll;
+
+	unsigned long	ref_done;	/* use with NMA_LOCK held */
 };
 
 
+static int
+netmap_get_memory(struct netmap_priv_d* p)
+{
+	int error = 0;
+	NMA_LOCK();
+	if (!p->ref_done) {
+		error = netmap_memory_finalize();
+		if (!error)
+			p->ref_done = 1;
+	}
+	NMA_UNLOCK();
+	return error;
+}
+
 /*
  * File descriptor's private data destructor.
  *
  * Call nm_register(ifp,0) to stop netmap mode on the interface and
  * revert to normal operation. We expect that np_ifp has not gone.
  */
+/* call with NMA_LOCK held */
 static void
 netmap_dtor_locked(void *data)
 {
@@ -350,7 +378,6 @@ netmap_dtor_locked(void *data)
 		selwakeuppri(&na->tx_si, PI_NET);
 		selwakeuppri(&na->rx_si, PI_NET);
 		/* release all buffers */
-		NMA_LOCK();
 		for (i = 0; i < na->num_tx_rings + 1; i++) {
 			struct netmap_ring *ring = na->tx_rings[i].ring;
 			lim = na->tx_rings[i].nkr_num_slots;
@@ -370,7 +397,6 @@ netmap_dtor_locked(void *data)
 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
 		/* knlist_destroy(&na->tx_si.si_note); */
 		/* knlist_destroy(&na->rx_si.si_note); */
-		NMA_UNLOCK();
 		netmap_free_rings(na);
 		wakeup(na);
 	}
@@ -403,7 +429,7 @@ nm_if_rele(struct ifnet *ifp)
 			bzero(ifp, sizeof(*ifp));
 			free(ifp, M_DEVBUF);
 			break;
-		} 
+		}
 		else if (b->bdg_ports[i] != NULL)
 			full = 1;
 	}
@@ -423,17 +449,83 @@ netmap_dtor(void *data)
 {
 	struct netmap_priv_d *priv = data;
 	struct ifnet *ifp = priv->np_ifp;
-	struct netmap_adapter *na = NA(ifp);
+	struct netmap_adapter *na;
 
-	na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
-	netmap_dtor_locked(data);
-	na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+	NMA_LOCK();
+	if (ifp) {
+		na = NA(ifp);
+		na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+		netmap_dtor_locked(data);
+		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
 
-	nm_if_rele(ifp);
+		nm_if_rele(ifp);
+	}
+	if (priv->ref_done) {
+		netmap_memory_deref();
+	}
+	NMA_UNLOCK();
 	bzero(priv, sizeof(*priv));	/* XXX for safety */
 	free(priv, M_DEVBUF);
 }
 
+#ifdef __FreeBSD__
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/uma.h>
+
+static struct cdev_pager_ops saved_cdev_pager_ops;
+
+static int
+netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
+    vm_ooffset_t foff, struct ucred *cred, u_short *color)
+{
+	D("first mmap for %p", handle);
+	return saved_cdev_pager_ops.cdev_pg_ctor(handle,
+			size, prot, foff, cred, color);
+}
+
+static void
+netmap_dev_pager_dtor(void *handle)
+{
+	saved_cdev_pager_ops.cdev_pg_dtor(handle);
+	D("ready to release memory for %p", handle);
+}
+
+
+static struct cdev_pager_ops netmap_cdev_pager_ops = {
+        .cdev_pg_ctor = netmap_dev_pager_ctor,
+        .cdev_pg_dtor = netmap_dev_pager_dtor,
+        .cdev_pg_fault = NULL,
+};
+
+static int
+netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
+	vm_size_t objsize,  vm_object_t *objp, int prot)
+{
+	vm_object_t obj;
+
+	D("cdev %p foff %d size %d objp %p prot %d", cdev, *foff,
+		objsize, objp, prot);
+	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
+            curthread->td_ucred);
+	ND("returns obj %p", obj);
+	if (obj == NULL)
+		return EINVAL;
+	if (saved_cdev_pager_ops.cdev_pg_fault == NULL) {
+		D("initialize cdev_pager_ops");
+		saved_cdev_pager_ops = *(obj->un_pager.devp.ops);
+		netmap_cdev_pager_ops.cdev_pg_fault =
+			saved_cdev_pager_ops.cdev_pg_fault;
+	};
+	obj->un_pager.devp.ops = &netmap_cdev_pager_ops;
+	*objp = obj;
+	return 0;
+}
+#endif /* __FreeBSD__ */
+
 
 /*
  * mmap(2) support for the "netmap" device.
@@ -456,13 +548,50 @@ netmap_mmap(__unused struct cdev *dev,
 #endif
 	)
 {
+	int error = 0;
+	struct netmap_priv_d *priv;
+
 	if (nprot & PROT_EXEC)
 		return (-1);	// XXX -1 or EINVAL ?
 
+	error = devfs_get_cdevpriv((void **)&priv);
+	if (error == EBADF) {	/* called on fault, memory is initialized */
+		ND(5, "handling fault at ofs 0x%x", offset);
+		error = 0;
+	} else if (error == 0)	/* make sure memory is set */
+		error = netmap_get_memory(priv);
+	if (error)
+		return (error);
+
 	ND("request for offset 0x%x", (uint32_t)offset);
 	*paddr = netmap_ofstophys(offset);
 
-	return (0);
+	return (*paddr ? 0 : ENOMEM);
+}
+
+static int
+netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
+{
+	D("dev %p fflag 0x%x devtype %d td %p", dev, fflag, devtype, td);
+	return 0;
+}
+
+static int
+netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
+{
+	struct netmap_priv_d *priv;
+	int error;
+
+	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+			      M_NOWAIT | M_ZERO);
+	if (priv == NULL)
+		return ENOMEM;
+
+	error = devfs_set_cdevpriv(priv, netmap_dtor);
+	if (error)
+	        return error;
+
+	return 0;
 }
 #endif /* __FreeBSD__ */
 
@@ -650,7 +779,7 @@ no_port:
 	/* can do this if the capability exists and if_pspare[0]
 	 * points to the netmap descriptor.
 	 */
-	if ((*ifp)->if_capabilities & IFCAP_NETMAP && NA(*ifp))
+	if (NETMAP_CAPABLE(*ifp))
 		return 0;	/* valid pointer, we hold the refcount */
 	nm_if_rele(*ifp);
 	return EINVAL;	// not NETMAP capable
@@ -676,7 +805,7 @@ netmap_ring_reinit(struct netmap_kring *
 	u_int i, lim = kring->nkr_num_slots - 1;
 	int errors = 0;
 
-	D("called for %s", kring->na->ifp->if_xname);
+	RD(10, "called for %s", kring->na->ifp->if_xname);
 	if (ring->cur > lim)
 		errors++;
 	for (i = 0; i <= lim; i++) {
@@ -698,9 +827,9 @@ netmap_ring_reinit(struct netmap_kring *
 		int pos = kring - kring->na->tx_rings;
 		int n = kring->na->num_tx_rings + 1;
 
-		D("total %d errors", errors);
+		RD(10, "total %d errors", errors);
 		errors++;
-		D("%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
+		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
 			kring->na->ifp->if_xname,
 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
 			ring->cur, kring->nr_hwcur,
@@ -803,20 +932,16 @@ netmap_ioctl(struct cdev *dev, u_long cm
 	CURVNET_SET(TD_TO_VNET(td));
 
 	error = devfs_get_cdevpriv((void **)&priv);
-	if (error != ENOENT && error != 0) {
+	if (error) {
 		CURVNET_RESTORE();
-		return (error);
+		/* XXX ENOENT should be impossible, since the priv
+		 * is now created in the open */
+		return (error == ENOENT ? ENXIO : error);
 	}
 
-	error = 0;	/* Could be ENOENT */
 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
 	switch (cmd) {
 	case NIOCGINFO:		/* return capabilities etc */
-		/* memsize is always valid */
-		nmr->nr_memsize = nm_mem->nm_totalsize;
-		nmr->nr_offset = 0;
-		nmr->nr_rx_rings = nmr->nr_tx_rings = 0;
-		nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
 		if (nmr->nr_version != NETMAP_API) {
 			D("API mismatch got %d have %d",
 				nmr->nr_version, NETMAP_API);
@@ -824,6 +949,16 @@ netmap_ioctl(struct cdev *dev, u_long cm
 			error = EINVAL;
 			break;
 		}
+		/* update configuration */
+		error = netmap_get_memory(priv);
+		ND("get_memory returned %d", error);
+		if (error)
+			break;
+		/* memsize is always valid */
+		nmr->nr_memsize = nm_mem.nm_totalsize;
+		nmr->nr_offset = 0;
+		nmr->nr_rx_rings = nmr->nr_tx_rings = 0;
+		nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
 		if (nmr->nr_name[0] == '\0')	/* just get memory info */
 			break;
 		error = get_ifp(nmr->nr_name, &ifp); /* get a refcount */
@@ -843,26 +978,26 @@ netmap_ioctl(struct cdev *dev, u_long cm
 			error = EINVAL;
 			break;
 		}
-		if (priv != NULL) {	/* thread already registered */
+		/* ensure allocators are ready */
+		error = netmap_get_memory(priv);
+		ND("get_memory returned %d", error);
+		if (error)
+			break;
+
+		/* protect access to priv from concurrent NIOCREGIF */
+		NMA_LOCK();
+		if (priv->np_ifp != NULL) {	/* thread already registered */
 			error = netmap_set_ringid(priv, nmr->nr_ringid);
+			NMA_UNLOCK();
 			break;
 		}
 		/* find the interface and a reference */
 		error = get_ifp(nmr->nr_name, &ifp); /* keep reference */
-		if (error)
-			break;
-		na = NA(ifp); /* retrieve netmap adapter */
-		/*
-		 * Allocate the private per-thread structure.
-		 * XXX perhaps we can use a blocking malloc ?
-		 */
-		priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
-			      M_NOWAIT | M_ZERO);
-		if (priv == NULL) {
-			error = ENOMEM;
-			nm_if_rele(ifp);   /* return the refcount */
+		if (error) {
+			NMA_UNLOCK();
 			break;
 		}
+		na = NA(ifp); /* retrieve netmap adapter */
 
 		for (i = 10; i > 0; i--) {
 			na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
@@ -874,8 +1009,8 @@ netmap_ioctl(struct cdev *dev, u_long cm
 		if (i == 0) {
 			D("too many NIOCREGIF attempts, give up");
 			error = EINVAL;
-			free(priv, M_DEVBUF);
 			nm_if_rele(ifp);	/* return the refcount */
+			NMA_UNLOCK();
 			break;
 		}
 
@@ -883,7 +1018,7 @@ netmap_ioctl(struct cdev *dev, u_long cm
 		error = netmap_set_ringid(priv, nmr->nr_ringid);
 		if (error)
 			goto error;
-		priv->np_nifp = nifp = netmap_if_new(nmr->nr_name, na);
+		nifp = netmap_if_new(nmr->nr_name, na);
 		if (nifp == NULL) { /* allocation failed */
 			error = ENOMEM;
 		} else if (ifp->if_capenable & IFCAP_NETMAP) {
@@ -898,57 +1033,66 @@ netmap_ioctl(struct cdev *dev, u_long cm
 				mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock", MTX_NETWORK_LOCK, MTX_DEF);
 			}
 			error = na->nm_register(ifp, 1); /* mode on */
-			if (error)
+			if (error) {
 				netmap_dtor_locked(priv);
+				netmap_if_free(nifp);
+			}
 		}
 
 		if (error) {	/* reg. failed, release priv and ref */
 error:
 			na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
 			nm_if_rele(ifp);	/* return the refcount */
-			bzero(priv, sizeof(*priv));
-			free(priv, M_DEVBUF);
+			priv->np_ifp = NULL;
+			priv->np_nifp = NULL;
+			NMA_UNLOCK();
 			break;
 		}
 
 		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-		error = devfs_set_cdevpriv(priv, netmap_dtor);
 
-		if (error != 0) {
-			/* could not assign the private storage for the
-			 * thread, call the destructor explicitly.
-			 */
-			netmap_dtor(priv);
-			break;
-		}
+		/* the following assignment is a commitment.
+		 * Readers (i.e., poll and *SYNC) check for
+		 * np_nifp != NULL without locking
+		 */
+		wmb(); /* make sure previous writes are visible to all CPUs */
+		priv->np_nifp = nifp;
+		NMA_UNLOCK();
 
 		/* return the offset of the netmap_if object */
 		nmr->nr_rx_rings = na->num_rx_rings;
 		nmr->nr_tx_rings = na->num_tx_rings;
 		nmr->nr_rx_slots = na->num_rx_desc;
 		nmr->nr_tx_slots = na->num_tx_desc;
-		nmr->nr_memsize = nm_mem->nm_totalsize;
+		nmr->nr_memsize = nm_mem.nm_totalsize;
 		nmr->nr_offset = netmap_if_offset(nifp);
 		break;
 
 	case NIOCUNREGIF:
-		if (priv == NULL) {
+		// XXX we have no data here ?
+		D("deprecated, data is %p", nmr);
+		error = EINVAL;
+		break;
+
+	case NIOCTXSYNC:
+	case NIOCRXSYNC:
+		nifp = priv->np_nifp;
+
+		if (nifp == NULL) {
 			error = ENXIO;
 			break;
 		}
+		rmb(); /* make sure following reads are not from cache */
 
-		/* the interface is unregistered inside the
-		   destructor of the private data. */
-		devfs_clear_cdevpriv();
-		break;
 
-	case NIOCTXSYNC:
-        case NIOCRXSYNC:
-		if (priv == NULL) {
+		ifp = priv->np_ifp;	/* we have a reference */
+
+		if (ifp == NULL) {
+			D("Internal error: nifp != NULL && ifp == NULL");
 			error = ENXIO;
 			break;
 		}
-		ifp = priv->np_ifp;	/* we have a reference */
+
 		na = NA(ifp); /* retrieve netmap adapter */
 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
 			if (cmd == NIOCTXSYNC)
@@ -1047,6 +1191,12 @@ netmap_poll(struct cdev *dev, int events
 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
 		return POLLERR;
 
+	if (priv->np_nifp == NULL) {
+		D("No if registered");
+		return POLLERR;
+	}
+	rmb(); /* make sure following reads are not from cache */
+
 	ifp = priv->np_ifp;
 	// XXX check for deleting() ?
 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
@@ -1322,7 +1472,7 @@ netmap_attach(struct netmap_adapter *na,
 		na->tx_rings = (void *)((char *)buf + sizeof(*na));
 		na->rx_rings = na->tx_rings + na->num_tx_rings + 1;
 		bcopy(na, buf, sizeof(*na));
-		ifp->if_capabilities |= IFCAP_NETMAP;
+		NETMAP_SET_CAPABLE(ifp);
 
 		na = buf;
 		/* Core lock initialized here.  Others are initialized after
@@ -1337,7 +1487,7 @@ netmap_attach(struct netmap_adapter *na,
 	}
 #ifdef linux
 	if (ifp->netdev_ops) {
-		D("netdev_ops %p", ifp->netdev_ops);
+		ND("netdev_ops %p", ifp->netdev_ops);
 		/* prepare a clone of the netdev ops */
 		na->nm_ndo = *ifp->netdev_ops;
 	}
@@ -1440,9 +1590,13 @@ netmap_reset(struct netmap_adapter *na, 
 		return NULL;	/* nothing to reinitialize */
 
 	if (tx == NR_TX) {
+		if (n >= na->num_tx_rings)
+			return NULL;
 		kring = na->tx_rings + n;
 		new_hwofs = kring->nr_hwcur - new_cur;
 	} else {
+		if (n >= na->num_rx_rings)
+			return NULL;
 		kring = na->rx_rings + n;
 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
 	}
@@ -1454,7 +1608,7 @@ netmap_reset(struct netmap_adapter *na, 
 	kring->nkr_hwofs = new_hwofs;
 	if (tx == NR_TX)
 		kring->nr_hwavail = kring->nkr_num_slots - 1;
-	D("new hwofs %d on %s %s[%d]",
+	ND(10, "new hwofs %d on %s %s[%d]",
 			kring->nkr_hwofs, na->ifp->if_xname,
 			tx == NR_TX ? "TX" : "RX", n);
 
@@ -1501,12 +1655,22 @@ netmap_rx_irq(struct ifnet *ifp, int q, 
 
 	if (!(ifp->if_capenable & IFCAP_NETMAP))
 		return 0;
+	ND(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
 	na = NA(ifp);
+	if (na->na_flags & NAF_SKIP_INTR) {
+		ND("use regular interrupt");
+		return 0;
+	}
+
 	if (work_done) { /* RX path */
+		if (q >= na->num_rx_rings)
+			return 0;	// regular queue
 		r = na->rx_rings + q;
 		r->nr_kflags |= NKR_PENDINTR;
 		main_wq = (na->num_rx_rings > 1) ? &na->rx_si : NULL;
 	} else { /* tx path */
+		if (q >= na->num_tx_rings)
+			return 0;	// regular queue
 		r = na->tx_rings + q;
 		main_wq = (na->num_tx_rings > 1) ? &na->tx_si : NULL;
 		work_done = &q; /* dummy */
@@ -1560,38 +1724,65 @@ linux_netmap_mmap(struct file *f, struct
 	int lut_skip, i, j;
 	int user_skip = 0;
 	struct lut_entry *l_entry;
-	const struct netmap_obj_pool *p[] = {
-		nm_mem->nm_if_pool,
-		nm_mem->nm_ring_pool,
-		nm_mem->nm_buf_pool };
+	int error = 0;
+	unsigned long off, tomap;
 	/*
 	 * vma->vm_start: start of mapping user address space
 	 * vma->vm_end: end of the mapping user address space
+	 * vma->vm_pfoff: offset of first page in the device
 	 */
 
-	(void)f;	/* UNUSED */
 	// XXX security checks
 
-	for (i = 0; i < 3; i++) {  /* loop through obj_pools */
+	error = netmap_get_memory(f->private_data);
+	ND("get_memory returned %d", error);
+	if (error)
+	    return -error;
+
+	off = vma->vm_pgoff << PAGE_SHIFT; /* offset in bytes */
+	tomap = vma->vm_end - vma->vm_start;
+	for (i = 0; i < NETMAP_POOLS_NR; i++) {  /* loop through obj_pools */
+		const struct netmap_obj_pool *p = &nm_mem.pools[i];
 		/*
 		 * In each pool memory is allocated in clusters
-		 * of size _clustsize , each containing clustentries
+		 * of size _clustsize, each containing clustentries
 		 * entries. For each object k we already store the
-		 * vtophys malling in lut[k] so we use that, scanning
+		 * vtophys mapping in lut[k] so we use that, scanning
 		 * the lut[] array in steps of clustentries,
 		 * and we map each cluster (not individual pages,
 		 * it would be overkill).
 		 */
-		for (lut_skip = 0, j = 0; j < p[i]->_numclusters; j++) {
-			l_entry = &p[i]->lut[lut_skip];
+
+		/*
+		 * We interpret vm_pgoff as an offset into the whole
+		 * netmap memory, as if all clusters where contiguous.
+		 */
+		for (lut_skip = 0, j = 0; j < p->_numclusters; j++, lut_skip += p->clustentries) {
+			unsigned long paddr, mapsize;
+			if (p->_clustsize <= off) {
+				off -= p->_clustsize;
+				continue;
+			}
+			l_entry = &p->lut[lut_skip]; /* first obj in the cluster */
+			paddr = l_entry->paddr + off;
+			mapsize = p->_clustsize - off;
+			off = 0;
+			if (mapsize > tomap)
+				mapsize = tomap;
+			ND("remap_pfn_range(%lx, %lx, %lx)",
+				vma->vm_start + user_skip,
+				paddr >> PAGE_SHIFT, mapsize);
 			if (remap_pfn_range(vma, vma->vm_start + user_skip,
-					l_entry->paddr >> PAGE_SHIFT, p[i]->_clustsize,
+					paddr >> PAGE_SHIFT, mapsize,
 					vma->vm_page_prot))
 				return -EAGAIN; // XXX check return value
-			lut_skip += p[i]->clustentries;
-			user_skip += p[i]->_clustsize;
+			user_skip += mapsize;
+			tomap -= mapsize;
+			if (tomap == 0)
+				goto done;
 		}
 	}
+done:
 
 	return 0;
 }
@@ -1636,8 +1827,24 @@ netmap_release(struct inode *inode, stru
 	return (0);
 }
 
+static int
+linux_netmap_open(struct inode *inode, struct file *file)
+{
+	struct netmap_priv_d *priv;
+	(void)inode;	/* UNUSED */
+
+	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
+			      M_NOWAIT | M_ZERO);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	file->private_data = priv;
+
+	return (0);
+}
 
 static struct file_operations netmap_fops = {
+    .open = linux_netmap_open,
     .mmap = linux_netmap_mmap,
     LIN_IOCTL_NAME = linux_netmap_ioctl,
     .poll = linux_netmap_poll,
@@ -1683,9 +1890,12 @@ MODULE_LICENSE("Dual BSD/GPL"); /* the c
 static struct cdevsw netmap_cdevsw = {
 	.d_version = D_VERSION,
 	.d_name = "netmap",
+	.d_open = netmap_open,
 	.d_mmap = netmap_mmap,
+	.d_mmap_single = netmap_mmap_single,
 	.d_ioctl = netmap_ioctl,
 	.d_poll = netmap_poll,
+	.d_close = netmap_close,
 };
 #endif /* __FreeBSD__ */
 
@@ -2048,8 +2258,7 @@ netmap_init(void)
 		printf("netmap: unable to initialize the memory allocator.\n");
 		return (error);
 	}
-	printf("netmap: loaded module with %d Mbytes\n",
-		(int)(nm_mem->nm_totalsize >> 20));
+	printf("netmap: loaded module\n");
 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
 			      "netmap");
 

Modified: head/sys/dev/netmap/netmap_kern.h
==============================================================================
--- head/sys/dev/netmap/netmap_kern.h	Fri Oct 19 03:01:25 2012	(r241718)
+++ head/sys/dev/netmap/netmap_kern.h	Fri Oct 19 04:13:12 2012	(r241719)
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2011-2012 Matteo Landi, Luigi Rizzo. All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -9,7 +9,7 @@
  *   2. Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -25,7 +25,7 @@
 
 /*
  * $FreeBSD$
- * $Id: netmap_kern.h 11343 2012-07-03 09:08:38Z luigi $
+ * $Id: netmap_kern.h 11829 2012-09-26 04:06:34Z luigi $
  *
  * The header contains the definitions of constants and function
  * prototypes used only in kernelspace.
@@ -55,11 +55,10 @@
 #endif
 
 /*
- * IFCAP_NETMAP goes into net_device's flags (if_capabilities)
- * and priv_flags (if_capenable). The latter used to be 16 bits
- * up to linux 2.6.36, so we need to use a 16 bit value on older
+ * IFCAP_NETMAP goes into net_device's priv_flags (if_capenable).
+ * This was 16 bits up to linux 2.6.36, so we need a 16 bit value on older
  * platforms and tolerate the clash with IFF_DYNAMIC and IFF_BRIDGE_PORT.
- * For the 32-bit value, 0x100000 (bit 20) has no clashes up to 3.3.1
+ * For the 32-bit value, 0x100000 has no clashes until at least 3.5.1
  */
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
 #define IFCAP_NETMAP	0x8000
@@ -68,7 +67,7 @@
 #endif
 
 #elif defined (__APPLE__)
-#warning apple support is experimental
+#warning apple support is incomplete.
 #define likely(x)	__builtin_expect(!!(x), 1)
 #define unlikely(x)	__builtin_expect(!!(x), 0)
 #define	NM_LOCK_T	IOLock *
@@ -89,7 +88,19 @@
 		(int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec,	\
 		__FUNCTION__, __LINE__, ##__VA_ARGS__);		\
 	} while (0)
- 
+
+/* rate limited, lps indicates how many per second */
+#define RD(lps, format, ...)					\
+	do {							\
+		static int t0, __cnt;				\
+		if (t0 != time_second) {			\
+			t0 = time_second;			\
+			__cnt = 0;				\
+		}						\
+		if (__cnt++ < lps)				\
+			D(format, ##__VA_ARGS__);		\
+	} while (0)
+
 struct netmap_adapter;
 
 /*
@@ -129,6 +140,18 @@ struct netmap_kring {
  * support netmap operation.
  */
 struct netmap_adapter {
+	/*
+	 * On linux we do not have a good way to tell if an interface
+	 * is netmap-capable. So we use the following trick:
+	 * NA(ifp) points here, and the first entry (which hopefully
+	 * always exists and is at least 32 bits) contains a magic
+	 * value which we can use to detect that the interface is good.
+	 */
+	uint32_t magic;
+	uint32_t na_flags;	/* future place for IFCAP_NETMAP */
+#define NAF_SKIP_INTR	1	/* use the regular interrupt handler.
+				 * useful during initialization
+				 */
 	int refcount; /* number of user-space descriptors using this
 			 interface, which is equal to the number of
 			 struct netmap_if objs in the mapped region. */
@@ -149,7 +172,6 @@ struct netmap_adapter {
 
 	u_int num_tx_desc; /* number of descriptor in each queue */
 	u_int num_rx_desc;
-	//u_int buff_size;	// XXX deprecate, use NETMAP_BUF_SIZE
 
 	/* tx_rings and rx_rings are private but allocated
 	 * as a contiguous chunk of memory. Each array has
@@ -185,7 +207,7 @@ struct netmap_adapter {
 };
 
 /*
- * The combination of "enable" (ifp->if_capabilities &IFCAP_NETMAP)
+ * The combination of "enable" (ifp->if_capenable & IFCAP_NETMAP)
  * and refcount gives the status of the interface, namely:
  *
  *	enable	refcount	Status
@@ -268,6 +290,36 @@ enum {                                  
 #endif
 #define	NA(_ifp)	((struct netmap_adapter *)WNA(_ifp))
 
+/*
+ * Macros to determine if an interface is netmap capable or netmap enabled.
+ * See the magic field in struct netmap_adapter.
+ */
+#ifdef __FreeBSD__
+/*
+ * on FreeBSD just use if_capabilities and if_capenable.
+ */
+#define NETMAP_CAPABLE(ifp)	(NA(ifp) &&		\
+	(ifp)->if_capabilities & IFCAP_NETMAP )
+
+#define	NETMAP_SET_CAPABLE(ifp)				\
+	(ifp)->if_capabilities |= IFCAP_NETMAP
+
+#else	/* linux */
+
+/*
+ * on linux:
+ * we check if NA(ifp) is set and its first element has a related
+ * magic value. The capenable is within the struct netmap_adapter.
+ */
+#define	NETMAP_MAGIC	0x52697a7a
+
+#define NETMAP_CAPABLE(ifp)	(NA(ifp) &&		\
+	((uint32_t)(uintptr_t)NA(ifp) ^ NA(ifp)->magic) == NETMAP_MAGIC )
+
+#define	NETMAP_SET_CAPABLE(ifp)				\
+	NA(ifp)->magic = ((uint32_t)(uintptr_t)NA(ifp)) ^ NETMAP_MAGIC
+
+#endif	/* linux */
 
 #ifdef __FreeBSD__
 /* Callback invoked by the dma machinery after a successfull dmamap_load */

Modified: head/sys/dev/netmap/netmap_mem2.c
==============================================================================
--- head/sys/dev/netmap/netmap_mem2.c	Fri Oct 19 03:01:25 2012	(r241718)
+++ head/sys/dev/netmap/netmap_mem2.c	Fri Oct 19 04:13:12 2012	(r241719)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2012 Matteo Landi, Luigi Rizzo. All rights reserved.
+ * Copyright (C) 2012 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -25,19 +25,19 @@
 
 /*
  * $FreeBSD$
- * $Id: netmap_mem2.c 11445 2012-07-30 10:49:07Z luigi $
+ * $Id: netmap_mem2.c 11881 2012-10-18 23:24:15Z luigi $
  *
- * New memory allocator for netmap
+ * (New) memory allocator for netmap
  */
 
 /*
- * The new version allocates three regions:
- *	nm_if_pool      for the struct netmap_if
- *	nm_ring_pool    for the struct netmap_ring
- *	nm_buf_pool    for the packet buffers.
+ * This allocator creates three memory regions:
+ *	nm_if_pool	for the struct netmap_if
+ *	nm_ring_pool	for the struct netmap_ring
+ *	nm_buf_pool	for the packet buffers.
  *
- * All regions need to be page-sized as we export them to
- * userspace through mmap. Only the latter need to be dma-able,
+ * All regions need to be multiple of a page size as we export them to
+ * userspace through mmap. Only the latter needs to be dma-able,
  * but for convenience use the same type of allocator for all.
  *
  * Once mapped, the three regions are exported to userspace
@@ -51,58 +51,97 @@
  * of the object, and from there locate the offset from the beginning
  * of the region.
  *
- * Allocator for a pool of memory objects of the same size.
+ * The invididual allocators manage a pool of memory for objects of
+ * the same size.
  * The pool is split into smaller clusters, whose size is a
  * multiple of the page size. The cluster size is chosen
  * to minimize the waste for a given max cluster size
  * (we do it by brute force, as we have relatively few object
  * per cluster).
  *
- * To be polite with the cache, objects are aligned to
- * the cache line, or 64 bytes. Sizes are rounded to multiple of 64.
- * For each object we have
- * one entry in the bitmap to signal the state. Allocation scans
- * the bitmap, but since this is done only on attach, we are not
+ * Objects are aligned to the cache line (64 bytes) rounding up object
+ * sizes when needed. A bitmap contains the state of each object.
+ * Allocation scans the bitmap; this is done only on attach, so we are not
  * too worried about performance
- */
-
-/*
- *	MEMORY SIZES:
  *
- * (all the parameters below will become tunables)
- *
- * struct netmap_if is variable size but small.
- * Assuming each NIC has 8+2 rings, (4+1 tx, 4+1 rx) the netmap_if
- * uses 120 bytes on a 64-bit machine.
- * We allocate NETMAP_IF_MAX_SIZE  (1024) which should work even for
- * cards with 48 ring pairs.
- * The total number of 'struct netmap_if' could be slightly larger
- * that the total number of rings on all interfaces on the system.
+ * For each allocator we can define (thorugh sysctl) the size and
+ * number of each object. Memory is allocated at the first use of a
+ * netmap file descriptor, and can be freed when all such descriptors
+ * have been released (including unmapping the memory).
+ * If memory is scarce, the system tries to get as much as possible
+ * and the sysctl values reflect the actual allocation.
+ * Together with desired values, the sysctl export also absolute
+ * min and maximum values that cannot be overridden.
+ *
+ * struct netmap_if:
+ *	variable size, max 16 bytes per ring pair plus some fixed amount.
+ *	1024 bytes should be large enough in practice.
+ *
+ *	In the worst case we have one netmap_if per ring in the system.
+ *
+ * struct netmap_ring
+ *	variable too, 8 byte per slot plus some fixed amount.
+ *	Rings can be large (e.g. 4k slots, or >32Kbytes).
+ *	We default to 36 KB (9 pages), and a few hundred rings.
+ *
+ * struct netmap_buffer
+ *	The more the better, both because fast interfaces tend to have
+ *	many slots, and because we may want to use buffers to store
+ *	packets in userspace avoiding copies.
+ *	Must contain a full frame (eg 1518, or more for vlans, jumbo
+ *	frames etc.) plus be nicely aligned, plus some NICs restrict
+ *	the size to multiple of 1K or so. Default to 2K
  */
-#define NETMAP_IF_MAX_SIZE      1024
-#define NETMAP_IF_MAX_NUM       512
 
-/*
- * netmap rings are up to 2..4k descriptors, 8 bytes each,
- * plus some glue at the beginning (32 bytes).
- * We set the default ring size to 9 pages (36K) and enable
- * a few hundreds of them.
- */
-#define NETMAP_RING_MAX_SIZE    (9*PAGE_SIZE)
-#define NETMAP_RING_MAX_NUM     200	/* approx 8MB */
-
-/*
- * Buffers: the more the better. Buffer size is NETMAP_BUF_SIZE,
- * 2k or slightly less, aligned to 64 bytes.
- * A large 10G interface can have 2k*18 = 36k buffers per interface,
- * or about 72MB of memory. Up to us to use more.
- */
 #ifndef CONSERVATIVE
-#define NETMAP_BUF_MAX_NUM      100000  /* 200MB */
+#define NETMAP_BUF_MAX_NUM	20*4096*2	/* large machine */
 #else /* CONSERVATIVE */
 #define NETMAP_BUF_MAX_NUM      20000   /* 40MB */
 #endif
 
+#ifdef linux
+#define NMA_LOCK_T		struct semaphore
+#define NMA_LOCK_INIT()		sema_init(&nm_mem.nm_mtx, 1)
+#define NMA_LOCK_DESTROY()	
+#define NMA_LOCK()		down(&nm_mem.nm_mtx)
+#define NMA_UNLOCK()		up(&nm_mem.nm_mtx)
+#else /* !linux */
+#define NMA_LOCK_T		struct mtx
+#define NMA_LOCK_INIT()		mtx_init(&nm_mem.nm_mtx, "netmap memory allocator lock", NULL, MTX_DEF)
+#define NMA_LOCK_DESTROY()	mtx_destroy(&nm_mem.nm_mtx)
+#define NMA_LOCK()		mtx_lock(&nm_mem.nm_mtx)
+#define NMA_UNLOCK()		mtx_unlock(&nm_mem.nm_mtx)
+#endif /* linux */
+
+enum {
+	NETMAP_IF_POOL   = 0,
+	NETMAP_RING_POOL,
+	NETMAP_BUF_POOL,
+	NETMAP_POOLS_NR
+};
+
+
+struct netmap_obj_params {
+	u_int size;
+	u_int num;
+};
+
+
+struct netmap_obj_params netmap_params[NETMAP_POOLS_NR] = {
+	[NETMAP_IF_POOL] = {
+		.size = 1024,
+		.num  = 100,
+	},
+	[NETMAP_RING_POOL] = {
+		.size = 9*PAGE_SIZE,
+		.num  = 200,
+	},
+	[NETMAP_BUF_POOL] = {
+		.size = 2048,
+		.num  = NETMAP_BUF_MAX_NUM,
+	},
+};
+
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201210190413.q9J4DC0V091484>