Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 14 May 2011 21:03:44 +0000 (UTC)
From:      Marius Strobl <marius@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r221918 - in stable/8/sys/sparc64: include sparc64
Message-ID:  <201105142103.p4EL3iCc015665@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: marius
Date: Sat May 14 21:03:44 2011
New Revision: 221918
URL: http://svn.freebsd.org/changeset/base/221918

Log:
  MFC: r216803, r217058, r217514, r218457
  
  On UltraSPARC-III+ and greater take advantage of ASI_ATOMIC_QUAD_LDD_PHYS,
  which takes an physical address instead of an virtual one, for loading TTEs
  of the kernel TSB so we no longer need to lock the kernel TSB into the dTLB,
  which only has a very limited number of lockable dTLB slots. The net result
  is that we now basically can handle a kernel TSB of any size and no longer
  need to limit the kernel address space based on the number of dTLB slots
  available for locked entries. Consequently, other parts of the trap handlers
  now also only access the the kernel TSB via its physical address in order
  to avoid nested traps, as does the PMAP bootstrap code as we haven't taken
  over the trap table at that point, yet. Apart from that the kernel TSB now
  is accessed via a direct mapping when we are otherwise taking advantage of
  ASI_ATOMIC_QUAD_LDD_PHYS so no further code changes are needed. Most of this
  is implemented by extending the patching of the TSB addresses and mask as
  well as the ASIs used to load it into the trap table so the runtime overhead
  of this change is rather low.
  Theoretically it should be possible to use the same approach also for the
  user TSB, which already is not locked into the dTLB, avoiding nested traps.
  However, for reasons I don't understand yet OpenSolaris only does that with
  SPARC64 CPUs. On the other hand I think that also addressing the user TSB
  physically and thus avoiding nested traps would get us closer to sharing
  this code with sun4v, which only supports trap level 0 and 1, so eventually
  we could have a single kernel which runs on both sun4u and sun4v (as does
  Linux and OpenBSD).

Modified:
  stable/8/sys/sparc64/include/pmap.h
  stable/8/sys/sparc64/include/tsb.h
  stable/8/sys/sparc64/sparc64/exception.S
  stable/8/sys/sparc64/sparc64/genassym.c
  stable/8/sys/sparc64/sparc64/mp_machdep.c
  stable/8/sys/sparc64/sparc64/pmap.c
  stable/8/sys/sparc64/sparc64/tsb.c
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)

Modified: stable/8/sys/sparc64/include/pmap.h
==============================================================================
--- stable/8/sys/sparc64/include/pmap.h	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/include/pmap.h	Sat May 14 21:03:44 2011	(r221918)
@@ -61,16 +61,16 @@ struct pmap {
 	struct	mtx pm_mtx;
 	struct	tte *pm_tsb;
 	vm_object_t pm_tsb_obj;
-	cpumask_t	pm_active;
+	cpumask_t pm_active;
 	u_int	pm_context[MAXCPU];
 	struct	pmap_statistics pm_stats;
 };
 
 #define	PMAP_LOCK(pmap)		mtx_lock(&(pmap)->pm_mtx)
-#define	PMAP_LOCK_ASSERT(pmap, type) \
+#define	PMAP_LOCK_ASSERT(pmap, type)					\
 				mtx_assert(&(pmap)->pm_mtx, (type))
 #define	PMAP_LOCK_DESTROY(pmap)	mtx_destroy(&(pmap)->pm_mtx)
-#define	PMAP_LOCK_INIT(pmap)	mtx_init(&(pmap)->pm_mtx, "pmap", \
+#define	PMAP_LOCK_INIT(pmap)	mtx_init(&(pmap)->pm_mtx, "pmap",	\
 				    NULL, MTX_DEF | MTX_DUPOK)
 #define	PMAP_LOCKED(pmap)	mtx_owned(&(pmap)->pm_mtx)
 #define	PMAP_MTX(pmap)		(&(pmap)->pm_mtx)
@@ -97,6 +97,7 @@ int	pmap_protect_tte(struct pmap *pm1, s
 			 vm_offset_t va);
 
 void	pmap_map_tsb(void);
+void	pmap_set_kctx(void);
 
 #define	vtophys(va)	pmap_kextract((vm_offset_t)(va))
 
@@ -112,7 +113,7 @@ SYSCTL_DECL(_debug_pmap_stats);
 
 #define	PMAP_STATS_VAR(name) \
 	static long name; \
-	SYSCTL_LONG(_debug_pmap_stats, OID_AUTO, name, CTLFLAG_RW, \
+	SYSCTL_LONG(_debug_pmap_stats, OID_AUTO, name, CTLFLAG_RW,	\
 	    &name, 0, "")
 
 #define	PMAP_STATS_INC(var) \

Modified: stable/8/sys/sparc64/include/tsb.h
==============================================================================
--- stable/8/sys/sparc64/include/tsb.h	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/include/tsb.h	Sat May 14 21:03:44 2011	(r221918)
@@ -50,6 +50,7 @@ extern struct tte *tsb_kernel;
 extern vm_size_t tsb_kernel_mask;
 extern vm_size_t tsb_kernel_size;
 extern vm_paddr_t tsb_kernel_phys;
+extern u_int tsb_kernel_ldd_phys;
 
 static __inline struct tte *
 tsb_vpntobucket(pmap_t pm, vm_offset_t vpn)

Modified: stable/8/sys/sparc64/sparc64/exception.S
==============================================================================
--- stable/8/sys/sparc64/sparc64/exception.S	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/sparc64/exception.S	Sat May 14 21:03:44 2011	(r221918)
@@ -75,8 +75,12 @@ __FBSDID("$FreeBSD$");
 
 #include "assym.s"
 
-#define	TSB_KERNEL_MASK	0x0
-#define	TSB_KERNEL	0x0
+#define	TSB_ASI			0x0
+#define	TSB_KERNEL		0x0
+#define	TSB_KERNEL_MASK		0x0
+#define	TSB_KERNEL_PHYS		0x0
+#define	TSB_KERNEL_PHYS_END	0x0
+#define	TSB_QUAD_LDD		0x0
 
 	.register %g2,#ignore
 	.register %g3,#ignore
@@ -84,19 +88,19 @@ __FBSDID("$FreeBSD$");
 	.register %g7,#ignore
 
 /*
- * Atomically set the reference bit in a TTE.
+ * Atomically set a bit in a TTE.
  */
-#define	TTE_SET_BIT(r1, r2, r3, bit) \
+#define	TTE_SET_BIT(r1, r2, r3, bit, a, asi) \
 	add	r1, TTE_DATA, r1 ; \
-	ldx	[r1], r2 ; \
+	LD(x, a) [r1] asi, r2 ; \
 9:	or	r2, bit, r3 ; \
-	casxa	[r1] ASI_N, r2, r3 ; \
+	CAS(x, a) [r1] asi, r2, r3 ; \
 	cmp	r2, r3 ; \
 	bne,pn	%xcc, 9b ; \
 	 mov	r3, r2
 
-#define	TTE_SET_REF(r1, r2, r3)		TTE_SET_BIT(r1, r2, r3, TD_REF)
-#define	TTE_SET_W(r1, r2, r3)		TTE_SET_BIT(r1, r2, r3, TD_W)
+#define	TTE_SET_REF(r1, r2, r3, a, asi)	TTE_SET_BIT(r1, r2, r3, TD_REF, a, asi)
+#define	TTE_SET_W(r1, r2, r3, a, asi)	TTE_SET_BIT(r1, r2, r3, TD_W, a, asi)
 
 /*
  * Macros for spilling and filling live windows.
@@ -691,7 +695,7 @@ ENTRY(tl0_immu_miss_set_ref)
 	/*
 	 * Set the reference bit.
 	 */
-	TTE_SET_REF(%g4, %g2, %g3)
+	TTE_SET_REF(%g4, %g2, %g3, a, ASI_N)
 
 	/*
 	 * May have become invalid during casxa, in which case start over.
@@ -849,7 +853,7 @@ ENTRY(tl0_dmmu_miss_set_ref)
 	/*
 	 * Set the reference bit.
 	 */
-	TTE_SET_REF(%g4, %g2, %g3)
+	TTE_SET_REF(%g4, %g2, %g3, a, ASI_N)
 
 	/*
 	 * May have become invalid during casxa, in which case start over.
@@ -997,7 +1001,7 @@ tl1_dmmu_prot_user:
 	/*
 	 * Set the hardware write bit.
 	 */
-	TTE_SET_W(%g4, %g2, %g3)
+	TTE_SET_W(%g4, %g2, %g3, a, ASI_N)
 
 	/*
 	 * Delete the old TLB entry and clear the SFSR.
@@ -1327,11 +1331,17 @@ END(tl1_sfsr_trap)
 	 * Compute the address of the TTE.  The TSB mask and address of the
 	 * TSB are patched at startup.
 	 */
-	.globl	tl1_immu_miss_patch_1
-tl1_immu_miss_patch_1:
+	.globl	tl1_immu_miss_patch_tsb_1
+tl1_immu_miss_patch_tsb_1:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_immu_miss_patch_tsb_mask_1
+tl1_immu_miss_patch_tsb_mask_1:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
 
 	srlx	%g5, TAR_VPN_SHIFT, %g5
 	and	%g5, %g6, %g6
@@ -1341,7 +1351,9 @@ tl1_immu_miss_patch_1:
 	/*
 	 * Load the TTE.
 	 */
-	ldda	[%g6] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */
+	.globl	tl1_immu_miss_patch_quad_ldd_1
+tl1_immu_miss_patch_quad_ldd_1:
+	ldda	[%g6] TSB_QUAD_LDD, %g6 /*, %g7 */
 
 	/*
 	 * Check that it's valid and executable and that the virtual page
@@ -1375,11 +1387,17 @@ ENTRY(tl1_immu_miss_set_ref)
 	 * Recompute the TTE address, which we clobbered loading the TTE.
 	 * The TSB mask and address of the TSB are patched at startup.
 	 */
-	.globl	tl1_immu_miss_patch_2
-tl1_immu_miss_patch_2:
+	.globl	tl1_immu_miss_patch_tsb_2
+tl1_immu_miss_patch_tsb_2:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_immu_miss_patch_tsb_mask_2
+tl1_immu_miss_patch_tsb_mask_2:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
 
 	and	%g5, %g6, %g5
 	sllx	%g5, TTE_SHIFT, %g5
@@ -1388,7 +1406,10 @@ tl1_immu_miss_patch_2:
 	/*
 	 * Set the reference bit.
 	 */
-	TTE_SET_REF(%g5, %g6, %g7)
+	.globl	tl1_immu_miss_patch_asi_1
+tl1_immu_miss_patch_asi_1:
+	wr	%g0, TSB_ASI, %asi
+	TTE_SET_REF(%g5, %g6, %g7, a, %asi)
 
 	/*
 	 * May have become invalid during casxa, in which case start over.
@@ -1447,11 +1468,17 @@ END(tl1_immu_miss_trap)
 	 * Compute the address of the TTE.  The TSB mask and address of the
 	 * TSB are patched at startup.
 	 */
-	.globl	tl1_dmmu_miss_patch_1
-tl1_dmmu_miss_patch_1:
+	.globl	tl1_dmmu_miss_patch_tsb_1
+tl1_dmmu_miss_patch_tsb_1:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_dmmu_miss_patch_tsb_mask_1
+tl1_dmmu_miss_patch_tsb_mask_1:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
 
 	srlx	%g5, TAR_VPN_SHIFT, %g5
 	and	%g5, %g6, %g6
@@ -1461,7 +1488,9 @@ tl1_dmmu_miss_patch_1:
 	/*
 	 * Load the TTE.
 	 */
-	ldda	[%g6] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */
+	.globl	tl1_dmmu_miss_patch_quad_ldd_1
+tl1_dmmu_miss_patch_quad_ldd_1:
+	ldda	[%g6] TSB_QUAD_LDD, %g6 /*, %g7 */
 
 	/*
 	 * Check that it's valid and that the virtual page numbers match.
@@ -1492,11 +1521,17 @@ ENTRY(tl1_dmmu_miss_set_ref)
 	 * Recompute the TTE address, which we clobbered loading the TTE.
 	 * The TSB mask and address of the TSB are patched at startup.
 	 */
-	.globl	tl1_dmmu_miss_patch_2
-tl1_dmmu_miss_patch_2:
+	.globl	tl1_dmmu_miss_patch_tsb_mask_2
+tl1_dmmu_miss_patch_tsb_2:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_dmmu_miss_patch_tsb_2
+tl1_dmmu_miss_patch_tsb_mask_2:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
 
 	and	%g5, %g6, %g5
 	sllx	%g5, TTE_SHIFT, %g5
@@ -1505,7 +1540,10 @@ tl1_dmmu_miss_patch_2:
 	/*
 	 * Set the reference bit.
 	 */
-	TTE_SET_REF(%g5, %g6, %g7)
+	.globl	tl1_dmmu_miss_patch_asi_1
+tl1_dmmu_miss_patch_asi_1:
+	wr	%g0, TSB_ASI, %asi
+	TTE_SET_REF(%g5, %g6, %g7, a, %asi)
 
 	/*
 	 * May have become invalid during casxa, in which case start over.
@@ -1545,15 +1583,42 @@ ENTRY(tl1_dmmu_miss_direct)
 	 * correspond to the TTE valid and page size bits are left set, so
 	 * they don't have to be included in the TTE bits below.  We know they
 	 * are set because the virtual address is in the upper va hole.
+	 * NB: if we are taking advantage of the ASI_ATOMIC_QUAD_LDD_PHYS
+	 * and we get a miss on the directly accessed kernel TSB we must not
+	 * set TD_CV in order to access it uniformly bypassing the D$.
 	 */
+	setx	TLB_DIRECT_ADDRESS_MASK, %g7, %g4
+	and	%g5, %g4, %g4
 	setx	TLB_DIRECT_TO_TTE_MASK, %g7, %g6
 	and	%g5, %g6, %g5
-	or	%g5, TD_CP | TD_CV | TD_W, %g5
+	.globl	tl1_dmmu_miss_direct_patch_tsb_phys_1
+tl1_dmmu_miss_direct_patch_tsb_phys_1:
+	sethi	%uhi(TSB_KERNEL_PHYS), %g3
+	or	%g3, %ulo(TSB_KERNEL_PHYS), %g3
+	sllx	%g3, 32, %g3
+	sethi	%hi(TSB_KERNEL_PHYS), %g3
+	or	%g7, %g3, %g7
+	cmp	%g4, %g7
+	bl,pt	%xcc, 1f
+	 or	%g5, TD_CP | TD_W, %g5
+	.globl	tl1_dmmu_miss_direct_patch_tsb_phys_end_1
+tl1_dmmu_miss_direct_patch_tsb_phys_end_1:
+	sethi	%uhi(TSB_KERNEL_PHYS_END), %g3
+	or	%g3, %ulo(TSB_KERNEL_PHYS_END), %g3
+	sllx	%g3, 32, %g3
+	sethi	%hi(TSB_KERNEL_PHYS_END), %g7
+	or	%g7, %g3, %g7
+	cmp	%g4, %g7
+	bg,a,pt	%xcc, 1f
+	 nop
+	ba,pt	%xcc, 2f
+	 nop
+1:	or	%g5, TD_CV, %g5
 
 	/*
 	 * Load the TTE data into the TLB and retry the instruction.
 	 */
-	stxa	%g5, [%g0] ASI_DTLB_DATA_IN_REG
+2:	stxa	%g5, [%g0] ASI_DTLB_DATA_IN_REG
 	retry
 END(tl1_dmmu_miss_direct)
 
@@ -1584,11 +1649,17 @@ ENTRY(tl1_dmmu_prot_1)
 	 * Compute the address of the TTE.  The TSB mask and address of the
 	 * TSB are patched at startup.
 	 */
-	.globl	tl1_dmmu_prot_patch_1
-tl1_dmmu_prot_patch_1:
+	.globl	tl1_dmmu_prot_patch_tsb_1
+tl1_dmmu_prot_patch_tsb_1:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_dmmu_prot_patch_tsb_mask_1
+tl1_dmmu_prot_patch_tsb_mask_1:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
 
 	srlx	%g5, TAR_VPN_SHIFT, %g5
 	and	%g5, %g6, %g6
@@ -1598,7 +1669,9 @@ tl1_dmmu_prot_patch_1:
 	/*
 	 * Load the TTE.
 	 */
-	ldda	[%g6] ASI_NUCLEUS_QUAD_LDD, %g6 /*, %g7 */
+	.globl	tl1_dmmu_prot_patch_quad_ldd_1
+tl1_dmmu_prot_patch_quad_ldd_1:
+	ldda	[%g6] TSB_QUAD_LDD, %g6 /*, %g7 */
 
 	/*
 	 * Check that it's valid and writeable and that the virtual page
@@ -1625,12 +1698,17 @@ tl1_dmmu_prot_patch_1:
 	 * Recompute the TTE address, which we clobbered loading the TTE.
 	 * The TSB mask and address of the TSB are patched at startup.
 	 */
-	.globl	tl1_dmmu_prot_patch_2
-tl1_dmmu_prot_patch_2:
+	.globl	tl1_dmmu_prot_patch_tsb_2
+tl1_dmmu_prot_patch_tsb_2:
+	sethi	%uhi(TSB_KERNEL), %g6
+	or	%g6, %ulo(TSB_KERNEL), %g6
+	sllx	%g6, 32, %g6
+	sethi	%hi(TSB_KERNEL), %g7
+	or	%g7, %g6, %g7
+	.globl	tl1_dmmu_prot_patch_tsb_mask_2
+tl1_dmmu_prot_patch_tsb_mask_2:
 	sethi	%hi(TSB_KERNEL_MASK), %g6
 	or	%g6, %lo(TSB_KERNEL_MASK), %g6
-	sethi	%hi(TSB_KERNEL), %g7
-
 	and	%g5, %g6, %g5
 	sllx	%g5, TTE_SHIFT, %g5
 	add	%g5, %g7, %g5
@@ -1638,7 +1716,10 @@ tl1_dmmu_prot_patch_2:
 	/*
 	 * Set the hardware write bit.
 	 */
-	TTE_SET_W(%g5, %g6, %g7)
+	.globl	tl1_dmmu_prot_patch_asi_1
+tl1_dmmu_prot_patch_asi_1:
+	wr	%g0, TSB_ASI, %asi
+	TTE_SET_W(%g5, %g6, %g7, a, %asi)
 
 	/*
 	 * May have become invalid during casxa, in which case start over.

Modified: stable/8/sys/sparc64/sparc64/genassym.c
==============================================================================
--- stable/8/sys/sparc64/sparc64/genassym.c	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/sparc64/genassym.c	Sat May 14 21:03:44 2011	(r221918)
@@ -136,6 +136,7 @@ ASSYM(TS_MIN, TS_MIN);
 ASSYM(TS_MAX, TS_MAX);
 ASSYM(TLB_DAR_SLOT_SHIFT, TLB_DAR_SLOT_SHIFT);
 ASSYM(TLB_CXR_PGSZ_MASK, TLB_CXR_PGSZ_MASK);
+ASSYM(TLB_DIRECT_ADDRESS_MASK, TLB_DIRECT_ADDRESS_MASK);
 ASSYM(TLB_DIRECT_TO_TTE_MASK, TLB_DIRECT_TO_TTE_MASK);
 ASSYM(TV_SIZE_BITS, TV_SIZE_BITS);
 #endif

Modified: stable/8/sys/sparc64/sparc64/mp_machdep.c
==============================================================================
--- stable/8/sys/sparc64/sparc64/mp_machdep.c	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/sparc64/mp_machdep.c	Sat May 14 21:03:44 2011	(r221918)
@@ -89,6 +89,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
+#include <machine/tsb.h>
 #include <machine/tte.h>
 #include <machine/ver.h>
 
@@ -437,8 +438,12 @@ cpu_mp_bootstrap(struct pcpu *pc)
 	tick_clear(pc->pc_impl);
 	tick_stop(pc->pc_impl);
 
-	/* Lock the kernel TSB in the TLB. */
-	pmap_map_tsb();
+	/* Set the kernel context. */
+	pmap_set_kctx();
+
+	/* Lock the kernel TSB in the TLB if necessary. */
+	if (tsb_kernel_ldd_phys == 0)
+		pmap_map_tsb();
 
 	/*
 	 * Flush all non-locked TLB entries possibly left over by the

Modified: stable/8/sys/sparc64/sparc64/pmap.c
==============================================================================
--- stable/8/sys/sparc64/sparc64/pmap.c	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/sparc64/pmap.c	Sat May 14 21:03:44 2011	(r221918)
@@ -151,6 +151,8 @@ struct pmap kernel_pmap_store;
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size, uint32_t colors);
 
+static void pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data);
+
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
@@ -161,12 +163,26 @@ static vm_paddr_t pmap_bootstrap_alloc(v
 static void pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, boolean_t wired);
 
-extern int tl1_immu_miss_patch_1[];
-extern int tl1_immu_miss_patch_2[];
-extern int tl1_dmmu_miss_patch_1[];
-extern int tl1_dmmu_miss_patch_2[];
-extern int tl1_dmmu_prot_patch_1[];
-extern int tl1_dmmu_prot_patch_2[];
+extern int tl1_dmmu_miss_direct_patch_tsb_phys_1[];
+extern int tl1_dmmu_miss_direct_patch_tsb_phys_end_1[];
+extern int tl1_dmmu_miss_patch_asi_1[];
+extern int tl1_dmmu_miss_patch_quad_ldd_1[];
+extern int tl1_dmmu_miss_patch_tsb_1[];
+extern int tl1_dmmu_miss_patch_tsb_2[];
+extern int tl1_dmmu_miss_patch_tsb_mask_1[];
+extern int tl1_dmmu_miss_patch_tsb_mask_2[];
+extern int tl1_dmmu_prot_patch_asi_1[];
+extern int tl1_dmmu_prot_patch_quad_ldd_1[];
+extern int tl1_dmmu_prot_patch_tsb_1[];
+extern int tl1_dmmu_prot_patch_tsb_2[];
+extern int tl1_dmmu_prot_patch_tsb_mask_1[];
+extern int tl1_dmmu_prot_patch_tsb_mask_2[];
+extern int tl1_immu_miss_patch_asi_1[];
+extern int tl1_immu_miss_patch_quad_ldd_1[];
+extern int tl1_immu_miss_patch_tsb_1[];
+extern int tl1_immu_miss_patch_tsb_2[];
+extern int tl1_immu_miss_patch_tsb_mask_1[];
+extern int tl1_immu_miss_patch_tsb_mask_2[];
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
@@ -297,13 +313,21 @@ pmap_bootstrap(u_int cpu_impl)
 	vm_size_t physsz;
 	vm_size_t virtsz;
 	u_long data;
+	u_long vpn;
 	phandle_t pmem;
 	phandle_t vmem;
 	u_int dtlb_slots_avail;
 	int i;
 	int j;
 	int sz;
+	uint32_t asi;
 	uint32_t colors;
+	uint32_t ldd;
+
+	/*
+	 * Set the kernel context.
+	 */
+	pmap_set_kctx();
 
 	colors = dcache_color_ignore != 0 ? 1 : DCACHE_COLORS;
 
@@ -350,40 +374,57 @@ pmap_bootstrap(u_int cpu_impl)
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel TSB based on the phsyical memory size but limited
-	 * by the amount of dTLB slots available for locked entries (given
-	 * that for spitfire-class CPUs all of the dt64 slots can hold locked
-	 * entries but there is no large dTLB for unlocked ones, we don't use
-	 * more than half of it for locked entries).
-	 */
-	dtlb_slots_avail = 0;
-	for (i = 0; i < dtlb_slots; i++) {
-		data = dtlb_get_data(i);
-		if ((data & (TD_V | TD_L)) != (TD_V | TD_L))
-			dtlb_slots_avail++;
-	}
+	 * by the amount of dTLB slots available for locked entries if we have
+	 * to lock the TSB in the TLB (given that for spitfire-class CPUs all
+	 * of the dt64 slots can hold locked entries but there is no large
+	 * dTLB for unlocked ones, we don't use more than half of it for the
+	 * TSB).
+	 * Note that for reasons unknown OpenSolaris doesn't take advantage of
+	 * ASI_ATOMIC_QUAD_LDD_PHYS on UltraSPARC-III.  However, given that no
+	 * public documentation is available for these, the latter just might
+	 * not support it, yet.
+	 */
+	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
+	if (cpu_impl == CPU_IMPL_SPARC64V ||
+	    cpu_impl >= CPU_IMPL_ULTRASPARCIIIp)
+		tsb_kernel_ldd_phys = 1;
+	else {
+		dtlb_slots_avail = 0;
+		for (i = 0; i < dtlb_slots; i++) {
+			data = dtlb_get_data(i);
+			if ((data & (TD_V | TD_L)) != (TD_V | TD_L))
+				dtlb_slots_avail++;
+		}
 #ifdef SMP
-	dtlb_slots_avail -= PCPU_PAGES;
+		dtlb_slots_avail -= PCPU_PAGES;
 #endif
-	if (cpu_impl >= CPU_IMPL_ULTRASPARCI &&
-	    cpu_impl < CPU_IMPL_ULTRASPARCIII)
-		dtlb_slots_avail /= 2;
-	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
-	virtsz = MIN(virtsz,
-	    (dtlb_slots_avail * PAGE_SIZE_4M) << (PAGE_SHIFT - TTE_SHIFT));
+		if (cpu_impl >= CPU_IMPL_ULTRASPARCI &&
+		    cpu_impl < CPU_IMPL_ULTRASPARCIII)
+			dtlb_slots_avail /= 2;
+		virtsz = MIN(virtsz, (dtlb_slots_avail * PAGE_SIZE_4M) <<
+		    (PAGE_SHIFT - TTE_SHIFT));
+	}
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 	tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 	tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1;
 
 	/*
-	 * Allocate the kernel TSB and lock it in the TLB.
+	 * Allocate the kernel TSB and lock it in the TLB if necessary.
 	 */
 	pa = pmap_bootstrap_alloc(tsb_kernel_size, colors);
 	if (pa & PAGE_MASK_4M)
-		panic("pmap_bootstrap: tsb unaligned\n");
+		panic("pmap_bootstrap: TSB unaligned\n");
 	tsb_kernel_phys = pa;
-	tsb_kernel = (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size);
-	pmap_map_tsb();
-	bzero(tsb_kernel, tsb_kernel_size);
+	if (tsb_kernel_ldd_phys == 0) {
+		tsb_kernel =
+		    (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size);
+		pmap_map_tsb();
+		bzero(tsb_kernel, tsb_kernel_size);
+	} else {
+		tsb_kernel =
+		    (struct tte *)TLB_PHYS_TO_DIRECT(tsb_kernel_phys);
+		aszero(ASI_PHYS_USE_EC, tsb_kernel_phys, tsb_kernel_size);
+	}
 
 	/*
 	 * Allocate and map the dynamic per-CPU area for the BSP.
@@ -398,35 +439,99 @@ pmap_bootstrap(u_int cpu_impl)
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(pa);
 
 	/*
-	 * Patch the virtual address and the tsb mask into the trap table.
+	 * Patch the TSB addresses and mask as well as the ASIs used to load
+	 * it into the trap table.
 	 */
 
-#define	SETHI(rd, imm22) \
-	(EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) | \
+#define	LDDA_R_I_R(rd, imm_asi, rs1, rs2)				\
+	(EIF_OP(IOP_LDST) | EIF_F3_RD(rd) | EIF_F3_OP3(INS3_LDDA) |	\
+	    EIF_F3_RS1(rs1) | EIF_F3_I(0) | EIF_F3_IMM_ASI(imm_asi) |	\
+	    EIF_F3_RS2(rs2))
+#define	OR_R_I_R(rd, imm13, rs1)					\
+	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) |	\
+	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
+#define	SETHI(rd, imm22)						\
+	(EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) |	\
 	    EIF_IMM((imm22) >> 10, 22))
-#define	OR_R_I_R(rd, imm13, rs1) \
-	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) | \
+#define	WR_R_I(rd, imm13, rs1)						\
+	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_WR) |	\
 	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
 
-#define	PATCH(addr) do { \
-	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \
-	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, IF_F3_RS1(addr[1])) || \
-	    addr[2] != SETHI(IF_F2_RD(addr[2]), 0x0)) \
-		panic("pmap_boostrap: patched instructions have changed"); \
-	addr[0] |= EIF_IMM((tsb_kernel_mask) >> 10, 22); \
-	addr[1] |= EIF_IMM(tsb_kernel_mask, 10); \
-	addr[2] |= EIF_IMM(((vm_offset_t)tsb_kernel) >> 10, 22); \
-	flush(addr); \
-	flush(addr + 1); \
-	flush(addr + 2); \
+#define	PATCH_ASI(addr, asi) do {					\
+	if (addr[0] != WR_R_I(IF_F3_RD(addr[0]), 0x0,			\
+	    IF_F3_RS1(addr[0])))					\
+		panic("%s: patched instructions have changed",		\
+		    __func__);						\
+	addr[0] |= EIF_IMM((asi), 13);					\
+	flush(addr);							\
+} while (0)
+
+#define	PATCH_LDD(addr, asi) do {					\
+	if (addr[0] != LDDA_R_I_R(IF_F3_RD(addr[0]), 0x0,		\
+	    IF_F3_RS1(addr[0]), IF_F3_RS2(addr[0])))			\
+		panic("%s: patched instructions have changed",		\
+		    __func__);						\
+	addr[0] |= EIF_F3_IMM_ASI(asi);					\
+	flush(addr);							\
 } while (0)
 
-	PATCH(tl1_immu_miss_patch_1);
-	PATCH(tl1_immu_miss_patch_2);
-	PATCH(tl1_dmmu_miss_patch_1);
-	PATCH(tl1_dmmu_miss_patch_2);
-	PATCH(tl1_dmmu_prot_patch_1);
-	PATCH(tl1_dmmu_prot_patch_2);
+#define	PATCH_TSB(addr, val) do {					\
+	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) ||			\
+	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0,			\
+	    IF_F3_RS1(addr[1]))	||					\
+	    addr[3] != SETHI(IF_F2_RD(addr[3]), 0x0))			\
+		panic("%s: patched instructions have changed",		\
+		    __func__);						\
+	addr[0] |= EIF_IMM((val) >> 42, 22);				\
+	addr[1] |= EIF_IMM((val) >> 32, 10);				\
+	addr[3] |= EIF_IMM((val) >> 10, 22);				\
+	flush(addr);							\
+	flush(addr + 1);						\
+	flush(addr + 3);						\
+} while (0)
+
+#define	PATCH_TSB_MASK(addr, val) do {					\
+	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) ||			\
+	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0,			\
+	    IF_F3_RS1(addr[1])))					\
+		panic("%s: patched instructions have changed",		\
+		    __func__);						\
+	addr[0] |= EIF_IMM((val) >> 10, 22);				\
+	addr[1] |= EIF_IMM((val), 10);					\
+	flush(addr);							\
+	flush(addr + 1);						\
+} while (0)
+
+	if (tsb_kernel_ldd_phys == 0) {
+		asi = ASI_N;
+		ldd = ASI_NUCLEUS_QUAD_LDD;
+		off = (vm_offset_t)tsb_kernel;
+	} else {
+		asi = ASI_PHYS_USE_EC;
+		ldd = ASI_ATOMIC_QUAD_LDD_PHYS;
+		off = (vm_offset_t)tsb_kernel_phys;
+	}
+	PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_1, tsb_kernel_phys);
+	PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_end_1,
+	    tsb_kernel_phys + tsb_kernel_size - 1);
+	PATCH_ASI(tl1_dmmu_miss_patch_asi_1, asi);
+	PATCH_LDD(tl1_dmmu_miss_patch_quad_ldd_1, ldd);
+	PATCH_TSB(tl1_dmmu_miss_patch_tsb_1, off);
+	PATCH_TSB(tl1_dmmu_miss_patch_tsb_2, off);
+	PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_1, tsb_kernel_mask);
+	PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_2, tsb_kernel_mask);
+	PATCH_ASI(tl1_dmmu_prot_patch_asi_1, asi);
+	PATCH_LDD(tl1_dmmu_prot_patch_quad_ldd_1, ldd);
+	PATCH_TSB(tl1_dmmu_prot_patch_tsb_1, off);
+	PATCH_TSB(tl1_dmmu_prot_patch_tsb_2, off);
+	PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_1, tsb_kernel_mask);
+	PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_2, tsb_kernel_mask);
+	PATCH_ASI(tl1_immu_miss_patch_asi_1, asi);
+	PATCH_LDD(tl1_immu_miss_patch_quad_ldd_1, ldd);
+	PATCH_TSB(tl1_immu_miss_patch_tsb_1, off);
+	PATCH_TSB(tl1_immu_miss_patch_tsb_2, off);
+	PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_1, tsb_kernel_mask);
+	PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_2, tsb_kernel_mask);
 
 	/*
 	 * Enter fake 8k pages for the 4MB kernel pages, so that
@@ -437,9 +542,10 @@ pmap_bootstrap(u_int cpu_impl)
 		va = kernel_tlbs[i].te_va;
 		for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) {
 			tp = tsb_kvtotte(va + off);
-			tp->tte_vpn = TV_VPN(va + off, TS_8K);
-			tp->tte_data = TD_V | TD_8K | TD_PA(pa + off) |
-			    TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W;
+			vpn = TV_VPN(va + off, TS_8K);
+			data = TD_V | TD_8K | TD_PA(pa + off) | TD_REF |
+			    TD_SW | TD_CP | TD_CV | TD_P | TD_W;
+			pmap_bootstrap_set_tte(tp, vpn, data);
 		}
 	}
 
@@ -480,9 +586,10 @@ pmap_bootstrap(u_int cpu_impl)
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tp = tsb_kvtotte(va);
-		tp->tte_vpn = TV_VPN(va, TS_8K);
-		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW |
-		    TD_CP | TD_CV | TD_P | TD_W;
+		vpn = TV_VPN(va, TS_8K);
+		data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW | TD_CP |
+		    TD_CV | TD_P | TD_W;
+		pmap_bootstrap_set_tte(tp, vpn, data);
 	}
 
 	/*
@@ -522,9 +629,8 @@ pmap_bootstrap(u_int cpu_impl)
 		    off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			tp = tsb_kvtotte(va);
-			tp->tte_vpn = TV_VPN(va, TS_8K);
-			tp->tte_data =
-			    ((translations[i].om_tte &
+			vpn = TV_VPN(va, TS_8K);
+			data = ((translations[i].om_tte &
 			    ~((TD_SOFT2_MASK << TD_SOFT2_SHIFT) |
 			    (cpu_impl >= CPU_IMPL_ULTRASPARCI &&
 			    cpu_impl < CPU_IMPL_ULTRASPARCIII ?
@@ -532,6 +638,7 @@ pmap_bootstrap(u_int cpu_impl)
 			    (TD_RSVD_CH_MASK << TD_RSVD_CH_SHIFT)) |
 			    (TD_SOFT_MASK << TD_SOFT_SHIFT))) | TD_EXEC) +
 			    off;
+			pmap_bootstrap_set_tte(tp, vpn, data);
 		}
 	}
 
@@ -566,20 +673,17 @@ pmap_bootstrap(u_int cpu_impl)
 	tlb_flush_nonlocked();
 }
 
+/*
+ * Map the 4MB kernel TSB pages.
+ */
 void
 pmap_map_tsb(void)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	u_long data;
-	register_t s;
 	int i;
 
-	s = intr_disable();
-
-	/*
-	 * Map the 4MB TSB pages.
-	 */
 	for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) {
 		va = (vm_offset_t)tsb_kernel + i;
 		pa = tsb_kernel_phys + i;
@@ -589,16 +693,19 @@ pmap_map_tsb(void)
 		    TLB_TAR_CTX(TLB_CTX_KERNEL));
 		stxa_sync(0, ASI_DTLB_DATA_IN_REG, data);
 	}
+}
+
+/*
+ * Set the secondary context to be the kernel context (needed for FP block
+ * operations in the kernel).
+ */
+void
+pmap_set_kctx(void)
+{
 
-	/*
-	 * Set the secondary context to be the kernel context (needed for
-	 * FP block operations in the kernel).
-	 */
 	stxa(AA_DMMU_SCXR, ASI_DMMU, (ldxa(AA_DMMU_SCXR, ASI_DMMU) &
 	    TLB_CXR_PGSZ_MASK) | TLB_CTX_KERNEL);
 	flush(KERNBASE);
-
-	intr_restore(s);
 }
 
 /*
@@ -624,6 +731,27 @@ pmap_bootstrap_alloc(vm_size_t size, uin
 }
 
 /*
+ * Set a TTE.  This function is intended as a helper when tsb_kernel is
+ * direct-mapped but we haven't taken over the trap table, yet, as it's the
+ * case when we are taking advantage of ASI_ATOMIC_QUAD_LDD_PHYS to access
+ * the kernel TSB.
+ */
+void
+pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data)
+{
+
+	if (tsb_kernel_ldd_phys == 0) {
+		tp->tte_vpn = vpn;
+		tp->tte_data = data;
+	} else {
+		stxa((vm_paddr_t)tp + offsetof(struct tte, tte_vpn),
+		    ASI_PHYS_USE_EC, vpn);
+		stxa((vm_paddr_t)tp + offsetof(struct tte, tte_data),
+		    ASI_PHYS_USE_EC, data);
+	}
+}
+
+/*
  * Initialize a vm_page's machine-dependent fields.
  */
 void

Modified: stable/8/sys/sparc64/sparc64/tsb.c
==============================================================================
--- stable/8/sys/sparc64/sparc64/tsb.c	Sat May 14 20:51:19 2011	(r221917)
+++ stable/8/sys/sparc64/sparc64/tsb.c	Sat May 14 21:03:44 2011	(r221918)
@@ -26,9 +26,11 @@
  * SUCH DAMAGE.
  *
  *	from BSDI: pmap.c,v 1.28.2.15 2000/04/27 03:10:31 cp Exp
- * $FreeBSD$
  */
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
 #include "opt_ddb.h"
 #include "opt_pmap.h"
 
@@ -42,7 +44,7 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
-#include <vm/vm.h> 
+#include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
@@ -77,6 +79,7 @@ struct tte *tsb_kernel;
 vm_size_t tsb_kernel_mask;
 vm_size_t tsb_kernel_size;
 vm_paddr_t tsb_kernel_phys;
+u_int tsb_kernel_ldd_phys;
 
 struct tte *
 tsb_tte_lookup(pmap_t pm, vm_offset_t va)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201105142103.p4EL3iCc015665>