Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 10 Jan 2001 21:35:52 +0900 (JST)
From:      NAKAMURA Kazushi <kaz@kobe1995.net>
To:        FreeBSD-gnats-submit@freebsd.org
Cc:        kaz@ns.kobe1995.net
Subject:   kern/24219: Pentium3 SSE patch for 4.2R
Message-ID:  <200101101235.f0ACZq000695@beauty.kobe1995.net>

next in thread | raw e-mail | index | archive | help

>Number:         24219
>Category:       kern
>Synopsis:       <4.2R can't use Pentium3 SSE instructions>
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Jan 10 04:40:01 PST 2001
>Closed-Date:
>Last-Modified:
>Originator:     NAKAMURA Kazushi
>Release:        FreeBSD 4.2-RELEASE i386
>Organization:
PCshop LABBIT
>Environment:

	FreeBSD4.2R/Pentium3

>Description:

	FreeBSD4.2R's gcc 2.95.2 & gas 2.10.0 can compile & assemble MMX2,SSE
	instructions. But SSE instructions can't run on 4.2R's kernel. I don't
	know about 5-current, but I want to use SSE instructions on 4-stable.
	Please merge this patch.
	This patch is origined from titech.ac.jp(sorry I forget the URL).

	Caution: This patch has NOT tested on mendocino celeron.

>How-To-Repeat:

	Any code that uses SSE instructions(XMM registers).

>Fix:

	Apply this patch, and append "option CPU_ENABLE_SSE" to kernel
	configuration file. Then recompile the kernel.

diff -ruN -9 sys/conf/options.i386 sys+/conf/options.i386
--- sys/conf/options.i386	Sat Sep 30 11:49:30 2000
+++ sys+/conf/options.i386	Tue Jan  9 13:44:18 2001
@@ -52,18 +52,19 @@
 CPU_LOOP_EN			opt_cpu.h
 CPU_PPRO2CELERON		opt_cpu.h
 CPU_RSTK_EN			opt_cpu.h
 CPU_SUSP_HLT			opt_cpu.h
 CPU_UPGRADE_HW_CACHE		opt_cpu.h
 CPU_WT_ALLOC			opt_cpu.h
 CYRIX_CACHE_WORKS		opt_cpu.h
 CYRIX_CACHE_REALLY_WORKS	opt_cpu.h
 NO_MEMORY_HOLE			opt_cpu.h
+CPU_ENABLE_SSE			opt_cpu.h
 
 # The CPU type affects the endian conversion functions all over the kernel.
 I386_CPU		opt_global.h
 I486_CPU		opt_global.h
 I586_CPU		opt_global.h
 I686_CPU		opt_global.h
 
 MAXCONS			opt_syscons.h
 SC_ALT_MOUSE_IMAGE	opt_syscons.h
diff -ruN -9 sys/i386/conf/LINT sys+/i386/conf/LINT
--- sys/i386/conf/LINT	Sat Nov 18 18:22:07 2000
+++ sys+/i386/conf/LINT	Tue Jan  9 13:45:36 2001
@@ -158,18 +158,20 @@
 #
 # CPU_CYRIX_NO_LOCK enables weak locking for the entire address space
 # of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1.
 # Otherwise, the NO_LOCK bit of CCR1 is cleared.  (NOTE 3)
 #
 # CPU_DISABLE_5X86_LSSER disables load store serialize (i.e. enables
 # reorder).  This option should not be used if you use memory mapped
 # I/O device(s).
 #
+# CPU_ENABLE_SSE enables SSE/MMX2 instructions support.
+#
 # CPU_FASTER_5X86_FPU enables faster FPU exception handler.
 #
 # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products
 # for i386 machines.
 #
 # CPU_IORT defines I/O clock delay time (NOTE 1).  Default values of
 # I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively
 # (no clock delay).
 #
diff -ruN -9 sys/i386/i386/genassym.c sys+/i386/i386/genassym.c
--- sys/i386/i386/genassym.c	Tue May 16 15:58:06 2000
+++ sys+/i386/i386/genassym.c	Tue Jan  9 13:49:18 2001
@@ -126,20 +126,21 @@
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 #ifdef SMP
 ASSYM(PCB_MPNEST, offsetof(struct pcb, pcb_mpnest));
 #endif
 
 ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
-ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu));
-ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87));
+ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save));
+ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu));
+ASSYM(PCB_SAVE87_SIZE, sizeof(struct save87));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 #ifdef SMP
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 #endif
 
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
diff -ruN -9 sys/i386/i386/initcpu.c sys+/i386/i386/initcpu.c
--- sys/i386/i386/initcpu.c	Sun Oct 15 12:09:32 2000
+++ sys+/i386/i386/initcpu.c	Tue Jan  9 14:04:38 2001
@@ -28,18 +28,19 @@
  *
  * $FreeBSD: src/sys/i386/i386/initcpu.c,v 1.19.2.2 2000/10/15 03:09:32 nyan Exp $
  */
 
 #include "opt_cpu.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 void initializecpu(void);
 #if defined(I586_CPU) && defined(CPU_WT_ALLOC)
 void	enable_K5_wt_alloc(void);
 void	enable_K6_wt_alloc(void);
@@ -55,19 +56,27 @@
 static void init_i486_on_386(void);
 #endif
 static void init_6x86(void);
 #endif /* I486_CPU */
 
 #ifdef I686_CPU
 static void	init_6x86MX(void);
 static void	init_ppro(void);
 static void	init_mendocino(void);
+#if defined(CPU_ENABLE_SSE)
+void	init_sse(void);
 #endif
+#endif /* I686_CPU */
+
+int	hw_instruction_sse = 0;
+SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD,
+	&hw_instruction_sse, 0,
+	"SSE/MMX2 instructions available in CPU");
 
 #ifdef I486_CPU
 /*
  * IBM Blue Lightning
  */
 static void
 init_bluelightning(void)
 {
 	u_long	eflags;
@@ -494,19 +503,28 @@
 		bbl_cr_ctl3 |= 5 << 1;
 #endif
 		wrmsr(0x11e, bbl_cr_ctl3);
 	}
 
 	load_cr0(rcr0() & ~(CR0_CD | CR0_NW));
 	write_eflags(eflags);
 #endif /* CPU_PPRO2CELERON */
 }
-	
+#if defined(CPU_ENABLE_SSE)
+void
+init_sse(void)
+{
+	if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) {
+		load_cr4(rcr4() | CR4_FXSR | CR4_XMM);
+		cpu_fxsr = hw_instruction_sse = 1;
+	}
+}
+#endif
 #endif /* I686_CPU */
 
 void
 initializecpu(void)
 {
 
 	switch (cpu) {
 #ifdef I486_CPU
 	case CPU_BLUE:
@@ -538,18 +556,21 @@
 		if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
 			switch (cpu_id & 0xff0) {
 			case 0x610:
 				init_ppro();
 				break;
 			case 0x660:
 				init_mendocino();
 				break;
 			}
+#if defined(CPU_ENABLE_SSE)
+			init_sse();
+#endif
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 
 #if defined(PC98) && !defined(CPU_UPGRADE_HW_CACHE)
 	/*
diff -ruN -9 sys/i386/i386/locore.s sys+/i386/i386/locore.s
--- sys/i386/i386/locore.s	Fri Jul  7 09:38:46 2000
+++ sys+/i386/i386/locore.s	Tue Jan  9 14:07:36 2001
@@ -90,24 +90,25 @@
 	ALIGN_DATA		/* just to be sure */
 
 	.globl	HIDENAME(tmpstk)
 	.space	0x2000		/* space for tmpstk - temporary stack */
 HIDENAME(tmpstk):
 
 	.globl	_boothowto,_bootdev
 
 	.globl	_cpu,_cpu_vendor,_cpu_id,_bootinfo
-	.globl	_cpu_high, _cpu_feature
+	.globl	_cpu_high, _cpu_feature, _cpu_fxsr
 
 _cpu:		.long	0			/* are we 386, 386sx, or 486 */
 _cpu_id:	.long	0			/* stepping ID */
 _cpu_high:	.long	0			/* highest arg to CPUID */
 _cpu_feature:	.long	0			/* features */
+_cpu_fxsr:	.long	0			/* use fxsave/fxrstor instruction */
 _cpu_vendor:	.space	20			/* CPU origin code */
 _bootinfo:	.space	BOOTINFO_SIZE		/* bootinfo that we can handle */
 
 _KERNend:	.long	0			/* phys addr end of kernel (just after bss) */
 physfree:	.long	0			/* phys addr of next free page */
 
 #ifdef SMP
 		.globl	_cpu0prvpage
 cpu0pp:		.long	0			/* phys addr cpu0 private pg */
diff -ruN -9 sys/i386/i386/machdep.c sys+/i386/i386/machdep.c
--- sys/i386/i386/machdep.c	Fri Oct 27 18:07:22 2000
+++ sys+/i386/i386/machdep.c	Tue Jan  9 14:21:51 2001
@@ -119,18 +119,22 @@
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 static void cpu_startup __P((void *));
+#ifdef CPU_ENABLE_SSE
+static void set_fpregs_xmm __P((struct save87 *, struct savexmm *));
+static void fill_fpregs_xmm __P((struct savexmm *, struct save87 *));
+#endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
 int	_udatasel, _ucodesel;
 u_int	atdevbase;
 
 #if defined(SWTCH_OPTIM_STATS)
 extern int swtch_optim_stats;
@@ -2076,20 +2080,20 @@
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFL_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
-	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
-	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
+	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_save);
+	if (off >= min && off <= min + sizeof(union savefpu) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
@@ -2145,33 +2149,101 @@
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
+#ifdef CPU_ENABLE_SSE
+static void
+fill_fpregs_xmm(sv_xmm, sv_87)
+	struct savexmm *sv_xmm;
+	struct save87 *sv_87;
+{
+	register struct env87 *penv_87 = &sv_87->sv_env;
+	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
+	int i;
+
+	/* FPU control/status */
+	penv_87->en_cw = penv_xmm->en_cw;
+	penv_87->en_sw = penv_xmm->en_sw;
+	penv_87->en_tw = penv_xmm->en_tw;
+	penv_87->en_fip = penv_xmm->en_fip;
+	penv_87->en_fcs = penv_xmm->en_fcs;
+	penv_87->en_opcode = penv_xmm->en_opcode;
+	penv_87->en_foo = penv_xmm->en_foo;
+	penv_87->en_fos = penv_xmm->en_fos;
+
+	/* FPU registers */
+	for (i = 0; i < 8; ++i)
+		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
+
+	sv_87->sv_ex_sw = sv_xmm->sv_ex_sw;
+}
+
+static void
+set_fpregs_xmm(sv_87, sv_xmm)
+	struct save87 *sv_87;
+	struct savexmm *sv_xmm;
+{
+	register struct env87 *penv_87 = &sv_87->sv_env;
+	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
+	int i;
+
+/* FPU control/status */
+	penv_xmm->en_cw = penv_87->en_cw;
+	penv_xmm->en_sw = penv_87->en_sw;
+	penv_xmm->en_tw = penv_87->en_tw;
+	penv_xmm->en_fip = penv_87->en_fip;
+	penv_xmm->en_fcs = penv_87->en_fcs;
+	penv_xmm->en_opcode = penv_87->en_opcode;
+	penv_xmm->en_foo = penv_87->en_foo;
+	penv_xmm->en_fos = penv_87->en_fos;
+
+	/* FPU registers */
+	for (i = 0; i < 8; ++i)
+		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
+
+	sv_xmm->sv_ex_sw = sv_87->sv_ex_sw;
+}
+#endif /* CPU_ENABLE_SSE */
+
 int
 fill_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
-	bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
+#ifdef CPU_ENABLE_SSE
+	if (cpu_fxsr) {
+		fill_fpregs_xmm(&p->p_addr->u_pcb.pcb_save.sv_xmm,
+						(struct save87 *)fpregs);
+		return (0);
+	}
+#endif /* CPU_ENABLE_SSE */
+	bcopy(&p->p_addr->u_pcb.pcb_save.sv_87, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
-	bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
+#ifdef CPU_ENABLE_SSE
+	if (cpu_fxsr) {
+		set_fpregs_xmm((struct save87 *)fpregs,
+					&p->p_addr->u_pcb.pcb_save.sv_xmm);
+		return (0);
+	}
+#endif /* CPU_ENABLE_SSE */
+	bcopy(fpregs, &p->p_addr->u_pcb.pcb_save.sv_87, sizeof *fpregs);
 	return (0);
 }
 
 int
 fill_dbregs(p, dbregs)
 	struct proc *p;
 	struct dbreg *dbregs;
 {
 	struct pcb *pcb;
diff -ruN -9 sys/i386/i386/mp_machdep.c sys+/i386/i386/mp_machdep.c
--- sys/i386/i386/mp_machdep.c	Sat Sep 30 11:49:32 2000
+++ sys+/i386/i386/mp_machdep.c	Tue Jan  9 14:26:16 2001
@@ -229,18 +229,22 @@
 #define MP_ENABLE_POST		0x14
 #define MPTABLE_PASS2_POST	0x15
 
 #define START_ALL_APS_POST	0x16
 #define INSTALL_AP_TRAMP_POST	0x17
 #define START_AP_POST		0x18
 
 #define MP_ANNOUNCE_POST	0x19
 
+#if defined(CPU_ENABLE_SSE)
+extern void    init_sse(void);
+#endif
+
 
 /** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */
 int	current_postcode;
 
 /** XXX FIXME: what system files declare these??? */
 extern struct region_descriptor r_gdt, r_idt;
 
 int	bsp_apic_ready = 0;	/* flags useability of BSP apic */
 int	mp_ncpus;		/* # of CPUs, including BSP */
@@ -2392,18 +2396,23 @@
 	other_cpus = all_cpus & ~(1 << cpuid);
 
 	printf("SMP: AP CPU #%d Launched!\n", cpuid);
 
 	/* XXX FIXME: i386 specific, and redundant: Setup the FPU. */
 	load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS);
 
 	/* set up FPU state on the AP */
 	npxinit(__INITIAL_NPXCW__);
+
+	/* Setup the SSE */
+#if defined(CPU_ENABLE_SSE)
+	init_sse();
+#endif
 
 	/* A quick check from sanity claus */
 	apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]);
 	if (cpuid != apic_id) {
 		printf("SMP: cpuid = %d\n", cpuid);
 		printf("SMP: apic_id = %d\n", apic_id);
 		printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]);
 		panic("cpuid mismatch! boom!!");
 	}
diff -ruN -9 sys/i386/i386/support.s sys+/i386/i386/support.s
--- sys/i386/i386/support.s	Sat Sep 30 11:49:33 2000
+++ sys+/i386/i386/support.s	Tue Jan  9 14:29:32 2001
@@ -948,19 +948,19 @@
 	src in %esi
 	dst in %edi
 	len in %ecx		XXX changed to on stack for profiling
 	uses %eax and %edx for tmp. storage
  */
 /* XXX use ENTRY() to get profiling.  fastmove() is actually a non-entry. */
 ENTRY(fastmove)
 	pushl	%ebp
 	movl	%esp,%ebp
-	subl	$PCB_SAVEFPU_SIZE+3*4,%esp
+	subl	$PCB_SAVE87_SIZE+3*4,%esp
 
 	movl	8(%ebp),%ecx
 	cmpl	$63,%ecx
 	jbe	fastmove_tail
 
 	testl	$7,%esi	/* check if src addr is multiple of 8 */
 	jnz	fastmove_tail
 
 	testl	$7,%edi	/* check if dst addr is multiple of 8 */
@@ -987,19 +987,19 @@
  */
 /* tmp = curpcb->pcb_savefpu; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	%esp,%edi
 	movl	_curpcb,%esi
 	addl	$PCB_SAVEFPU,%esi
 	cld
-	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
+	movl	$PCB_SAVE87_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 /* stop_emulating(); */
 	clts
 /* npxproc = curproc; */
 	movl	_curproc,%eax
@@ -1064,19 +1064,19 @@
 
 /* curpcb->pcb_savefpu = tmp; */
 	movl	%ecx,-12(%ebp)
 	movl	%esi,-8(%ebp)
 	movl	%edi,-4(%ebp)
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
-	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
+	movl	$PCB_SAVE87_SIZE>>2,%ecx
 	rep
 	movsl
 	movl	-12(%ebp),%ecx
 	movl	-8(%ebp),%esi
 	movl	-4(%ebp),%edi
 
 /* start_emulating(); */
 	smsw	%ax
 	orb	$CR0_TS,%al
@@ -1103,19 +1103,19 @@
 	popl	%ebp
 	ret
 
 	ALIGN_TEXT
 fastmove_fault:
 	movl	_curpcb,%edi
 	addl	$PCB_SAVEFPU,%edi
 	movl	%esp,%esi
 	cld
-	movl	$PCB_SAVEFPU_SIZE>>2,%ecx
+	movl	$PCB_SAVE87_SIZE>>2,%ecx
 	rep
 	movsl
 
 	smsw	%ax
 	orb	$CR0_TS,%al
 	lmsw	%ax
 	movl	$0,_npxproc
 
 fastmove_tail_fault:
diff -ruN -9 sys/i386/i386/vm_machdep.c sys+/i386/i386/vm_machdep.c
--- sys/i386/i386/vm_machdep.c	Sat Aug 26 13:19:26 2000
+++ sys+/i386/i386/vm_machdep.c	Tue Jan  9 14:30:56 2001
@@ -135,19 +135,19 @@
 			}
 		}
 #endif
 		return;
 	}
 
 #if NNPX > 0
 	/* Ensure that p1's pcb is up to date. */
 	if (npxproc == p1)
-		npxsave(&p1->p_addr->u_pcb.pcb_savefpu);
+		npxsave(&p1->p_addr->u_pcb.pcb_save);
 #endif
 
 	/* Copy p1's pcb. */
 	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
 	pcb2 = &p2->p_addr->u_pcb;
 
 	/*
 	 * Create a new fresh stack for the new process.
 	 * Copy the trap frame for the return to user mode as if from a
diff -ruN -9 sys/i386/include/asnames.h sys+/i386/include/asnames.h
--- sys/i386/include/asnames.h	Tue May 16 15:58:10 2000
+++ sys+/i386/include/asnames.h	Tue Jan  9 14:32:12 2001
@@ -185,18 +185,19 @@
 #define _copyin_vector			copyin_vector
 #define _copyout_vector			copyout_vector
 #define _cpl				cpl
 #define _cpl_lock			cpl_lock
 #define _cpu				cpu
 #define _cpu0prvpage			cpu0prvpage
 #define _cpu_apic_versions		cpu_apic_versions
 #define _cpu_class			cpu_class
 #define _cpu_feature			cpu_feature
+#define _cpu_fxsr			cpu_fxsr
 #define _cpu_high			cpu_high
 #define _cpu_id				cpu_id
 #define _cpu_num_to_apic_id		cpu_num_to_apic_id
 #define _cpu_switch			cpu_switch
 #define _cpu_vendor			cpu_vendor
 #define _default_halt			default_halt
 #define _denormal_operand		denormal_operand
 #define _div_small			div_small
 #define _divide_by_zero			divide_by_zero
diff -ruN -9 sys/i386/include/md_var.h sys+/i386/include/md_var.h
--- sys/i386/include/md_var.h	Mon Feb 21 05:51:23 2000
+++ sys+/i386/include/md_var.h	Tue Jan  9 14:33:24 2001
@@ -41,18 +41,19 @@
 extern	void	(*bcopy_vector) __P((const void *from, void *to, size_t len));
 extern	int	busdma_swi_pending;
 extern	int	(*copyin_vector) __P((const void *udaddr, void *kaddr,
 				      size_t len));
 extern	int	(*copyout_vector) __P((const void *kaddr, void *udaddr,
 				       size_t len));
 extern	u_int	cpu_feature;
 extern	u_int	cpu_high;
 extern	u_int	cpu_id;
+extern	u_int	cpu_fxsr;
 extern	char	cpu_vendor[];
 extern	u_int	cyrix_did;
 extern	char	kstack[];
 #ifdef PC98
 extern	int	need_pre_dma_flush;
 extern	int	need_post_dma_flush;
 #endif
 extern	void	(*netisrs[32]) __P((void));
 extern	int	nfs_diskless_valid;
diff -ruN -9 sys/i386/include/npx.h sys+/i386/include/npx.h
--- sys/i386/include/npx.h	Sat Mar 11 02:56:33 2000
+++ sys+/i386/include/npx.h	Tue Jan  9 14:40:14 2001
@@ -81,18 +81,54 @@
 	 * struct and arrange to store into this struct (ending here)
 	 * before it is inspected for ptracing or for core dumps.  Some
 	 * emulators overwrite the whole struct.  We have no good way of
 	 * knowing how much padding to leave.  Leave just enough for the
 	 * GPL emulator's i387_union (176 bytes total).
 	 */
 	u_char	sv_pad[64];	/* padding; used by emulators */
 };
 
+struct  envxmm {
+	u_int16_t	en_cw;		/* control word (16bits) */
+	u_int16_t	en_sw;		/* status word (16bits) */
+	u_int16_t	en_tw;		/* tag word (16bits) */
+	u_int16_t	en_opcode;	/* opcode last executed (11 bits ) */
+	u_int32_t	en_fip;		/* floating point instruction pointer */
+	u_int16_t	en_fcs;		/* floating code segment selector */
+	u_int16_t	en_pad0;	/* padding */
+	u_int32_t	en_foo;		/* floating operand offset */
+	u_int16_t	en_fos;		/* floating operand segment selector */
+	u_int16_t	en_pad1;	/* padding */
+	u_int32_t	en_mxcsr;	/* SSE sontorol/status register */
+	u_int32_t	en_pad2;	/* padding */
+};
+
+/* Contents of each SSE extended accumulator */
+struct	xmmacc {
+	u_char	xmm_bytes[16];
+};
+
+struct  savexmm {
+	struct	envxmm	sv_env;
+	struct {
+		struct fpacc87	fp_acc;
+		u_char		fp_pad[6];	/* padding */
+	} sv_fp[8];
+	struct xmmacc	sv_xmm[8];
+	u_long sv_ex_sw;	/* status word for last exception */
+	u_char sv_pad[220];
+};
+
+union	savefpu {
+	struct	save87	sv_87;
+	struct	savexmm	sv_xmm;
+};
+
 /*
  * The hardware default control word for i387's and later coprocessors is
  * 0x37F, giving:
  *
  *	round to nearest
  *	64-bit precision
  *	all exceptions masked.
  *
  * We modify the affine mode bit and precision bits in this to give:
@@ -108,13 +144,13 @@
 
 #ifdef _KERNEL
 #ifndef npxproc
 extern struct proc *npxproc;
 #endif
 
 int	npxdna __P((void));
 void	npxexit __P((struct proc *p));
 void	npxinit __P((int control));
-void	npxsave __P((struct save87 *addr));
+void	npxsave __P((union savefpu *addr));
 #endif
 
 #endif /* !_MACHINE_NPX_H_ */
diff -ruN -9 sys/i386/include/pcb.h sys+/i386/include/pcb.h
--- sys/i386/include/pcb.h	Wed Dec 29 13:33:03 1999
+++ sys+/i386/include/pcb.h	Tue Jan  9 14:43:07 2001
@@ -61,19 +61,20 @@
 	int     pcb_dr3;
 	int     pcb_dr6;
 	int     pcb_dr7;
 
 #ifdef USER_LDT
 	struct	pcb_ldt *pcb_ldt;	/* per process (user) LDT */
 #else
 	struct	pcb_ldt	*pcb_ldt_dontuse;
 #endif
-	struct	save87	pcb_savefpu;	/* floating point state for 287/387 */
+	union	savefpu	pcb_save;
+#define pcb_savefpu	pcb_save.sv_87
 	u_char	pcb_flags;
 #define	FP_SOFTFP	0x01	/* process using software fltng pnt emulator */
 #define	PCB_DBREGS	0x02	/* process using debug registers */
 	caddr_t	pcb_onfault;	/* copyin/out fault recovery */
 #ifdef SMP
 	u_long	pcb_mpnest;
 #else
 	u_long	pcb_mpnest_dontuse;
 #endif
diff -ruN -9 sys/i386/include/specialreg.h sys+/i386/include/specialreg.h
--- sys/i386/include/specialreg.h	Sat Sep 11 00:51:44 1999
+++ sys+/i386/include/specialreg.h	Tue Jan  9 14:44:25 2001
@@ -87,18 +87,20 @@
 #define	CPUID_MCE	0x0080
 #define	CPUID_CX8	0x0100
 #define	CPUID_APIC	0x0200
 #define	CPUID_B10	0x0400
 #define	CPUID_B11	0x0800
 #define	CPUID_MTRR	0x1000
 #define	CPUID_PGE	0x2000
 #define	CPUID_MCA	0x4000
 #define	CPUID_CMOV	0x8000
+#define	CPUID_FXSR	0x01000000
+#define	CPUID_XMM	0x02000000
 
 /*
  * Model-specific registers for the i386 family
  */
 #define MSR_P5_MC_ADDR		0x000
 #define MSR_P5_MC_TYPE		0x001
 #define MSR_TSC			0x010
 #define MSR_APICBASE		0x01b
 #define MSR_EBL_CR_POWERON	0x02a
diff -ruN -9 sys/i386/isa/npx.c sys+/i386/isa/npx.c
--- sys/i386/isa/npx.c	Sun Jan 30 01:17:36 2000
+++ sys+/i386/isa/npx.c	Tue Jan  9 15:06:48 2001
@@ -29,18 +29,19 @@
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)npx.c	7.2 (Berkeley) 5/12/91
  * $FreeBSD: src/sys/i386/isa/npx.c,v 1.80 2000/01/29 16:17:36 peter Exp $
  */
 
+#include "opt_cpu.h"
 #include "opt_debug_npx.h"
 #include "opt_math_emulate.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
@@ -90,45 +91,60 @@
 #define	fldcw(addr)		__asm("fldcw %0" : : "m" (*(addr)))
 #define	fnclex()		__asm("fnclex")
 #define	fninit()		__asm("fninit")
 #define	fnop()			__asm("fnop")
 #define	fnsave(addr)		__asm __volatile("fnsave %0" : "=m" (*(addr)))
 #define	fnstcw(addr)		__asm __volatile("fnstcw %0" : "=m" (*(addr)))
 #define	fnstsw(addr)		__asm __volatile("fnstsw %0" : "=m" (*(addr)))
 #define	fp_divide_by_0()	__asm("fldz; fld1; fdiv %st,%st(1); fnop")
 #define	frstor(addr)		__asm("frstor %0" : : "m" (*(addr)))
+#define	fxrstor(addr)		__asm("fxrstor %0" : : "m" (*(addr)))
+#define	fxsave(addr)		__asm __volatile("fxsave %0" : "=m" (*(addr)))
 #define	start_emulating()	__asm("smsw %%ax; orb %0,%%al; lmsw %%ax" \
 				      : : "n" (CR0_TS) : "ax")
 #define	stop_emulating()	__asm("clts")
 
 #else	/* not __GNUC__ */
 
 void	fldcw		__P((caddr_t addr));
 void	fnclex		__P((void));
 void	fninit		__P((void));
 void	fnop		__P((void));
 void	fnsave		__P((caddr_t addr));
 void	fnstcw		__P((caddr_t addr));
 void	fnstsw		__P((caddr_t addr));
 void	fp_divide_by_0	__P((void));
 void	frstor		__P((caddr_t addr));
+void	fxsave		__P((caddr_t addr));
+void	fxrstor		__P((caddr_t addr));
 void	start_emulating	__P((void));
 void	stop_emulating	__P((void));
 
 #endif	/* __GNUC__ */
 
+#ifdef CPU_ENABLE_SSE
+#define FPU_STATUS_EX(pcb) \
+	(cpu_fxsr ? \
+		&(pcb)->pcb_save.sv_xmm.sv_ex_sw : \
+		&(pcb)->pcb_save.sv_87.sv_ex_sw)
+#else /* CPU_ENABLE_SSE */
+#define FPU_STATUS_EX(pcb)	(&(pcb)->pcb_save.sv_87.sv_ex_sw)
+#endif /* CPU_ENABLE_SSE */
+
 typedef u_char bool_t;
 
 static	int	npx_attach	__P((device_t dev));
 	void	npx_intr	__P((void *));
 static	void	npx_identify	__P((driver_t *driver, device_t parent));
 static	int	npx_probe	__P((device_t dev));
 static	int	npx_probe1	__P((device_t dev));
+static	void	fpusave		__P((union savefpu *));
+static	void	fpurstor	__P((union savefpu *));
 #ifdef I586_CPU
 static	long	timezero	__P((const char *funcname,
 				     void (*func)(void *buf, size_t len)));
 #endif /* I586_CPU */
 
 int	hw_float;		/* XXX currently just alias for npx_exists */
 
 SYSCTL_INT(_hw,HW_FLOATINGPT, floatingpoint,
 	CTLFLAG_RD, &hw_float, 0, 
@@ -468,45 +484,45 @@
 }
 
 /*
  * Initialize floating point unit.
  */
 void
 npxinit(control)
 	u_short control;
 {
-	struct save87 dummy;
+	union savefpu dummy;
 
 	if (!npx_exists)
 		return;
 	/*
 	 * fninit has the same h/w bugs as fnsave.  Use the detoxified
 	 * fnsave to throw away any junk in the fpu.  npxsave() initializes
 	 * the fpu and sets npxproc = NULL as important side effects.
 	 */
 	npxsave(&dummy);
 	stop_emulating();
 	fldcw(&control);
 	if (curpcb != NULL)
-		fnsave(&curpcb->pcb_savefpu);
+		fpusave(&curpcb->pcb_save);
 	start_emulating();
 }
 
 /*
  * Free coprocessor (if we have it).
  */
 void
 npxexit(p)
 	struct proc *p;
 {
 
 	if (p == npxproc)
-		npxsave(&curpcb->pcb_savefpu);
+		npxsave(&curpcb->pcb_save);
 #ifdef NPX_DEBUG
 	if (npx_exists) {
 		u_int	masked_exceptions;
 
 		masked_exceptions = curpcb->pcb_savefpu.sv_env.en_cw
 				    & curpcb->pcb_savefpu.sv_env.en_sw & 0x7f;
 		/*
 		 * Log exceptions that would have trapped with the old
 		 * control word (overflow, divide by 0, and invalid operand).
@@ -708,32 +724,35 @@
  * solution for signals other than SIGFPE.
  */
 void
 npx_intr(dummy)
 	void *dummy;
 {
 	int code;
 	u_short control;
 	struct intrframe *frame;
+	u_long *pstatus;
 
 	if (npxproc == NULL || !npx_exists) {
 		printf("npxintr: npxproc = %p, curproc = %p, npx_exists = %d\n",
 		       npxproc, curproc, npx_exists);
 		panic("npxintr from nowhere");
 	}
 	if (npxproc != curproc) {
 		printf("npxintr: npxproc = %p, curproc = %p, npx_exists = %d\n",
 		       npxproc, curproc, npx_exists);
 		panic("npxintr from non-current process");
 	}
 
+	pstatus = FPU_STATUS_EX(curpcb);
+
 	outb(0xf0, 0);
-	fnstsw(&curpcb->pcb_savefpu.sv_ex_sw);
+	fnstsw(pstatus);
 	fnstcw(&control);
 	fnclex();
 
 	/*
 	 * Pass exception to process.
 	 */
 	frame = (struct intrframe *)&dummy;	/* XXX */
 	if ((ISPL(frame->if_cs) == SEL_UPL) || (frame->if_eflags & PSL_VM)) {
 		/*
@@ -747,20 +766,19 @@
 		 * in doreti, and the frame for that could easily be set up
 		 * just before it is used).
 		 */
 		curproc->p_md.md_regs = INTR_TO_TRAPFRAME(frame);
 		/*
 		 * Encode the appropriate code for detailed information on
 		 * this exception.
 		 */
 		code = 
-		    fpetable[(curpcb->pcb_savefpu.sv_ex_sw & ~control & 0x3f) |
-			(curpcb->pcb_savefpu.sv_ex_sw & 0x40)];
+		    fpetable[(*pstatus & ~control & 0x3f) | (*pstatus & 0x40)];
 		trapsignal(curproc, SIGFPE, code);
 	} else {
 		/*
 		 * Nested interrupt.  These losers occur when:
 		 *	o an IRQ13 is bogusly generated at a bogus time, e.g.:
 		 *		o immediately after an fnsave or frstor of an
 		 *		  error state.
 		 *		o a couple of 386 instructions after
 		 *		  "fstpl _memvar" causes a stack overflow.
@@ -779,69 +797,73 @@
  * Implement device not available (DNA) exception
  *
  * It would be better to switch FP context here (if curproc != npxproc)
  * and not necessarily for every context switch, but it is too hard to
  * access foreign pcb's.
  */
 int
 npxdna()
 {
+	u_long *pstatus;
+
 	if (!npx_exists)
 		return (0);
 	if (npxproc != NULL) {
 		printf("npxdna: npxproc = %p, curproc = %p\n",
 		       npxproc, curproc);
 		panic("npxdna");
 	}
 	stop_emulating();
 	/*
 	 * Record new context early in case frstor causes an IRQ13.
 	 */
 	npxproc = curproc;
-	curpcb->pcb_savefpu.sv_ex_sw = 0;
+
+	pstatus = FPU_STATUS_EX(curpcb);
+	*pstatus = 0;
+
 	/*
 	 * The following frstor may cause an IRQ13 when the state being
 	 * restored has a pending error.  The error will appear to have been
 	 * triggered by the current (npx) user instruction even when that
 	 * instruction is a no-wait instruction that should not trigger an
 	 * error (e.g., fnclex).  On at least one 486 system all of the
 	 * no-wait instructions are broken the same as frstor, so our
 	 * treatment does not amplify the breakage.  On at least one
 	 * 386/Cyrix 387 system, fnclex works correctly while frstor and
 	 * fnsave are broken, so our treatment breaks fnclex if it is the
 	 * first FPU instruction after a context switch.
 	 */
-	frstor(&curpcb->pcb_savefpu);
+	fpurstor(&curpcb->pcb_save);
 
 	return (1);
 }
 
 /*
  * Wrapper for fnsave instruction to handle h/w bugs.  If there is an error
  * pending, then fnsave generates a bogus IRQ13 on some systems.  Force
  * any IRQ13 to be handled immediately, and then ignore it.  This routine is
  * often called at splhigh so it must not use many system services.  In
  * particular, it's much easier to install a special handler than to
  * guarantee that it's safe to use npxintr() and its supporting code.
  */
 void
 npxsave(addr)
-	struct save87 *addr;
+	union savefpu *addr;
 {
-#ifdef SMP
-
+#if defined(SMP) || defined(CPU_ENABLE_SSE)
 	stop_emulating();
-	fnsave(addr);
+	fpusave(addr);
 	/* fnop(); */
 	start_emulating();
 	npxproc = NULL;
 
-#else /* SMP */
+#else /* SMP or CPU_ENABLE_SSE */
 
 	u_char	icu1_mask;
 	u_char	icu2_mask;
 	u_char	old_icu1_mask;
 	u_char	old_icu2_mask;
 	struct gate_descriptor	save_idt_npxintr;
 
 	disable_intr();
 	old_icu1_mask = inb(IO_ICU1 + 1);
@@ -862,18 +884,50 @@
 	outb(IO_ICU1 + 1,
 	     (icu1_mask & ~npx0_imask) | (old_icu1_mask & npx0_imask));
 	outb(IO_ICU2 + 1,
 	     (icu2_mask & ~(npx0_imask >> 8))
 	     | (old_icu2_mask & (npx0_imask >> 8)));
 	idt[npx_intrno] = save_idt_npxintr;
 	enable_intr();		/* back to usual state */
 
 #endif /* SMP */
+}
+
+static void
+fpusave(addr)
+	union savefpu *addr;
+{
+#ifdef CPU_ENABLE_SSE
+	static struct savexmm svxmm __attribute__((aligned(16)));
+
+	if (cpu_fxsr) {
+		fxsave(&svxmm);
+		bcopy(&svxmm, addr, sizeof(struct savexmm));
+		return;
+	}
+#endif /* CPU_ENABLE_SSE */
+	fnsave(addr);
+}
+
+static void
+fpurstor(addr)
+	union savefpu *addr;
+{
+#ifdef CPU_ENABLE_SSE
+	static struct savexmm svxmm __attribute__((aligned(16)));
+
+	if (cpu_fxsr) {
+		bcopy(addr, &svxmm, sizeof (struct savexmm));
+		fxrstor(&svxmm);
+		return;
+	}
+#endif /* CPU_ENABLE_SSE */
+	frstor(addr);
 }
 
 #ifdef I586_CPU
 static long
 timezero(funcname, func)
 	const char *funcname;
 	void (*func) __P((void *buf, size_t len));
 
 {

>Release-Note:
>Audit-Trail:
>Unformatted:


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200101101235.f0ACZq000695>