Checking in Intel's contribution for SSE2 JPEG optimization, bug 125762, r=cathleen sr=scc

git-svn-id: svn://10.0.0.236/trunk@120252 18797224-902f-48f8-a5cc-f745e15eee43
2002-04-29 23:27:33 +00:00
parent 55d22709ee
commit adea6ff786
4 changed files with 649 additions and 6 deletions
--- a/mozilla/jpeg/jdapimin.c
+++ b/mozilla/jpeg/jdapimin.c
@@ -25,6 +25,12 @@ int MMXAvailable;
 int mmxsupport();
 #endif

+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+int SSE2Available = 0;
+int sse2support();
+#endif
+
+
 /*
 * Initialization of a JPEG decompression object.
 * The error manager must already be set up (in case memory manager fails).
@@ -41,6 +47,11 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
  if(!cpuidDetected)
  {
 	MMXAvailable = mmxsupport();
+
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+	SSE2Available = sse2support();
+#endif
+
 	cpuidDetected = 1;
  }
 #endif
@@ -462,10 +473,26 @@ NOT_SUPPORTED:

 	return mmx_supported;		
 }
+#endif

+#ifdef HAVE_SSE2_INTEL_MNEMONICS

+int sse2support()
+{
+	int sse2available = 0;
+	int my_edx;
+	_asm
+	{
+		mov eax, 01                       
+		cpuid                                    
+		mov my_edx, edx    
+	}
+	if (my_edx & (0x1 << 26)) 
+		sse2available = 1; 
+	else sse2available = 2;

-
+	return sse2available;
+}

 #endif

--- a/mozilla/jpeg/jddctmgr.c
+++ b/mozilla/jpeg/jddctmgr.c
@@ -19,7 +19,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdct.h"		/* Private declarations for DCT subsystem */
-
+extern int SSE2Available;

 /*
 * The decompressor input side (jdinput.c) saves away the appropriate
@@ -78,6 +78,14 @@ typedef union {
 #endif
 #endif

+GLOBAL(void)
+jpeg_idct_islow_sse2 (
+	j_decompress_ptr cinfo, 
+	jpeg_component_info * compptr,
+	JCOEFPTR coef_block,
+	JSAMPARRAY output_buf, 
+	JDIMENSION output_col);
+

 /*
 * Prepare for an output pass.
@@ -117,15 +125,43 @@ start_pass (j_decompress_ptr cinfo)
      switch (cinfo->dct_method) {
 #ifdef DCT_ISLOW_SUPPORTED
      case JDCT_ISLOW:
-	method_ptr = jpeg_idct_islow;
-	method = JDCT_ISLOW;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+		if(SSE2Available == 1)
+		{
+			method_ptr = jpeg_idct_islow_sse2;
+			method = JDCT_ISLOW;
+		}
+		else
+		{
+			method_ptr = jpeg_idct_islow;
+			method = JDCT_ISLOW;
+		}
+#else
+		method_ptr = jpeg_idct_islow;
+		method = JDCT_ISLOW;
+		  
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
 #endif
 #ifdef DCT_IFAST_SUPPORTED
      case JDCT_IFAST:
-	method_ptr = jpeg_idct_ifast;
-	method = JDCT_IFAST;
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+		if (SSE2Available==1) 
+		{
+			method_ptr = jpeg_idct_islow_sse2;
+			method = JDCT_ISLOW;
+		}
+		else
+		{
+			method_ptr = jpeg_idct_ifast;
+			method = JDCT_IFAST;
+		}
+#else
+		method_ptr = jpeg_idct_ifast;
+		method = JDCT_IFAST;
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
 	break;
+
 #endif
 #ifdef DCT_FLOAT_SUPPORTED
      case JDCT_FLOAT:
--- a/mozilla/jpeg/jidctint.c
+++ b/mozilla/jpeg/jidctint.c
@@ -386,4 +386,578 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
  }
 }

+
+#ifdef HAVE_SSE2_INTEL_MNEMONICS
+
+/*
+* Intel SSE2 optimized Inverse Discrete Cosine Transform
+*
+*
+* Copyright (c) 2001-2002 Intel Corporation
+* All Rights Reserved
+*
+*
+*  Authors:
+*      Danilov G.
+*
+*
+*-----------------------------------------------------------------------------
+*
+* References:
+*    K.R. Rao and P. Yip
+*       Discrete Cosine Transform.
+*       Algorithms, Advantages, Applications.
+*       Academic Press, Inc, London, 1990.
+*    JPEG Group's software.
+*       This implementation is based on Appendix A.2 of the book (R&Y) ...
+*
+*-----------------------------------------------------------------------------
+*/
+
+typedef unsigned char   Ipp8u;
+typedef unsigned short  Ipp16u;
+typedef unsigned int    Ipp32u;
+
+typedef signed char    Ipp8s;
+typedef signed short   Ipp16s;
+typedef signed int     Ipp32s;
+
+#define BITS_INV_ACC  4			
+#define SHIFT_INV_ROW  16 - BITS_INV_ACC
+#define SHIFT_INV_COL 1 + BITS_INV_ACC
+
+#define RND_INV_ROW  1024 * (6 - BITS_INV_ACC)	/* 1 << (SHIFT_INV_ROW-1)		*/
+#define RND_INV_COL = 16 * (BITS_INV_ACC - 3)   /* 1 << (SHIFT_INV_COL-1)		*/
+#define RND_INV_CORR = RND_INV_COL - 1          /* correction -1.0 and round	*/
+
+#define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536		/* -0.5 + (16.0 or 32.0)	*/
+#define c_inv_corr_1 1877 * (6 - BITS_INV_ACC)				/* 0.9167	*/	
+#define c_inv_corr_2 1236 * (6 - BITS_INV_ACC)				/* 0.6035	*/					
+#define c_inv_corr_3 680  * (6 - BITS_INV_ACC)				/* 0.3322	*/
+#define c_inv_corr_4 0    * (6 - BITS_INV_ACC)				/* 0.0		*/	
+#define c_inv_corr_5 -569  * (6 - BITS_INV_ACC)				/* -0.278	*/
+#define c_inv_corr_6 -512  * (6 - BITS_INV_ACC)				/* -0.25	*/	
+#define c_inv_corr_7 -651  * (6 - BITS_INV_ACC)				/* -0.3176	*/	
+
+#define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0
+#define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1
+#define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2
+#define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3
+#define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4
+#define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5
+#define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6
+#define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7
+
+/* Table for rows 0,4 - constants are multiplied on cos_4_16 */
+
+__declspec() short tab_i_04[] = { 
+	16384, 21407, 16384, 8867,		
+	-16384, 21407, 16384, -8867,	
+	16384,  -8867,  16384, -21407,  
+    16384,   8867, -16384, -21407,  
+    22725,  19266,  19266,  -4520,  
+    4520,  19266,  19266, -22725,   
+    12873, -22725,   4520, -12873,  
+    12873,   4520, -22725, -12873}; 
+
+/* Table for rows 1,7 - constants are multiplied on cos_1_16 */
+
+__declspec(align(16)) short tab_i_17[] = {
+	22725,  29692,  22725,  12299,   
+    -22725,  29692,  22725, -12299,  
+    22725, -12299,  22725, -29692,   
+    22725,  12299, -22725, -29692,   
+    31521,  26722,  26722,  -6270,   
+    6270,  26722,  26722, -31521,    
+    17855, -31521,   6270, -17855,   
+    17855,   6270, -31521, -17855};  
+
+/* Table for rows 2,6 - constants are multiplied on cos_2_16 */
+
+__declspec(align(16)) short tab_i_26[] = {
+	21407,  27969,  21407,  11585,	
+    -21407,  27969,  21407, -11585,	
+    21407, -11585,  21407, -27969,	
+    21407,  11585, -21407, -27969,	
+    29692,  25172,  25172,  -5906,	
+    5906,  25172,  25172, -29692,	
+    16819, -29692,   5906, -16819,	
+    16819,   5906, -29692, -16819};	
+
+/* Table for rows 3,5 - constants are multiplied on cos_3_16 */
+
+__declspec(align(16)) short tab_i_35[] = {
+	19266,  25172,  19266,  10426,	
+    -19266,  25172,  19266, -10426,	
+    19266, -10426,  19266, -25172,	
+    19266,  10426, -19266, -25172,	
+    26722,  22654,  22654,  -5315,	
+    5315,  22654,  22654, -26722,	
+    15137, -26722,   5315, -15137,	
+    15137,   5315, -26722, -15137};	
+	
+__declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0,
+	RND_INV_ROW_0,RND_INV_ROW_0};
+__declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1,
+	RND_INV_ROW_1,RND_INV_ROW_1};
+__declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2,
+	RND_INV_ROW_2,RND_INV_ROW_2};
+__declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3,
+	RND_INV_ROW_3,RND_INV_ROW_3};
+__declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4,
+	RND_INV_ROW_4,RND_INV_ROW_4};
+__declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5,
+	RND_INV_ROW_5,RND_INV_ROW_5};
+__declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6,
+	RND_INV_ROW_6,RND_INV_ROW_6};
+__declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7,
+	RND_INV_ROW_7,RND_INV_ROW_7};
+
+__declspec(align(16)) short tg_1_16[] = {
+	13036,  13036,  13036,  13036,	/* tg * (2<<16) + 0.5 */
+	13036,  13036,  13036,  13036};
+__declspec(align(16)) short tg_2_16[] = {
+	27146,  27146,  27146,  27146,	/* tg * (2<<16) + 0.5 */
+	27146,  27146,  27146,  27146};
+__declspec(align(16)) short tg_3_16[] = {
+	-21746, -21746, -21746, -21746,	/* tg * (2<<16) + 0.5 */
+	-21746, -21746, -21746, -21746};
+__declspec(align(16)) short cos_4_16[] = {
+	-19195, -19195, -19195, -19195,	/* cos * (2<<16) + 0.5 */
+	-19195, -19195, -19195, -19195};
+
+/*
+* In this implementation the outputs of the iDCT-1D are multiplied
+*    for rows 0,4 - on cos_4_16,
+*    for rows 1,7 - on cos_1_16,
+*    for rows 2,6 - on cos_2_16,
+*    for rows 3,5 - on cos_3_16
+* and are shifted to the left for rise of accuracy
+*
+* For used constants
+*    FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
+*
+*-----------------------------------------------------------------------------
+*
+* On the first stage the calculation is executed at once for two rows.
+* The permutation for each output row is done on second stage
+*    t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0
+*
+*-----------------------------------------------------------------------------
+*/
+	
+#define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm {	\
+	__asm pshuflw  xmm1, xmm0, 10001000b				\
+    __asm pshuflw  xmm0, xmm0, 11011101b    			\
+    __asm pshufhw  xmm1, xmm1, 10001000b    			\
+	__asm pshufhw  xmm0, xmm0, 11011101b				\
+	__asm movdqa   xmm2, XMMWORD PTR [TABLE]			\
+	__asm pmaddwd  xmm2, xmm1							\
+	__asm movdqa   xmm3, XMMWORD PTR [TABLE + 32]		\
+	__asm pmaddwd  xmm3, xmm0               			\
+	__asm pmaddwd  xmm1, XMMWORD PTR [TABLE + 16]		\
+	__asm pmaddwd  xmm0, XMMWORD PTR [TABLE + 48]		\
+	__asm pshuflw  xmm5, xmm4, 10001000b				\
+	__asm pshuflw  xmm4, xmm4, 11011101b    			\
+	__asm pshufhw  xmm5, xmm5, 10001000b    			\
+	__asm pshufhw  xmm4, xmm4, 11011101b    			\
+	__asm movdqa   xmm6, XMMWORD PTR [TABLE]			\
+	__asm pmaddwd  xmm6, xmm5               			\
+	__asm movdqa   xmm7, XMMWORD PTR [TABLE + 32]		\
+	__asm pmaddwd  xmm7, xmm4               			\
+	__asm pmaddwd  xmm5, XMMWORD PTR [TABLE + 16]		\
+	__asm pmaddwd  xmm4, XMMWORD PTR [TABLE + 48]		\
+	__asm pshufd   xmm1, xmm1, 01001110b    			\
+	__asm pshufd   xmm0, xmm0, 01001110b    			\
+	__asm paddd    xmm2, XMMWORD PTR [ROUND1]			\
+	__asm paddd    xmm3, xmm0							\
+	__asm paddd    xmm1, xmm2							\
+	__asm pshufd   xmm5, xmm5, 01001110b    			\
+	__asm pshufd   xmm4, xmm4, 01001110b    			\
+	__asm movdqa   xmm2, xmm1             				\
+	__asm psubd    xmm2, xmm3             				\
+	__asm psrad    xmm2, SHIFT_INV_ROW    				\
+	__asm paddd    xmm1, xmm3							\
+	__asm psrad    xmm1, SHIFT_INV_ROW      			\
+	__asm packssdw xmm1, xmm2							\
+	__asm paddd    xmm6, XMMWORD PTR [ROUND2]			\
+	__asm paddd    xmm7, xmm4							\
+	__asm paddd    xmm5, xmm6							\
+	__asm movdqa   xmm6, xmm5	            			\
+	__asm psubd    xmm6, xmm7               			\
+	__asm psrad    xmm6, SHIFT_INV_ROW      			\
+	__asm paddd    xmm5, xmm7							\
+	__asm psrad    xmm5, SHIFT_INV_ROW      			\
+	__asm packssdw xmm5, xmm6							\
+	}
+
+/*
+*
+* The second stage - inverse DCTs of columns
+*
+* The inputs are multiplied
+*    for rows 0,4 - on cos_4_16,
+*    for rows 1,7 - on cos_1_16,
+*    for rows 2,6 - on cos_2_16,
+*    for rows 3,5 - on cos_3_16
+* and are shifted to the left for rise of accuracy
+*/
+
+#define DCT_8_INV_COL_8R(INP, OUTP) __asm {		\
+	__asm movdqa   xmm0, [INP + 5*16]			\
+    __asm movdqa   xmm1, XMMWORD PTR tg_3_16	\
+    __asm movdqa   xmm2, xmm0            		\
+    __asm movdqa   xmm3, [INP + 3*16]   		\
+    __asm pmulhw   xmm0, xmm1           		\
+    __asm movdqa   xmm4, [INP + 7*16]   		\
+    __asm pmulhw   xmm1, xmm3           		\
+    __asm movdqa   xmm5, XMMWORD PTR tg_1_16   	\
+    __asm movdqa   xmm6, xmm4            		\
+    __asm pmulhw   xmm4, xmm5           		\
+    __asm paddsw   xmm0, xmm2           		\
+    __asm pmulhw   xmm5, [INP + 1*16]   		\
+    __asm paddsw   xmm1, xmm3           		\
+    __asm movdqa   xmm7, [INP + 6*16]    		\
+    __asm paddsw   xmm0, xmm3					\
+    __asm movdqa   xmm3, XMMWORD PTR tg_2_16	\
+    __asm psubsw   xmm2, xmm1					\
+    __asm pmulhw   xmm7, xmm3            		\
+    __asm movdqa   xmm1, xmm0            		\
+    __asm pmulhw   xmm3, [INP + 2*16]   		\
+    __asm psubsw   xmm5, xmm6					\
+    __asm paddsw   xmm4, [INP + 1*16]    		\
+    __asm paddsw   xmm0, xmm4            		\
+    __asm psubsw   xmm4, xmm1					\
+    __asm pshufhw  xmm0, xmm0, 00011011b		\
+    __asm paddsw   xmm7, [INP + 2*16]    		\
+    __asm movdqa   xmm6, xmm5					\
+    __asm psubsw   xmm3, [INP + 6*16]    		\
+    __asm psubsw   xmm5, xmm2            		\
+    __asm paddsw   xmm6, xmm2					\
+	__asm movdqa   [OUTP + 7*16], xmm0    		\
+    __asm movdqa   xmm1, xmm4            		\
+    __asm movdqa   xmm2, XMMWORD PTR cos_4_16  	\
+    __asm paddsw   xmm4, xmm5            		\
+    __asm movdqa   xmm0, XMMWORD PTR cos_4_16  	\
+    __asm pmulhw   xmm2, xmm4					\
+    __asm pshufhw  xmm6, xmm6, 00011011b		\
+    __asm movdqa   [OUTP + 3*16], xmm6    		\
+    __asm psubsw   xmm1, xmm5            		\
+    __asm movdqa   xmm6, [INP + 0*16]   		\
+    __asm pmulhw   xmm0, xmm1					\
+    __asm movdqa   xmm5, [INP + 4*16]    		\
+    __asm paddsw   xmm4, xmm2					\
+    __asm paddsw   xmm5, xmm6       			\
+    __asm psubsw   xmm6, [INP + 4*16]   		\
+    __asm paddsw   xmm0, xmm1					\
+    __asm pshufhw  xmm4, xmm4, 00011011b		\
+    __asm movdqa   xmm2, xmm5            		\
+    __asm paddsw   xmm5, xmm7            		\
+    __asm movdqa   xmm1, xmm6					\
+    __asm psubsw   xmm2, xmm7					\
+    __asm movdqa   xmm7, [OUTP + 7*16]    		\
+    __asm paddsw   xmm6, xmm3            		\
+    __asm pshufhw  xmm5, xmm5, 00011011b		\
+	__asm paddsw   xmm7, xmm5					\
+    __asm psubsw   xmm1, xmm3					\
+    __asm pshufhw  xmm6, xmm6, 00011011b		\
+	__asm movdqa   xmm3, xmm6					\
+    __asm paddsw   xmm6, xmm4            		\
+    __asm pshufhw  xmm2, xmm2, 00011011b		\
+    __asm psraw    xmm7, SHIFT_INV_COL   		\
+    __asm movdqa   [OUTP + 0*16], xmm7    		\
+    __asm movdqa   xmm7, xmm1            		\
+    __asm paddsw   xmm1, xmm0					\
+    __asm psraw    xmm6, SHIFT_INV_COL			\
+    __asm movdqa   [OUTP + 1*16], xmm6    		\
+    __asm pshufhw  xmm1, xmm1, 00011011b		\
+	__asm movdqa   xmm6, [OUTP + 3*16]			\
+    __asm psubsw   xmm7, xmm0            		\
+    __asm psraw    xmm1, SHIFT_INV_COL   		\
+    __asm movdqa   [OUTP + 2*16], xmm1    		\
+    __asm psubsw   xmm5, [OUTP + 7*16]			\
+    __asm paddsw   xmm6, xmm2            		\
+    __asm psubsw   xmm2, [OUTP + 3*16]			\
+    __asm psubsw   xmm3, xmm4            		\
+    __asm psraw    xmm7, SHIFT_INV_COL  		\
+    __asm pshufhw  xmm7, xmm7, 00011011b		\
+    __asm movdqa   [OUTP + 5*16], xmm7    		\
+    __asm psraw    xmm5, SHIFT_INV_COL			\
+    __asm movdqa   [OUTP + 7*16], xmm5    		\
+    __asm psraw    xmm6, SHIFT_INV_COL			\
+    __asm movdqa   [OUTP + 3*16], xmm6    		\
+    __asm psraw    xmm2, SHIFT_INV_COL			\
+    __asm movdqa   [OUTP + 4*16], xmm2    		\
+    __asm psraw    xmm3, SHIFT_INV_COL			\
+    __asm movdqa   [OUTP + 6*16], xmm3    		\
+	}
+
+/*
+*
+*  Name:      dct_8x8_inv_16s
+*  Purpose:   Inverse Discrete Cosine Transform 8x8 with
+*             2D buffer of short int data
+*  Context:
+*      void dct_8x8_inv_16s ( short *src, short *dst )
+*  Parameters:
+*      src  - Pointer to the source buffer
+*      dst  - Pointer to the destination buffer
+*
+*/
+
+GLOBAL(void)
+dct_8x8_inv_16s ( short *src, short *dst ) {
+	
+	__asm {
+
+		mov     ecx,  src
+		mov     edx,  dst
+
+		movdqa  xmm0, [ecx+0*16]
+		movdqa  xmm4, [ecx+4*16]
+		DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4)
+		movdqa     [edx+0*16], xmm1 
+		movdqa     [edx+4*16], xmm5 
+
+		movdqa  xmm0, [ecx+1*16]
+		movdqa  xmm4, [ecx+7*16]
+		DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7)
+		movdqa     [edx+1*16], xmm1 
+		movdqa     [edx+7*16], xmm5 
+
+		movdqa  xmm0, [ecx+3*16]
+		movdqa  xmm4, [ecx+5*16]
+		DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5);
+		movdqa     [edx+3*16], xmm1 
+		movdqa     [edx+5*16], xmm5 
+
+		movdqa  xmm0, [ecx+2*16]
+		movdqa  xmm4, [ecx+6*16]
+		DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6);
+		movdqa     [edx+2*16], xmm1
+		movdqa     [edx+6*16], xmm5    
+
+		DCT_8_INV_COL_8R(edx+0, edx+0);
+	}
+}
+
+
+/*
+*  Name:
+*    ownpj_QuantInv_8x8_16s
+*
+*  Purpose:
+*    Dequantize 8x8 block of DCT coefficients
+*
+*  Context:
+*    void ownpj_QuantInv_8x8_16s
+*            Ipp16s*  pSrc,
+*            Ipp16s*  pDst,
+*      const Ipp16u*  pQTbl)*
+*
+*/
+
+GLOBAL(void)
+ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl)
+{
+	__asm {
+
+		push        ebx
+		push        ecx
+		push        edx
+		push        esi
+		push        edi
+
+		mov         esi, pSrc
+		mov         edi, pDst
+		mov         edx, pQTbl
+		mov         ecx, 4
+		mov         ebx, 32
+
+	again:
+
+		movq        mm0, QWORD PTR [esi+0]
+		movq        mm1, QWORD PTR [esi+8]
+		movq        mm2, QWORD PTR [esi+16]
+		movq        mm3, QWORD PTR [esi+24]
+
+		prefetcht0  [esi+ebx] ; fetch next cache line
+
+		pmullw      mm0, QWORD PTR [edx+0]
+		pmullw      mm1, QWORD PTR [edx+8]
+		pmullw      mm2, QWORD PTR [edx+16]
+		pmullw      mm3, QWORD PTR [edx+24]
+
+		movq        QWORD PTR [edi+0], mm0
+		movq        QWORD PTR [edi+8], mm1
+		movq        QWORD PTR [edi+16], mm2
+		movq        QWORD PTR [edi+24], mm3
+
+		add         esi, ebx
+		add         edi, ebx
+		add         edx, ebx
+		dec         ecx
+		jnz         again
+
+		emms
+
+		pop         edi
+		pop         esi
+		pop         edx
+		pop         ecx
+		pop         ebx
+	}
+}
+
+
+/*
+*  Name:
+*    ownpj_Add128_8x8_16s8u
+*
+*  Purpose:
+*    signed to unsigned conversion (level shift)
+*    for 8x8 block of DCT coefficients
+*
+*  Context:
+*    void ownpj_Add128_8x8_16s8u
+*      const Ipp16s* pSrc,
+*            Ipp8u*  pDst,
+*            int     DstStep);
+*
+*/
+
+__declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080};
+
+GLOBAL(void)
+ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep)
+{
+	__asm {
+		push        eax
+		push        ebx
+		push        ecx
+		push        edx
+		push        esi
+		push        edi
+
+		mov         esi, pSrc
+		mov         edi, pDst
+		mov         edx, DstStep
+		mov         ecx, 2
+		mov         ebx, edx
+		mov         eax, edx
+		sal         ebx, 1
+		add         eax, ebx
+		movdqa      xmm7, XMMWORD PTR const_128
+
+	again:
+
+		movdqa      xmm0, XMMWORD PTR [esi+0]  ; line 0
+		movdqa      xmm1, XMMWORD PTR [esi+16] ; line 1
+		movdqa      xmm2, XMMWORD PTR [esi+32] ; line 2
+		movdqa      xmm3, XMMWORD PTR [esi+48] ; line 3
+
+		paddw     xmm0, xmm7
+		paddw     xmm1, xmm7
+		paddw     xmm2, xmm7
+		paddw     xmm3, xmm7
+
+		packuswb  xmm0, xmm1
+		packuswb  xmm2, xmm3
+
+		movq      QWORD PTR [edi], xmm0      ;0*DstStep
+		movq      QWORD PTR [edi+ebx], xmm2  ;2*DstStep
+
+		psrldq      xmm0, 8
+		psrldq      xmm2, 8
+
+		movq      QWORD PTR [edi+edx], xmm0  ;1*DstStep
+		movq      QWORD PTR [edi+eax], xmm2  ;3*DstStep
+
+		add         edi, ebx
+		add         esi, 64
+		add         edi, ebx
+		dec         ecx
+		jnz         again
+
+		pop         edi
+		pop         esi
+		pop         edx
+		pop         ecx
+		pop         ebx
+		pop         eax
+	}
+}
+
+
+/* 
+*  Name:
+*    ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R
+*
+*  Purpose:
+*    Inverse DCT transform, de-quantization and level shift
+*
+*  Parameters:
+*    pSrc               - pointer to source
+*    pDst               - pointer to output array
+*    DstStep            - line offset for output data
+*    pEncoderQuantTable - pointer to Quantization table
+*
+*/
+
+GLOBAL(void)
+ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(
+  short * pSrc,
+  unsigned char *  pDst,
+  int     DstStep,
+  const unsigned short * pQuantInvTable)
+{
+
+	__declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)];
+	Ipp16s * workbuf = (Ipp16s *)buf;	
+
+	ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable);
+	dct_8x8_inv_16s(workbuf,workbuf);
+	ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep);
+  
+} 
+
+GLOBAL(void)
+jpeg_idct_islow_sse2 (
+	j_decompress_ptr cinfo, 
+	jpeg_component_info * compptr,
+	JCOEFPTR coef_block,
+	JSAMPARRAY output_buf, 
+	JDIMENSION output_col)
+{
+	int			ctr;
+	JCOEFPTR	inptr;
+	Ipp16u*		quantptr;
+	Ipp8u*		wsptr;
+	__declspec(align(16)) Ipp8u workspace[DCTSIZE2];  	
+	JSAMPROW	outptr;
+
+	inptr = coef_block;
+	quantptr = (Ipp16u*)compptr->dct_table;
+	wsptr = workspace;
+	
+	ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr);
+
+	for(ctr = 0; ctr < DCTSIZE; ctr++)
+	{
+		outptr = output_buf[ctr] + output_col;
+
+		outptr[0] = wsptr[0];
+		outptr[1] = wsptr[1];
+		outptr[2] = wsptr[2];
+		outptr[3] = wsptr[3];
+		outptr[4] = wsptr[4];
+		outptr[5] = wsptr[5];
+		outptr[6] = wsptr[6];
+		outptr[7] = wsptr[7];
+
+		wsptr += DCTSIZE;
+	}
+}
+#endif /* HAVE_SSE2_INTEL_MNEMONICS */
+
 #endif /* DCT_ISLOW_SUPPORTED */
--- a/mozilla/jpeg/jmorecfg.h
+++ b/mozilla/jpeg/jmorecfg.h
@@ -111,6 +111,12 @@ typedef short JCOEF;
 #define HAVE_MMX_INTEL_MNEMONICS 
 #endif

+/* Defines for SSE2 support. */
+#if defined(XP_WIN32) && defined(_M_IX86) && defined(__m128i)
+#define HAVE_SSE2_INTEL_MNEMONICS 
+#endif
+
+
 /* Compressed datastreams are represented as arrays of JOCTET.
 * These must be EXACTLY 8 bits wide, at least once they are written to
 * external storage.  Note that when using the stdio data source/destination