From adea6ff7861ba19128ec1248cf219a58002d73a8 Mon Sep 17 00:00:00 2001 From: "cathleen%netscape.com" Date: Mon, 29 Apr 2002 23:27:33 +0000 Subject: [PATCH] Checking in Intel's contribution for SSE2 JPEG optimization, bug 125762, r=cathleen sr=scc git-svn-id: svn://10.0.0.236/trunk@120252 18797224-902f-48f8-a5cc-f745e15eee43 --- mozilla/jpeg/jdapimin.c | 29 +- mozilla/jpeg/jddctmgr.c | 46 +++- mozilla/jpeg/jidctint.c | 574 ++++++++++++++++++++++++++++++++++++++++ mozilla/jpeg/jmorecfg.h | 6 + 4 files changed, 649 insertions(+), 6 deletions(-) diff --git a/mozilla/jpeg/jdapimin.c b/mozilla/jpeg/jdapimin.c index ab57dc688fe..3f6b1cce600 100644 --- a/mozilla/jpeg/jdapimin.c +++ b/mozilla/jpeg/jdapimin.c @@ -25,6 +25,12 @@ int MMXAvailable; int mmxsupport(); #endif +#ifdef HAVE_SSE2_INTEL_MNEMONICS +int SSE2Available = 0; +int sse2support(); +#endif + + /* * Initialization of a JPEG decompression object. * The error manager must already be set up (in case memory manager fails). @@ -41,6 +47,11 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize) if(!cpuidDetected) { MMXAvailable = mmxsupport(); + +#ifdef HAVE_SSE2_INTEL_MNEMONICS + SSE2Available = sse2support(); +#endif + cpuidDetected = 1; } #endif @@ -462,10 +473,26 @@ NOT_SUPPORTED: return mmx_supported; } +#endif +#ifdef HAVE_SSE2_INTEL_MNEMONICS +int sse2support() +{ + int sse2available = 0; + int my_edx; + _asm + { + mov eax, 01 + cpuid + mov my_edx, edx + } + if (my_edx & (0x1 << 26)) + sse2available = 1; + else sse2available = 2; - + return sse2available; +} #endif diff --git a/mozilla/jpeg/jddctmgr.c b/mozilla/jpeg/jddctmgr.c index bbf8d0e92fd..afdb180af13 100644 --- a/mozilla/jpeg/jddctmgr.c +++ b/mozilla/jpeg/jddctmgr.c @@ -19,7 +19,7 @@ #include "jinclude.h" #include "jpeglib.h" #include "jdct.h" /* Private declarations for DCT subsystem */ - +extern int SSE2Available; /* * The decompressor input side (jdinput.c) saves away the appropriate @@ -78,6 +78,14 @@ typedef union { #endif #endif +GLOBAL(void) +jpeg_idct_islow_sse2 ( + j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col); + /* * Prepare for an output pass. @@ -117,15 +125,43 @@ start_pass (j_decompress_ptr cinfo) switch (cinfo->dct_method) { #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: - method_ptr = jpeg_idct_islow; - method = JDCT_ISLOW; +#ifdef HAVE_SSE2_INTEL_MNEMONICS + if(SSE2Available == 1) + { + method_ptr = jpeg_idct_islow_sse2; + method = JDCT_ISLOW; + } + else + { + method_ptr = jpeg_idct_islow; + method = JDCT_ISLOW; + } +#else + method_ptr = jpeg_idct_islow; + method = JDCT_ISLOW; + +#endif /* HAVE_SSE2_INTEL_MNEMONICS */ break; #endif #ifdef DCT_IFAST_SUPPORTED case JDCT_IFAST: - method_ptr = jpeg_idct_ifast; - method = JDCT_IFAST; +#ifdef HAVE_SSE2_INTEL_MNEMONICS + if (SSE2Available==1) + { + method_ptr = jpeg_idct_islow_sse2; + method = JDCT_ISLOW; + } + else + { + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; + } +#else + method_ptr = jpeg_idct_ifast; + method = JDCT_IFAST; +#endif /* HAVE_SSE2_INTEL_MNEMONICS */ break; + #endif #ifdef DCT_FLOAT_SUPPORTED case JDCT_FLOAT: diff --git a/mozilla/jpeg/jidctint.c b/mozilla/jpeg/jidctint.c index a72b3207caf..0e5cf8833b6 100644 --- a/mozilla/jpeg/jidctint.c +++ b/mozilla/jpeg/jidctint.c @@ -386,4 +386,578 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, } } + +#ifdef HAVE_SSE2_INTEL_MNEMONICS + +/* +* Intel SSE2 optimized Inverse Discrete Cosine Transform +* +* +* Copyright (c) 2001-2002 Intel Corporation +* All Rights Reserved +* +* +* Authors: +* Danilov G. +* +* +*----------------------------------------------------------------------------- +* +* References: +* K.R. Rao and P. Yip +* Discrete Cosine Transform. +* Algorithms, Advantages, Applications. +* Academic Press, Inc, London, 1990. +* JPEG Group's software. +* This implementation is based on Appendix A.2 of the book (R&Y) ... +* +*----------------------------------------------------------------------------- +*/ + +typedef unsigned char Ipp8u; +typedef unsigned short Ipp16u; +typedef unsigned int Ipp32u; + +typedef signed char Ipp8s; +typedef signed short Ipp16s; +typedef signed int Ipp32s; + +#define BITS_INV_ACC 4 +#define SHIFT_INV_ROW 16 - BITS_INV_ACC +#define SHIFT_INV_COL 1 + BITS_INV_ACC + +#define RND_INV_ROW 1024 * (6 - BITS_INV_ACC) /* 1 << (SHIFT_INV_ROW-1) */ +#define RND_INV_COL = 16 * (BITS_INV_ACC - 3) /* 1 << (SHIFT_INV_COL-1) */ +#define RND_INV_CORR = RND_INV_COL - 1 /* correction -1.0 and round */ + +#define c_inv_corr_0 -1024 * (6 - BITS_INV_ACC) + 65536 /* -0.5 + (16.0 or 32.0) */ +#define c_inv_corr_1 1877 * (6 - BITS_INV_ACC) /* 0.9167 */ +#define c_inv_corr_2 1236 * (6 - BITS_INV_ACC) /* 0.6035 */ +#define c_inv_corr_3 680 * (6 - BITS_INV_ACC) /* 0.3322 */ +#define c_inv_corr_4 0 * (6 - BITS_INV_ACC) /* 0.0 */ +#define c_inv_corr_5 -569 * (6 - BITS_INV_ACC) /* -0.278 */ +#define c_inv_corr_6 -512 * (6 - BITS_INV_ACC) /* -0.25 */ +#define c_inv_corr_7 -651 * (6 - BITS_INV_ACC) /* -0.3176 */ + +#define RND_INV_ROW_0 RND_INV_ROW + c_inv_corr_0 +#define RND_INV_ROW_1 RND_INV_ROW + c_inv_corr_1 +#define RND_INV_ROW_2 RND_INV_ROW + c_inv_corr_2 +#define RND_INV_ROW_3 RND_INV_ROW + c_inv_corr_3 +#define RND_INV_ROW_4 RND_INV_ROW + c_inv_corr_4 +#define RND_INV_ROW_5 RND_INV_ROW + c_inv_corr_5 +#define RND_INV_ROW_6 RND_INV_ROW + c_inv_corr_6 +#define RND_INV_ROW_7 RND_INV_ROW + c_inv_corr_7 + +/* Table for rows 0,4 - constants are multiplied on cos_4_16 */ + +__declspec() short tab_i_04[] = { + 16384, 21407, 16384, 8867, + -16384, 21407, 16384, -8867, + 16384, -8867, 16384, -21407, + 16384, 8867, -16384, -21407, + 22725, 19266, 19266, -4520, + 4520, 19266, 19266, -22725, + 12873, -22725, 4520, -12873, + 12873, 4520, -22725, -12873}; + +/* Table for rows 1,7 - constants are multiplied on cos_1_16 */ + +__declspec(align(16)) short tab_i_17[] = { + 22725, 29692, 22725, 12299, + -22725, 29692, 22725, -12299, + 22725, -12299, 22725, -29692, + 22725, 12299, -22725, -29692, + 31521, 26722, 26722, -6270, + 6270, 26722, 26722, -31521, + 17855, -31521, 6270, -17855, + 17855, 6270, -31521, -17855}; + +/* Table for rows 2,6 - constants are multiplied on cos_2_16 */ + +__declspec(align(16)) short tab_i_26[] = { + 21407, 27969, 21407, 11585, + -21407, 27969, 21407, -11585, + 21407, -11585, 21407, -27969, + 21407, 11585, -21407, -27969, + 29692, 25172, 25172, -5906, + 5906, 25172, 25172, -29692, + 16819, -29692, 5906, -16819, + 16819, 5906, -29692, -16819}; + +/* Table for rows 3,5 - constants are multiplied on cos_3_16 */ + +__declspec(align(16)) short tab_i_35[] = { + 19266, 25172, 19266, 10426, + -19266, 25172, 19266, -10426, + 19266, -10426, 19266, -25172, + 19266, 10426, -19266, -25172, + 26722, 22654, 22654, -5315, + 5315, 22654, 22654, -26722, + 15137, -26722, 5315, -15137, + 15137, 5315, -26722, -15137}; + +__declspec(align(16)) long round_i_0[] = {RND_INV_ROW_0,RND_INV_ROW_0, + RND_INV_ROW_0,RND_INV_ROW_0}; +__declspec(align(16)) long round_i_1[] = {RND_INV_ROW_1,RND_INV_ROW_1, + RND_INV_ROW_1,RND_INV_ROW_1}; +__declspec(align(16)) long round_i_2[] = {RND_INV_ROW_2,RND_INV_ROW_2, + RND_INV_ROW_2,RND_INV_ROW_2}; +__declspec(align(16)) long round_i_3[] = {RND_INV_ROW_3,RND_INV_ROW_3, + RND_INV_ROW_3,RND_INV_ROW_3}; +__declspec(align(16)) long round_i_4[] = {RND_INV_ROW_4,RND_INV_ROW_4, + RND_INV_ROW_4,RND_INV_ROW_4}; +__declspec(align(16)) long round_i_5[] = {RND_INV_ROW_5,RND_INV_ROW_5, + RND_INV_ROW_5,RND_INV_ROW_5}; +__declspec(align(16)) long round_i_6[] = {RND_INV_ROW_6,RND_INV_ROW_6, + RND_INV_ROW_6,RND_INV_ROW_6}; +__declspec(align(16)) long round_i_7[] = {RND_INV_ROW_7,RND_INV_ROW_7, + RND_INV_ROW_7,RND_INV_ROW_7}; + +__declspec(align(16)) short tg_1_16[] = { + 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */ + 13036, 13036, 13036, 13036}; +__declspec(align(16)) short tg_2_16[] = { + 27146, 27146, 27146, 27146, /* tg * (2<<16) + 0.5 */ + 27146, 27146, 27146, 27146}; +__declspec(align(16)) short tg_3_16[] = { + -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */ + -21746, -21746, -21746, -21746}; +__declspec(align(16)) short cos_4_16[] = { + -19195, -19195, -19195, -19195, /* cos * (2<<16) + 0.5 */ + -19195, -19195, -19195, -19195}; + +/* +* In this implementation the outputs of the iDCT-1D are multiplied +* for rows 0,4 - on cos_4_16, +* for rows 1,7 - on cos_1_16, +* for rows 2,6 - on cos_2_16, +* for rows 3,5 - on cos_3_16 +* and are shifted to the left for rise of accuracy +* +* For used constants +* FIX(float_const) = (short) (float_const * (1<<15) + 0.5) +* +*----------------------------------------------------------------------------- +* +* On the first stage the calculation is executed at once for two rows. +* The permutation for each output row is done on second stage +* t7 t6 t5 t4 t3 t2 t1 t0 -> t4 t5 t6 t7 t3 t2 t1 t0 +* +*----------------------------------------------------------------------------- +*/ + +#define DCT_8_INV_ROW_2R(TABLE, ROUND1, ROUND2) __asm { \ + __asm pshuflw xmm1, xmm0, 10001000b \ + __asm pshuflw xmm0, xmm0, 11011101b \ + __asm pshufhw xmm1, xmm1, 10001000b \ + __asm pshufhw xmm0, xmm0, 11011101b \ + __asm movdqa xmm2, XMMWORD PTR [TABLE] \ + __asm pmaddwd xmm2, xmm1 \ + __asm movdqa xmm3, XMMWORD PTR [TABLE + 32] \ + __asm pmaddwd xmm3, xmm0 \ + __asm pmaddwd xmm1, XMMWORD PTR [TABLE + 16] \ + __asm pmaddwd xmm0, XMMWORD PTR [TABLE + 48] \ + __asm pshuflw xmm5, xmm4, 10001000b \ + __asm pshuflw xmm4, xmm4, 11011101b \ + __asm pshufhw xmm5, xmm5, 10001000b \ + __asm pshufhw xmm4, xmm4, 11011101b \ + __asm movdqa xmm6, XMMWORD PTR [TABLE] \ + __asm pmaddwd xmm6, xmm5 \ + __asm movdqa xmm7, XMMWORD PTR [TABLE + 32] \ + __asm pmaddwd xmm7, xmm4 \ + __asm pmaddwd xmm5, XMMWORD PTR [TABLE + 16] \ + __asm pmaddwd xmm4, XMMWORD PTR [TABLE + 48] \ + __asm pshufd xmm1, xmm1, 01001110b \ + __asm pshufd xmm0, xmm0, 01001110b \ + __asm paddd xmm2, XMMWORD PTR [ROUND1] \ + __asm paddd xmm3, xmm0 \ + __asm paddd xmm1, xmm2 \ + __asm pshufd xmm5, xmm5, 01001110b \ + __asm pshufd xmm4, xmm4, 01001110b \ + __asm movdqa xmm2, xmm1 \ + __asm psubd xmm2, xmm3 \ + __asm psrad xmm2, SHIFT_INV_ROW \ + __asm paddd xmm1, xmm3 \ + __asm psrad xmm1, SHIFT_INV_ROW \ + __asm packssdw xmm1, xmm2 \ + __asm paddd xmm6, XMMWORD PTR [ROUND2] \ + __asm paddd xmm7, xmm4 \ + __asm paddd xmm5, xmm6 \ + __asm movdqa xmm6, xmm5 \ + __asm psubd xmm6, xmm7 \ + __asm psrad xmm6, SHIFT_INV_ROW \ + __asm paddd xmm5, xmm7 \ + __asm psrad xmm5, SHIFT_INV_ROW \ + __asm packssdw xmm5, xmm6 \ + } + +/* +* +* The second stage - inverse DCTs of columns +* +* The inputs are multiplied +* for rows 0,4 - on cos_4_16, +* for rows 1,7 - on cos_1_16, +* for rows 2,6 - on cos_2_16, +* for rows 3,5 - on cos_3_16 +* and are shifted to the left for rise of accuracy +*/ + +#define DCT_8_INV_COL_8R(INP, OUTP) __asm { \ + __asm movdqa xmm0, [INP + 5*16] \ + __asm movdqa xmm1, XMMWORD PTR tg_3_16 \ + __asm movdqa xmm2, xmm0 \ + __asm movdqa xmm3, [INP + 3*16] \ + __asm pmulhw xmm0, xmm1 \ + __asm movdqa xmm4, [INP + 7*16] \ + __asm pmulhw xmm1, xmm3 \ + __asm movdqa xmm5, XMMWORD PTR tg_1_16 \ + __asm movdqa xmm6, xmm4 \ + __asm pmulhw xmm4, xmm5 \ + __asm paddsw xmm0, xmm2 \ + __asm pmulhw xmm5, [INP + 1*16] \ + __asm paddsw xmm1, xmm3 \ + __asm movdqa xmm7, [INP + 6*16] \ + __asm paddsw xmm0, xmm3 \ + __asm movdqa xmm3, XMMWORD PTR tg_2_16 \ + __asm psubsw xmm2, xmm1 \ + __asm pmulhw xmm7, xmm3 \ + __asm movdqa xmm1, xmm0 \ + __asm pmulhw xmm3, [INP + 2*16] \ + __asm psubsw xmm5, xmm6 \ + __asm paddsw xmm4, [INP + 1*16] \ + __asm paddsw xmm0, xmm4 \ + __asm psubsw xmm4, xmm1 \ + __asm pshufhw xmm0, xmm0, 00011011b \ + __asm paddsw xmm7, [INP + 2*16] \ + __asm movdqa xmm6, xmm5 \ + __asm psubsw xmm3, [INP + 6*16] \ + __asm psubsw xmm5, xmm2 \ + __asm paddsw xmm6, xmm2 \ + __asm movdqa [OUTP + 7*16], xmm0 \ + __asm movdqa xmm1, xmm4 \ + __asm movdqa xmm2, XMMWORD PTR cos_4_16 \ + __asm paddsw xmm4, xmm5 \ + __asm movdqa xmm0, XMMWORD PTR cos_4_16 \ + __asm pmulhw xmm2, xmm4 \ + __asm pshufhw xmm6, xmm6, 00011011b \ + __asm movdqa [OUTP + 3*16], xmm6 \ + __asm psubsw xmm1, xmm5 \ + __asm movdqa xmm6, [INP + 0*16] \ + __asm pmulhw xmm0, xmm1 \ + __asm movdqa xmm5, [INP + 4*16] \ + __asm paddsw xmm4, xmm2 \ + __asm paddsw xmm5, xmm6 \ + __asm psubsw xmm6, [INP + 4*16] \ + __asm paddsw xmm0, xmm1 \ + __asm pshufhw xmm4, xmm4, 00011011b \ + __asm movdqa xmm2, xmm5 \ + __asm paddsw xmm5, xmm7 \ + __asm movdqa xmm1, xmm6 \ + __asm psubsw xmm2, xmm7 \ + __asm movdqa xmm7, [OUTP + 7*16] \ + __asm paddsw xmm6, xmm3 \ + __asm pshufhw xmm5, xmm5, 00011011b \ + __asm paddsw xmm7, xmm5 \ + __asm psubsw xmm1, xmm3 \ + __asm pshufhw xmm6, xmm6, 00011011b \ + __asm movdqa xmm3, xmm6 \ + __asm paddsw xmm6, xmm4 \ + __asm pshufhw xmm2, xmm2, 00011011b \ + __asm psraw xmm7, SHIFT_INV_COL \ + __asm movdqa [OUTP + 0*16], xmm7 \ + __asm movdqa xmm7, xmm1 \ + __asm paddsw xmm1, xmm0 \ + __asm psraw xmm6, SHIFT_INV_COL \ + __asm movdqa [OUTP + 1*16], xmm6 \ + __asm pshufhw xmm1, xmm1, 00011011b \ + __asm movdqa xmm6, [OUTP + 3*16] \ + __asm psubsw xmm7, xmm0 \ + __asm psraw xmm1, SHIFT_INV_COL \ + __asm movdqa [OUTP + 2*16], xmm1 \ + __asm psubsw xmm5, [OUTP + 7*16] \ + __asm paddsw xmm6, xmm2 \ + __asm psubsw xmm2, [OUTP + 3*16] \ + __asm psubsw xmm3, xmm4 \ + __asm psraw xmm7, SHIFT_INV_COL \ + __asm pshufhw xmm7, xmm7, 00011011b \ + __asm movdqa [OUTP + 5*16], xmm7 \ + __asm psraw xmm5, SHIFT_INV_COL \ + __asm movdqa [OUTP + 7*16], xmm5 \ + __asm psraw xmm6, SHIFT_INV_COL \ + __asm movdqa [OUTP + 3*16], xmm6 \ + __asm psraw xmm2, SHIFT_INV_COL \ + __asm movdqa [OUTP + 4*16], xmm2 \ + __asm psraw xmm3, SHIFT_INV_COL \ + __asm movdqa [OUTP + 6*16], xmm3 \ + } + +/* +* +* Name: dct_8x8_inv_16s +* Purpose: Inverse Discrete Cosine Transform 8x8 with +* 2D buffer of short int data +* Context: +* void dct_8x8_inv_16s ( short *src, short *dst ) +* Parameters: +* src - Pointer to the source buffer +* dst - Pointer to the destination buffer +* +*/ + +GLOBAL(void) +dct_8x8_inv_16s ( short *src, short *dst ) { + + __asm { + + mov ecx, src + mov edx, dst + + movdqa xmm0, [ecx+0*16] + movdqa xmm4, [ecx+4*16] + DCT_8_INV_ROW_2R(tab_i_04, round_i_0, round_i_4) + movdqa [edx+0*16], xmm1 + movdqa [edx+4*16], xmm5 + + movdqa xmm0, [ecx+1*16] + movdqa xmm4, [ecx+7*16] + DCT_8_INV_ROW_2R(tab_i_17, round_i_1, round_i_7) + movdqa [edx+1*16], xmm1 + movdqa [edx+7*16], xmm5 + + movdqa xmm0, [ecx+3*16] + movdqa xmm4, [ecx+5*16] + DCT_8_INV_ROW_2R(tab_i_35, round_i_3, round_i_5); + movdqa [edx+3*16], xmm1 + movdqa [edx+5*16], xmm5 + + movdqa xmm0, [ecx+2*16] + movdqa xmm4, [ecx+6*16] + DCT_8_INV_ROW_2R(tab_i_26, round_i_2, round_i_6); + movdqa [edx+2*16], xmm1 + movdqa [edx+6*16], xmm5 + + DCT_8_INV_COL_8R(edx+0, edx+0); + } +} + + +/* +* Name: +* ownpj_QuantInv_8x8_16s +* +* Purpose: +* Dequantize 8x8 block of DCT coefficients +* +* Context: +* void ownpj_QuantInv_8x8_16s +* Ipp16s* pSrc, +* Ipp16s* pDst, +* const Ipp16u* pQTbl)* +* +*/ + +GLOBAL(void) +ownpj_QuantInv_8x8_16s(short * pSrc, short * pDst, const unsigned short * pQTbl) +{ + __asm { + + push ebx + push ecx + push edx + push esi + push edi + + mov esi, pSrc + mov edi, pDst + mov edx, pQTbl + mov ecx, 4 + mov ebx, 32 + + again: + + movq mm0, QWORD PTR [esi+0] + movq mm1, QWORD PTR [esi+8] + movq mm2, QWORD PTR [esi+16] + movq mm3, QWORD PTR [esi+24] + + prefetcht0 [esi+ebx] ; fetch next cache line + + pmullw mm0, QWORD PTR [edx+0] + pmullw mm1, QWORD PTR [edx+8] + pmullw mm2, QWORD PTR [edx+16] + pmullw mm3, QWORD PTR [edx+24] + + movq QWORD PTR [edi+0], mm0 + movq QWORD PTR [edi+8], mm1 + movq QWORD PTR [edi+16], mm2 + movq QWORD PTR [edi+24], mm3 + + add esi, ebx + add edi, ebx + add edx, ebx + dec ecx + jnz again + + emms + + pop edi + pop esi + pop edx + pop ecx + pop ebx + } +} + + +/* +* Name: +* ownpj_Add128_8x8_16s8u +* +* Purpose: +* signed to unsigned conversion (level shift) +* for 8x8 block of DCT coefficients +* +* Context: +* void ownpj_Add128_8x8_16s8u +* const Ipp16s* pSrc, +* Ipp8u* pDst, +* int DstStep); +* +*/ + +__declspec(align(16)) long const_128[]= {0x00800080, 0x00800080, 0x00800080, 0x00800080}; + +GLOBAL(void) +ownpj_Add128_8x8_16s8u(const short * pSrc, unsigned char * pDst, int DstStep) +{ + __asm { + push eax + push ebx + push ecx + push edx + push esi + push edi + + mov esi, pSrc + mov edi, pDst + mov edx, DstStep + mov ecx, 2 + mov ebx, edx + mov eax, edx + sal ebx, 1 + add eax, ebx + movdqa xmm7, XMMWORD PTR const_128 + + again: + + movdqa xmm0, XMMWORD PTR [esi+0] ; line 0 + movdqa xmm1, XMMWORD PTR [esi+16] ; line 1 + movdqa xmm2, XMMWORD PTR [esi+32] ; line 2 + movdqa xmm3, XMMWORD PTR [esi+48] ; line 3 + + paddw xmm0, xmm7 + paddw xmm1, xmm7 + paddw xmm2, xmm7 + paddw xmm3, xmm7 + + packuswb xmm0, xmm1 + packuswb xmm2, xmm3 + + movq QWORD PTR [edi], xmm0 ;0*DstStep + movq QWORD PTR [edi+ebx], xmm2 ;2*DstStep + + psrldq xmm0, 8 + psrldq xmm2, 8 + + movq QWORD PTR [edi+edx], xmm0 ;1*DstStep + movq QWORD PTR [edi+eax], xmm2 ;3*DstStep + + add edi, ebx + add esi, 64 + add edi, ebx + dec ecx + jnz again + + pop edi + pop esi + pop edx + pop ecx + pop ebx + pop eax + } +} + + +/* +* Name: +* ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R +* +* Purpose: +* Inverse DCT transform, de-quantization and level shift +* +* Parameters: +* pSrc - pointer to source +* pDst - pointer to output array +* DstStep - line offset for output data +* pEncoderQuantTable - pointer to Quantization table +* +*/ + +GLOBAL(void) +ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R( + short * pSrc, + unsigned char * pDst, + int DstStep, + const unsigned short * pQuantInvTable) +{ + + __declspec(align(16)) Ipp8u buf[DCTSIZE2*sizeof(Ipp16s)]; + Ipp16s * workbuf = (Ipp16s *)buf; + + ownpj_QuantInv_8x8_16s(pSrc,workbuf,pQuantInvTable); + dct_8x8_inv_16s(workbuf,workbuf); + ownpj_Add128_8x8_16s8u(workbuf,pDst,DstStep); + +} + +GLOBAL(void) +jpeg_idct_islow_sse2 ( + j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col) +{ + int ctr; + JCOEFPTR inptr; + Ipp16u* quantptr; + Ipp8u* wsptr; + __declspec(align(16)) Ipp8u workspace[DCTSIZE2]; + JSAMPROW outptr; + + inptr = coef_block; + quantptr = (Ipp16u*)compptr->dct_table; + wsptr = workspace; + + ippiDCTQuantInv8x8LS_JPEG_16s8u_C1R(inptr, workspace, 8, quantptr); + + for(ctr = 0; ctr < DCTSIZE; ctr++) + { + outptr = output_buf[ctr] + output_col; + + outptr[0] = wsptr[0]; + outptr[1] = wsptr[1]; + outptr[2] = wsptr[2]; + outptr[3] = wsptr[3]; + outptr[4] = wsptr[4]; + outptr[5] = wsptr[5]; + outptr[6] = wsptr[6]; + outptr[7] = wsptr[7]; + + wsptr += DCTSIZE; + } +} +#endif /* HAVE_SSE2_INTEL_MNEMONICS */ + #endif /* DCT_ISLOW_SUPPORTED */ diff --git a/mozilla/jpeg/jmorecfg.h b/mozilla/jpeg/jmorecfg.h index 9a9b3bc7e02..3a1379c025d 100644 --- a/mozilla/jpeg/jmorecfg.h +++ b/mozilla/jpeg/jmorecfg.h @@ -111,6 +111,12 @@ typedef short JCOEF; #define HAVE_MMX_INTEL_MNEMONICS #endif +/* Defines for SSE2 support. */ +#if defined(XP_WIN32) && defined(_M_IX86) && defined(__m128i) +#define HAVE_SSE2_INTEL_MNEMONICS +#endif + + /* Compressed datastreams are represented as arrays of JOCTET. * These must be EXACTLY 8 bits wide, at least once they are written to * external storage. Note that when using the stdio data source/destination