X-Git-Url: https://main.carlh.net/gitweb/?a=blobdiff_plain;f=src%2Flib%2Fopenjp2%2Fmct.c;h=a0da0996604048f26e9740596eb8001280c7045c;hb=ee0f3138482fc5ef44f92e6164206bc8df34efe4;hp=8d3c59cef8b3bbf0aa37efb1de906c2b248224b9;hpb=ec0fe091380c2de75584fdfc3d914eee5b89e731;p=openjpeg.git diff --git a/src/lib/openjp2/mct.c b/src/lib/openjp2/mct.c index 8d3c59ce..a0da0996 100644 --- a/src/lib/openjp2/mct.c +++ b/src/lib/openjp2/mct.c @@ -1,10 +1,18 @@ /* - * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium - * Copyright (c) 2002-2007, Professor Benoit Macq + * The copyright in this software is being made available under the 2-clauses + * BSD License, included below. This software may be subject to other third + * party and contributor rights, including patent rights, and no such rights + * are granted under this license. + * + * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium + * Copyright (c) 2002-2014, Professor Benoit Macq * Copyright (c) 2001-2003, David Janssens * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe + * Copyright (c) 2003-2007, Francois-Olivier Devaux + * Copyright (c) 2003-2014, Antonin Descampe * Copyright (c) 2005, Herve Drolon, FreeImage Team + * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR + * Copyright (c) 2012, CS Systemes d'Information, France * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -32,6 +40,12 @@ #ifdef __SSE__ #include #endif +#ifdef __SSE2__ +#include +#endif +#ifdef __SSE4_1__ +#include +#endif #include "opj_includes.h" @@ -58,14 +72,33 @@ const OPJ_FLOAT64 * opj_mct_get_mct_norms_real () /* */ /* Foward reversible MCT. */ /* */ +#ifdef __SSE2__ void opj_mct_encode( OPJ_INT32* restrict c0, OPJ_INT32* restrict c1, OPJ_INT32* restrict c2, OPJ_UINT32 n) { - OPJ_UINT32 i; - for(i = 0; i < n; ++i) { + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); + y = _mm_add_epi32(g, g); + y = _mm_add_epi32(y, b); + y = _mm_add_epi32(y, r); + y = _mm_srai_epi32(y, 2); + u = _mm_sub_epi32(b, g); + v = _mm_sub_epi32(r, g); + _mm_store_si128((__m128i *)&(c0[i]), y); + _mm_store_si128((__m128i *)&(c1[i]), u); + _mm_store_si128((__m128i *)&(c2[i]), v); + } + + for(; i < len; ++i) { OPJ_INT32 r = c0[i]; OPJ_INT32 g = c1[i]; OPJ_INT32 b = c2[i]; @@ -77,10 +110,69 @@ void opj_mct_encode( c2[i] = v; } } +#else +void opj_mct_encode( + OPJ_INT32* restrict c0, + OPJ_INT32* restrict c1, + OPJ_INT32* restrict c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for(i = 0; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = (r + (g * 2) + b) >> 2; + OPJ_INT32 u = b - g; + OPJ_INT32 v = r - g; + c0[i] = y; + c1[i] = u; + c2[i] = v; + } +} +#endif /* */ /* Inverse reversible MCT. */ /* */ +#ifdef __SSE2__ +void opj_mct_decode( + OPJ_INT32* restrict c0, + OPJ_INT32* restrict c1, + OPJ_INT32* restrict c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i r, g, b; + __m128i y = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i u = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i v = _mm_load_si128((const __m128i *)&(c2[i])); + g = y; + g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2)); + r = _mm_add_epi32(v, g); + b = _mm_add_epi32(u, g); + _mm_store_si128((__m128i *)&(c0[i]), r); + _mm_store_si128((__m128i *)&(c1[i]), g); + _mm_store_si128((__m128i *)&(c2[i]), b); + } + for (; i < len; ++i) { + OPJ_INT32 y = c0[i]; + OPJ_INT32 u = c1[i]; + OPJ_INT32 v = c2[i]; + OPJ_INT32 g = y - ((u + v) >> 2); + OPJ_INT32 r = v + g; + OPJ_INT32 b = u + g; + c0[i] = r; + c1[i] = g; + c2[i] = b; + } +} +#else void opj_mct_decode( OPJ_INT32* restrict c0, OPJ_INT32* restrict c1, @@ -100,6 +192,7 @@ void opj_mct_decode( c2[i] = b; } } +#endif /* */ /* Get norm of basis function of reversible MCT. */ @@ -111,6 +204,148 @@ OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno) { /* */ /* Foward irreversible MCT. */ /* */ +#ifdef __SSE4_1__ +void opj_mct_encode_real( + OPJ_INT32* restrict c0, + OPJ_INT32* restrict c1, + OPJ_INT32* restrict c2, + OPJ_UINT32 n) +{ + OPJ_SIZE_T i; + const OPJ_SIZE_T len = n; + + const __m128i ry = _mm_set1_epi32(2449); + const __m128i gy = _mm_set1_epi32(4809); + const __m128i by = _mm_set1_epi32(934); + const __m128i ru = _mm_set1_epi32(1382); + const __m128i gu = _mm_set1_epi32(2714); + /* const __m128i bu = _mm_set1_epi32(4096); */ + /* const __m128i rv = _mm_set1_epi32(4096); */ + const __m128i gv = _mm_set1_epi32(3430); + const __m128i bv = _mm_set1_epi32(666); + const __m128i mulround = _mm_shuffle_epi32(_mm_cvtsi32_si128(4096), _MM_SHUFFLE(1, 0, 1, 0)); + + for(i = 0; i < (len & ~3U); i += 4) { + __m128i lo, hi; + __m128i y, u, v; + __m128i r = _mm_load_si128((const __m128i *)&(c0[i])); + __m128i g = _mm_load_si128((const __m128i *)&(c1[i])); + __m128i b = _mm_load_si128((const __m128i *)&(c2[i])); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ry); + hi = _mm_mul_epi32(hi, ry); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gy); + hi = _mm_mul_epi32(hi, gy); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, by); + hi = _mm_mul_epi32(hi, by); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + y = _mm_add_epi32(y, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c0[i]), y); + + /*lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_blend_epi16(lo, hi, 0xCC); + + lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, ru); + hi = _mm_mul_epi32(hi, ru); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gu); + hi = _mm_mul_epi32(hi, gu); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + u = _mm_sub_epi32(u, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c1[i]), u); + + /*lo = r; + hi = _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, mulround); + hi = _mm_mul_epi32(hi, mulround);*/ + lo = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 2, 0))); + hi = _mm_cvtepi32_epi64(_mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 1))); + lo = _mm_slli_epi64(lo, 12); + hi = _mm_slli_epi64(hi, 12); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_blend_epi16(lo, hi, 0xCC); + + lo = g; + hi = _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, gv); + hi = _mm_mul_epi32(hi, gv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + + lo = b; + hi = _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)); + lo = _mm_mul_epi32(lo, bv); + hi = _mm_mul_epi32(hi, bv); + lo = _mm_add_epi64(lo, mulround); + hi = _mm_add_epi64(hi, mulround); + lo = _mm_srli_epi64(lo, 13); + hi = _mm_slli_epi64(hi, 32-13); + v = _mm_sub_epi32(v, _mm_blend_epi16(lo, hi, 0xCC)); + _mm_store_si128((__m128i *)&(c2[i]), v); + } + for(; i < len; ++i) { + OPJ_INT32 r = c0[i]; + OPJ_INT32 g = c1[i]; + OPJ_INT32 b = c2[i]; + OPJ_INT32 y = opj_int_fix_mul(r, 2449) + opj_int_fix_mul(g, 4809) + opj_int_fix_mul(b, 934); + OPJ_INT32 u = -opj_int_fix_mul(r, 1382) - opj_int_fix_mul(g, 2714) + opj_int_fix_mul(b, 4096); + OPJ_INT32 v = opj_int_fix_mul(r, 4096) - opj_int_fix_mul(g, 3430) - opj_int_fix_mul(b, 666); + c0[i] = y; + c1[i] = u; + c2[i] = v; + } +} +#else void opj_mct_encode_real( OPJ_INT32* restrict c0, OPJ_INT32* restrict c1, @@ -130,6 +365,7 @@ void opj_mct_encode_real( c2[i] = v; } } +#endif /* */ /* Inverse irreversible MCT. */ @@ -228,7 +464,7 @@ OPJ_BOOL opj_mct_encode_custom( lCurrentMatrix = lCurrentData + pNbComp; for (i =0;i