src/lib/openjp2/mct.c

   1 /*
   2  * The copyright in this software is being made available under the 2-clauses
   3  * BSD License, included below. This software may be subject to other third
   4  * party and contributor rights, including patent rights, and no such rights
   5  * are granted under this license.
   6  *
   7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
   8  * Copyright (c) 2002-2014, Professor Benoit Macq
   9  * Copyright (c) 2001-2003, David Janssens
  10  * Copyright (c) 2002-2003, Yannick Verschueren
  11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12  * Copyright (c) 2003-2014, Antonin Descampe
  13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14  * Copyright (c) 2008, 2011-2012, Centre National d'Etudes Spatiales (CNES), FR
  15  * Copyright (c) 2012, CS Systemes d'Information, France
  16  * All rights reserved.
  17  *
  18  * Redistribution and use in source and binary forms, with or without
  19  * modification, are permitted provided that the following conditions
  20  * are met:
  21  * 1. Redistributions of source code must retain the above copyright
  22  *    notice, this list of conditions and the following disclaimer.
  23  * 2. Redistributions in binary form must reproduce the above copyright
  24  *    notice, this list of conditions and the following disclaimer in the
  25  *    documentation and/or other materials provided with the distribution.
  26  *
  27  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  28  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  31  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  37  * POSSIBILITY OF SUCH DAMAGE.
  38  */
  39
  40 #ifdef __SSE__
  41 #include <xmmintrin.h>
  42 #endif
  43 #ifdef __SSE2__
  44 #include <emmintrin.h>
  45 #endif
  46 #ifdef __SSE4_1__
  47 #include <smmintrin.h>
  48 #endif
  49
  50 #include "opj_includes.h"
  51
  52 /* <summary> */
  53 /* This table contains the norms of the basis function of the reversible MCT. */
  54 /* </summary> */
  55 static const OPJ_FLOAT64 opj_mct_norms[3] = { 1.732, .8292, .8292 };
  56
  57 /* <summary> */
  58 /* This table contains the norms of the basis function of the irreversible MCT. */
  59 /* </summary> */
  60 static const OPJ_FLOAT64 opj_mct_norms_real[3] = { 1.732, 1.805, 1.573 };
  61
  62 const OPJ_FLOAT64 * opj_mct_get_mct_norms()
  63 {
  64     return opj_mct_norms;
  65 }
  66
  67 const OPJ_FLOAT64 * opj_mct_get_mct_norms_real()
  68 {
  69     return opj_mct_norms_real;
  70 }
  71
  72 /* <summary> */
  73 /* Forward reversible MCT. */
  74 /* </summary> */
  75 #ifdef __SSE2__
  76 void opj_mct_encode(
  77     OPJ_INT32* OPJ_RESTRICT c0,
  78     OPJ_INT32* OPJ_RESTRICT c1,
  79     OPJ_INT32* OPJ_RESTRICT c2,
  80     OPJ_SIZE_T n)
  81 {
  82     OPJ_SIZE_T i;
  83     const OPJ_SIZE_T len = n;
  84     /* buffer are aligned on 16 bytes */
  85     assert(((size_t)c0 & 0xf) == 0);
  86     assert(((size_t)c1 & 0xf) == 0);
  87     assert(((size_t)c2 & 0xf) == 0);
  88
  89     for (i = 0; i < (len & ~3U); i += 4) {
  90         __m128i y, u, v;
  91         __m128i r = _mm_load_si128((const __m128i *) & (c0[i]));
  92         __m128i g = _mm_load_si128((const __m128i *) & (c1[i]));
  93         __m128i b = _mm_load_si128((const __m128i *) & (c2[i]));
  94         y = _mm_add_epi32(g, g);
  95         y = _mm_add_epi32(y, b);
  96         y = _mm_add_epi32(y, r);
  97         y = _mm_srai_epi32(y, 2);
  98         u = _mm_sub_epi32(b, g);
  99         v = _mm_sub_epi32(r, g);
 100         _mm_store_si128((__m128i *) & (c0[i]), y);
 101         _mm_store_si128((__m128i *) & (c1[i]), u);
 102         _mm_store_si128((__m128i *) & (c2[i]), v);
 103     }
 104
 105     for (; i < len; ++i) {
 106         OPJ_INT32 r = c0[i];
 107         OPJ_INT32 g = c1[i];
 108         OPJ_INT32 b = c2[i];
 109         OPJ_INT32 y = (r + (g * 2) + b) >> 2;
 110         OPJ_INT32 u = b - g;
 111         OPJ_INT32 v = r - g;
 112         c0[i] = y;
 113         c1[i] = u;
 114         c2[i] = v;
 115     }
 116 }
 117 #else
 118 void opj_mct_encode(
 119     OPJ_INT32* OPJ_RESTRICT c0,
 120     OPJ_INT32* OPJ_RESTRICT c1,
 121     OPJ_INT32* OPJ_RESTRICT c2,
 122     OPJ_SIZE_T n)
 123 {
 124     OPJ_SIZE_T i;
 125     const OPJ_SIZE_T len = n;
 126
 127     for (i = 0; i < len; ++i) {
 128         OPJ_INT32 r = c0[i];
 129         OPJ_INT32 g = c1[i];
 130         OPJ_INT32 b = c2[i];
 131         OPJ_INT32 y = (r + (g * 2) + b) >> 2;
 132         OPJ_INT32 u = b - g;
 133         OPJ_INT32 v = r - g;
 134         c0[i] = y;
 135         c1[i] = u;
 136         c2[i] = v;
 137     }
 138 }
 139 #endif
 140
 141 /* <summary> */
 142 /* Inverse reversible MCT. */
 143 /* </summary> */
 144 #ifdef __SSE2__
 145 void opj_mct_decode(
 146     OPJ_INT32* OPJ_RESTRICT c0,
 147     OPJ_INT32* OPJ_RESTRICT c1,
 148     OPJ_INT32* OPJ_RESTRICT c2,
 149     OPJ_SIZE_T n)
 150 {
 151     OPJ_SIZE_T i;
 152     const OPJ_SIZE_T len = n;
 153
 154     for (i = 0; i < (len & ~3U); i += 4) {
 155         __m128i r, g, b;
 156         __m128i y = _mm_load_si128((const __m128i *) & (c0[i]));
 157         __m128i u = _mm_load_si128((const __m128i *) & (c1[i]));
 158         __m128i v = _mm_load_si128((const __m128i *) & (c2[i]));
 159         g = y;
 160         g = _mm_sub_epi32(g, _mm_srai_epi32(_mm_add_epi32(u, v), 2));
 161         r = _mm_add_epi32(v, g);
 162         b = _mm_add_epi32(u, g);
 163         _mm_store_si128((__m128i *) & (c0[i]), r);
 164         _mm_store_si128((__m128i *) & (c1[i]), g);
 165         _mm_store_si128((__m128i *) & (c2[i]), b);
 166     }
 167     for (; i < len; ++i) {
 168         OPJ_INT32 y = c0[i];
 169         OPJ_INT32 u = c1[i];
 170         OPJ_INT32 v = c2[i];
 171         OPJ_INT32 g = y - ((u + v) >> 2);
 172         OPJ_INT32 r = v + g;
 173         OPJ_INT32 b = u + g;
 174         c0[i] = r;
 175         c1[i] = g;
 176         c2[i] = b;
 177     }
 178 }
 179 #else
 180 void opj_mct_decode(
 181     OPJ_INT32* OPJ_RESTRICT c0,
 182     OPJ_INT32* OPJ_RESTRICT c1,
 183     OPJ_INT32* OPJ_RESTRICT c2,
 184     OPJ_SIZE_T n)
 185 {
 186     OPJ_SIZE_T i;
 187     for (i = 0; i < n; ++i) {
 188         OPJ_INT32 y = c0[i];
 189         OPJ_INT32 u = c1[i];
 190         OPJ_INT32 v = c2[i];
 191         OPJ_INT32 g = y - ((u + v) >> 2);
 192         OPJ_INT32 r = v + g;
 193         OPJ_INT32 b = u + g;
 194         c0[i] = r;
 195         c1[i] = g;
 196         c2[i] = b;
 197     }
 198 }
 199 #endif
 200
 201 /* <summary> */
 202 /* Get norm of basis function of reversible MCT. */
 203 /* </summary> */
 204 OPJ_FLOAT64 opj_mct_getnorm(OPJ_UINT32 compno)
 205 {
 206     return opj_mct_norms[compno];
 207 }
 208
 209 /* <summary> */
 210 /* Forward irreversible MCT. */
 211 /* </summary> */
 212 void opj_mct_encode_real(
 213     OPJ_FLOAT32* OPJ_RESTRICT c0,
 214     OPJ_FLOAT32* OPJ_RESTRICT c1,
 215     OPJ_FLOAT32* OPJ_RESTRICT c2,
 216     OPJ_SIZE_T n)
 217 {
 218     OPJ_SIZE_T i;
 219 #ifdef __SSE__
 220     const __m128 YR = _mm_set1_ps(0.299f);
 221     const __m128 YG = _mm_set1_ps(0.587f);
 222     const __m128 YB = _mm_set1_ps(0.114f);
 223     const __m128 UR = _mm_set1_ps(-0.16875f);
 224     const __m128 UG = _mm_set1_ps(-0.331260f);
 225     const __m128 UB = _mm_set1_ps(0.5f);
 226     const __m128 VR = _mm_set1_ps(0.5f);
 227     const __m128 VG = _mm_set1_ps(-0.41869f);
 228     const __m128 VB = _mm_set1_ps(-0.08131f);
 229     for (i = 0; i < (n >> 3); i ++) {
 230         __m128 r, g, b, y, u, v;
 231
 232         r = _mm_load_ps(c0);
 233         g = _mm_load_ps(c1);
 234         b = _mm_load_ps(c2);
 235         y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
 236                        _mm_mul_ps(b, YB));
 237         u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
 238                        _mm_mul_ps(b, UB));
 239         v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
 240                        _mm_mul_ps(b, VB));
 241         _mm_store_ps(c0, y);
 242         _mm_store_ps(c1, u);
 243         _mm_store_ps(c2, v);
 244         c0 += 4;
 245         c1 += 4;
 246         c2 += 4;
 247
 248         r = _mm_load_ps(c0);
 249         g = _mm_load_ps(c1);
 250         b = _mm_load_ps(c2);
 251         y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, YR), _mm_mul_ps(g, YG)),
 252                        _mm_mul_ps(b, YB));
 253         u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, UR), _mm_mul_ps(g, UG)),
 254                        _mm_mul_ps(b, UB));
 255         v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(r, VR), _mm_mul_ps(g, VG)),
 256                        _mm_mul_ps(b, VB));
 257         _mm_store_ps(c0, y);
 258         _mm_store_ps(c1, u);
 259         _mm_store_ps(c2, v);
 260         c0 += 4;
 261         c1 += 4;
 262         c2 += 4;
 263     }
 264     n &= 7;
 265 #endif
 266     for (i = 0; i < n; ++i) {
 267         OPJ_FLOAT32 r = c0[i];
 268         OPJ_FLOAT32 g = c1[i];
 269         OPJ_FLOAT32 b = c2[i];
 270         OPJ_FLOAT32 y = 0.299f * r + 0.587f * g + 0.114f * b;
 271         OPJ_FLOAT32 u = -0.16875f * r - 0.331260f * g + 0.5f * b;
 272         OPJ_FLOAT32 v = 0.5f * r - 0.41869f * g - 0.08131f * b;
 273         c0[i] = y;
 274         c1[i] = u;
 275         c2[i] = v;
 276     }
 277 }
 278
 279 /* <summary> */
 280 /* Inverse irreversible MCT. */
 281 /* </summary> */
 282 void opj_mct_decode_real(
 283     OPJ_FLOAT32* OPJ_RESTRICT c0,
 284     OPJ_FLOAT32* OPJ_RESTRICT c1,
 285     OPJ_FLOAT32* OPJ_RESTRICT c2,
 286     OPJ_SIZE_T n)
 287 {
 288     OPJ_SIZE_T i;
 289 #ifdef __SSE__
 290     __m128 vrv, vgu, vgv, vbu;
 291     vrv = _mm_set1_ps(1.402f);
 292     vgu = _mm_set1_ps(0.34413f);
 293     vgv = _mm_set1_ps(0.71414f);
 294     vbu = _mm_set1_ps(1.772f);
 295     for (i = 0; i < (n >> 3); ++i) {
 296         __m128 vy, vu, vv;
 297         __m128 vr, vg, vb;
 298
 299         vy = _mm_load_ps(c0);
 300         vu = _mm_load_ps(c1);
 301         vv = _mm_load_ps(c2);
 302         vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
 303         vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
 304         vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
 305         _mm_store_ps(c0, vr);
 306         _mm_store_ps(c1, vg);
 307         _mm_store_ps(c2, vb);
 308         c0 += 4;
 309         c1 += 4;
 310         c2 += 4;
 311
 312         vy = _mm_load_ps(c0);
 313         vu = _mm_load_ps(c1);
 314         vv = _mm_load_ps(c2);
 315         vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
 316         vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv));
 317         vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
 318         _mm_store_ps(c0, vr);
 319         _mm_store_ps(c1, vg);
 320         _mm_store_ps(c2, vb);
 321         c0 += 4;
 322         c1 += 4;
 323         c2 += 4;
 324     }
 325     n &= 7;
 326 #endif
 327     for (i = 0; i < n; ++i) {
 328         OPJ_FLOAT32 y = c0[i];
 329         OPJ_FLOAT32 u = c1[i];
 330         OPJ_FLOAT32 v = c2[i];
 331         OPJ_FLOAT32 r = y + (v * 1.402f);
 332         OPJ_FLOAT32 g = y - (u * 0.34413f) - (v * (0.71414f));
 333         OPJ_FLOAT32 b = y + (u * 1.772f);
 334         c0[i] = r;
 335         c1[i] = g;
 336         c2[i] = b;
 337     }
 338 }
 339
 340 /* <summary> */
 341 /* Get norm of basis function of irreversible MCT. */
 342 /* </summary> */
 343 OPJ_FLOAT64 opj_mct_getnorm_real(OPJ_UINT32 compno)
 344 {
 345     return opj_mct_norms_real[compno];
 346 }
 347
 348
 349 OPJ_BOOL opj_mct_encode_custom(
 350     OPJ_BYTE * pCodingdata,
 351     OPJ_SIZE_T n,
 352     OPJ_BYTE ** pData,
 353     OPJ_UINT32 pNbComp,
 354     OPJ_UINT32 isSigned)
 355 {
 356     OPJ_FLOAT32 * lMct = (OPJ_FLOAT32 *) pCodingdata;
 357     OPJ_SIZE_T i;
 358     OPJ_UINT32 j;
 359     OPJ_UINT32 k;
 360     OPJ_UINT32 lNbMatCoeff = pNbComp * pNbComp;
 361     OPJ_INT32 * lCurrentData = 00;
 362     OPJ_INT32 * lCurrentMatrix = 00;
 363     OPJ_INT32 ** lData = (OPJ_INT32 **) pData;
 364     OPJ_UINT32 lMultiplicator = 1 << 13;
 365     OPJ_INT32 * lMctPtr;
 366
 367     OPJ_ARG_NOT_USED(isSigned);
 368
 369     lCurrentData = (OPJ_INT32 *) opj_malloc((pNbComp + lNbMatCoeff) * sizeof(
 370             OPJ_INT32));
 371     if (! lCurrentData) {
 372         return OPJ_FALSE;
 373     }
 374
 375     lCurrentMatrix = lCurrentData + pNbComp;
 376
 377     for (i = 0; i < lNbMatCoeff; ++i) {
 378         lCurrentMatrix[i] = (OPJ_INT32)(*(lMct++) * (OPJ_FLOAT32)lMultiplicator);
 379     }
 380
 381     for (i = 0; i < n; ++i)  {
 382         lMctPtr = lCurrentMatrix;
 383         for (j = 0; j < pNbComp; ++j) {
 384             lCurrentData[j] = (*(lData[j]));
 385         }
 386
 387         for (j = 0; j < pNbComp; ++j) {
 388             *(lData[j]) = 0;
 389             for (k = 0; k < pNbComp; ++k) {
 390                 *(lData[j]) += opj_int_fix_mul(*lMctPtr, lCurrentData[k]);
 391                 ++lMctPtr;
 392             }
 393
 394             ++lData[j];
 395         }
 396     }
 397
 398     opj_free(lCurrentData);
 399
 400     return OPJ_TRUE;
 401 }
 402
 403 OPJ_BOOL opj_mct_decode_custom(
 404     OPJ_BYTE * pDecodingData,
 405     OPJ_SIZE_T n,
 406     OPJ_BYTE ** pData,
 407     OPJ_UINT32 pNbComp,
 408     OPJ_UINT32 isSigned)
 409 {
 410     OPJ_FLOAT32 * lMct;
 411     OPJ_SIZE_T i;
 412     OPJ_UINT32 j;
 413     OPJ_UINT32 k;
 414
 415     OPJ_FLOAT32 * lCurrentData = 00;
 416     OPJ_FLOAT32 * lCurrentResult = 00;
 417     OPJ_FLOAT32 ** lData = (OPJ_FLOAT32 **) pData;
 418
 419     OPJ_ARG_NOT_USED(isSigned);
 420
 421     lCurrentData = (OPJ_FLOAT32 *) opj_malloc(2 * pNbComp * sizeof(OPJ_FLOAT32));
 422     if (! lCurrentData) {
 423         return OPJ_FALSE;
 424     }
 425     lCurrentResult = lCurrentData + pNbComp;
 426
 427     for (i = 0; i < n; ++i) {
 428         lMct = (OPJ_FLOAT32 *) pDecodingData;
 429         for (j = 0; j < pNbComp; ++j) {
 430             lCurrentData[j] = (OPJ_FLOAT32)(*(lData[j]));
 431         }
 432         for (j = 0; j < pNbComp; ++j) {
 433             lCurrentResult[j] = 0;
 434             for (k = 0; k < pNbComp; ++k) {
 435                 lCurrentResult[j] += *(lMct++) * lCurrentData[k];
 436             }
 437             *(lData[j]++) = (OPJ_FLOAT32)(lCurrentResult[j]);
 438         }
 439     }
 440     opj_free(lCurrentData);
 441     return OPJ_TRUE;
 442 }
 443
 444 void opj_calculate_norms(OPJ_FLOAT64 * pNorms,
 445                          OPJ_UINT32 pNbComps,
 446                          OPJ_FLOAT32 * pMatrix)
 447 {
 448     OPJ_UINT32 i, j, lIndex;
 449     OPJ_FLOAT32 lCurrentValue;
 450     OPJ_FLOAT64 * lNorms = (OPJ_FLOAT64 *) pNorms;
 451     OPJ_FLOAT32 * lMatrix = (OPJ_FLOAT32 *) pMatrix;
 452
 453     for (i = 0; i < pNbComps; ++i) {
 454         lNorms[i] = 0;
 455         lIndex = i;
 456
 457         for (j = 0; j < pNbComps; ++j) {
 458             lCurrentValue = lMatrix[lIndex];
 459             lIndex += pNbComps;
 460             lNorms[i] += (OPJ_FLOAT64) lCurrentValue * lCurrentValue;
 461         }
 462         lNorms[i] = sqrt(lNorms[i]);
 463     }
 464 }