libopenjpeg/dwt.c

   1 /*
   2  * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
   3  * Copyright (c) 2002-2007, Professor Benoit Macq
   4  * Copyright (c) 2001-2003, David Janssens
   5  * Copyright (c) 2002-2003, Yannick Verschueren
   6  * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
   7  * Copyright (c) 2005, Herve Drolon, FreeImage Team
   8  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
   9  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
  10  * Copyright (c) 2008, Jerome Fimes, Communications & Systemes <jerome.fimes@c-s.fr>
  11  * All rights reserved.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  *
  22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  23  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  26  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32  * POSSIBILITY OF SUCH DAMAGE.
  33  */
  34
  35 #ifdef __SSE__
  36 #include <xmmintrin.h>
  37 #endif
  38
  39 #include "dwt.h"
  40 #include "j2k.h"
  41 #include "tcd.h"
  42 #include "fix.h"
  43 #include "opj_malloc.h"
  44 #include "int.h"
  45
  46 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
  47 /*@{*/
  48
  49 #define WS(i) v->mem[(i)*2]
  50 #define WD(i) v->mem[(1+(i)*2)]
  51
  52 /** @name Local data structures */
  53 /*@{*/
  54
  55 typedef struct dwt_local {
  56         OPJ_INT32* mem;
  57         OPJ_INT32 dn;
  58         OPJ_INT32 sn;
  59         OPJ_INT32 cas;
  60 } dwt_t;
  61
  62 typedef union {
  63         OPJ_FLOAT32     f[4];
  64 } v4;
  65
  66 typedef struct v4dwt_local {
  67         v4*     wavelet ;
  68         OPJ_INT32               dn ;
  69         OPJ_INT32               sn ;
  70         OPJ_INT32               cas ;
  71 } v4dwt_t ;
  72
  73 static const OPJ_FLOAT32 dwt_alpha =  1.586134342f; //  12994
  74 static const OPJ_FLOAT32 dwt_beta  =  0.052980118f; //    434
  75 static const OPJ_FLOAT32 dwt_gamma = -0.882911075f; //  -7233
  76 static const OPJ_FLOAT32 delta = -0.443506852f; //  -3633
  77
  78 static const OPJ_FLOAT32 K      = 1.230174105f; //  10078
  79 /* FIXME: What is this constant? */
  80 static const OPJ_FLOAT32 c13318 = 1.625732422f;
  81
  82 /*@}*/
  83
  84 /**
  85 Virtual function type for wavelet transform in 1-D
  86 */
  87 typedef void (*DWT1DFN)(dwt_t* v);
  88
  89 /** @name Local static functions */
  90 /*@{*/
  91
  92 /**
  93 Forward lazy transform (horizontal)
  94 */
  95 static void dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
  96 /**
  97 Forward lazy transform (vertical)
  98 */
  99 static void dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
 100 /**
 101 Inverse lazy transform (horizontal)
 102 */
 103 static void dwt_interleave_h(dwt_t* h, OPJ_INT32 *a);
 104 /**
 105 Inverse lazy transform (vertical)
 106 */
 107 static void dwt_interleave_v(dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x);
 108 /**
 109 Forward 5-3 wavelet transform in 1-D
 110 */
 111 static void dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
 112 /**
 113 Inverse 5-3 wavelet transform in 1-D
 114 */
 115 static void dwt_decode_1(dwt_t *v);
 116 /**
 117 Forward 9-7 wavelet transform in 1-D
 118 */
 119 static void dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
 120 /**
 121 Explicit calculation of the Quantization Stepsizes
 122 */
 123 static void dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize);
 124 /**
 125 Inverse wavelet transform in 2-D.
 126 */
 127 static bool dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);
 128
 129 static OPJ_UINT32 dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i);
 130
 131 static INLINE bool dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) );
 132 /*@}*/
 133
 134 /*@}*/
 135
 136 #define S(i) a[(i)*2]
 137 #define D(i) a[(1+(i)*2)]
 138 #define S_(i) ((i)<0?S(0):((i)>=sn?S(sn-1):S(i)))
 139 #define D_(i) ((i)<0?D(0):((i)>=dn?D(dn-1):D(i)))
 140 /* new */
 141 #define SS_(i) ((i)<0?S(0):((i)>=dn?S(dn-1):S(i)))
 142 #define DD_(i) ((i)<0?D(0):((i)>=sn?D(sn-1):D(i)))
 143
 144 /* <summary>                                                              */
 145 /* This table contains the norms of the 5-3 wavelets for different bands. */
 146 /* </summary>                                                             */
 147 static const OPJ_FLOAT64 dwt_norms[4][10] = {
 148         {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
 149         {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 150         {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 151         {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
 152 };
 153
 154 /* <summary>                                                              */
 155 /* This table contains the norms of the 9-7 wavelets for different bands. */
 156 /* </summary>                                                             */
 157 static const OPJ_FLOAT64 dwt_norms_real[4][10] = {
 158         {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
 159         {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 160         {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 161         {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
 162 };
 163
 164 /*
 165 ==========================================================
 166    local functions
 167 ==========================================================
 168 */
 169
 170 /* <summary>                                     */
 171 /* Forward lazy transform (horizontal).  */
 172 /* </summary>                            */
 173 static void dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
 174         OPJ_INT32 i;
 175
 176         OPJ_INT32 * l_dest = b;
 177         OPJ_INT32 * l_src = a+cas;
 178     for
 179                 (i=0; i<sn; ++i)
 180         {
 181                 *l_dest++ = *l_src;
 182                 l_src += 2;
 183         }
 184         l_dest = b + sn;
 185         l_src = a + 1 - cas;
 186     for
 187                 (i=0; i<dn; ++i)
 188         {
 189                 *l_dest++=*l_src;
 190                 l_src += 2;
 191         }
 192 }
 193
 194 /* <summary>                             */
 195 /* Forward lazy transform (vertical).    */
 196 /* </summary>                            */
 197 static void dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) {
 198     OPJ_INT32 i = sn;
 199         OPJ_INT32 * l_dest = b;
 200         OPJ_INT32 * l_src = a+cas;
 201
 202     while
 203                 (i--)
 204         {
 205                 *l_dest = *l_src;
 206                 l_dest += x;
 207                 l_src += 2;
 208                 /* b[i*x]=a[2*i+cas]; */
 209         }
 210         l_dest = b + sn * x;
 211         l_src = a + 1 - cas;
 212
 213         i = dn;
 214     while
 215                 (i--)
 216         {
 217                 *l_dest = *l_src;
 218                 l_dest += x;
 219                 l_src += 2;
 220                 /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
 221         }
 222 }
 223
 224 /* <summary>                             */
 225 /* Inverse lazy transform (horizontal).  */
 226 /* </summary>                            */
 227 static void dwt_interleave_h(dwt_t* h, OPJ_INT32 *a) {
 228     OPJ_INT32 *ai = a;
 229     OPJ_INT32 *bi = h->mem + h->cas;
 230     OPJ_INT32  i        = h->sn;
 231     while
 232                 ( i-- )
 233         {
 234                 *bi = *(ai++);
 235                 bi += 2;
 236     }
 237     ai  = a + h->sn;
 238     bi  = h->mem + 1 - h->cas;
 239     i   = h->dn ;
 240     while
 241                 ( i-- )
 242         {
 243                 *bi = *(ai++);
 244                 bi += 2;
 245     }
 246 }
 247
 248 /* <summary>                             */
 249 /* Inverse lazy transform (vertical).    */
 250 /* </summary>                            */
 251 static void dwt_interleave_v(dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) {
 252     OPJ_INT32 *ai = a;
 253     OPJ_INT32 *bi = v->mem + v->cas;
 254     OPJ_INT32  i = v->sn;
 255     while( i-- ) {
 256       *bi = *ai;
 257           bi += 2;
 258           ai += x;
 259     }
 260     ai = a + (v->sn * x);
 261     bi = v->mem + 1 - v->cas;
 262     i = v->dn ;
 263     while( i-- ) {
 264       *bi = *ai;
 265           bi += 2;
 266           ai += x;
 267     }
 268 }
 269
 270
 271 /* <summary>                            */
 272 /* Forward 5-3 wavelet transform in 1-D. */
 273 /* </summary>                           */
 274 static void dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
 275         OPJ_INT32 i;
 276
 277         if (!cas) {
 278                 if ((dn > 0) || (sn > 1)) {     /* NEW :  CASE ONE ELEMENT */
 279                         for (i = 0; i < dn; i++) D(i) -= (S_(i) + S_(i + 1)) >> 1;
 280                         for (i = 0; i < sn; i++) S(i) += (D_(i - 1) + D_(i) + 2) >> 2;
 281                 }
 282         } else {
 283                 if (!sn && dn == 1)                 /* NEW :  CASE ONE ELEMENT */
 284                         S(0) *= 2;
 285                 else {
 286                         for (i = 0; i < dn; i++) S(i) -= (DD_(i) + DD_(i - 1)) >> 1;
 287                         for (i = 0; i < sn; i++) D(i) += (SS_(i) + SS_(i + 1) + 2) >> 2;
 288                 }
 289         }
 290 }
 291
 292 /* <summary>                            */
 293 /* Inverse 5-3 wavelet transform in 1-D. */
 294 /* </summary>                           */
 295 static void dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
 296         OPJ_INT32 i;
 297
 298         if (!cas) {
 299                 if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 300                         for (i = 0; i < sn; i++) S(i) -= (D_(i - 1) + D_(i) + 2) >> 2;
 301                         for (i = 0; i < dn; i++) D(i) += (S_(i) + S_(i + 1)) >> 1;
 302                 }
 303         } else {
 304                 if (!sn  && dn == 1)          /* NEW :  CASE ONE ELEMENT */
 305                         S(0) /= 2;
 306                 else {
 307                         for (i = 0; i < sn; i++) D(i) -= (SS_(i) + SS_(i + 1) + 2) >> 2;
 308                         for (i = 0; i < dn; i++) S(i) += (DD_(i) + DD_(i - 1)) >> 1;
 309                 }
 310         }
 311 }
 312
 313 /* <summary>                            */
 314 /* Inverse 5-3 wavelet transform in 1-D. */
 315 /* </summary>                           */
 316 static void dwt_decode_1(dwt_t *v) {
 317         dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
 318 }
 319
 320 /* <summary>                             */
 321 /* Forward 9-7 wavelet transform in 1-D. */
 322 /* </summary>                            */
 323 static void dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
 324         OPJ_INT32 i;
 325         if (!cas) {
 326                 if ((dn > 0) || (sn > 1)) {     /* NEW :  CASE ONE ELEMENT */
 327                         for (i = 0; i < dn; i++)
 328                                 D(i) -= fix_mul(S_(i) + S_(i + 1), 12993);
 329                         for (i = 0; i < sn; i++)
 330                                 S(i) -= fix_mul(D_(i - 1) + D_(i), 434);
 331                         for (i = 0; i < dn; i++)
 332                                 D(i) += fix_mul(S_(i) + S_(i + 1), 7233);
 333                         for (i = 0; i < sn; i++)
 334                                 S(i) += fix_mul(D_(i - 1) + D_(i), 3633);
 335                         for (i = 0; i < dn; i++)
 336                                 D(i) = fix_mul(D(i), 5038);     /*5038 */
 337                         for (i = 0; i < sn; i++)
 338                                 S(i) = fix_mul(S(i), 6659);     /*6660 */
 339                 }
 340         } else {
 341                 if ((sn > 0) || (dn > 1)) {     /* NEW :  CASE ONE ELEMENT */
 342                         for (i = 0; i < dn; i++)
 343                                 S(i) -= fix_mul(DD_(i) + DD_(i - 1), 12993);
 344                         for (i = 0; i < sn; i++)
 345                                 D(i) -= fix_mul(SS_(i) + SS_(i + 1), 434);
 346                         for (i = 0; i < dn; i++)
 347                                 S(i) += fix_mul(DD_(i) + DD_(i - 1), 7233);
 348                         for (i = 0; i < sn; i++)
 349                                 D(i) += fix_mul(SS_(i) + SS_(i + 1), 3633);
 350                         for (i = 0; i < dn; i++)
 351                                 S(i) = fix_mul(S(i), 5038);     /*5038 */
 352                         for (i = 0; i < sn; i++)
 353                                 D(i) = fix_mul(D(i), 6659);     /*6660 */
 354                 }
 355         }
 356 }
 357
 358 static void dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize) {
 359         OPJ_INT32 p, n;
 360         p = int_floorlog2(stepsize) - 13;
 361         n = 11 - int_floorlog2(stepsize);
 362         bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
 363         bandno_stepsize->expn = numbps - p;
 364 }
 365
 366 /*
 367 ==========================================================
 368    DWT interface
 369 ==========================================================
 370 */
 371
 372 /* <summary>                            */
 373 /* Forward 5-3 wavelet transform in 2-D. */
 374 /* </summary>                           */
 375 INLINE bool dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) )
 376 {
 377         OPJ_INT32 i, j, k;
 378         OPJ_INT32 *a = 00;
 379         OPJ_INT32 *aj = 00;
 380         OPJ_INT32 *bj = 00;
 381         OPJ_INT32 w, l;
 382
 383         OPJ_INT32 rw;                   /* width of the resolution level computed   */
 384         OPJ_INT32 rh;                   /* height of the resolution level computed  */
 385         OPJ_INT32 l_data_size;
 386
 387         opj_tcd_resolution_t * l_cur_res = 0;
 388         opj_tcd_resolution_t * l_last_res = 0;
 389
 390         w = tilec->x1-tilec->x0;
 391         l = tilec->numresolutions-1;
 392         a = tilec->data;
 393
 394         l_cur_res = tilec->resolutions + l;
 395         l_last_res = l_cur_res - 1;
 396
 397         rw = l_cur_res->x1 - l_cur_res->x0;
 398         rh = l_cur_res->y1 - l_cur_res->y0;
 399
 400         l_data_size = dwt_max_resolution( tilec->resolutions,tilec->numresolutions) * sizeof(OPJ_INT32);
 401         bj = opj_malloc(l_data_size);
 402         if
 403                 (! bj)
 404         {
 405                 return false;
 406         }
 407         i = l;
 408
 409         while
 410                 (i--)
 411         {
 412                 OPJ_INT32 rw1;          /* width of the resolution level once lower than computed one                                       */
 413                 OPJ_INT32 rh1;          /* height of the resolution level once lower than computed one                                      */
 414                 OPJ_INT32 cas_col;      /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
 415                 OPJ_INT32 cas_row;      /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
 416                 OPJ_INT32 dn, sn;
 417
 418                 rw  = l_cur_res->x1 - l_cur_res->x0;
 419                 rh  = l_cur_res->y1 - l_cur_res->y0;
 420                 rw1 = l_last_res->x1 - l_last_res->x0;
 421                 rh1 = l_last_res->y1 - l_last_res->y0;
 422
 423                 cas_row = l_cur_res->x0 & 1;
 424                 cas_col = l_cur_res->y0 & 1;
 425
 426                 sn = rh1;
 427                 dn = rh - rh1;
 428                 for
 429                         (j = 0; j < rw; ++j)
 430                 {
 431                         aj = a + j;
 432                         for
 433                                 (k = 0; k < rh; ++k)
 434                         {
 435                                 bj[k] = aj[k*w];
 436                         }
 437                         (*p_function) (bj, dn, sn, cas_col);
 438                         dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
 439                 }
 440                 sn = rw1;
 441                 dn = rw - rw1;
 442                 for (j = 0; j < rh; j++)
 443                 {
 444                         aj = a + j * w;
 445                         for (k = 0; k < rw; k++)  bj[k] = aj[k];
 446                         (*p_function) (bj, dn, sn, cas_row);
 447                         dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
 448                 }
 449                 l_cur_res = l_last_res;
 450                 --l_last_res;
 451         }
 452         opj_free(bj);
 453         return true;
 454 }
 455 /* Forward 5-3 wavelet transform in 2-D. */
 456 /* </summary>                           */
 457 bool dwt_encode(opj_tcd_tilecomp_t * tilec)
 458 {
 459         return dwt_encode_procedure(tilec,dwt_encode_1);
 460 }
 461
 462 /* <summary>                            */
 463 /* Inverse 5-3 wavelet transform in 2-D. */
 464 /* </summary>                           */
 465 bool dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) {
 466         return dwt_decode_tile(tilec, numres, &dwt_decode_1);
 467 }
 468
 469
 470 /* <summary>                          */
 471 /* Get gain of 5-3 wavelet transform. */
 472 /* </summary>                         */
 473 OPJ_UINT32 dwt_getgain(OPJ_UINT32 orient) {
 474         if (orient == 0)
 475                 return 0;
 476         if (orient == 1 || orient == 2)
 477                 return 1;
 478         return 2;
 479 }
 480
 481 /* <summary>                */
 482 /* Get norm of 5-3 wavelet. */
 483 /* </summary>               */
 484 OPJ_FLOAT64 dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) {
 485         return dwt_norms[orient][level];
 486 }
 487
 488 /* <summary>                             */
 489 /* Forward 9-7 wavelet transform in 2-D. */
 490 /* </summary>                            */
 491 bool dwt_encode_real(opj_tcd_tilecomp_t * tilec)
 492 {
 493         return dwt_encode_procedure(tilec,dwt_encode_1_real);
 494 }
 495
 496
 497
 498 /* <summary>                          */
 499 /* Get gain of 9-7 wavelet transform. */
 500 /* </summary>                         */
 501 OPJ_UINT32 dwt_getgain_real(OPJ_UINT32 orient) {
 502         (void)orient;
 503         return 0;
 504 }
 505
 506 /* <summary>                */
 507 /* Get norm of 9-7 wavelet. */
 508 /* </summary>               */
 509 OPJ_FLOAT64 dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) {
 510         return dwt_norms_real[orient][level];
 511 }
 512
 513 void dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) {
 514         OPJ_UINT32 numbands, bandno;
 515         numbands = 3 * tccp->numresolutions - 2;
 516         for (bandno = 0; bandno < numbands; bandno++) {
 517                 OPJ_FLOAT64 stepsize;
 518                 OPJ_UINT32 resno, level, orient, gain;
 519
 520                 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
 521                 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
 522                 level = tccp->numresolutions - 1 - resno;
 523                 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || (orient == 2)) ? 1 : 2));
 524                 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
 525                         stepsize = 1.0;
 526                 } else {
 527                         OPJ_FLOAT64 norm = dwt_norms_real[orient][level];
 528                         stepsize = (1 << (gain)) / norm;
 529                 }
 530                 dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), prec + gain, &tccp->stepsizes[bandno]);
 531         }
 532 }
 533
 534
 535 /* <summary>                             */
 536 /* Determine maximum computed resolution level for inverse wavelet transform */
 537 /* </summary>                            */
 538 static OPJ_UINT32 dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i) {
 539         OPJ_UINT32 mr   = 0;
 540         OPJ_UINT32 w;
 541         while( --i ) {
 542                 ++r;
 543                 if( mr < ( w = r->x1 - r->x0 ) )
 544                         mr = w ;
 545                 if( mr < ( w = r->y1 - r->y0 ) )
 546                         mr = w ;
 547         }
 548         return mr ;
 549 }
 550
 551
 552 /* <summary>                            */
 553 /* Inverse wavelet transform in 2-D.     */
 554 /* </summary>                           */
 555 static bool dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) {
 556         dwt_t h;
 557         dwt_t v;
 558
 559         opj_tcd_resolution_t* tr = tilec->resolutions;
 560
 561         OPJ_UINT32 rw = tr->x1 - tr->x0;        /* width of the resolution level computed */
 562         OPJ_UINT32 rh = tr->y1 - tr->y0;        /* height of the resolution level computed */
 563
 564         OPJ_UINT32 w = tilec->x1 - tilec->x0;
 565
 566         h.mem = opj_aligned_malloc(dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32));
 567         if
 568                 (! h.mem)
 569         {
 570                 return false;
 571         }
 572
 573         v.mem = h.mem;
 574
 575         while( --numres) {
 576                 OPJ_INT32 * restrict tiledp = tilec->data;
 577                 OPJ_UINT32 j;
 578
 579                 ++tr;
 580                 h.sn = rw;
 581                 v.sn = rh;
 582
 583                 rw = tr->x1 - tr->x0;
 584                 rh = tr->y1 - tr->y0;
 585
 586                 h.dn = rw - h.sn;
 587                 h.cas = tr->x0 % 2;
 588
 589                 for(j = 0; j < rh; ++j) {
 590                         dwt_interleave_h(&h, &tiledp[j*w]);
 591                         (dwt_1D)(&h);
 592                         memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32));
 593                 }
 594
 595                 v.dn = rh - v.sn;
 596                 v.cas = tr->y0 % 2;
 597
 598                 for(j = 0; j < rw; ++j){
 599                         OPJ_UINT32 k;
 600                         dwt_interleave_v(&v, &tiledp[j], w);
 601                         (dwt_1D)(&v);
 602                         for(k = 0; k < rh; ++k) {
 603                                 tiledp[k * w + j] = v.mem[k];
 604                         }
 605                 }
 606         }
 607         opj_aligned_free(h.mem);
 608         return true;
 609 }
 610
 611 static void v4dwt_interleave_h(v4dwt_t* restrict w, OPJ_FLOAT32* restrict a, OPJ_INT32 x, OPJ_INT32 size){
 612         OPJ_FLOAT32* restrict bi = (OPJ_FLOAT32*) (w->wavelet + w->cas);
 613         OPJ_INT32 count = w->sn;
 614         OPJ_INT32 i, k;
 615         for(k = 0; k < 2; ++k){
 616                 for(i = 0; i < count; ++i){
 617                         OPJ_INT32 j = i;
 618                         bi[i*8    ] = a[j];
 619                         j += x;
 620                         if(j >= size) continue;
 621                         bi[i*8 + 1] = a[j];
 622                         j += x;
 623                         if(j >= size) continue;
 624                         bi[i*8 + 2] = a[j];
 625                         j += x;
 626                         if(j >= size) continue;
 627                         bi[i*8 + 3] = a[j];
 628                 }
 629                 bi = (OPJ_FLOAT32*) (w->wavelet + 1 - w->cas);
 630                 a += w->sn;
 631                 size -= w->sn;
 632                 count = w->dn;
 633         }
 634 }
 635
 636 static void v4dwt_interleave_v(v4dwt_t* restrict v , OPJ_FLOAT32* restrict a , OPJ_INT32 x){
 637         v4* restrict bi = v->wavelet + v->cas;
 638         OPJ_INT32 i;
 639         for(i = 0; i < v->sn; ++i){
 640                 memcpy(&bi[i*2], &a[i*x], 4 * sizeof(OPJ_FLOAT32));
 641         }
 642         a += v->sn * x;
 643         bi = v->wavelet + 1 - v->cas;
 644         for(i = 0; i < v->dn; ++i){
 645                 memcpy(&bi[i*2], &a[i*x], 4 * sizeof(OPJ_FLOAT32));
 646         }
 647 }
 648
 649 #ifdef __SSE__
 650
 651 static void v4dwt_decode_step1_sse(v4* w, OPJ_INT32 count, const __m128 c){
 652         __m128* restrict vw = (__m128*) w;
 653         OPJ_INT32 i;
 654         for(i = 0; i < count; ++i){
 655                 __m128 tmp = vw[i*2];
 656                 vw[i*2] = tmp * c;
 657         }
 658 }
 659
 660 static void v4dwt_decode_step2_sse(v4* l, v4* w, OPJ_INT32 k, OPJ_INT32 m, __m128 c){
 661         __m128* restrict vl = (__m128*) l;
 662         __m128* restrict vw = (__m128*) w;
 663         OPJ_INT32 i;
 664         for(i = 0; i < m; ++i){
 665                 __m128 tmp1 = vl[ 0];
 666                 __m128 tmp2 = vw[-1];
 667                 __m128 tmp3 = vw[ 0];
 668                 vw[-1] = tmp2 + ((tmp1 + tmp3) * c);
 669                 vl = vw;
 670                 vw += 2;
 671         }
 672         if(m >= k){
 673                 return;
 674         }
 675         c += c;
 676         c *= vl[0];
 677         for(; m < k; ++m){
 678                 __m128 tmp = vw[-1];
 679                 vw[-1] = tmp + c;
 680                 vw += 2;
 681         }
 682 }
 683
 684 #else
 685
 686 static void v4dwt_decode_step1(v4* w, OPJ_INT32 count, const OPJ_FLOAT32 c){
 687         OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;
 688         OPJ_INT32 i;
 689         for(i = 0; i < count; ++i){
 690                 OPJ_FLOAT32 tmp1 = fw[i*8    ];
 691                 OPJ_FLOAT32 tmp2 = fw[i*8 + 1];
 692                 OPJ_FLOAT32 tmp3 = fw[i*8 + 2];
 693                 OPJ_FLOAT32 tmp4 = fw[i*8 + 3];
 694                 fw[i*8    ] = tmp1 * c;
 695                 fw[i*8 + 1] = tmp2 * c;
 696                 fw[i*8 + 2] = tmp3 * c;
 697                 fw[i*8 + 3] = tmp4 * c;
 698         }
 699 }
 700
 701 static void v4dwt_decode_step2(v4* l, v4* w, OPJ_INT32 k, OPJ_INT32 m, OPJ_FLOAT32 c){
 702         OPJ_FLOAT32* restrict fl = (OPJ_FLOAT32*) l;
 703         OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;
 704         OPJ_INT32 i;
 705         for(i = 0; i < m; ++i){
 706                 OPJ_FLOAT32 tmp1_1 = fl[0];
 707                 OPJ_FLOAT32 tmp1_2 = fl[1];
 708                 OPJ_FLOAT32 tmp1_3 = fl[2];
 709                 OPJ_FLOAT32 tmp1_4 = fl[3];
 710                 OPJ_FLOAT32 tmp2_1 = fw[-4];
 711                 OPJ_FLOAT32 tmp2_2 = fw[-3];
 712                 OPJ_FLOAT32 tmp2_3 = fw[-2];
 713                 OPJ_FLOAT32 tmp2_4 = fw[-1];
 714                 OPJ_FLOAT32 tmp3_1 = fw[0];
 715                 OPJ_FLOAT32 tmp3_2 = fw[1];
 716                 OPJ_FLOAT32 tmp3_3 = fw[2];
 717                 OPJ_FLOAT32 tmp3_4 = fw[3];
 718                 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
 719                 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
 720                 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
 721                 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
 722                 fl = fw;
 723                 fw += 8;
 724         }
 725         if(m < k){
 726                 OPJ_FLOAT32 c1;
 727                 OPJ_FLOAT32 c2;
 728                 OPJ_FLOAT32 c3;
 729                 OPJ_FLOAT32 c4;
 730                 c += c;
 731                 c1 = fl[0] * c;
 732                 c2 = fl[1] * c;
 733                 c3 = fl[2] * c;
 734                 c4 = fl[3] * c;
 735                 for(; m < k; ++m){
 736                         OPJ_FLOAT32 tmp1 = fw[-4];
 737                         OPJ_FLOAT32 tmp2 = fw[-3];
 738                         OPJ_FLOAT32 tmp3 = fw[-2];
 739                         OPJ_FLOAT32 tmp4 = fw[-1];
 740                         fw[-4] = tmp1 + c1;
 741                         fw[-3] = tmp2 + c2;
 742                         fw[-2] = tmp3 + c3;
 743                         fw[-1] = tmp4 + c4;
 744                         fw += 8;
 745                 }
 746         }
 747 }
 748
 749 #endif
 750
 751 /* <summary>                             */
 752 /* Inverse 9-7 wavelet transform in 1-D. */
 753 /* </summary>                            */
 754 static void v4dwt_decode(v4dwt_t* restrict dwt){
 755         OPJ_INT32 a, b;
 756         if(dwt->cas == 0) {
 757                 if(!((dwt->dn > 0) || (dwt->sn > 1))){
 758                         return;
 759                 }
 760                 a = 0;
 761                 b = 1;
 762         }else{
 763                 if(!((dwt->sn > 0) || (dwt->dn > 1))) {
 764                         return;
 765                 }
 766                 a = 1;
 767                 b = 0;
 768         }
 769 #ifdef __SSE__
 770         v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
 771         v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
 772         v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(delta));
 773         v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
 774         v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
 775         v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
 776 #else
 777         v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
 778         v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
 779         v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), delta);
 780         v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
 781         v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
 782         v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
 783 #endif
 784 }
 785
 786 /* <summary>                             */
 787 /* Inverse 9-7 wavelet transform in 2-D. */
 788 /* </summary>                            */
 789 bool dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, OPJ_UINT32 numres){
 790         v4dwt_t h;
 791         v4dwt_t v;
 792
 793         opj_tcd_resolution_t* res = tilec->resolutions;
 794
 795         OPJ_UINT32 rw = res->x1 - res->x0;      /* width of the resolution level computed */
 796         OPJ_UINT32 rh = res->y1 - res->y0;      /* height of the resolution level computed */
 797
 798         OPJ_UINT32 w = tilec->x1 - tilec->x0;
 799
 800         h.wavelet = (v4*) opj_aligned_malloc((dwt_max_resolution(res, numres)+5) * sizeof(v4));
 801         v.wavelet = h.wavelet;
 802
 803         while( --numres) {
 804                 OPJ_FLOAT32 * restrict aj = (OPJ_FLOAT32*) tilec->data;
 805                 OPJ_UINT32 bufsize = (tilec->x1 - tilec->x0) * (tilec->y1 - tilec->y0);
 806                 OPJ_INT32 j;
 807
 808                 h.sn = rw;
 809                 v.sn = rh;
 810
 811                 ++res;
 812
 813                 rw = res->x1 - res->x0; /* width of the resolution level computed */
 814                 rh = res->y1 - res->y0; /* height of the resolution level computed */
 815
 816                 h.dn = rw - h.sn;
 817                 h.cas = res->x0 & 1;
 818
 819                 for(j = rh; j > 0; j -= 4){
 820                         v4dwt_interleave_h(&h, aj, w, bufsize);
 821                         v4dwt_decode(&h);
 822                         if(j >= 4){
 823                                 OPJ_INT32 k = rw;
 824                                 while
 825                                         (--k >= 0)
 826                                 {
 827                                         aj[k    ] = h.wavelet[k].f[0];
 828                                         aj[k+w  ] = h.wavelet[k].f[1];
 829                                         aj[k+w*2] = h.wavelet[k].f[2];
 830                                         aj[k+w*3] = h.wavelet[k].f[3];
 831                                 }
 832                         }else{
 833                                 OPJ_INT32 k = rw;
 834                                 while
 835                                         (--k >= 0)
 836                                 {
 837                                         switch(j) {
 838                                                 case 3: aj[k+w*2] = h.wavelet[k].f[2];
 839                                                 case 2: aj[k+w  ] = h.wavelet[k].f[1];
 840                                                 case 1: aj[k    ] = h.wavelet[k].f[0];
 841                                         }
 842                                 }
 843                         }
 844                         aj += w*4;
 845                         bufsize -= w*4;
 846                 }
 847
 848                 v.dn = rh - v.sn;
 849                 v.cas = res->y0 % 2;
 850
 851                 aj = (OPJ_FLOAT32*) tilec->data;
 852                 for(j = rw; j > 0; j -= 4){
 853                         v4dwt_interleave_v(&v, aj, w);
 854                         v4dwt_decode(&v);
 855                         if(j >= 4){
 856                                 OPJ_UINT32 k;
 857                                 for(k = 0; k < rh; ++k){
 858                                         memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
 859                                 }
 860                         }else{
 861                                 OPJ_UINT32 k;
 862                                 for(k = 0; k < rh; ++k){
 863                                         memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(OPJ_FLOAT32));
 864                                 }
 865                         }
 866                         aj += 4;
 867                 }
 868         }
 869
 870         opj_aligned_free(h.wavelet);
 871         return true;
 872 }
 873