src/lib/openjp2/dwt.c

   1 /*
   2  * The copyright in this software is being made available under the 2-clauses
   3  * BSD License, included below. This software may be subject to other third
   4  * party and contributor rights, including patent rights, and no such rights
   5  * are granted under this license.
   6  *
   7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
   8  * Copyright (c) 2002-2014, Professor Benoit Macq
   9  * Copyright (c) 2001-2003, David Janssens
  10  * Copyright (c) 2002-2003, Yannick Verschueren
  11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12  * Copyright (c) 2003-2014, Antonin Descampe
  13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
  15  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
  16  * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
  17  * All rights reserved.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  29  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38  * POSSIBILITY OF SUCH DAMAGE.
  39  */
  40
  41 #include <assert.h>
  42
  43 #define OPJ_SKIP_POISON
  44 #include "opj_includes.h"
  45
  46 #ifdef __SSE__
  47 #include <xmmintrin.h>
  48 #endif
  49 #ifdef __SSE2__
  50 #include <emmintrin.h>
  51 #endif
  52 #ifdef __SSSE3__
  53 #include <tmmintrin.h>
  54 #endif
  55 #ifdef __AVX2__
  56 #include <immintrin.h>
  57 #endif
  58
  59 #if defined(__GNUC__)
  60 #pragma GCC poison malloc calloc realloc free
  61 #endif
  62
  63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
  64 /*@{*/
  65
  66 #define OPJ_WS(i) v->mem[(i)*2]
  67 #define OPJ_WD(i) v->mem[(1+(i)*2)]
  68
  69 #ifdef __AVX2__
  70 /** Number of int32 values in a AVX2 register */
  71 #define VREG_INT_COUNT       8
  72 #else
  73 /** Number of int32 values in a SSE2 register */
  74 #define VREG_INT_COUNT       4
  75 #endif
  76
  77 /** Number of columns that we can process in parallel in the vertical pass */
  78 #define PARALLEL_COLS_53     (2*VREG_INT_COUNT)
  79
  80 /** @name Local data structures */
  81 /*@{*/
  82
  83 typedef struct dwt_local {
  84     OPJ_INT32* mem;
  85     OPJ_INT32 dn;   /* number of elements in high pass band */
  86     OPJ_INT32 sn;   /* number of elements in low pass band */
  87     OPJ_INT32 cas;  /* 0 = start on even coord, 1 = start on odd coord */
  88 } opj_dwt_t;
  89
  90 #define NB_ELTS_V8  8
  91
  92 typedef union {
  93     OPJ_FLOAT32 f[NB_ELTS_V8];
  94 } opj_v8_t;
  95
  96 typedef struct v8dwt_local {
  97     opj_v8_t*   wavelet ;
  98     OPJ_INT32       dn ;  /* number of elements in high pass band */
  99     OPJ_INT32       sn ;  /* number of elements in low pass band */
 100     OPJ_INT32       cas ; /* 0 = start on even coord, 1 = start on odd coord */
 101     OPJ_UINT32      win_l_x0; /* start coord in low pass band */
 102     OPJ_UINT32      win_l_x1; /* end coord in low pass band */
 103     OPJ_UINT32      win_h_x0; /* start coord in high pass band */
 104     OPJ_UINT32      win_h_x1; /* end coord in high pass band */
 105 } opj_v8dwt_t ;
 106
 107 /* From table F.4 from the standard */
 108 static const OPJ_FLOAT32 opj_dwt_alpha =  -1.586134342f;
 109 static const OPJ_FLOAT32 opj_dwt_beta  =  -0.052980118f;
 110 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
 111 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
 112
 113 static const OPJ_FLOAT32 opj_K      = 1.230174105f;
 114 static const OPJ_FLOAT32 opj_invK   = (OPJ_FLOAT32)(1.0 / 1.230174105);
 115
 116 /*@}*/
 117
 118 /** @name Local static functions */
 119 /*@{*/
 120
 121 /**
 122 Forward lazy transform (horizontal)
 123 */
 124 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 125                                    OPJ_INT32 * OPJ_RESTRICT b,
 126                                    OPJ_INT32 dn,
 127                                    OPJ_INT32 sn, OPJ_INT32 cas);
 128 /**
 129 Forward lazy transform (vertical)
 130 */
 131 static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a,
 132                                    OPJ_INT32 * OPJ_RESTRICT b,
 133                                    OPJ_INT32 dn,
 134                                    OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas);
 135 /**
 136 Forward 5-3 wavelet transform in 1-D
 137 */
 138 static void opj_dwt_encode_1(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
 139                              OPJ_INT32 cas);
 140 /**
 141 Forward 9-7 wavelet transform in 1-D
 142 */
 143 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
 144                                   OPJ_INT32 cas);
 145 /**
 146 Explicit calculation of the Quantization Stepsizes
 147 */
 148 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 149                                     opj_stepsize_t *bandno_stepsize);
 150 /**
 151 Inverse wavelet transform in 2-D.
 152 */
 153 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
 154                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
 155
 156 static OPJ_BOOL opj_dwt_decode_partial_tile(
 157     opj_tcd_tilecomp_t* tilec,
 158     OPJ_UINT32 numres);
 159
 160 /* Forward transform, for the vertical pass, processing cols columns */
 161 /* where cols <= NB_ELTS_V8 */
 162 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 163 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
 164     void *array,
 165     void *tmp,
 166     OPJ_UINT32 height,
 167     OPJ_BOOL even,
 168     OPJ_UINT32 stride_width,
 169     OPJ_UINT32 cols);
 170
 171 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 172 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
 173     void *row,
 174     void *tmp,
 175     OPJ_UINT32 width,
 176     OPJ_BOOL even);
 177
 178 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
 179         opj_tcd_tilecomp_t * tilec,
 180         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
 181         opj_encode_and_deinterleave_h_one_row_fnptr_type
 182         p_encode_and_deinterleave_h_one_row);
 183
 184 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
 185         OPJ_UINT32 i);
 186
 187 /* <summary>                             */
 188 /* Inverse 9-7 wavelet transform in 1-D. */
 189 /* </summary>                            */
 190
 191 /*@}*/
 192
 193 /*@}*/
 194
 195 #define OPJ_S(i) a[(i)*2]
 196 #define OPJ_D(i) a[(1+(i)*2)]
 197 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
 198 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
 199 /* new */
 200 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
 201 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
 202
 203 /* <summary>                                                              */
 204 /* This table contains the norms of the 5-3 wavelets for different bands. */
 205 /* </summary>                                                             */
 206 /* FIXME! the array should really be extended up to 33 resolution levels */
 207 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 208 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
 209     {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
 210     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 211     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 212     {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
 213 };
 214
 215 /* <summary>                                                              */
 216 /* This table contains the norms of the 9-7 wavelets for different bands. */
 217 /* </summary>                                                             */
 218 /* FIXME! the array should really be extended up to 33 resolution levels */
 219 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 220 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
 221     {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
 222     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 223     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 224     {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
 225 };
 226
 227 /*
 228 ==========================================================
 229    local functions
 230 ==========================================================
 231 */
 232
 233 /* <summary>                             */
 234 /* Forward lazy transform (horizontal).  */
 235 /* </summary>                            */
 236 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 237                                    OPJ_INT32 * OPJ_RESTRICT b,
 238                                    OPJ_INT32 dn,
 239                                    OPJ_INT32 sn, OPJ_INT32 cas)
 240 {
 241     OPJ_INT32 i;
 242     OPJ_INT32 * OPJ_RESTRICT l_dest = b;
 243     const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
 244
 245     for (i = 0; i < sn; ++i) {
 246         *l_dest++ = *l_src;
 247         l_src += 2;
 248     }
 249
 250     l_dest = b + sn;
 251     l_src = a + 1 - cas;
 252
 253     for (i = 0; i < dn; ++i)  {
 254         *l_dest++ = *l_src;
 255         l_src += 2;
 256     }
 257 }
 258
 259 /* <summary>                             */
 260 /* Forward lazy transform (vertical).    */
 261 /* </summary>                            */
 262 static void opj_dwt_deinterleave_v(const OPJ_INT32 * OPJ_RESTRICT a,
 263                                    OPJ_INT32 * OPJ_RESTRICT b,
 264                                    OPJ_INT32 dn,
 265                                    OPJ_INT32 sn, OPJ_UINT32 x, OPJ_INT32 cas)
 266 {
 267     OPJ_INT32 i = sn;
 268     OPJ_INT32 * OPJ_RESTRICT l_dest = b;
 269     const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
 270
 271     while (i--) {
 272         *l_dest = *l_src;
 273         l_dest += x;
 274         l_src += 2;
 275     } /* b[i*x]=a[2*i+cas]; */
 276
 277     l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x;
 278     l_src = a + 1 - cas;
 279
 280     i = dn;
 281     while (i--) {
 282         *l_dest = *l_src;
 283         l_dest += x;
 284         l_src += 2;
 285     } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
 286 }
 287
 288 #ifdef STANDARD_SLOW_VERSION
 289 /* <summary>                             */
 290 /* Inverse lazy transform (horizontal).  */
 291 /* </summary>                            */
 292 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 293 {
 294     const OPJ_INT32 *ai = a;
 295     OPJ_INT32 *bi = h->mem + h->cas;
 296     OPJ_INT32  i    = h->sn;
 297     while (i--) {
 298         *bi = *(ai++);
 299         bi += 2;
 300     }
 301     ai  = a + h->sn;
 302     bi  = h->mem + 1 - h->cas;
 303     i   = h->dn ;
 304     while (i--) {
 305         *bi = *(ai++);
 306         bi += 2;
 307     }
 308 }
 309
 310 /* <summary>                             */
 311 /* Inverse lazy transform (vertical).    */
 312 /* </summary>                            */
 313 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 314 {
 315     const OPJ_INT32 *ai = a;
 316     OPJ_INT32 *bi = v->mem + v->cas;
 317     OPJ_INT32  i = v->sn;
 318     while (i--) {
 319         *bi = *ai;
 320         bi += 2;
 321         ai += x;
 322     }
 323     ai = a + (v->sn * (OPJ_SIZE_T)x);
 324     bi = v->mem + 1 - v->cas;
 325     i = v->dn ;
 326     while (i--) {
 327         *bi = *ai;
 328         bi += 2;
 329         ai += x;
 330     }
 331 }
 332
 333 #endif /* STANDARD_SLOW_VERSION */
 334
 335 /* <summary>                            */
 336 /* Forward 5-3 wavelet transform in 1-D. */
 337 /* </summary>                           */
 338 static void opj_dwt_encode_1(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
 339                              OPJ_INT32 cas)
 340 {
 341     OPJ_INT32 i;
 342     OPJ_INT32* a = (OPJ_INT32*)aIn;
 343
 344     if (!cas) {
 345         if (sn + dn > 1) {
 346             for (i = 0; i < sn - 1; i++) {
 347                 OPJ_D(i) -= (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
 348             }
 349             if (((sn + dn) % 2) == 0) {
 350                 OPJ_D(i) -= OPJ_S(i);
 351             }
 352             OPJ_S(0) += (OPJ_D(0) + OPJ_D(0) + 2) >> 2;
 353             for (i = 1; i < dn; i++) {
 354                 OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
 355             }
 356             if (((sn + dn) % 2) == 1) {
 357                 OPJ_S(i) += (OPJ_D(i - 1) + OPJ_D(i - 1) + 2) >> 2;
 358             }
 359         }
 360     } else {
 361         if (sn + dn == 1) {
 362             a[0] *= 2;
 363         } else {
 364             OPJ_S(0) -= OPJ_D(0);
 365             for (i = 1; i < sn; i++) {
 366                 OPJ_S(i) -= (OPJ_D(i) + OPJ_D(i - 1)) >> 1;
 367             }
 368             if (((sn + dn) % 2) == 1) {
 369                 OPJ_S(i) -= OPJ_D(i - 1);
 370             }
 371             for (i = 0; i < dn - 1; i++) {
 372                 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1) + 2) >> 2;
 373             }
 374             if (((sn + dn) % 2) == 0) {
 375                 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i) + 2) >> 2;
 376             }
 377         }
 378     }
 379 }
 380
 381 #ifdef STANDARD_SLOW_VERSION
 382 /* <summary>                            */
 383 /* Inverse 5-3 wavelet transform in 1-D. */
 384 /* </summary>                           */
 385 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 386                               OPJ_INT32 cas)
 387 {
 388     OPJ_INT32 i;
 389
 390     if (!cas) {
 391         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 392             for (i = 0; i < sn; i++) {
 393                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
 394             }
 395             for (i = 0; i < dn; i++) {
 396                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
 397             }
 398         }
 399     } else {
 400         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
 401             OPJ_S(0) /= 2;
 402         } else {
 403             for (i = 0; i < sn; i++) {
 404                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
 405             }
 406             for (i = 0; i < dn; i++) {
 407                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
 408             }
 409         }
 410     }
 411 }
 412
 413 static void opj_dwt_decode_1(const opj_dwt_t *v)
 414 {
 415     opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
 416 }
 417
 418 #endif /* STANDARD_SLOW_VERSION */
 419
 420 #if !defined(STANDARD_SLOW_VERSION)
 421 static void  opj_idwt53_h_cas0(OPJ_INT32* tmp,
 422                                const OPJ_INT32 sn,
 423                                const OPJ_INT32 len,
 424                                OPJ_INT32* tiledp)
 425 {
 426     OPJ_INT32 i, j;
 427     const OPJ_INT32* in_even = &tiledp[0];
 428     const OPJ_INT32* in_odd = &tiledp[sn];
 429
 430 #ifdef TWO_PASS_VERSION
 431     /* For documentation purpose: performs lifting in two iterations, */
 432     /* but without explicit interleaving */
 433
 434     assert(len > 1);
 435
 436     /* Even */
 437     tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
 438     for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
 439         tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
 440     }
 441     if (len & 1) { /* if len is odd */
 442         tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
 443     }
 444
 445     /* Odd */
 446     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 447         tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
 448     }
 449     if (!(len & 1)) { /* if len is even */
 450         tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
 451     }
 452 #else
 453     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 454
 455     assert(len > 1);
 456
 457     /* Improved version of the TWO_PASS_VERSION: */
 458     /* Performs lifting in one single iteration. Saves memory */
 459     /* accesses and explicit interleaving. */
 460     s1n = in_even[0];
 461     d1n = in_odd[0];
 462     s0n = s1n - ((d1n + 1) >> 1);
 463
 464     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 465         d1c = d1n;
 466         s0c = s0n;
 467
 468         s1n = in_even[j];
 469         d1n = in_odd[j];
 470
 471         s0n = s1n - ((d1c + d1n + 2) >> 2);
 472
 473         tmp[i  ] = s0c;
 474         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 475     }
 476
 477     tmp[i] = s0n;
 478
 479     if (len & 1) {
 480         tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
 481         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 482     } else {
 483         tmp[len - 1] = d1n + s0n;
 484     }
 485 #endif
 486     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 487 }
 488
 489 static void  opj_idwt53_h_cas1(OPJ_INT32* tmp,
 490                                const OPJ_INT32 sn,
 491                                const OPJ_INT32 len,
 492                                OPJ_INT32* tiledp)
 493 {
 494     OPJ_INT32 i, j;
 495     const OPJ_INT32* in_even = &tiledp[sn];
 496     const OPJ_INT32* in_odd = &tiledp[0];
 497
 498 #ifdef TWO_PASS_VERSION
 499     /* For documentation purpose: performs lifting in two iterations, */
 500     /* but without explicit interleaving */
 501
 502     assert(len > 2);
 503
 504     /* Odd */
 505     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 506         tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
 507     }
 508     if (!(len & 1)) {
 509         tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
 510     }
 511
 512     /* Even */
 513     tmp[0] = in_even[0] + tmp[1];
 514     for (i = 2, j = 1; i < len - 1; i += 2, j++) {
 515         tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
 516     }
 517     if (len & 1) {
 518         tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
 519     }
 520 #else
 521     OPJ_INT32 s1, s2, dc, dn;
 522
 523     assert(len > 2);
 524
 525     /* Improved version of the TWO_PASS_VERSION: */
 526     /* Performs lifting in one single iteration. Saves memory */
 527     /* accesses and explicit interleaving. */
 528
 529     s1 = in_even[1];
 530     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 531     tmp[0] = in_even[0] + dc;
 532
 533     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 534
 535         s2 = in_even[j + 1];
 536
 537         dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
 538         tmp[i  ] = dc;
 539         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 540
 541         dc = dn;
 542         s1 = s2;
 543     }
 544
 545     tmp[i] = dc;
 546
 547     if (!(len & 1)) {
 548         dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
 549         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 550         tmp[len - 1] = dn;
 551     } else {
 552         tmp[len - 1] = s1 + dc;
 553     }
 554 #endif
 555     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 556 }
 557
 558
 559 #endif /* !defined(STANDARD_SLOW_VERSION) */
 560
 561 /* <summary>                            */
 562 /* Inverse 5-3 wavelet transform in 1-D for one row. */
 563 /* </summary>                           */
 564 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 565 static void opj_idwt53_h(const opj_dwt_t *dwt,
 566                          OPJ_INT32* tiledp)
 567 {
 568 #ifdef STANDARD_SLOW_VERSION
 569     /* For documentation purpose */
 570     opj_dwt_interleave_h(dwt, tiledp);
 571     opj_dwt_decode_1(dwt);
 572     memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
 573 #else
 574     const OPJ_INT32 sn = dwt->sn;
 575     const OPJ_INT32 len = sn + dwt->dn;
 576     if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
 577         if (len > 1) {
 578             opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
 579         } else {
 580             /* Unmodified value */
 581         }
 582     } else { /* Left-most sample is on odd coordinate */
 583         if (len == 1) {
 584             tiledp[0] /= 2;
 585         } else if (len == 2) {
 586             OPJ_INT32* out = dwt->mem;
 587             const OPJ_INT32* in_even = &tiledp[sn];
 588             const OPJ_INT32* in_odd = &tiledp[0];
 589             out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 590             out[0] = in_even[0] + out[1];
 591             memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 592         } else if (len > 2) {
 593             opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
 594         }
 595     }
 596 #endif
 597 }
 598
 599 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
 600
 601 /* Conveniency macros to improve the readabilty of the formulas */
 602 #if __AVX2__
 603 #define VREG        __m256i
 604 #define LOAD_CST(x) _mm256_set1_epi32(x)
 605 #define LOAD(x)     _mm256_load_si256((const VREG*)(x))
 606 #define LOADU(x)    _mm256_loadu_si256((const VREG*)(x))
 607 #define STORE(x,y)  _mm256_store_si256((VREG*)(x),(y))
 608 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
 609 #define ADD(x,y)    _mm256_add_epi32((x),(y))
 610 #define SUB(x,y)    _mm256_sub_epi32((x),(y))
 611 #define SAR(x,y)    _mm256_srai_epi32((x),(y))
 612 #else
 613 #define VREG        __m128i
 614 #define LOAD_CST(x) _mm_set1_epi32(x)
 615 #define LOAD(x)     _mm_load_si128((const VREG*)(x))
 616 #define LOADU(x)    _mm_loadu_si128((const VREG*)(x))
 617 #define STORE(x,y)  _mm_store_si128((VREG*)(x),(y))
 618 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
 619 #define ADD(x,y)    _mm_add_epi32((x),(y))
 620 #define SUB(x,y)    _mm_sub_epi32((x),(y))
 621 #define SAR(x,y)    _mm_srai_epi32((x),(y))
 622 #endif
 623 #define ADD3(x,y,z) ADD(ADD(x,y),z)
 624
 625 static
 626 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
 627                                const OPJ_INT32* tmp,
 628                                OPJ_INT32 len,
 629                                OPJ_SIZE_T stride)
 630 {
 631     OPJ_INT32 i;
 632     for (i = 0; i < len; ++i) {
 633         /* A memcpy(&tiledp_col[i * stride + 0],
 634                     &tmp[PARALLEL_COLS_53 * i + 0],
 635                     PARALLEL_COLS_53 * sizeof(OPJ_INT32))
 636            would do but would be a tiny bit slower.
 637            We can take here advantage of our knowledge of alignment */
 638         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
 639                LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
 640         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
 641                LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
 642     }
 643 }
 644
 645 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 646  * 16 in AVX2, when top-most pixel is on even coordinate */
 647 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
 648     OPJ_INT32* tmp,
 649     const OPJ_INT32 sn,
 650     const OPJ_INT32 len,
 651     OPJ_INT32* tiledp_col,
 652     const OPJ_SIZE_T stride)
 653 {
 654     const OPJ_INT32* in_even = &tiledp_col[0];
 655     const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 656
 657     OPJ_INT32 i;
 658     OPJ_SIZE_T j;
 659     VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
 660     VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
 661     const VREG two = LOAD_CST(2);
 662
 663     assert(len > 1);
 664 #if __AVX2__
 665     assert(PARALLEL_COLS_53 == 16);
 666     assert(VREG_INT_COUNT == 8);
 667 #else
 668     assert(PARALLEL_COLS_53 == 8);
 669     assert(VREG_INT_COUNT == 4);
 670 #endif
 671
 672     /* Note: loads of input even/odd values must be done in a unaligned */
 673     /* fashion. But stores in tmp can be done with aligned store, since */
 674     /* the temporary buffer is properly aligned */
 675     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 676
 677     s1n_0 = LOADU(in_even + 0);
 678     s1n_1 = LOADU(in_even + VREG_INT_COUNT);
 679     d1n_0 = LOADU(in_odd);
 680     d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
 681
 682     /* s0n = s1n - ((d1n + 1) >> 1); <==> */
 683     /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
 684     s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 685     s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 686
 687     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 688         d1c_0 = d1n_0;
 689         s0c_0 = s0n_0;
 690         d1c_1 = d1n_1;
 691         s0c_1 = s0n_1;
 692
 693         s1n_0 = LOADU(in_even + j * stride);
 694         s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
 695         d1n_0 = LOADU(in_odd + j * stride);
 696         d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
 697
 698         /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
 699         s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
 700         s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
 701
 702         STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
 703         STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
 704
 705         /* d1c + ((s0c + s0n) >> 1) */
 706         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 707               ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
 708         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 709               ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
 710     }
 711
 712     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
 713     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
 714
 715     if (len & 1) {
 716         VREG tmp_len_minus_1;
 717         s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
 718         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 719         tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 720         STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
 721         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 722         STORE(tmp + PARALLEL_COLS_53 * (len - 2),
 723               ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
 724
 725         s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
 726         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 727         tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 728         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 729               tmp_len_minus_1);
 730         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 731         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 732               ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
 733
 734
 735     } else {
 736         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
 737               ADD(d1n_0, s0n_0));
 738         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 739               ADD(d1n_1, s0n_1));
 740     }
 741
 742     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 743 }
 744
 745
 746 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 747  * 16 in AVX2, when top-most pixel is on odd coordinate */
 748 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
 749     OPJ_INT32* tmp,
 750     const OPJ_INT32 sn,
 751     const OPJ_INT32 len,
 752     OPJ_INT32* tiledp_col,
 753     const OPJ_SIZE_T stride)
 754 {
 755     OPJ_INT32 i;
 756     OPJ_SIZE_T j;
 757
 758     VREG s1_0, s2_0, dc_0, dn_0;
 759     VREG s1_1, s2_1, dc_1, dn_1;
 760     const VREG two = LOAD_CST(2);
 761
 762     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 763     const OPJ_INT32* in_odd = &tiledp_col[0];
 764
 765     assert(len > 2);
 766 #if __AVX2__
 767     assert(PARALLEL_COLS_53 == 16);
 768     assert(VREG_INT_COUNT == 8);
 769 #else
 770     assert(PARALLEL_COLS_53 == 8);
 771     assert(VREG_INT_COUNT == 4);
 772 #endif
 773
 774     /* Note: loads of input even/odd values must be done in a unaligned */
 775     /* fashion. But stores in tmp can be done with aligned store, since */
 776     /* the temporary buffer is properly aligned */
 777     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 778
 779     s1_0 = LOADU(in_even + stride);
 780     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 781     dc_0 = SUB(LOADU(in_odd + 0),
 782                SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
 783     STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
 784
 785     s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
 786     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 787     dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
 788                SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
 789     STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
 790           ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
 791
 792     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 793
 794         s2_0 = LOADU(in_even + (j + 1) * stride);
 795         s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
 796
 797         /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
 798         dn_0 = SUB(LOADU(in_odd + j * stride),
 799                    SAR(ADD3(s1_0, s2_0, two), 2));
 800         dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
 801                    SAR(ADD3(s1_1, s2_1, two), 2));
 802
 803         STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 804         STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 805
 806         /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
 807         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 808               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 809         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 810               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 811
 812         dc_0 = dn_0;
 813         s1_0 = s2_0;
 814         dc_1 = dn_1;
 815         s1_1 = s2_1;
 816     }
 817     STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 818     STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 819
 820     if (!(len & 1)) {
 821         /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
 822         dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
 823                    SAR(ADD3(s1_0, s1_0, two), 2));
 824         dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
 825                    SAR(ADD3(s1_1, s1_1, two), 2));
 826
 827         /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
 828         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
 829               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 830         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 831               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 832
 833         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
 834         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
 835     } else {
 836         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
 837         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 838               ADD(s1_1, dc_1));
 839     }
 840
 841     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 842 }
 843
 844 #undef VREG
 845 #undef LOAD_CST
 846 #undef LOADU
 847 #undef LOAD
 848 #undef STORE
 849 #undef STOREU
 850 #undef ADD
 851 #undef ADD3
 852 #undef SUB
 853 #undef SAR
 854
 855 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
 856
 857 #if !defined(STANDARD_SLOW_VERSION)
 858 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 859  * pixel is on even coordinate */
 860 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
 861                              const OPJ_INT32 sn,
 862                              const OPJ_INT32 len,
 863                              OPJ_INT32* tiledp_col,
 864                              const OPJ_SIZE_T stride)
 865 {
 866     OPJ_INT32 i, j;
 867     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 868
 869     assert(len > 1);
 870
 871     /* Performs lifting in one single iteration. Saves memory */
 872     /* accesses and explicit interleaving. */
 873
 874     s1n = tiledp_col[0];
 875     d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
 876     s0n = s1n - ((d1n + 1) >> 1);
 877
 878     for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
 879         d1c = d1n;
 880         s0c = s0n;
 881
 882         s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
 883         d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
 884
 885         s0n = s1n - ((d1c + d1n + 2) >> 2);
 886
 887         tmp[i  ] = s0c;
 888         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 889     }
 890
 891     tmp[i] = s0n;
 892
 893     if (len & 1) {
 894         tmp[len - 1] =
 895             tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
 896             ((d1n + 1) >> 1);
 897         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 898     } else {
 899         tmp[len - 1] = d1n + s0n;
 900     }
 901
 902     for (i = 0; i < len; ++i) {
 903         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 904     }
 905 }
 906
 907
 908 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 909  * pixel is on odd coordinate */
 910 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
 911                              const OPJ_INT32 sn,
 912                              const OPJ_INT32 len,
 913                              OPJ_INT32* tiledp_col,
 914                              const OPJ_SIZE_T stride)
 915 {
 916     OPJ_INT32 i, j;
 917     OPJ_INT32 s1, s2, dc, dn;
 918     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 919     const OPJ_INT32* in_odd = &tiledp_col[0];
 920
 921     assert(len > 2);
 922
 923     /* Performs lifting in one single iteration. Saves memory */
 924     /* accesses and explicit interleaving. */
 925
 926     s1 = in_even[stride];
 927     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 928     tmp[0] = in_even[0] + dc;
 929     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 930
 931         s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
 932
 933         dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
 934         tmp[i  ] = dc;
 935         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 936
 937         dc = dn;
 938         s1 = s2;
 939     }
 940     tmp[i] = dc;
 941     if (!(len & 1)) {
 942         dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
 943         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 944         tmp[len - 1] = dn;
 945     } else {
 946         tmp[len - 1] = s1 + dc;
 947     }
 948
 949     for (i = 0; i < len; ++i) {
 950         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 951     }
 952 }
 953 #endif /* !defined(STANDARD_SLOW_VERSION) */
 954
 955 /* <summary>                            */
 956 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
 957 /* </summary>                           */
 958 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 959 static void opj_idwt53_v(const opj_dwt_t *dwt,
 960                          OPJ_INT32* tiledp_col,
 961                          OPJ_SIZE_T stride,
 962                          OPJ_INT32 nb_cols)
 963 {
 964 #ifdef STANDARD_SLOW_VERSION
 965     /* For documentation purpose */
 966     OPJ_INT32 k, c;
 967     for (c = 0; c < nb_cols; c ++) {
 968         opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
 969         opj_dwt_decode_1(dwt);
 970         for (k = 0; k < dwt->sn + dwt->dn; ++k) {
 971             tiledp_col[c + k * stride] = dwt->mem[k];
 972         }
 973     }
 974 #else
 975     const OPJ_INT32 sn = dwt->sn;
 976     const OPJ_INT32 len = sn + dwt->dn;
 977     if (dwt->cas == 0) {
 978         /* If len == 1, unmodified value */
 979
 980 #if (defined(__SSE2__) || defined(__AVX2__))
 981         if (len > 1 && nb_cols == PARALLEL_COLS_53) {
 982             /* Same as below general case, except that thanks to SSE2/AVX2 */
 983             /* we can efficiently process 8/16 columns in parallel */
 984             opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 985             return;
 986         }
 987 #endif
 988         if (len > 1) {
 989             OPJ_INT32 c;
 990             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 991                 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
 992             }
 993             return;
 994         }
 995     } else {
 996         if (len == 1) {
 997             OPJ_INT32 c;
 998             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 999                 tiledp_col[0] /= 2;
1000             }
1001             return;
1002         }
1003
1004         if (len == 2) {
1005             OPJ_INT32 c;
1006             OPJ_INT32* out = dwt->mem;
1007             for (c = 0; c < nb_cols; c++, tiledp_col++) {
1008                 OPJ_INT32 i;
1009                 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
1010                 const OPJ_INT32* in_odd = &tiledp_col[0];
1011
1012                 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
1013                 out[0] = in_even[0] + out[1];
1014
1015                 for (i = 0; i < len; ++i) {
1016                     tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
1017                 }
1018             }
1019
1020             return;
1021         }
1022
1023 #if (defined(__SSE2__) || defined(__AVX2__))
1024         if (len > 2 && nb_cols == PARALLEL_COLS_53) {
1025             /* Same as below general case, except that thanks to SSE2/AVX2 */
1026             /* we can efficiently process 8/16 columns in parallel */
1027             opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1028             return;
1029         }
1030 #endif
1031         if (len > 2) {
1032             OPJ_INT32 c;
1033             for (c = 0; c < nb_cols; c++, tiledp_col++) {
1034                 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
1035             }
1036             return;
1037         }
1038     }
1039 #endif
1040 }
1041
1042 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
1043                                  OPJ_UINT32 start,
1044                                  OPJ_UINT32 end,
1045                                  const OPJ_FLOAT32 c)
1046 {
1047     OPJ_UINT32 i;
1048     for (i = start; i < end; ++i) {
1049         fw[i * 2] *= c;
1050     }
1051 }
1052 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1053                                  OPJ_UINT32 start,
1054                                  OPJ_UINT32 end,
1055                                  OPJ_UINT32 m,
1056                                  OPJ_FLOAT32 c)
1057 {
1058     OPJ_UINT32 i;
1059     OPJ_UINT32 imax = opj_uint_min(end, m);
1060     if (start > 0) {
1061         fw += 2 * start;
1062         fl = fw - 2;
1063     }
1064     for (i = start; i < imax; ++i) {
1065         fw[-1] += (fl[0] + fw[0]) * c;
1066         fl = fw;
1067         fw += 2;
1068     }
1069     if (m < end) {
1070         assert(m + 1 == end);
1071         fw[-1] += (2 * fl[0]) * c;
1072     }
1073 }
1074
1075 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
1076                                   OPJ_INT32 cas)
1077 {
1078     OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
1079     OPJ_INT32 a, b;
1080     if (cas == 0) {
1081         if (!((dn > 0) || (sn > 1))) {
1082             return;
1083         }
1084         a = 0;
1085         b = 1;
1086     } else {
1087         if (!((sn > 0) || (dn > 1))) {
1088             return;
1089         }
1090         a = 1;
1091         b = 0;
1092     }
1093     opj_dwt_encode_step2(w + a, w + b + 1,
1094                          0, (OPJ_UINT32)dn,
1095                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1096                          opj_dwt_alpha);
1097     opj_dwt_encode_step2(w + b, w + a + 1,
1098                          0, (OPJ_UINT32)sn,
1099                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1100                          opj_dwt_beta);
1101     opj_dwt_encode_step2(w + a, w + b + 1,
1102                          0, (OPJ_UINT32)dn,
1103                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1104                          opj_dwt_gamma);
1105     opj_dwt_encode_step2(w + b, w + a + 1,
1106                          0, (OPJ_UINT32)sn,
1107                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1108                          opj_dwt_delta);
1109     opj_dwt_encode_step1(w + b, 0, (OPJ_UINT32)dn,
1110                          opj_K);
1111     opj_dwt_encode_step1(w + a, 0, (OPJ_UINT32)sn,
1112                          opj_invK);
1113 }
1114
1115 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1116                                     opj_stepsize_t *bandno_stepsize)
1117 {
1118     OPJ_INT32 p, n;
1119     p = opj_int_floorlog2(stepsize) - 13;
1120     n = 11 - opj_int_floorlog2(stepsize);
1121     bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1122     bandno_stepsize->expn = numbps - p;
1123 }
1124
1125 /*
1126 ==========================================================
1127    DWT interface
1128 ==========================================================
1129 */
1130
1131 /** Process one line for the horizontal pass of the 5x3 forward transform */
1132 static
1133 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
1134         void* tmpIn,
1135         OPJ_UINT32 width,
1136         OPJ_BOOL even)
1137 {
1138     OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
1139     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
1140     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1141     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1142
1143     if (even) {
1144         if (width > 1) {
1145             OPJ_INT32 i;
1146             for (i = 0; i < sn - 1; i++) {
1147                 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
1148             }
1149             if ((width % 2) == 0) {
1150                 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
1151             }
1152             row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
1153             for (i = 1; i < dn; i++) {
1154                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
1155             }
1156             if ((width % 2) == 1) {
1157                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
1158             }
1159             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1160         }
1161     } else {
1162         if (width == 1) {
1163             row[0] *= 2;
1164         } else {
1165             OPJ_INT32 i;
1166             tmp[sn + 0] = row[0] - row[1];
1167             for (i = 1; i < sn; i++) {
1168                 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
1169             }
1170             if ((width % 2) == 1) {
1171                 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
1172             }
1173
1174             for (i = 0; i < dn - 1; i++) {
1175                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
1176             }
1177             if ((width % 2) == 0) {
1178                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
1179             }
1180             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1181         }
1182     }
1183 }
1184
1185 /** Process one line for the horizontal pass of the 9x7 forward transform */
1186 static
1187 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
1188         void* tmpIn,
1189         OPJ_UINT32 width,
1190         OPJ_BOOL even)
1191 {
1192     OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
1193     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
1194     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1195     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1196     memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
1197     opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1198     opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
1199                            (OPJ_INT32 * OPJ_RESTRICT)row,
1200                            dn, sn, even ? 0 : 1);
1201 }
1202
1203 typedef struct {
1204     opj_dwt_t h;
1205     OPJ_UINT32 rw; /* Width of the resolution to process */
1206     OPJ_UINT32 w; /* Width of tiledp */
1207     OPJ_INT32 * OPJ_RESTRICT tiledp;
1208     OPJ_UINT32 min_j;
1209     OPJ_UINT32 max_j;
1210     opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
1211 } opj_dwt_encode_h_job_t;
1212
1213 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
1214 {
1215     OPJ_UINT32 j;
1216     opj_dwt_encode_h_job_t* job;
1217     (void)tls;
1218
1219     job = (opj_dwt_encode_h_job_t*)user_data;
1220     for (j = job->min_j; j < job->max_j; j++) {
1221         OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
1222         (*job->p_function)(aj, job->h.mem, job->rw,
1223                            job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
1224     }
1225
1226     opj_aligned_free(job->h.mem);
1227     opj_free(job);
1228 }
1229
1230 typedef struct {
1231     opj_dwt_t v;
1232     OPJ_UINT32 rh;
1233     OPJ_UINT32 w;
1234     OPJ_INT32 * OPJ_RESTRICT tiledp;
1235     OPJ_UINT32 min_j;
1236     OPJ_UINT32 max_j;
1237     opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
1238 } opj_dwt_encode_v_job_t;
1239
1240 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
1241 {
1242     OPJ_UINT32 j;
1243     opj_dwt_encode_v_job_t* job;
1244     (void)tls;
1245
1246     job = (opj_dwt_encode_v_job_t*)user_data;
1247     for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
1248         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1249                                             job->v.mem,
1250                                             job->rh,
1251                                             job->v.cas == 0,
1252                                             job->w,
1253                                             NB_ELTS_V8);
1254     }
1255     if (j < job->max_j) {
1256         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1257                                             job->v.mem,
1258                                             job->rh,
1259                                             job->v.cas == 0,
1260                                             job->w,
1261                                             job->max_j - j);
1262     }
1263
1264     opj_aligned_free(job->v.mem);
1265     opj_free(job);
1266 }
1267
1268 /* Forward 5-3 transform, for the vertical pass, processing cols columns */
1269 /* where cols <= NB_ELTS_V8 */
1270 static void opj_dwt_encode_and_deinterleave_v(
1271     void *arrayIn,
1272     void *tmpIn,
1273     OPJ_UINT32 height,
1274     OPJ_BOOL even,
1275     OPJ_UINT32 stride_width,
1276     OPJ_UINT32 cols)
1277 {
1278     OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1279     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
1280     OPJ_UINT32 c;
1281     const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1282     const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1283     for (c = 0; c < cols; c++) {
1284         OPJ_UINT32 k;
1285         for (k = 0; k < height; ++k) {
1286             tmp[k] = array[c + k * stride_width];
1287         }
1288
1289         opj_dwt_encode_1(tmp, dn, sn, even ? 0 : 1);
1290
1291         opj_dwt_deinterleave_v(tmp, array + c, dn, sn, stride_width, even ? 0 : 1);
1292     }
1293 }
1294
1295 /* Forward 9-7 transform, for the vertical pass, processing cols columns */
1296 /* where cols <= NB_ELTS_V8 */
1297 static void opj_dwt_encode_and_deinterleave_v_real(
1298     void *arrayIn,
1299     void *tmpIn,
1300     OPJ_UINT32 height,
1301     OPJ_BOOL even,
1302     OPJ_UINT32 stride_width,
1303     OPJ_UINT32 cols)
1304 {
1305     OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
1306     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
1307     OPJ_UINT32 c;
1308     const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1309     const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1310     for (c = 0; c < cols; c++) {
1311         OPJ_UINT32 k;
1312         for (k = 0; k < height; ++k) {
1313             tmp[k] = array[c + k * stride_width];
1314         }
1315
1316         opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1317
1318         opj_dwt_deinterleave_v((OPJ_INT32*)tmpIn,
1319                                ((OPJ_INT32*)(arrayIn)) + c,
1320                                dn, sn, stride_width, even ? 0 : 1);
1321     }
1322 }
1323
1324
1325 /* <summary>                            */
1326 /* Forward 5-3 wavelet transform in 2-D. */
1327 /* </summary>                           */
1328 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
1329         opj_tcd_tilecomp_t * tilec,
1330         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
1331         opj_encode_and_deinterleave_h_one_row_fnptr_type
1332         p_encode_and_deinterleave_h_one_row)
1333 {
1334     OPJ_INT32 i;
1335     OPJ_INT32 *bj = 00;
1336     OPJ_UINT32 w;
1337     OPJ_INT32 l;
1338
1339     OPJ_SIZE_T l_data_size;
1340
1341     opj_tcd_resolution_t * l_cur_res = 0;
1342     opj_tcd_resolution_t * l_last_res = 0;
1343     const int num_threads = opj_thread_pool_get_thread_count(tp);
1344     OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1345
1346     w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1347     l = (OPJ_INT32)tilec->numresolutions - 1;
1348
1349     l_cur_res = tilec->resolutions + l;
1350     l_last_res = l_cur_res - 1;
1351
1352     l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1353     /* overflow check */
1354     if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
1355         /* FIXME event manager error callback */
1356         return OPJ_FALSE;
1357     }
1358     l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
1359     bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1360     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1361     /* in that case, so do not error out */
1362     if (l_data_size != 0 && ! bj) {
1363         return OPJ_FALSE;
1364     }
1365     i = l;
1366
1367     while (i--) {
1368         OPJ_UINT32 j;
1369         OPJ_UINT32 rw;           /* width of the resolution level computed   */
1370         OPJ_UINT32 rh;           /* height of the resolution level computed  */
1371         OPJ_UINT32
1372         rw1;      /* width of the resolution level once lower than computed one                                       */
1373         OPJ_UINT32
1374         rh1;      /* height of the resolution level once lower than computed one                                      */
1375         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1376         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
1377         OPJ_INT32 dn, sn;
1378
1379         rw  = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
1380         rh  = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
1381         rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
1382         rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
1383
1384         cas_row = l_cur_res->x0 & 1;
1385         cas_col = l_cur_res->y0 & 1;
1386
1387         sn = (OPJ_INT32)rh1;
1388         dn = (OPJ_INT32)(rh - rh1);
1389
1390         /* Perform vertical pass */
1391         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
1392             for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
1393                 p_encode_and_deinterleave_v(tiledp + j,
1394                                             bj,
1395                                             rh,
1396                                             cas_col == 0,
1397                                             w,
1398                                             NB_ELTS_V8);
1399             }
1400             if (j < rw) {
1401                 p_encode_and_deinterleave_v(tiledp + j,
1402                                             bj,
1403                                             rh,
1404                                             cas_col == 0,
1405                                             w,
1406                                             rw - j);
1407             }
1408         }  else {
1409             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1410             OPJ_UINT32 step_j;
1411
1412             if (rw < num_jobs) {
1413                 num_jobs = rw;
1414             }
1415             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
1416
1417             for (j = 0; j < num_jobs; j++) {
1418                 opj_dwt_encode_v_job_t* job;
1419
1420                 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
1421                 if (!job) {
1422                     opj_thread_pool_wait_completion(tp, 0);
1423                     opj_aligned_free(bj);
1424                     return OPJ_FALSE;
1425                 }
1426                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1427                 if (!job->v.mem) {
1428                     opj_thread_pool_wait_completion(tp, 0);
1429                     opj_free(job);
1430                     opj_aligned_free(bj);
1431                     return OPJ_FALSE;
1432                 }
1433                 job->v.dn = dn;
1434                 job->v.sn = sn;
1435                 job->v.cas = cas_col;
1436                 job->rh = rh;
1437                 job->w = w;
1438                 job->tiledp = tiledp;
1439                 job->min_j = j * step_j;
1440                 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
1441                 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
1442                 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
1443             }
1444             opj_thread_pool_wait_completion(tp, 0);
1445         }
1446
1447         sn = (OPJ_INT32)rw1;
1448         dn = (OPJ_INT32)(rw - rw1);
1449
1450         /* Perform horizontal pass */
1451         if (num_threads <= 1 || rh <= 1) {
1452             for (j = 0; j < rh; j++) {
1453                 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
1454                 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
1455                                                        cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
1456             }
1457         }  else {
1458             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1459             OPJ_UINT32 step_j;
1460
1461             if (rh < num_jobs) {
1462                 num_jobs = rh;
1463             }
1464             step_j = (rh / num_jobs);
1465
1466             for (j = 0; j < num_jobs; j++) {
1467                 opj_dwt_encode_h_job_t* job;
1468
1469                 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
1470                 if (!job) {
1471                     opj_thread_pool_wait_completion(tp, 0);
1472                     opj_aligned_free(bj);
1473                     return OPJ_FALSE;
1474                 }
1475                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1476                 if (!job->h.mem) {
1477                     opj_thread_pool_wait_completion(tp, 0);
1478                     opj_free(job);
1479                     opj_aligned_free(bj);
1480                     return OPJ_FALSE;
1481                 }
1482                 job->h.dn = dn;
1483                 job->h.sn = sn;
1484                 job->h.cas = cas_row;
1485                 job->rw = rw;
1486                 job->w = w;
1487                 job->tiledp = tiledp;
1488                 job->min_j = j * step_j;
1489                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1490                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1491                     job->max_j = rh;
1492                 }
1493                 job->p_function = p_encode_and_deinterleave_h_one_row;
1494                 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
1495             }
1496             opj_thread_pool_wait_completion(tp, 0);
1497         }
1498
1499         l_cur_res = l_last_res;
1500
1501         --l_last_res;
1502     }
1503
1504     opj_aligned_free(bj);
1505     return OPJ_TRUE;
1506 }
1507
1508 /* Forward 5-3 wavelet transform in 2-D. */
1509 /* </summary>                           */
1510 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
1511                         opj_tcd_tilecomp_t * tilec)
1512 {
1513     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1514                                     opj_dwt_encode_and_deinterleave_v,
1515                                     opj_dwt_encode_and_deinterleave_h_one_row);
1516 }
1517
1518 /* <summary>                            */
1519 /* Inverse 5-3 wavelet transform in 2-D. */
1520 /* </summary>                           */
1521 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1522                         OPJ_UINT32 numres)
1523 {
1524     if (p_tcd->whole_tile_decoding) {
1525         return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1526     } else {
1527         return opj_dwt_decode_partial_tile(tilec, numres);
1528     }
1529 }
1530
1531 /* <summary>                */
1532 /* Get norm of 5-3 wavelet. */
1533 /* </summary>               */
1534 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1535 {
1536     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1537     /* but the array should really be extended up to 33 resolution levels */
1538     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1539     if (orient == 0 && level >= 10) {
1540         level = 9;
1541     } else if (orient > 0 && level >= 9) {
1542         level = 8;
1543     }
1544     return opj_dwt_norms[orient][level];
1545 }
1546
1547 /* <summary>                             */
1548 /* Forward 9-7 wavelet transform in 2-D. */
1549 /* </summary>                            */
1550 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
1551                              opj_tcd_tilecomp_t * tilec)
1552 {
1553     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1554                                     opj_dwt_encode_and_deinterleave_v_real,
1555                                     opj_dwt_encode_and_deinterleave_h_one_row_real);
1556 }
1557
1558 /* <summary>                */
1559 /* Get norm of 9-7 wavelet. */
1560 /* </summary>               */
1561 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1562 {
1563     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1564     /* but the array should really be extended up to 33 resolution levels */
1565     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1566     if (orient == 0 && level >= 10) {
1567         level = 9;
1568     } else if (orient > 0 && level >= 9) {
1569         level = 8;
1570     }
1571     return opj_dwt_norms_real[orient][level];
1572 }
1573
1574 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1575 {
1576     OPJ_UINT32 numbands, bandno;
1577     numbands = 3 * tccp->numresolutions - 2;
1578     for (bandno = 0; bandno < numbands; bandno++) {
1579         OPJ_FLOAT64 stepsize;
1580         OPJ_UINT32 resno, level, orient, gain;
1581
1582         resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1583         orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1584         level = tccp->numresolutions - 1 - resno;
1585         gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1586                                           (orient == 2)) ? 1 : 2));
1587         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1588             stepsize = 1.0;
1589         } else {
1590             OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1591             stepsize = (1 << (gain)) / norm;
1592         }
1593         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1594                                 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1595     }
1596 }
1597
1598 /* <summary>                             */
1599 /* Determine maximum computed resolution level for inverse wavelet transform */
1600 /* </summary>                            */
1601 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1602         OPJ_UINT32 i)
1603 {
1604     OPJ_UINT32 mr   = 0;
1605     OPJ_UINT32 w;
1606     while (--i) {
1607         ++r;
1608         if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1609             mr = w ;
1610         }
1611         if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1612             mr = w ;
1613         }
1614     }
1615     return mr ;
1616 }
1617
1618 typedef struct {
1619     opj_dwt_t h;
1620     OPJ_UINT32 rw;
1621     OPJ_UINT32 w;
1622     OPJ_INT32 * OPJ_RESTRICT tiledp;
1623     OPJ_UINT32 min_j;
1624     OPJ_UINT32 max_j;
1625 } opj_dwt_decode_h_job_t;
1626
1627 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1628 {
1629     OPJ_UINT32 j;
1630     opj_dwt_decode_h_job_t* job;
1631     (void)tls;
1632
1633     job = (opj_dwt_decode_h_job_t*)user_data;
1634     for (j = job->min_j; j < job->max_j; j++) {
1635         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1636     }
1637
1638     opj_aligned_free(job->h.mem);
1639     opj_free(job);
1640 }
1641
1642 typedef struct {
1643     opj_dwt_t v;
1644     OPJ_UINT32 rh;
1645     OPJ_UINT32 w;
1646     OPJ_INT32 * OPJ_RESTRICT tiledp;
1647     OPJ_UINT32 min_j;
1648     OPJ_UINT32 max_j;
1649 } opj_dwt_decode_v_job_t;
1650
1651 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1652 {
1653     OPJ_UINT32 j;
1654     opj_dwt_decode_v_job_t* job;
1655     (void)tls;
1656
1657     job = (opj_dwt_decode_v_job_t*)user_data;
1658     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1659             j += PARALLEL_COLS_53) {
1660         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1661                      PARALLEL_COLS_53);
1662     }
1663     if (j < job->max_j)
1664         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1665                      (OPJ_INT32)(job->max_j - j));
1666
1667     opj_aligned_free(job->v.mem);
1668     opj_free(job);
1669 }
1670
1671
1672 /* <summary>                            */
1673 /* Inverse wavelet transform in 2-D.    */
1674 /* </summary>                           */
1675 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1676                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1677 {
1678     opj_dwt_t h;
1679     opj_dwt_t v;
1680
1681     opj_tcd_resolution_t* tr = tilec->resolutions;
1682
1683     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1684                                  tr->x0);  /* width of the resolution level computed */
1685     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1686                                  tr->y0);  /* height of the resolution level computed */
1687
1688     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
1689                                                                1].x1 -
1690                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
1691     OPJ_SIZE_T h_mem_size;
1692     int num_threads;
1693
1694     if (numres == 1U) {
1695         return OPJ_TRUE;
1696     }
1697     num_threads = opj_thread_pool_get_thread_count(tp);
1698     h_mem_size = opj_dwt_max_resolution(tr, numres);
1699     /* overflow check */
1700     if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1701         /* FIXME event manager error callback */
1702         return OPJ_FALSE;
1703     }
1704     /* We need PARALLEL_COLS_53 times the height of the array, */
1705     /* since for the vertical pass */
1706     /* we process PARALLEL_COLS_53 columns at a time */
1707     h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
1708     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1709     if (! h.mem) {
1710         /* FIXME event manager error callback */
1711         return OPJ_FALSE;
1712     }
1713
1714     v.mem = h.mem;
1715
1716     while (--numres) {
1717         OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1718         OPJ_UINT32 j;
1719
1720         ++tr;
1721         h.sn = (OPJ_INT32)rw;
1722         v.sn = (OPJ_INT32)rh;
1723
1724         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1725         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1726
1727         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1728         h.cas = tr->x0 % 2;
1729
1730         if (num_threads <= 1 || rh <= 1) {
1731             for (j = 0; j < rh; ++j) {
1732                 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
1733             }
1734         } else {
1735             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1736             OPJ_UINT32 step_j;
1737
1738             if (rh < num_jobs) {
1739                 num_jobs = rh;
1740             }
1741             step_j = (rh / num_jobs);
1742
1743             for (j = 0; j < num_jobs; j++) {
1744                 opj_dwt_decode_h_job_t* job;
1745
1746                 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
1747                 if (!job) {
1748                     /* It would be nice to fallback to single thread case, but */
1749                     /* unfortunately some jobs may be launched and have modified */
1750                     /* tiledp, so it is not practical to recover from that error */
1751                     /* FIXME event manager error callback */
1752                     opj_thread_pool_wait_completion(tp, 0);
1753                     opj_aligned_free(h.mem);
1754                     return OPJ_FALSE;
1755                 }
1756                 job->h = h;
1757                 job->rw = rw;
1758                 job->w = w;
1759                 job->tiledp = tiledp;
1760                 job->min_j = j * step_j;
1761                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1762                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1763                     job->max_j = rh;
1764                 }
1765                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1766                 if (!job->h.mem) {
1767                     /* FIXME event manager error callback */
1768                     opj_thread_pool_wait_completion(tp, 0);
1769                     opj_free(job);
1770                     opj_aligned_free(h.mem);
1771                     return OPJ_FALSE;
1772                 }
1773                 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1774             }
1775             opj_thread_pool_wait_completion(tp, 0);
1776         }
1777
1778         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1779         v.cas = tr->y0 % 2;
1780
1781         if (num_threads <= 1 || rw <= 1) {
1782             for (j = 0; j + PARALLEL_COLS_53 <= rw;
1783                     j += PARALLEL_COLS_53) {
1784                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
1785             }
1786             if (j < rw) {
1787                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
1788             }
1789         } else {
1790             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1791             OPJ_UINT32 step_j;
1792
1793             if (rw < num_jobs) {
1794                 num_jobs = rw;
1795             }
1796             step_j = (rw / num_jobs);
1797
1798             for (j = 0; j < num_jobs; j++) {
1799                 opj_dwt_decode_v_job_t* job;
1800
1801                 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
1802                 if (!job) {
1803                     /* It would be nice to fallback to single thread case, but */
1804                     /* unfortunately some jobs may be launched and have modified */
1805                     /* tiledp, so it is not practical to recover from that error */
1806                     /* FIXME event manager error callback */
1807                     opj_thread_pool_wait_completion(tp, 0);
1808                     opj_aligned_free(v.mem);
1809                     return OPJ_FALSE;
1810                 }
1811                 job->v = v;
1812                 job->rh = rh;
1813                 job->w = w;
1814                 job->tiledp = tiledp;
1815                 job->min_j = j * step_j;
1816                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1817                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1818                     job->max_j = rw;
1819                 }
1820                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1821                 if (!job->v.mem) {
1822                     /* FIXME event manager error callback */
1823                     opj_thread_pool_wait_completion(tp, 0);
1824                     opj_free(job);
1825                     opj_aligned_free(v.mem);
1826                     return OPJ_FALSE;
1827                 }
1828                 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1829             }
1830             opj_thread_pool_wait_completion(tp, 0);
1831         }
1832     }
1833     opj_aligned_free(h.mem);
1834     return OPJ_TRUE;
1835 }
1836
1837 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
1838         OPJ_INT32 cas,
1839         opj_sparse_array_int32_t* sa,
1840         OPJ_UINT32 sa_line,
1841         OPJ_UINT32 sn,
1842         OPJ_UINT32 win_l_x0,
1843         OPJ_UINT32 win_l_x1,
1844         OPJ_UINT32 win_h_x0,
1845         OPJ_UINT32 win_h_x1)
1846 {
1847     OPJ_BOOL ret;
1848     ret = opj_sparse_array_int32_read(sa,
1849                                       win_l_x0, sa_line,
1850                                       win_l_x1, sa_line + 1,
1851                                       dest + cas + 2 * win_l_x0,
1852                                       2, 0, OPJ_TRUE);
1853     assert(ret);
1854     ret = opj_sparse_array_int32_read(sa,
1855                                       sn + win_h_x0, sa_line,
1856                                       sn + win_h_x1, sa_line + 1,
1857                                       dest + 1 - cas + 2 * win_h_x0,
1858                                       2, 0, OPJ_TRUE);
1859     assert(ret);
1860     OPJ_UNUSED(ret);
1861 }
1862
1863
1864 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
1865         OPJ_INT32 cas,
1866         opj_sparse_array_int32_t* sa,
1867         OPJ_UINT32 sa_col,
1868         OPJ_UINT32 nb_cols,
1869         OPJ_UINT32 sn,
1870         OPJ_UINT32 win_l_y0,
1871         OPJ_UINT32 win_l_y1,
1872         OPJ_UINT32 win_h_y0,
1873         OPJ_UINT32 win_h_y1)
1874 {
1875     OPJ_BOOL ret;
1876     ret  = opj_sparse_array_int32_read(sa,
1877                                        sa_col, win_l_y0,
1878                                        sa_col + nb_cols, win_l_y1,
1879                                        dest + cas * 4 + 2 * 4 * win_l_y0,
1880                                        1, 2 * 4, OPJ_TRUE);
1881     assert(ret);
1882     ret = opj_sparse_array_int32_read(sa,
1883                                       sa_col, sn + win_h_y0,
1884                                       sa_col + nb_cols, sn + win_h_y1,
1885                                       dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
1886                                       1, 2 * 4, OPJ_TRUE);
1887     assert(ret);
1888     OPJ_UNUSED(ret);
1889 }
1890
1891 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
1892                                      OPJ_INT32 cas,
1893                                      OPJ_INT32 win_l_x0,
1894                                      OPJ_INT32 win_l_x1,
1895                                      OPJ_INT32 win_h_x0,
1896                                      OPJ_INT32 win_h_x1)
1897 {
1898     OPJ_INT32 i;
1899
1900     if (!cas) {
1901         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
1902
1903             /* Naive version is :
1904             for (i = win_l_x0; i < i_max; i++) {
1905                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1906             }
1907             for (i = win_h_x0; i < win_h_x1; i++) {
1908                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1909             }
1910             but the compiler doesn't manage to unroll it to avoid bound
1911             checking in OPJ_S_ and OPJ_D_ macros
1912             */
1913
1914             i = win_l_x0;
1915             if (i < win_l_x1) {
1916                 OPJ_INT32 i_max;
1917
1918                 /* Left-most case */
1919                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1920                 i ++;
1921
1922                 i_max = win_l_x1;
1923                 if (i_max > dn) {
1924                     i_max = dn;
1925                 }
1926                 for (; i < i_max; i++) {
1927                     /* No bound checking */
1928                     OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
1929                 }
1930                 for (; i < win_l_x1; i++) {
1931                     /* Right-most case */
1932                     OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1933                 }
1934             }
1935
1936             i = win_h_x0;
1937             if (i < win_h_x1) {
1938                 OPJ_INT32 i_max = win_h_x1;
1939                 if (i_max >= sn) {
1940                     i_max = sn - 1;
1941                 }
1942                 for (; i < i_max; i++) {
1943                     /* No bound checking */
1944                     OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
1945                 }
1946                 for (; i < win_h_x1; i++) {
1947                     /* Right-most case */
1948                     OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1949                 }
1950             }
1951         }
1952     } else {
1953         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
1954             OPJ_S(0) /= 2;
1955         } else {
1956             for (i = win_l_x0; i < win_l_x1; i++) {
1957                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
1958             }
1959             for (i = win_h_x0; i < win_h_x1; i++) {
1960                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
1961             }
1962         }
1963     }
1964 }
1965
1966 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
1967 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
1968 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
1969 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
1970 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
1971 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
1972
1973 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
1974         OPJ_UINT32 nb_cols,
1975         OPJ_INT32 dn, OPJ_INT32 sn,
1976         OPJ_INT32 cas,
1977         OPJ_INT32 win_l_x0,
1978         OPJ_INT32 win_l_x1,
1979         OPJ_INT32 win_h_x0,
1980         OPJ_INT32 win_h_x1)
1981 {
1982     OPJ_INT32 i;
1983     OPJ_UINT32 off;
1984
1985     (void)nb_cols;
1986
1987     if (!cas) {
1988         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
1989
1990             /* Naive version is :
1991             for (i = win_l_x0; i < i_max; i++) {
1992                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1993             }
1994             for (i = win_h_x0; i < win_h_x1; i++) {
1995                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1996             }
1997             but the compiler doesn't manage to unroll it to avoid bound
1998             checking in OPJ_S_ and OPJ_D_ macros
1999             */
2000
2001             i = win_l_x0;
2002             if (i < win_l_x1) {
2003                 OPJ_INT32 i_max;
2004
2005                 /* Left-most case */
2006                 for (off = 0; off < 4; off++) {
2007                     OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2008                 }
2009                 i ++;
2010
2011                 i_max = win_l_x1;
2012                 if (i_max > dn) {
2013                     i_max = dn;
2014                 }
2015
2016 #ifdef __SSE2__
2017                 if (i + 1 < i_max) {
2018                     const __m128i two = _mm_set1_epi32(2);
2019                     __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
2020                     for (; i + 1 < i_max; i += 2) {
2021                         /* No bound checking */
2022                         __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2023                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2024                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2025                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2026                         S = _mm_sub_epi32(S,
2027                                           _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
2028                         S1 = _mm_sub_epi32(S1,
2029                                            _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
2030                         _mm_store_si128((__m128i*)(a + i * 8), S);
2031                         _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
2032                         Dm1 = D1;
2033                     }
2034                 }
2035 #endif
2036
2037                 for (; i < i_max; i++) {
2038                     /* No bound checking */
2039                     for (off = 0; off < 4; off++) {
2040                         OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
2041                     }
2042                 }
2043                 for (; i < win_l_x1; i++) {
2044                     /* Right-most case */
2045                     for (off = 0; off < 4; off++) {
2046                         OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2047                     }
2048                 }
2049             }
2050
2051             i = win_h_x0;
2052             if (i < win_h_x1) {
2053                 OPJ_INT32 i_max = win_h_x1;
2054                 if (i_max >= sn) {
2055                     i_max = sn - 1;
2056                 }
2057
2058 #ifdef __SSE2__
2059                 if (i + 1 < i_max) {
2060                     __m128i S =  _mm_load_si128((__m128i * const)(a + i * 8));
2061                     for (; i + 1 < i_max; i += 2) {
2062                         /* No bound checking */
2063                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2064                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2065                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2066                         __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
2067                         D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
2068                         D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
2069                         _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
2070                         _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
2071                         S = S2;
2072                     }
2073                 }
2074 #endif
2075
2076                 for (; i < i_max; i++) {
2077                     /* No bound checking */
2078                     for (off = 0; off < 4; off++) {
2079                         OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
2080                     }
2081                 }
2082                 for (; i < win_h_x1; i++) {
2083                     /* Right-most case */
2084                     for (off = 0; off < 4; off++) {
2085                         OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
2086                     }
2087                 }
2088             }
2089         }
2090     } else {
2091         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
2092             for (off = 0; off < 4; off++) {
2093                 OPJ_S_off(0, off) /= 2;
2094             }
2095         } else {
2096             for (i = win_l_x0; i < win_l_x1; i++) {
2097                 for (off = 0; off < 4; off++) {
2098                     OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
2099                 }
2100             }
2101             for (i = win_h_x0; i < win_h_x1; i++) {
2102                 for (off = 0; off < 4; off++) {
2103                     OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
2104                 }
2105             }
2106         }
2107     }
2108 }
2109
2110 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
2111         OPJ_UINT32 resno,
2112         OPJ_UINT32 bandno,
2113         OPJ_UINT32 tcx0,
2114         OPJ_UINT32 tcy0,
2115         OPJ_UINT32 tcx1,
2116         OPJ_UINT32 tcy1,
2117         OPJ_UINT32* tbx0,
2118         OPJ_UINT32* tby0,
2119         OPJ_UINT32* tbx1,
2120         OPJ_UINT32* tby1)
2121 {
2122     /* Compute number of decomposition for this band. See table F-1 */
2123     OPJ_UINT32 nb = (resno == 0) ?
2124                     tilec->numresolutions - 1 :
2125                     tilec->numresolutions - resno;
2126     /* Map above tile-based coordinates to sub-band-based coordinates per */
2127     /* equation B-15 of the standard */
2128     OPJ_UINT32 x0b = bandno & 1;
2129     OPJ_UINT32 y0b = bandno >> 1;
2130     if (tbx0) {
2131         *tbx0 = (nb == 0) ? tcx0 :
2132                 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
2133                 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
2134     }
2135     if (tby0) {
2136         *tby0 = (nb == 0) ? tcy0 :
2137                 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
2138                 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
2139     }
2140     if (tbx1) {
2141         *tbx1 = (nb == 0) ? tcx1 :
2142                 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
2143                 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
2144     }
2145     if (tby1) {
2146         *tby1 = (nb == 0) ? tcy1 :
2147                 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
2148                 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
2149     }
2150 }
2151
2152 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
2153                                  OPJ_UINT32 max_size,
2154                                  OPJ_UINT32* start,
2155                                  OPJ_UINT32* end)
2156 {
2157     *start = opj_uint_subs(*start, filter_width);
2158     *end = opj_uint_adds(*end, filter_width);
2159     *end = opj_uint_min(*end, max_size);
2160 }
2161
2162
2163 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
2164     opj_tcd_tilecomp_t* tilec,
2165     OPJ_UINT32 numres)
2166 {
2167     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2168     OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
2169     OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
2170     OPJ_UINT32 resno, bandno, precno, cblkno;
2171     opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
2172                                        w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
2173     if (sa == NULL) {
2174         return NULL;
2175     }
2176
2177     for (resno = 0; resno < numres; ++resno) {
2178         opj_tcd_resolution_t* res = &tilec->resolutions[resno];
2179
2180         for (bandno = 0; bandno < res->numbands; ++bandno) {
2181             opj_tcd_band_t* band = &res->bands[bandno];
2182
2183             for (precno = 0; precno < res->pw * res->ph; ++precno) {
2184                 opj_tcd_precinct_t* precinct = &band->precincts[precno];
2185                 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
2186                     opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
2187                     if (cblk->decoded_data != NULL) {
2188                         OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
2189                         OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
2190                         OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
2191                         OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
2192
2193                         if (band->bandno & 1) {
2194                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2195                             x += (OPJ_UINT32)(pres->x1 - pres->x0);
2196                         }
2197                         if (band->bandno & 2) {
2198                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2199                             y += (OPJ_UINT32)(pres->y1 - pres->y0);
2200                         }
2201
2202                         if (!opj_sparse_array_int32_write(sa, x, y,
2203                                                           x + cblk_w, y + cblk_h,
2204                                                           cblk->decoded_data,
2205                                                           1, cblk_w, OPJ_TRUE)) {
2206                             opj_sparse_array_int32_free(sa);
2207                             return NULL;
2208                         }
2209                     }
2210                 }
2211             }
2212         }
2213     }
2214
2215     return sa;
2216 }
2217
2218
2219 static OPJ_BOOL opj_dwt_decode_partial_tile(
2220     opj_tcd_tilecomp_t* tilec,
2221     OPJ_UINT32 numres)
2222 {
2223     opj_sparse_array_int32_t* sa;
2224     opj_dwt_t h;
2225     opj_dwt_t v;
2226     OPJ_UINT32 resno;
2227     /* This value matches the maximum left/right extension given in tables */
2228     /* F.2 and F.3 of the standard. */
2229     const OPJ_UINT32 filter_width = 2U;
2230
2231     opj_tcd_resolution_t* tr = tilec->resolutions;
2232     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2233
2234     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2235                                  tr->x0);  /* width of the resolution level computed */
2236     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2237                                  tr->y0);  /* height of the resolution level computed */
2238
2239     OPJ_SIZE_T h_mem_size;
2240
2241     /* Compute the intersection of the area of interest, expressed in tile coordinates */
2242     /* with the tile coordinates */
2243     OPJ_UINT32 win_tcx0 = tilec->win_x0;
2244     OPJ_UINT32 win_tcy0 = tilec->win_y0;
2245     OPJ_UINT32 win_tcx1 = tilec->win_x1;
2246     OPJ_UINT32 win_tcy1 = tilec->win_y1;
2247
2248     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2249         return OPJ_TRUE;
2250     }
2251
2252     sa = opj_dwt_init_sparse_array(tilec, numres);
2253     if (sa == NULL) {
2254         return OPJ_FALSE;
2255     }
2256
2257     if (numres == 1U) {
2258         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2259                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2260                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2261                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2262                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2263                        tilec->data_win,
2264                        1, tr_max->win_x1 - tr_max->win_x0,
2265                        OPJ_TRUE);
2266         assert(ret);
2267         OPJ_UNUSED(ret);
2268         opj_sparse_array_int32_free(sa);
2269         return OPJ_TRUE;
2270     }
2271     h_mem_size = opj_dwt_max_resolution(tr, numres);
2272     /* overflow check */
2273     /* in vertical pass, we process 4 columns at a time */
2274     if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
2275         /* FIXME event manager error callback */
2276         opj_sparse_array_int32_free(sa);
2277         return OPJ_FALSE;
2278     }
2279
2280     h_mem_size *= 4 * sizeof(OPJ_INT32);
2281     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2282     if (! h.mem) {
2283         /* FIXME event manager error callback */
2284         opj_sparse_array_int32_free(sa);
2285         return OPJ_FALSE;
2286     }
2287
2288     v.mem = h.mem;
2289
2290     for (resno = 1; resno < numres; resno ++) {
2291         OPJ_UINT32 i, j;
2292         /* Window of interest subband-based coordinates */
2293         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2294         OPJ_UINT32 win_hl_x0, win_hl_x1;
2295         OPJ_UINT32 win_lh_y0, win_lh_y1;
2296         /* Window of interest tile-resolution-based coordinates */
2297         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2298         /* Tile-resolution subband-based coordinates */
2299         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2300
2301         ++tr;
2302
2303         h.sn = (OPJ_INT32)rw;
2304         v.sn = (OPJ_INT32)rh;
2305
2306         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2307         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2308
2309         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2310         h.cas = tr->x0 % 2;
2311
2312         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2313         v.cas = tr->y0 % 2;
2314
2315         /* Get the subband coordinates for the window of interest */
2316         /* LL band */
2317         opj_dwt_get_band_coordinates(tilec, resno, 0,
2318                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2319                                      &win_ll_x0, &win_ll_y0,
2320                                      &win_ll_x1, &win_ll_y1);
2321
2322         /* HL band */
2323         opj_dwt_get_band_coordinates(tilec, resno, 1,
2324                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2325                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
2326
2327         /* LH band */
2328         opj_dwt_get_band_coordinates(tilec, resno, 2,
2329                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2330                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
2331
2332         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2333         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2334         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2335         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2336         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2337
2338         /* Subtract the origin of the bands for this tile, to the subwindow */
2339         /* of interest band coordinates, so as to get them relative to the */
2340         /* tile */
2341         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2342         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2343         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2344         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2345         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2346         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2347         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2348         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2349
2350         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2351         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2352
2353         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2354         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2355
2356         /* Compute the tile-resolution-based coordinates for the window of interest */
2357         if (h.cas == 0) {
2358             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2359             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2360         } else {
2361             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2362             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2363         }
2364
2365         if (v.cas == 0) {
2366             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2367             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2368         } else {
2369             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2370             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2371         }
2372
2373         for (j = 0; j < rh; ++j) {
2374             if ((j >= win_ll_y0 && j < win_ll_y1) ||
2375                     (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2376
2377                 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2378                 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2379                 /* on opj_decompress -i  ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2380                 /* This is less extreme than memsetting the whole buffer to 0 */
2381                 /* although we could potentially do better with better handling of edge conditions */
2382                 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2383                     h.mem[win_tr_x1 - 1] = 0;
2384                 }
2385                 if (win_tr_x1 < rw) {
2386                     h.mem[win_tr_x1] = 0;
2387                 }
2388
2389                 opj_dwt_interleave_partial_h(h.mem,
2390                                              h.cas,
2391                                              sa,
2392                                              j,
2393                                              (OPJ_UINT32)h.sn,
2394                                              win_ll_x0,
2395                                              win_ll_x1,
2396                                              win_hl_x0,
2397                                              win_hl_x1);
2398                 opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas,
2399                                          (OPJ_INT32)win_ll_x0,
2400                                          (OPJ_INT32)win_ll_x1,
2401                                          (OPJ_INT32)win_hl_x0,
2402                                          (OPJ_INT32)win_hl_x1);
2403                 if (!opj_sparse_array_int32_write(sa,
2404                                                   win_tr_x0, j,
2405                                                   win_tr_x1, j + 1,
2406                                                   h.mem + win_tr_x0,
2407                                                   1, 0, OPJ_TRUE)) {
2408                     /* FIXME event manager error callback */
2409                     opj_sparse_array_int32_free(sa);
2410                     opj_aligned_free(h.mem);
2411                     return OPJ_FALSE;
2412                 }
2413             }
2414         }
2415
2416         for (i = win_tr_x0; i < win_tr_x1;) {
2417             OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2418             opj_dwt_interleave_partial_v(v.mem,
2419                                          v.cas,
2420                                          sa,
2421                                          i,
2422                                          nb_cols,
2423                                          (OPJ_UINT32)v.sn,
2424                                          win_ll_y0,
2425                                          win_ll_y1,
2426                                          win_lh_y0,
2427                                          win_lh_y1);
2428             opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2429                                               (OPJ_INT32)win_ll_y0,
2430                                               (OPJ_INT32)win_ll_y1,
2431                                               (OPJ_INT32)win_lh_y0,
2432                                               (OPJ_INT32)win_lh_y1);
2433             if (!opj_sparse_array_int32_write(sa,
2434                                               i, win_tr_y0,
2435                                               i + nb_cols, win_tr_y1,
2436                                               v.mem + 4 * win_tr_y0,
2437                                               1, 4, OPJ_TRUE)) {
2438                 /* FIXME event manager error callback */
2439                 opj_sparse_array_int32_free(sa);
2440                 opj_aligned_free(h.mem);
2441                 return OPJ_FALSE;
2442             }
2443
2444             i += nb_cols;
2445         }
2446     }
2447     opj_aligned_free(h.mem);
2448
2449     {
2450         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2451                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2452                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2453                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2454                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2455                        tilec->data_win,
2456                        1, tr_max->win_x1 - tr_max->win_x0,
2457                        OPJ_TRUE);
2458         assert(ret);
2459         OPJ_UNUSED(ret);
2460     }
2461     opj_sparse_array_int32_free(sa);
2462     return OPJ_TRUE;
2463 }
2464
2465 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
2466                                    OPJ_FLOAT32* OPJ_RESTRICT a,
2467                                    OPJ_UINT32 width,
2468                                    OPJ_UINT32 remaining_height)
2469 {
2470     OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2471     OPJ_UINT32 i, k;
2472     OPJ_UINT32 x0 = dwt->win_l_x0;
2473     OPJ_UINT32 x1 = dwt->win_l_x1;
2474
2475     for (k = 0; k < 2; ++k) {
2476         if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2477                 ((OPJ_SIZE_T) bi & 0x0f) == 0) {
2478             /* Fast code path */
2479             for (i = x0; i < x1; ++i) {
2480                 OPJ_UINT32 j = i;
2481                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2482                 dst[0] = a[j];
2483                 j += width;
2484                 dst[1] = a[j];
2485                 j += width;
2486                 dst[2] = a[j];
2487                 j += width;
2488                 dst[3] = a[j];
2489                 j += width;
2490                 dst[4] = a[j];
2491                 j += width;
2492                 dst[5] = a[j];
2493                 j += width;
2494                 dst[6] = a[j];
2495                 j += width;
2496                 dst[7] = a[j];
2497             }
2498         } else {
2499             /* Slow code path */
2500             for (i = x0; i < x1; ++i) {
2501                 OPJ_UINT32 j = i;
2502                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2503                 dst[0] = a[j];
2504                 j += width;
2505                 if (remaining_height == 1) {
2506                     continue;
2507                 }
2508                 dst[1] = a[j];
2509                 j += width;
2510                 if (remaining_height == 2) {
2511                     continue;
2512                 }
2513                 dst[2] = a[j];
2514                 j += width;
2515                 if (remaining_height == 3) {
2516                     continue;
2517                 }
2518                 dst[3] = a[j];
2519                 j += width;
2520                 if (remaining_height == 4) {
2521                     continue;
2522                 }
2523                 dst[4] = a[j];
2524                 j += width;
2525                 if (remaining_height == 5) {
2526                     continue;
2527                 }
2528                 dst[5] = a[j];
2529                 j += width;
2530                 if (remaining_height == 6) {
2531                     continue;
2532                 }
2533                 dst[6] = a[j];
2534                 j += width;
2535                 if (remaining_height == 7) {
2536                     continue;
2537                 }
2538                 dst[7] = a[j];
2539             }
2540         }
2541
2542         bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2543         a += dwt->sn;
2544         x0 = dwt->win_h_x0;
2545         x1 = dwt->win_h_x1;
2546     }
2547 }
2548
2549 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
2550         opj_sparse_array_int32_t* sa,
2551         OPJ_UINT32 sa_line,
2552         OPJ_UINT32 remaining_height)
2553 {
2554     OPJ_UINT32 i;
2555     for (i = 0; i < remaining_height; i++) {
2556         OPJ_BOOL ret;
2557         ret = opj_sparse_array_int32_read(sa,
2558                                           dwt->win_l_x0, sa_line + i,
2559                                           dwt->win_l_x1, sa_line + i + 1,
2560                                           /* Nasty cast from float* to int32* */
2561                                           (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2562                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2563         assert(ret);
2564         ret = opj_sparse_array_int32_read(sa,
2565                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2566                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2567                                           /* Nasty cast from float* to int32* */
2568                                           (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2569                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2570         assert(ret);
2571         OPJ_UNUSED(ret);
2572     }
2573 }
2574
2575 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2576         OPJ_FLOAT32* OPJ_RESTRICT a,
2577         OPJ_UINT32 width,
2578         OPJ_UINT32 nb_elts_read)
2579 {
2580     opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2581     OPJ_UINT32 i;
2582
2583     for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2584         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2585                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2586     }
2587
2588     a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2589     bi = dwt->wavelet + 1 - dwt->cas;
2590
2591     for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2592         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2593                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2594     }
2595 }
2596
2597 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2598         opj_sparse_array_int32_t* sa,
2599         OPJ_UINT32 sa_col,
2600         OPJ_UINT32 nb_elts_read)
2601 {
2602     OPJ_BOOL ret;
2603     ret = opj_sparse_array_int32_read(sa,
2604                                       sa_col, dwt->win_l_x0,
2605                                       sa_col + nb_elts_read, dwt->win_l_x1,
2606                                       (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
2607                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
2608     assert(ret);
2609     ret = opj_sparse_array_int32_read(sa,
2610                                       sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
2611                                       sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
2612                                       (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
2613                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
2614     assert(ret);
2615     OPJ_UNUSED(ret);
2616 }
2617
2618 #ifdef __SSE__
2619
2620 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
2621                                        OPJ_UINT32 start,
2622                                        OPJ_UINT32 end,
2623                                        const __m128 c)
2624 {
2625     __m128* OPJ_RESTRICT vw = (__m128*) w;
2626     OPJ_UINT32 i = start;
2627     /* To be adapted if NB_ELTS_V8 changes */
2628     vw += 4 * start;
2629     /* Note: attempt at loop unrolling x2 doesn't help */
2630     for (; i < end; ++i, vw += 4) {
2631         vw[0] = _mm_mul_ps(vw[0], c);
2632         vw[1] = _mm_mul_ps(vw[1], c);
2633     }
2634 }
2635
2636 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
2637                                        OPJ_UINT32 start,
2638                                        OPJ_UINT32 end,
2639                                        OPJ_UINT32 m,
2640                                        __m128 c)
2641 {
2642     __m128* OPJ_RESTRICT vl = (__m128*) l;
2643     __m128* OPJ_RESTRICT vw = (__m128*) w;
2644     /* To be adapted if NB_ELTS_V8 changes */
2645     OPJ_UINT32 i;
2646     OPJ_UINT32 imax = opj_uint_min(end, m);
2647     if (start == 0) {
2648         if (imax >= 1) {
2649             vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
2650             vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
2651             vw += 4;
2652             start = 1;
2653         }
2654     } else {
2655         vw += start * 4;
2656     }
2657
2658     i = start;
2659     /* Note: attempt at loop unrolling x2 doesn't help */
2660     for (; i < imax; ++i) {
2661         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
2662         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
2663         vw += 4;
2664     }
2665     if (m < end) {
2666         assert(m + 1 == end);
2667         c = _mm_add_ps(c, c);
2668         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
2669         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
2670     }
2671 }
2672
2673 #else
2674
2675 static void opj_v8dwt_decode_step1(opj_v8_t* w,
2676                                    OPJ_UINT32 start,
2677                                    OPJ_UINT32 end,
2678                                    const OPJ_FLOAT32 c)
2679 {
2680     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
2681     OPJ_UINT32 i;
2682     /* To be adapted if NB_ELTS_V8 changes */
2683     for (i = start; i < end; ++i) {
2684         fw[i * 2 * 8    ] = fw[i * 2 * 8    ] * c;
2685         fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
2686         fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
2687         fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
2688         fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
2689         fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
2690         fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
2691         fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
2692     }
2693 }
2694
2695 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
2696                                    OPJ_UINT32 start,
2697                                    OPJ_UINT32 end,
2698                                    OPJ_UINT32 m,
2699                                    OPJ_FLOAT32 c)
2700 {
2701     OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
2702     OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
2703     OPJ_UINT32 i;
2704     OPJ_UINT32 imax = opj_uint_min(end, m);
2705     if (start > 0) {
2706         fw += 2 * NB_ELTS_V8 * start;
2707         fl = fw - 2 * NB_ELTS_V8;
2708     }
2709     /* To be adapted if NB_ELTS_V8 changes */
2710     for (i = start; i < imax; ++i) {
2711         fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
2712         fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
2713         fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
2714         fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
2715         fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
2716         fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
2717         fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
2718         fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
2719         fl = fw;
2720         fw += 2 * NB_ELTS_V8;
2721     }
2722     if (m < end) {
2723         assert(m + 1 == end);
2724         c += c;
2725         fw[-8] = fw[-8] + fl[0] * c;
2726         fw[-7] = fw[-7] + fl[1] * c;
2727         fw[-6] = fw[-6] + fl[2] * c;
2728         fw[-5] = fw[-5] + fl[3] * c;
2729         fw[-4] = fw[-4] + fl[4] * c;
2730         fw[-3] = fw[-3] + fl[5] * c;
2731         fw[-2] = fw[-2] + fl[6] * c;
2732         fw[-1] = fw[-1] + fl[7] * c;
2733     }
2734 }
2735
2736 #endif
2737
2738 /* <summary>                             */
2739 /* Inverse 9-7 wavelet transform in 1-D. */
2740 /* </summary>                            */
2741 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
2742 {
2743     OPJ_INT32 a, b;
2744     /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
2745     /* Historic value for 2 / opj_invK */
2746     /* Normally, we should use invK, but if we do so, we have failures in the */
2747     /* conformance test, due to MSE and peak errors significantly higher than */
2748     /* accepted value */
2749     /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
2750     /* the computation of the stepsize for the non LL subbands */
2751     const float two_invK = 1.625732422f;
2752     if (dwt->cas == 0) {
2753         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
2754             return;
2755         }
2756         a = 0;
2757         b = 1;
2758     } else {
2759         if (!((dwt->sn > 0) || (dwt->dn > 1))) {
2760             return;
2761         }
2762         a = 1;
2763         b = 0;
2764     }
2765 #ifdef __SSE__
2766     opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2767                                _mm_set1_ps(opj_K));
2768     opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2769                                _mm_set1_ps(two_invK));
2770     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2771                                dwt->win_l_x0, dwt->win_l_x1,
2772                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2773                                _mm_set1_ps(-opj_dwt_delta));
2774     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2775                                dwt->win_h_x0, dwt->win_h_x1,
2776                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2777                                _mm_set1_ps(-opj_dwt_gamma));
2778     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2779                                dwt->win_l_x0, dwt->win_l_x1,
2780                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2781                                _mm_set1_ps(-opj_dwt_beta));
2782     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2783                                dwt->win_h_x0, dwt->win_h_x1,
2784                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2785                                _mm_set1_ps(-opj_dwt_alpha));
2786 #else
2787     opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2788                            opj_K);
2789     opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2790                            two_invK);
2791     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2792                            dwt->win_l_x0, dwt->win_l_x1,
2793                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2794                            -opj_dwt_delta);
2795     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2796                            dwt->win_h_x0, dwt->win_h_x1,
2797                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2798                            -opj_dwt_gamma);
2799     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2800                            dwt->win_l_x0, dwt->win_l_x1,
2801                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2802                            -opj_dwt_beta);
2803     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2804                            dwt->win_h_x0, dwt->win_h_x1,
2805                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2806                            -opj_dwt_alpha);
2807 #endif
2808 }
2809
2810 typedef struct {
2811     opj_v8dwt_t h;
2812     OPJ_UINT32 rw;
2813     OPJ_UINT32 w;
2814     OPJ_FLOAT32 * OPJ_RESTRICT aj;
2815     OPJ_UINT32 nb_rows;
2816 } opj_dwt97_decode_h_job_t;
2817
2818 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
2819 {
2820     OPJ_UINT32 j;
2821     opj_dwt97_decode_h_job_t* job;
2822     OPJ_FLOAT32 * OPJ_RESTRICT aj;
2823     OPJ_UINT32 w;
2824     (void)tls;
2825
2826     job = (opj_dwt97_decode_h_job_t*)user_data;
2827     w = job->w;
2828
2829     assert((job->nb_rows % NB_ELTS_V8) == 0);
2830
2831     aj = job->aj;
2832     for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
2833         OPJ_UINT32 k;
2834         opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
2835         opj_v8dwt_decode(&job->h);
2836
2837         /* To be adapted if NB_ELTS_V8 changes */
2838         for (k = 0; k < job->rw; k++) {
2839             aj[k      ] = job->h.wavelet[k].f[0];
2840             aj[k + (OPJ_SIZE_T)w  ] = job->h.wavelet[k].f[1];
2841             aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
2842             aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
2843         }
2844         for (k = 0; k < job->rw; k++) {
2845             aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
2846             aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
2847             aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
2848             aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
2849         }
2850
2851         aj += w * NB_ELTS_V8;
2852     }
2853
2854     opj_aligned_free(job->h.wavelet);
2855     opj_free(job);
2856 }
2857
2858
2859 typedef struct {
2860     opj_v8dwt_t v;
2861     OPJ_UINT32 rh;
2862     OPJ_UINT32 w;
2863     OPJ_FLOAT32 * OPJ_RESTRICT aj;
2864     OPJ_UINT32 nb_columns;
2865 } opj_dwt97_decode_v_job_t;
2866
2867 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
2868 {
2869     OPJ_UINT32 j;
2870     opj_dwt97_decode_v_job_t* job;
2871     OPJ_FLOAT32 * OPJ_RESTRICT aj;
2872     (void)tls;
2873
2874     job = (opj_dwt97_decode_v_job_t*)user_data;
2875
2876     assert((job->nb_columns % NB_ELTS_V8) == 0);
2877
2878     aj = job->aj;
2879     for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
2880         OPJ_UINT32 k;
2881
2882         opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
2883         opj_v8dwt_decode(&job->v);
2884
2885         for (k = 0; k < job->rh; ++k) {
2886             memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
2887                    NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
2888         }
2889         aj += NB_ELTS_V8;
2890     }
2891
2892     opj_aligned_free(job->v.wavelet);
2893     opj_free(job);
2894 }
2895
2896
2897 /* <summary>                             */
2898 /* Inverse 9-7 wavelet transform in 2-D. */
2899 /* </summary>                            */
2900 static
2901 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
2902                                 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2903                                 OPJ_UINT32 numres)
2904 {
2905     opj_v8dwt_t h;
2906     opj_v8dwt_t v;
2907
2908     opj_tcd_resolution_t* res = tilec->resolutions;
2909
2910     OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
2911                                  res->x0);    /* width of the resolution level computed */
2912     OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
2913                                  res->y0);    /* height of the resolution level computed */
2914
2915     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2916                                                                1].x1 -
2917                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2918
2919     OPJ_SIZE_T l_data_size;
2920     const int num_threads = opj_thread_pool_get_thread_count(tp);
2921
2922     if (numres == 1) {
2923         return OPJ_TRUE;
2924     }
2925
2926     l_data_size = opj_dwt_max_resolution(res, numres);
2927     /* overflow check */
2928     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
2929         /* FIXME event manager error callback */
2930         return OPJ_FALSE;
2931     }
2932     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
2933     if (!h.wavelet) {
2934         /* FIXME event manager error callback */
2935         return OPJ_FALSE;
2936     }
2937     v.wavelet = h.wavelet;
2938
2939     while (--numres) {
2940         OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
2941         OPJ_UINT32 j;
2942
2943         h.sn = (OPJ_INT32)rw;
2944         v.sn = (OPJ_INT32)rh;
2945
2946         ++res;
2947
2948         rw = (OPJ_UINT32)(res->x1 -
2949                           res->x0);   /* width of the resolution level computed */
2950         rh = (OPJ_UINT32)(res->y1 -
2951                           res->y0);   /* height of the resolution level computed */
2952
2953         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2954         h.cas = res->x0 % 2;
2955
2956         h.win_l_x0 = 0;
2957         h.win_l_x1 = (OPJ_UINT32)h.sn;
2958         h.win_h_x0 = 0;
2959         h.win_h_x1 = (OPJ_UINT32)h.dn;
2960
2961         if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
2962             for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
2963                 OPJ_UINT32 k;
2964                 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
2965                 opj_v8dwt_decode(&h);
2966
2967                 /* To be adapted if NB_ELTS_V8 changes */
2968                 for (k = 0; k < rw; k++) {
2969                     aj[k      ] = h.wavelet[k].f[0];
2970                     aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
2971                     aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
2972                     aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
2973                 }
2974                 for (k = 0; k < rw; k++) {
2975                     aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
2976                     aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
2977                     aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
2978                     aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
2979                 }
2980
2981                 aj += w * NB_ELTS_V8;
2982             }
2983         } else {
2984             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2985             OPJ_UINT32 step_j;
2986
2987             if ((rh / NB_ELTS_V8) < num_jobs) {
2988                 num_jobs = rh / NB_ELTS_V8;
2989             }
2990             step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
2991             for (j = 0; j < num_jobs; j++) {
2992                 opj_dwt97_decode_h_job_t* job;
2993
2994                 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
2995                 if (!job) {
2996                     opj_thread_pool_wait_completion(tp, 0);
2997                     opj_aligned_free(h.wavelet);
2998                     return OPJ_FALSE;
2999                 }
3000                 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3001                 if (!job->h.wavelet) {
3002                     opj_thread_pool_wait_completion(tp, 0);
3003                     opj_free(job);
3004                     opj_aligned_free(h.wavelet);
3005                     return OPJ_FALSE;
3006                 }
3007                 job->h.dn = h.dn;
3008                 job->h.sn = h.sn;
3009                 job->h.cas = h.cas;
3010                 job->h.win_l_x0 = h.win_l_x0;
3011                 job->h.win_l_x1 = h.win_l_x1;
3012                 job->h.win_h_x0 = h.win_h_x0;
3013                 job->h.win_h_x1 = h.win_h_x1;
3014                 job->rw = rw;
3015                 job->w = w;
3016                 job->aj = aj;
3017                 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
3018                                                       (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3019                 aj += w * job->nb_rows;
3020                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
3021             }
3022             opj_thread_pool_wait_completion(tp, 0);
3023             j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
3024         }
3025
3026         if (j < rh) {
3027             OPJ_UINT32 k;
3028             opj_v8dwt_interleave_h(&h, aj, w, rh - j);
3029             opj_v8dwt_decode(&h);
3030             for (k = 0; k < rw; k++) {
3031                 OPJ_UINT32 l;
3032                 for (l = 0; l < rh - j; l++) {
3033                     aj[k + (OPJ_SIZE_T)w  * l ] = h.wavelet[k].f[l];
3034                 }
3035             }
3036         }
3037
3038         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3039         v.cas = res->y0 % 2;
3040         v.win_l_x0 = 0;
3041         v.win_l_x1 = (OPJ_UINT32)v.sn;
3042         v.win_h_x0 = 0;
3043         v.win_h_x1 = (OPJ_UINT32)v.dn;
3044
3045         aj = (OPJ_FLOAT32*) tilec->data;
3046         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
3047             for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
3048                 OPJ_UINT32 k;
3049
3050                 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
3051                 opj_v8dwt_decode(&v);
3052
3053                 for (k = 0; k < rh; ++k) {
3054                     memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3055                 }
3056                 aj += NB_ELTS_V8;
3057             }
3058         } else {
3059             /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
3060                 transfer being the limiting factor. So limit the number of
3061                 threads.
3062              */
3063             OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
3064             OPJ_UINT32 step_j;
3065
3066             if ((rw / NB_ELTS_V8) < num_jobs) {
3067                 num_jobs = rw / NB_ELTS_V8;
3068             }
3069             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3070             for (j = 0; j < num_jobs; j++) {
3071                 opj_dwt97_decode_v_job_t* job;
3072
3073                 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
3074                 if (!job) {
3075                     opj_thread_pool_wait_completion(tp, 0);
3076                     opj_aligned_free(h.wavelet);
3077                     return OPJ_FALSE;
3078                 }
3079                 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3080                 if (!job->v.wavelet) {
3081                     opj_thread_pool_wait_completion(tp, 0);
3082                     opj_free(job);
3083                     opj_aligned_free(h.wavelet);
3084                     return OPJ_FALSE;
3085                 }
3086                 job->v.dn = v.dn;
3087                 job->v.sn = v.sn;
3088                 job->v.cas = v.cas;
3089                 job->v.win_l_x0 = v.win_l_x0;
3090                 job->v.win_l_x1 = v.win_l_x1;
3091                 job->v.win_h_x0 = v.win_h_x0;
3092                 job->v.win_h_x1 = v.win_h_x1;
3093                 job->rh = rh;
3094                 job->w = w;
3095                 job->aj = aj;
3096                 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
3097                                   (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3098                 aj += job->nb_columns;
3099                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
3100             }
3101             opj_thread_pool_wait_completion(tp, 0);
3102         }
3103
3104         if (rw & (NB_ELTS_V8 - 1)) {
3105             OPJ_UINT32 k;
3106
3107             j = rw & (NB_ELTS_V8 - 1);
3108
3109             opj_v8dwt_interleave_v(&v, aj, w, j);
3110             opj_v8dwt_decode(&v);
3111
3112             for (k = 0; k < rh; ++k) {
3113                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
3114                        (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
3115             }
3116         }
3117     }
3118
3119     opj_aligned_free(h.wavelet);
3120     return OPJ_TRUE;
3121 }
3122
3123 static
3124 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3125                                    OPJ_UINT32 numres)
3126 {
3127     opj_sparse_array_int32_t* sa;
3128     opj_v8dwt_t h;
3129     opj_v8dwt_t v;
3130     OPJ_UINT32 resno;
3131     /* This value matches the maximum left/right extension given in tables */
3132     /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
3133     /* we currently use 3. */
3134     const OPJ_UINT32 filter_width = 4U;
3135
3136     opj_tcd_resolution_t* tr = tilec->resolutions;
3137     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
3138
3139     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
3140                                  tr->x0);    /* width of the resolution level computed */
3141     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
3142                                  tr->y0);    /* height of the resolution level computed */
3143
3144     OPJ_SIZE_T l_data_size;
3145
3146     /* Compute the intersection of the area of interest, expressed in tile coordinates */
3147     /* with the tile coordinates */
3148     OPJ_UINT32 win_tcx0 = tilec->win_x0;
3149     OPJ_UINT32 win_tcy0 = tilec->win_y0;
3150     OPJ_UINT32 win_tcx1 = tilec->win_x1;
3151     OPJ_UINT32 win_tcy1 = tilec->win_y1;
3152
3153     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
3154         return OPJ_TRUE;
3155     }
3156
3157     sa = opj_dwt_init_sparse_array(tilec, numres);
3158     if (sa == NULL) {
3159         return OPJ_FALSE;
3160     }
3161
3162     if (numres == 1U) {
3163         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3164                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3165                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3166                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3167                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3168                        tilec->data_win,
3169                        1, tr_max->win_x1 - tr_max->win_x0,
3170                        OPJ_TRUE);
3171         assert(ret);
3172         OPJ_UNUSED(ret);
3173         opj_sparse_array_int32_free(sa);
3174         return OPJ_TRUE;
3175     }
3176
3177     l_data_size = opj_dwt_max_resolution(tr, numres);
3178     /* overflow check */
3179     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3180         /* FIXME event manager error callback */
3181         opj_sparse_array_int32_free(sa);
3182         return OPJ_FALSE;
3183     }
3184     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3185     if (!h.wavelet) {
3186         /* FIXME event manager error callback */
3187         opj_sparse_array_int32_free(sa);
3188         return OPJ_FALSE;
3189     }
3190     v.wavelet = h.wavelet;
3191
3192     for (resno = 1; resno < numres; resno ++) {
3193         OPJ_UINT32 j;
3194         /* Window of interest subband-based coordinates */
3195         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
3196         OPJ_UINT32 win_hl_x0, win_hl_x1;
3197         OPJ_UINT32 win_lh_y0, win_lh_y1;
3198         /* Window of interest tile-resolution-based coordinates */
3199         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
3200         /* Tile-resolution subband-based coordinates */
3201         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
3202
3203         ++tr;
3204
3205         h.sn = (OPJ_INT32)rw;
3206         v.sn = (OPJ_INT32)rh;
3207
3208         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
3209         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
3210
3211         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3212         h.cas = tr->x0 % 2;
3213
3214         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3215         v.cas = tr->y0 % 2;
3216
3217         /* Get the subband coordinates for the window of interest */
3218         /* LL band */
3219         opj_dwt_get_band_coordinates(tilec, resno, 0,
3220                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3221                                      &win_ll_x0, &win_ll_y0,
3222                                      &win_ll_x1, &win_ll_y1);
3223
3224         /* HL band */
3225         opj_dwt_get_band_coordinates(tilec, resno, 1,
3226                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3227                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
3228
3229         /* LH band */
3230         opj_dwt_get_band_coordinates(tilec, resno, 2,
3231                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3232                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
3233
3234         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
3235         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
3236         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
3237         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
3238         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
3239
3240         /* Subtract the origin of the bands for this tile, to the subwindow */
3241         /* of interest band coordinates, so as to get them relative to the */
3242         /* tile */
3243         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
3244         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
3245         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
3246         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
3247         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
3248         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
3249         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
3250         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
3251
3252         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
3253         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
3254
3255         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
3256         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
3257
3258         /* Compute the tile-resolution-based coordinates for the window of interest */
3259         if (h.cas == 0) {
3260             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
3261             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
3262         } else {
3263             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
3264             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
3265         }
3266
3267         if (v.cas == 0) {
3268             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
3269             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
3270         } else {
3271             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
3272             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
3273         }
3274
3275         h.win_l_x0 = win_ll_x0;
3276         h.win_l_x1 = win_ll_x1;
3277         h.win_h_x0 = win_hl_x0;
3278         h.win_h_x1 = win_hl_x1;
3279         for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3280             if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3281                     (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3282                      j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
3283                 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
3284                 opj_v8dwt_decode(&h);
3285                 if (!opj_sparse_array_int32_write(sa,
3286                                                   win_tr_x0, j,
3287                                                   win_tr_x1, j + NB_ELTS_V8,
3288                                                   (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3289                                                   NB_ELTS_V8, 1, OPJ_TRUE)) {
3290                     /* FIXME event manager error callback */
3291                     opj_sparse_array_int32_free(sa);
3292                     opj_aligned_free(h.wavelet);
3293                     return OPJ_FALSE;
3294                 }
3295             }
3296         }
3297
3298         if (j < rh &&
3299                 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3300                  (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3301                   j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
3302             opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
3303             opj_v8dwt_decode(&h);
3304             if (!opj_sparse_array_int32_write(sa,
3305                                               win_tr_x0, j,
3306                                               win_tr_x1, rh,
3307                                               (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3308                                               NB_ELTS_V8, 1, OPJ_TRUE)) {
3309                 /* FIXME event manager error callback */
3310                 opj_sparse_array_int32_free(sa);
3311                 opj_aligned_free(h.wavelet);
3312                 return OPJ_FALSE;
3313             }
3314         }
3315
3316         v.win_l_x0 = win_ll_y0;
3317         v.win_l_x1 = win_ll_y1;
3318         v.win_h_x0 = win_lh_y0;
3319         v.win_h_x1 = win_lh_y1;
3320         for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
3321             OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
3322
3323             opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
3324             opj_v8dwt_decode(&v);
3325
3326             if (!opj_sparse_array_int32_write(sa,
3327                                               j, win_tr_y0,
3328                                               j + nb_elts, win_tr_y1,
3329                                               (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
3330                                               1, NB_ELTS_V8, OPJ_TRUE)) {
3331                 /* FIXME event manager error callback */
3332                 opj_sparse_array_int32_free(sa);
3333                 opj_aligned_free(h.wavelet);
3334                 return OPJ_FALSE;
3335             }
3336         }
3337     }
3338
3339     {
3340         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3341                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3342                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3343                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3344                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3345                        tilec->data_win,
3346                        1, tr_max->win_x1 - tr_max->win_x0,
3347                        OPJ_TRUE);
3348         assert(ret);
3349         OPJ_UNUSED(ret);
3350     }
3351     opj_sparse_array_int32_free(sa);
3352
3353     opj_aligned_free(h.wavelet);
3354     return OPJ_TRUE;
3355 }
3356
3357
3358 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
3359                              opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3360                              OPJ_UINT32 numres)
3361 {
3362     if (p_tcd->whole_tile_decoding) {
3363         return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
3364     } else {
3365         return opj_dwt_decode_partial_97(tilec, numres);
3366     }
3367 }