src/lib/openjp2/dwt.c

   1 /*
   2  * The copyright in this software is being made available under the 2-clauses
   3  * BSD License, included below. This software may be subject to other third
   4  * party and contributor rights, including patent rights, and no such rights
   5  * are granted under this license.
   6  *
   7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
   8  * Copyright (c) 2002-2014, Professor Benoit Macq
   9  * Copyright (c) 2001-2003, David Janssens
  10  * Copyright (c) 2002-2003, Yannick Verschueren
  11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
  12  * Copyright (c) 2003-2014, Antonin Descampe
  13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
  14  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
  15  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
  16  * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
  17  * All rights reserved.
  18  *
  19  * Redistribution and use in source and binary forms, with or without
  20  * modification, are permitted provided that the following conditions
  21  * are met:
  22  * 1. Redistributions of source code must retain the above copyright
  23  *    notice, this list of conditions and the following disclaimer.
  24  * 2. Redistributions in binary form must reproduce the above copyright
  25  *    notice, this list of conditions and the following disclaimer in the
  26  *    documentation and/or other materials provided with the distribution.
  27  *
  28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
  29  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38  * POSSIBILITY OF SUCH DAMAGE.
  39  */
  40
  41 #include <assert.h>
  42
  43 #define OPJ_SKIP_POISON
  44 #include "opj_includes.h"
  45
  46 #ifdef __SSE__
  47 #include <xmmintrin.h>
  48 #endif
  49 #ifdef __SSE2__
  50 #include <emmintrin.h>
  51 #endif
  52 #ifdef __SSSE3__
  53 #include <tmmintrin.h>
  54 #endif
  55 #ifdef __AVX2__
  56 #include <immintrin.h>
  57 #endif
  58
  59 #if defined(__GNUC__)
  60 #pragma GCC poison malloc calloc realloc free
  61 #endif
  62
  63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
  64 /*@{*/
  65
  66 #define OPJ_WS(i) v->mem[(i)*2]
  67 #define OPJ_WD(i) v->mem[(1+(i)*2)]
  68
  69 #ifdef __AVX2__
  70 /** Number of int32 values in a AVX2 register */
  71 #define VREG_INT_COUNT       8
  72 #else
  73 /** Number of int32 values in a SSE2 register */
  74 #define VREG_INT_COUNT       4
  75 #endif
  76
  77 /** Number of columns that we can process in parallel in the vertical pass */
  78 #define PARALLEL_COLS_53     (2*VREG_INT_COUNT)
  79
  80 /** @name Local data structures */
  81 /*@{*/
  82
  83 typedef struct dwt_local {
  84     OPJ_INT32* mem;
  85     OPJ_INT32 dn;   /* number of elements in high pass band */
  86     OPJ_INT32 sn;   /* number of elements in low pass band */
  87     OPJ_INT32 cas;  /* 0 = start on even coord, 1 = start on odd coord */
  88 } opj_dwt_t;
  89
  90 #define NB_ELTS_V8  8
  91
  92 typedef union {
  93     OPJ_FLOAT32 f[NB_ELTS_V8];
  94 } opj_v8_t;
  95
  96 typedef struct v8dwt_local {
  97     opj_v8_t*   wavelet ;
  98     OPJ_INT32       dn ;  /* number of elements in high pass band */
  99     OPJ_INT32       sn ;  /* number of elements in low pass band */
 100     OPJ_INT32       cas ; /* 0 = start on even coord, 1 = start on odd coord */
 101     OPJ_UINT32      win_l_x0; /* start coord in low pass band */
 102     OPJ_UINT32      win_l_x1; /* end coord in low pass band */
 103     OPJ_UINT32      win_h_x0; /* start coord in high pass band */
 104     OPJ_UINT32      win_h_x1; /* end coord in high pass band */
 105 } opj_v8dwt_t ;
 106
 107 /* From table F.4 from the standard */
 108 static const OPJ_FLOAT32 opj_dwt_alpha =  -1.586134342f;
 109 static const OPJ_FLOAT32 opj_dwt_beta  =  -0.052980118f;
 110 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
 111 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
 112
 113 static const OPJ_FLOAT32 opj_K      = 1.230174105f;
 114 static const OPJ_FLOAT32 opj_invK   = (OPJ_FLOAT32)(1.0 / 1.230174105);
 115
 116 /*@}*/
 117
 118 /** @name Local static functions */
 119 /*@{*/
 120
 121 /**
 122 Forward lazy transform (horizontal)
 123 */
 124 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 125                                    OPJ_INT32 * OPJ_RESTRICT b,
 126                                    OPJ_INT32 dn,
 127                                    OPJ_INT32 sn, OPJ_INT32 cas);
 128
 129 /**
 130 Forward 9-7 wavelet transform in 1-D
 131 */
 132 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
 133                                   OPJ_INT32 cas);
 134 /**
 135 Explicit calculation of the Quantization Stepsizes
 136 */
 137 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
 138                                     opj_stepsize_t *bandno_stepsize);
 139 /**
 140 Inverse wavelet transform in 2-D.
 141 */
 142 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
 143                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
 144
 145 static OPJ_BOOL opj_dwt_decode_partial_tile(
 146     opj_tcd_tilecomp_t* tilec,
 147     OPJ_UINT32 numres);
 148
 149 /* Forward transform, for the vertical pass, processing cols columns */
 150 /* where cols <= NB_ELTS_V8 */
 151 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 152 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
 153     void *array,
 154     void *tmp,
 155     OPJ_UINT32 height,
 156     OPJ_BOOL even,
 157     OPJ_UINT32 stride_width,
 158     OPJ_UINT32 cols);
 159
 160 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
 161 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
 162     void *row,
 163     void *tmp,
 164     OPJ_UINT32 width,
 165     OPJ_BOOL even);
 166
 167 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
 168         opj_tcd_tilecomp_t * tilec,
 169         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
 170         opj_encode_and_deinterleave_h_one_row_fnptr_type
 171         p_encode_and_deinterleave_h_one_row);
 172
 173 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
 174         OPJ_UINT32 i);
 175
 176 /* <summary>                             */
 177 /* Inverse 9-7 wavelet transform in 1-D. */
 178 /* </summary>                            */
 179
 180 /*@}*/
 181
 182 /*@}*/
 183
 184 #define OPJ_S(i) a[(i)*2]
 185 #define OPJ_D(i) a[(1+(i)*2)]
 186 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
 187 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
 188 /* new */
 189 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
 190 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
 191
 192 /* <summary>                                                              */
 193 /* This table contains the norms of the 5-3 wavelets for different bands. */
 194 /* </summary>                                                             */
 195 /* FIXME! the array should really be extended up to 33 resolution levels */
 196 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 197 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
 198     {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
 199     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 200     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
 201     {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
 202 };
 203
 204 /* <summary>                                                              */
 205 /* This table contains the norms of the 9-7 wavelets for different bands. */
 206 /* </summary>                                                             */
 207 /* FIXME! the array should really be extended up to 33 resolution levels */
 208 /* See https://github.com/uclouvain/openjpeg/issues/493 */
 209 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
 210     {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
 211     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 212     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
 213     {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
 214 };
 215
 216 /*
 217 ==========================================================
 218    local functions
 219 ==========================================================
 220 */
 221
 222 /* <summary>                             */
 223 /* Forward lazy transform (horizontal).  */
 224 /* </summary>                            */
 225 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
 226                                    OPJ_INT32 * OPJ_RESTRICT b,
 227                                    OPJ_INT32 dn,
 228                                    OPJ_INT32 sn, OPJ_INT32 cas)
 229 {
 230     OPJ_INT32 i;
 231     OPJ_INT32 * OPJ_RESTRICT l_dest = b;
 232     const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
 233
 234     for (i = 0; i < sn; ++i) {
 235         *l_dest++ = *l_src;
 236         l_src += 2;
 237     }
 238
 239     l_dest = b + sn;
 240     l_src = a + 1 - cas;
 241
 242     for (i = 0; i < dn; ++i)  {
 243         *l_dest++ = *l_src;
 244         l_src += 2;
 245     }
 246 }
 247
 248 #ifdef STANDARD_SLOW_VERSION
 249 /* <summary>                             */
 250 /* Inverse lazy transform (horizontal).  */
 251 /* </summary>                            */
 252 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
 253 {
 254     const OPJ_INT32 *ai = a;
 255     OPJ_INT32 *bi = h->mem + h->cas;
 256     OPJ_INT32  i    = h->sn;
 257     while (i--) {
 258         *bi = *(ai++);
 259         bi += 2;
 260     }
 261     ai  = a + h->sn;
 262     bi  = h->mem + 1 - h->cas;
 263     i   = h->dn ;
 264     while (i--) {
 265         *bi = *(ai++);
 266         bi += 2;
 267     }
 268 }
 269
 270 /* <summary>                             */
 271 /* Inverse lazy transform (vertical).    */
 272 /* </summary>                            */
 273 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
 274 {
 275     const OPJ_INT32 *ai = a;
 276     OPJ_INT32 *bi = v->mem + v->cas;
 277     OPJ_INT32  i = v->sn;
 278     while (i--) {
 279         *bi = *ai;
 280         bi += 2;
 281         ai += x;
 282     }
 283     ai = a + (v->sn * (OPJ_SIZE_T)x);
 284     bi = v->mem + 1 - v->cas;
 285     i = v->dn ;
 286     while (i--) {
 287         *bi = *ai;
 288         bi += 2;
 289         ai += x;
 290     }
 291 }
 292
 293 #endif /* STANDARD_SLOW_VERSION */
 294
 295 #ifdef STANDARD_SLOW_VERSION
 296 /* <summary>                            */
 297 /* Inverse 5-3 wavelet transform in 1-D. */
 298 /* </summary>                           */
 299 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
 300                               OPJ_INT32 cas)
 301 {
 302     OPJ_INT32 i;
 303
 304     if (!cas) {
 305         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
 306             for (i = 0; i < sn; i++) {
 307                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
 308             }
 309             for (i = 0; i < dn; i++) {
 310                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
 311             }
 312         }
 313     } else {
 314         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
 315             OPJ_S(0) /= 2;
 316         } else {
 317             for (i = 0; i < sn; i++) {
 318                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
 319             }
 320             for (i = 0; i < dn; i++) {
 321                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
 322             }
 323         }
 324     }
 325 }
 326
 327 static void opj_dwt_decode_1(const opj_dwt_t *v)
 328 {
 329     opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
 330 }
 331
 332 #endif /* STANDARD_SLOW_VERSION */
 333
 334 #if !defined(STANDARD_SLOW_VERSION)
 335 static void  opj_idwt53_h_cas0(OPJ_INT32* tmp,
 336                                const OPJ_INT32 sn,
 337                                const OPJ_INT32 len,
 338                                OPJ_INT32* tiledp)
 339 {
 340     OPJ_INT32 i, j;
 341     const OPJ_INT32* in_even = &tiledp[0];
 342     const OPJ_INT32* in_odd = &tiledp[sn];
 343
 344 #ifdef TWO_PASS_VERSION
 345     /* For documentation purpose: performs lifting in two iterations, */
 346     /* but without explicit interleaving */
 347
 348     assert(len > 1);
 349
 350     /* Even */
 351     tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
 352     for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
 353         tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
 354     }
 355     if (len & 1) { /* if len is odd */
 356         tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
 357     }
 358
 359     /* Odd */
 360     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 361         tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
 362     }
 363     if (!(len & 1)) { /* if len is even */
 364         tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
 365     }
 366 #else
 367     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 368
 369     assert(len > 1);
 370
 371     /* Improved version of the TWO_PASS_VERSION: */
 372     /* Performs lifting in one single iteration. Saves memory */
 373     /* accesses and explicit interleaving. */
 374     s1n = in_even[0];
 375     d1n = in_odd[0];
 376     s0n = s1n - ((d1n + 1) >> 1);
 377
 378     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 379         d1c = d1n;
 380         s0c = s0n;
 381
 382         s1n = in_even[j];
 383         d1n = in_odd[j];
 384
 385         s0n = s1n - ((d1c + d1n + 2) >> 2);
 386
 387         tmp[i  ] = s0c;
 388         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 389     }
 390
 391     tmp[i] = s0n;
 392
 393     if (len & 1) {
 394         tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
 395         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 396     } else {
 397         tmp[len - 1] = d1n + s0n;
 398     }
 399 #endif
 400     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 401 }
 402
 403 static void  opj_idwt53_h_cas1(OPJ_INT32* tmp,
 404                                const OPJ_INT32 sn,
 405                                const OPJ_INT32 len,
 406                                OPJ_INT32* tiledp)
 407 {
 408     OPJ_INT32 i, j;
 409     const OPJ_INT32* in_even = &tiledp[sn];
 410     const OPJ_INT32* in_odd = &tiledp[0];
 411
 412 #ifdef TWO_PASS_VERSION
 413     /* For documentation purpose: performs lifting in two iterations, */
 414     /* but without explicit interleaving */
 415
 416     assert(len > 2);
 417
 418     /* Odd */
 419     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
 420         tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
 421     }
 422     if (!(len & 1)) {
 423         tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
 424     }
 425
 426     /* Even */
 427     tmp[0] = in_even[0] + tmp[1];
 428     for (i = 2, j = 1; i < len - 1; i += 2, j++) {
 429         tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
 430     }
 431     if (len & 1) {
 432         tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
 433     }
 434 #else
 435     OPJ_INT32 s1, s2, dc, dn;
 436
 437     assert(len > 2);
 438
 439     /* Improved version of the TWO_PASS_VERSION: */
 440     /* Performs lifting in one single iteration. Saves memory */
 441     /* accesses and explicit interleaving. */
 442
 443     s1 = in_even[1];
 444     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 445     tmp[0] = in_even[0] + dc;
 446
 447     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 448
 449         s2 = in_even[j + 1];
 450
 451         dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
 452         tmp[i  ] = dc;
 453         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 454
 455         dc = dn;
 456         s1 = s2;
 457     }
 458
 459     tmp[i] = dc;
 460
 461     if (!(len & 1)) {
 462         dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
 463         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 464         tmp[len - 1] = dn;
 465     } else {
 466         tmp[len - 1] = s1 + dc;
 467     }
 468 #endif
 469     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 470 }
 471
 472
 473 #endif /* !defined(STANDARD_SLOW_VERSION) */
 474
 475 /* <summary>                            */
 476 /* Inverse 5-3 wavelet transform in 1-D for one row. */
 477 /* </summary>                           */
 478 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 479 static void opj_idwt53_h(const opj_dwt_t *dwt,
 480                          OPJ_INT32* tiledp)
 481 {
 482 #ifdef STANDARD_SLOW_VERSION
 483     /* For documentation purpose */
 484     opj_dwt_interleave_h(dwt, tiledp);
 485     opj_dwt_decode_1(dwt);
 486     memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
 487 #else
 488     const OPJ_INT32 sn = dwt->sn;
 489     const OPJ_INT32 len = sn + dwt->dn;
 490     if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
 491         if (len > 1) {
 492             opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
 493         } else {
 494             /* Unmodified value */
 495         }
 496     } else { /* Left-most sample is on odd coordinate */
 497         if (len == 1) {
 498             tiledp[0] /= 2;
 499         } else if (len == 2) {
 500             OPJ_INT32* out = dwt->mem;
 501             const OPJ_INT32* in_even = &tiledp[sn];
 502             const OPJ_INT32* in_odd = &tiledp[0];
 503             out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 504             out[0] = in_even[0] + out[1];
 505             memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
 506         } else if (len > 2) {
 507             opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
 508         }
 509     }
 510 #endif
 511 }
 512
 513 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
 514
 515 /* Conveniency macros to improve the readabilty of the formulas */
 516 #if __AVX2__
 517 #define VREG        __m256i
 518 #define LOAD_CST(x) _mm256_set1_epi32(x)
 519 #define LOAD(x)     _mm256_load_si256((const VREG*)(x))
 520 #define LOADU(x)    _mm256_loadu_si256((const VREG*)(x))
 521 #define STORE(x,y)  _mm256_store_si256((VREG*)(x),(y))
 522 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
 523 #define ADD(x,y)    _mm256_add_epi32((x),(y))
 524 #define SUB(x,y)    _mm256_sub_epi32((x),(y))
 525 #define SAR(x,y)    _mm256_srai_epi32((x),(y))
 526 #else
 527 #define VREG        __m128i
 528 #define LOAD_CST(x) _mm_set1_epi32(x)
 529 #define LOAD(x)     _mm_load_si128((const VREG*)(x))
 530 #define LOADU(x)    _mm_loadu_si128((const VREG*)(x))
 531 #define STORE(x,y)  _mm_store_si128((VREG*)(x),(y))
 532 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
 533 #define ADD(x,y)    _mm_add_epi32((x),(y))
 534 #define SUB(x,y)    _mm_sub_epi32((x),(y))
 535 #define SAR(x,y)    _mm_srai_epi32((x),(y))
 536 #endif
 537 #define ADD3(x,y,z) ADD(ADD(x,y),z)
 538
 539 static
 540 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
 541                                const OPJ_INT32* tmp,
 542                                OPJ_INT32 len,
 543                                OPJ_SIZE_T stride)
 544 {
 545     OPJ_INT32 i;
 546     for (i = 0; i < len; ++i) {
 547         /* A memcpy(&tiledp_col[i * stride + 0],
 548                     &tmp[PARALLEL_COLS_53 * i + 0],
 549                     PARALLEL_COLS_53 * sizeof(OPJ_INT32))
 550            would do but would be a tiny bit slower.
 551            We can take here advantage of our knowledge of alignment */
 552         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
 553                LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
 554         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
 555                LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
 556     }
 557 }
 558
 559 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 560  * 16 in AVX2, when top-most pixel is on even coordinate */
 561 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
 562     OPJ_INT32* tmp,
 563     const OPJ_INT32 sn,
 564     const OPJ_INT32 len,
 565     OPJ_INT32* tiledp_col,
 566     const OPJ_SIZE_T stride)
 567 {
 568     const OPJ_INT32* in_even = &tiledp_col[0];
 569     const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 570
 571     OPJ_INT32 i;
 572     OPJ_SIZE_T j;
 573     VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
 574     VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
 575     const VREG two = LOAD_CST(2);
 576
 577     assert(len > 1);
 578 #if __AVX2__
 579     assert(PARALLEL_COLS_53 == 16);
 580     assert(VREG_INT_COUNT == 8);
 581 #else
 582     assert(PARALLEL_COLS_53 == 8);
 583     assert(VREG_INT_COUNT == 4);
 584 #endif
 585
 586     /* Note: loads of input even/odd values must be done in a unaligned */
 587     /* fashion. But stores in tmp can be done with aligned store, since */
 588     /* the temporary buffer is properly aligned */
 589     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 590
 591     s1n_0 = LOADU(in_even + 0);
 592     s1n_1 = LOADU(in_even + VREG_INT_COUNT);
 593     d1n_0 = LOADU(in_odd);
 594     d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
 595
 596     /* s0n = s1n - ((d1n + 1) >> 1); <==> */
 597     /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
 598     s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 599     s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 600
 601     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
 602         d1c_0 = d1n_0;
 603         s0c_0 = s0n_0;
 604         d1c_1 = d1n_1;
 605         s0c_1 = s0n_1;
 606
 607         s1n_0 = LOADU(in_even + j * stride);
 608         s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
 609         d1n_0 = LOADU(in_odd + j * stride);
 610         d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
 611
 612         /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
 613         s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
 614         s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
 615
 616         STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
 617         STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
 618
 619         /* d1c + ((s0c + s0n) >> 1) */
 620         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 621               ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
 622         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 623               ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
 624     }
 625
 626     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
 627     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
 628
 629     if (len & 1) {
 630         VREG tmp_len_minus_1;
 631         s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
 632         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 633         tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
 634         STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
 635         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 636         STORE(tmp + PARALLEL_COLS_53 * (len - 2),
 637               ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
 638
 639         s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
 640         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
 641         tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
 642         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 643               tmp_len_minus_1);
 644         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
 645         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 646               ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
 647
 648
 649     } else {
 650         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
 651               ADD(d1n_0, s0n_0));
 652         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 653               ADD(d1n_1, s0n_1));
 654     }
 655
 656     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 657 }
 658
 659
 660 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
 661  * 16 in AVX2, when top-most pixel is on odd coordinate */
 662 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
 663     OPJ_INT32* tmp,
 664     const OPJ_INT32 sn,
 665     const OPJ_INT32 len,
 666     OPJ_INT32* tiledp_col,
 667     const OPJ_SIZE_T stride)
 668 {
 669     OPJ_INT32 i;
 670     OPJ_SIZE_T j;
 671
 672     VREG s1_0, s2_0, dc_0, dn_0;
 673     VREG s1_1, s2_1, dc_1, dn_1;
 674     const VREG two = LOAD_CST(2);
 675
 676     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 677     const OPJ_INT32* in_odd = &tiledp_col[0];
 678
 679     assert(len > 2);
 680 #if __AVX2__
 681     assert(PARALLEL_COLS_53 == 16);
 682     assert(VREG_INT_COUNT == 8);
 683 #else
 684     assert(PARALLEL_COLS_53 == 8);
 685     assert(VREG_INT_COUNT == 4);
 686 #endif
 687
 688     /* Note: loads of input even/odd values must be done in a unaligned */
 689     /* fashion. But stores in tmp can be done with aligned store, since */
 690     /* the temporary buffer is properly aligned */
 691     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
 692
 693     s1_0 = LOADU(in_even + stride);
 694     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 695     dc_0 = SUB(LOADU(in_odd + 0),
 696                SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
 697     STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
 698
 699     s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
 700     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
 701     dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
 702                SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
 703     STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
 704           ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
 705
 706     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 707
 708         s2_0 = LOADU(in_even + (j + 1) * stride);
 709         s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
 710
 711         /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
 712         dn_0 = SUB(LOADU(in_odd + j * stride),
 713                    SAR(ADD3(s1_0, s2_0, two), 2));
 714         dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
 715                    SAR(ADD3(s1_1, s2_1, two), 2));
 716
 717         STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 718         STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 719
 720         /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
 721         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
 722               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 723         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
 724               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 725
 726         dc_0 = dn_0;
 727         s1_0 = s2_0;
 728         dc_1 = dn_1;
 729         s1_1 = s2_1;
 730     }
 731     STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
 732     STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
 733
 734     if (!(len & 1)) {
 735         /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
 736         dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
 737                    SAR(ADD3(s1_0, s1_0, two), 2));
 738         dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
 739                    SAR(ADD3(s1_1, s1_1, two), 2));
 740
 741         /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
 742         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
 743               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
 744         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
 745               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
 746
 747         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
 748         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
 749     } else {
 750         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
 751         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
 752               ADD(s1_1, dc_1));
 753     }
 754
 755     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
 756 }
 757
 758 #undef VREG
 759 #undef LOAD_CST
 760 #undef LOADU
 761 #undef LOAD
 762 #undef STORE
 763 #undef STOREU
 764 #undef ADD
 765 #undef ADD3
 766 #undef SUB
 767 #undef SAR
 768
 769 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
 770
 771 #if !defined(STANDARD_SLOW_VERSION)
 772 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 773  * pixel is on even coordinate */
 774 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
 775                              const OPJ_INT32 sn,
 776                              const OPJ_INT32 len,
 777                              OPJ_INT32* tiledp_col,
 778                              const OPJ_SIZE_T stride)
 779 {
 780     OPJ_INT32 i, j;
 781     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
 782
 783     assert(len > 1);
 784
 785     /* Performs lifting in one single iteration. Saves memory */
 786     /* accesses and explicit interleaving. */
 787
 788     s1n = tiledp_col[0];
 789     d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
 790     s0n = s1n - ((d1n + 1) >> 1);
 791
 792     for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
 793         d1c = d1n;
 794         s0c = s0n;
 795
 796         s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
 797         d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
 798
 799         s0n = s1n - ((d1c + d1n + 2) >> 2);
 800
 801         tmp[i  ] = s0c;
 802         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
 803     }
 804
 805     tmp[i] = s0n;
 806
 807     if (len & 1) {
 808         tmp[len - 1] =
 809             tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
 810             ((d1n + 1) >> 1);
 811         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
 812     } else {
 813         tmp[len - 1] = d1n + s0n;
 814     }
 815
 816     for (i = 0; i < len; ++i) {
 817         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 818     }
 819 }
 820
 821
 822 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
 823  * pixel is on odd coordinate */
 824 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
 825                              const OPJ_INT32 sn,
 826                              const OPJ_INT32 len,
 827                              OPJ_INT32* tiledp_col,
 828                              const OPJ_SIZE_T stride)
 829 {
 830     OPJ_INT32 i, j;
 831     OPJ_INT32 s1, s2, dc, dn;
 832     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 833     const OPJ_INT32* in_odd = &tiledp_col[0];
 834
 835     assert(len > 2);
 836
 837     /* Performs lifting in one single iteration. Saves memory */
 838     /* accesses and explicit interleaving. */
 839
 840     s1 = in_even[stride];
 841     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
 842     tmp[0] = in_even[0] + dc;
 843     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
 844
 845         s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
 846
 847         dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
 848         tmp[i  ] = dc;
 849         tmp[i + 1] = s1 + ((dn + dc) >> 1);
 850
 851         dc = dn;
 852         s1 = s2;
 853     }
 854     tmp[i] = dc;
 855     if (!(len & 1)) {
 856         dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
 857         tmp[len - 2] = s1 + ((dn + dc) >> 1);
 858         tmp[len - 1] = dn;
 859     } else {
 860         tmp[len - 1] = s1 + dc;
 861     }
 862
 863     for (i = 0; i < len; ++i) {
 864         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
 865     }
 866 }
 867 #endif /* !defined(STANDARD_SLOW_VERSION) */
 868
 869 /* <summary>                            */
 870 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
 871 /* </summary>                           */
 872 /* Performs interleave, inverse wavelet transform and copy back to buffer */
 873 static void opj_idwt53_v(const opj_dwt_t *dwt,
 874                          OPJ_INT32* tiledp_col,
 875                          OPJ_SIZE_T stride,
 876                          OPJ_INT32 nb_cols)
 877 {
 878 #ifdef STANDARD_SLOW_VERSION
 879     /* For documentation purpose */
 880     OPJ_INT32 k, c;
 881     for (c = 0; c < nb_cols; c ++) {
 882         opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
 883         opj_dwt_decode_1(dwt);
 884         for (k = 0; k < dwt->sn + dwt->dn; ++k) {
 885             tiledp_col[c + k * stride] = dwt->mem[k];
 886         }
 887     }
 888 #else
 889     const OPJ_INT32 sn = dwt->sn;
 890     const OPJ_INT32 len = sn + dwt->dn;
 891     if (dwt->cas == 0) {
 892         /* If len == 1, unmodified value */
 893
 894 #if (defined(__SSE2__) || defined(__AVX2__))
 895         if (len > 1 && nb_cols == PARALLEL_COLS_53) {
 896             /* Same as below general case, except that thanks to SSE2/AVX2 */
 897             /* we can efficiently process 8/16 columns in parallel */
 898             opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 899             return;
 900         }
 901 #endif
 902         if (len > 1) {
 903             OPJ_INT32 c;
 904             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 905                 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
 906             }
 907             return;
 908         }
 909     } else {
 910         if (len == 1) {
 911             OPJ_INT32 c;
 912             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 913                 tiledp_col[0] /= 2;
 914             }
 915             return;
 916         }
 917
 918         if (len == 2) {
 919             OPJ_INT32 c;
 920             OPJ_INT32* out = dwt->mem;
 921             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 922                 OPJ_INT32 i;
 923                 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
 924                 const OPJ_INT32* in_odd = &tiledp_col[0];
 925
 926                 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
 927                 out[0] = in_even[0] + out[1];
 928
 929                 for (i = 0; i < len; ++i) {
 930                     tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
 931                 }
 932             }
 933
 934             return;
 935         }
 936
 937 #if (defined(__SSE2__) || defined(__AVX2__))
 938         if (len > 2 && nb_cols == PARALLEL_COLS_53) {
 939             /* Same as below general case, except that thanks to SSE2/AVX2 */
 940             /* we can efficiently process 8/16 columns in parallel */
 941             opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
 942             return;
 943         }
 944 #endif
 945         if (len > 2) {
 946             OPJ_INT32 c;
 947             for (c = 0; c < nb_cols; c++, tiledp_col++) {
 948                 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
 949             }
 950             return;
 951         }
 952     }
 953 #endif
 954 }
 955
 956 #if 0
 957 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
 958                                  OPJ_UINT32 end,
 959                                  const OPJ_FLOAT32 c)
 960 {
 961     OPJ_UINT32 i = 0;
 962     for (; i < end; ++i) {
 963         fw[0] *= c;
 964         fw += 2;
 965     }
 966 }
 967 #else
 968 static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
 969         OPJ_UINT32 iters_c1,
 970         OPJ_UINT32 iters_c2,
 971         const OPJ_FLOAT32 c1,
 972         const OPJ_FLOAT32 c2)
 973 {
 974     OPJ_UINT32 i = 0;
 975     const OPJ_UINT32 iters_common =  opj_uint_min(iters_c1, iters_c2);
 976     assert((((OPJ_SIZE_T)fw) & 0xf) == 0);
 977     assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1);
 978     for (; i + 3 < iters_common; i += 4) {
 979 #ifdef __SSE__
 980         const __m128 vcst = _mm_set_ps(c2, c1, c2, c1);
 981         *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst);
 982         *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst);
 983 #else
 984         fw[0] *= c1;
 985         fw[1] *= c2;
 986         fw[2] *= c1;
 987         fw[3] *= c2;
 988         fw[4] *= c1;
 989         fw[5] *= c2;
 990         fw[6] *= c1;
 991         fw[7] *= c2;
 992 #endif
 993         fw += 8;
 994     }
 995     for (; i < iters_common; i++) {
 996         fw[0] *= c1;
 997         fw[1] *= c2;
 998         fw += 2;
 999     }
1000     if (i < iters_c1) {
1001         fw[0] *= c1;
1002     } else if (i < iters_c2) {
1003         fw[1] *= c2;
1004     }
1005 }
1006
1007 #endif
1008
1009 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1010                                  OPJ_UINT32 end,
1011                                  OPJ_UINT32 m,
1012                                  OPJ_FLOAT32 c)
1013 {
1014     OPJ_UINT32 i;
1015     OPJ_UINT32 imax = opj_uint_min(end, m);
1016     if (imax > 0) {
1017         fw[-1] += (fl[0] + fw[0]) * c;
1018         fw += 2;
1019         i = 1;
1020         for (; i + 3 < imax; i += 4) {
1021             fw[-1] += (fw[-2] + fw[0]) * c;
1022             fw[1] += (fw[0] + fw[2]) * c;
1023             fw[3] += (fw[2] + fw[4]) * c;
1024             fw[5] += (fw[4] + fw[6]) * c;
1025             fw += 8;
1026         }
1027         for (; i < imax; ++i) {
1028             fw[-1] += (fw[-2] + fw[0]) * c;
1029             fw += 2;
1030         }
1031     }
1032     if (m < end) {
1033         assert(m + 1 == end);
1034         fw[-1] += (2 * fw[-2]) * c;
1035     }
1036 }
1037
1038 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
1039                                   OPJ_INT32 cas)
1040 {
1041     OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
1042     OPJ_INT32 a, b;
1043     assert(dn + sn > 1);
1044     if (cas == 0) {
1045         a = 0;
1046         b = 1;
1047     } else {
1048         a = 1;
1049         b = 0;
1050     }
1051     opj_dwt_encode_step2(w + a, w + b + 1,
1052                          (OPJ_UINT32)dn,
1053                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1054                          opj_dwt_alpha);
1055     opj_dwt_encode_step2(w + b, w + a + 1,
1056                          (OPJ_UINT32)sn,
1057                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1058                          opj_dwt_beta);
1059     opj_dwt_encode_step2(w + a, w + b + 1,
1060                          (OPJ_UINT32)dn,
1061                          (OPJ_UINT32)opj_int_min(dn, sn - b),
1062                          opj_dwt_gamma);
1063     opj_dwt_encode_step2(w + b, w + a + 1,
1064                          (OPJ_UINT32)sn,
1065                          (OPJ_UINT32)opj_int_min(sn, dn - a),
1066                          opj_dwt_delta);
1067 #if 0
1068     opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn,
1069                          opj_K);
1070     opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn,
1071                          opj_invK);
1072 #else
1073     if (a == 0) {
1074         opj_dwt_encode_step1_combined(w,
1075                                       (OPJ_UINT32)sn,
1076                                       (OPJ_UINT32)dn,
1077                                       opj_invK,
1078                                       opj_K);
1079     } else {
1080         opj_dwt_encode_step1_combined(w,
1081                                       (OPJ_UINT32)dn,
1082                                       (OPJ_UINT32)sn,
1083                                       opj_K,
1084                                       opj_invK);
1085     }
1086 #endif
1087 }
1088
1089 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1090                                     opj_stepsize_t *bandno_stepsize)
1091 {
1092     OPJ_INT32 p, n;
1093     p = opj_int_floorlog2(stepsize) - 13;
1094     n = 11 - opj_int_floorlog2(stepsize);
1095     bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1096     bandno_stepsize->expn = numbps - p;
1097 }
1098
1099 /*
1100 ==========================================================
1101    DWT interface
1102 ==========================================================
1103 */
1104
1105 /** Process one line for the horizontal pass of the 5x3 forward transform */
1106 static
1107 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
1108         void* tmpIn,
1109         OPJ_UINT32 width,
1110         OPJ_BOOL even)
1111 {
1112     OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
1113     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
1114     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1115     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1116
1117     if (even) {
1118         if (width > 1) {
1119             OPJ_INT32 i;
1120             for (i = 0; i < sn - 1; i++) {
1121                 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
1122             }
1123             if ((width % 2) == 0) {
1124                 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
1125             }
1126             row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
1127             for (i = 1; i < dn; i++) {
1128                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
1129             }
1130             if ((width % 2) == 1) {
1131                 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
1132             }
1133             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1134         }
1135     } else {
1136         if (width == 1) {
1137             row[0] *= 2;
1138         } else {
1139             OPJ_INT32 i;
1140             tmp[sn + 0] = row[0] - row[1];
1141             for (i = 1; i < sn; i++) {
1142                 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
1143             }
1144             if ((width % 2) == 1) {
1145                 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
1146             }
1147
1148             for (i = 0; i < dn - 1; i++) {
1149                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
1150             }
1151             if ((width % 2) == 0) {
1152                 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
1153             }
1154             memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1155         }
1156     }
1157 }
1158
1159 /** Process one line for the horizontal pass of the 9x7 forward transform */
1160 static
1161 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
1162         void* tmpIn,
1163         OPJ_UINT32 width,
1164         OPJ_BOOL even)
1165 {
1166     OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
1167     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
1168     const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1169     const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1170     if (width == 1) {
1171         return;
1172     }
1173     memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
1174     opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1175     opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
1176                            (OPJ_INT32 * OPJ_RESTRICT)row,
1177                            dn, sn, even ? 0 : 1);
1178 }
1179
1180 typedef struct {
1181     opj_dwt_t h;
1182     OPJ_UINT32 rw; /* Width of the resolution to process */
1183     OPJ_UINT32 w; /* Width of tiledp */
1184     OPJ_INT32 * OPJ_RESTRICT tiledp;
1185     OPJ_UINT32 min_j;
1186     OPJ_UINT32 max_j;
1187     opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
1188 } opj_dwt_encode_h_job_t;
1189
1190 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
1191 {
1192     OPJ_UINT32 j;
1193     opj_dwt_encode_h_job_t* job;
1194     (void)tls;
1195
1196     job = (opj_dwt_encode_h_job_t*)user_data;
1197     for (j = job->min_j; j < job->max_j; j++) {
1198         OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
1199         (*job->p_function)(aj, job->h.mem, job->rw,
1200                            job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
1201     }
1202
1203     opj_aligned_free(job->h.mem);
1204     opj_free(job);
1205 }
1206
1207 typedef struct {
1208     opj_dwt_t v;
1209     OPJ_UINT32 rh;
1210     OPJ_UINT32 w;
1211     OPJ_INT32 * OPJ_RESTRICT tiledp;
1212     OPJ_UINT32 min_j;
1213     OPJ_UINT32 max_j;
1214     opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
1215 } opj_dwt_encode_v_job_t;
1216
1217 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
1218 {
1219     OPJ_UINT32 j;
1220     opj_dwt_encode_v_job_t* job;
1221     (void)tls;
1222
1223     job = (opj_dwt_encode_v_job_t*)user_data;
1224     for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
1225         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1226                                             job->v.mem,
1227                                             job->rh,
1228                                             job->v.cas == 0,
1229                                             job->w,
1230                                             NB_ELTS_V8);
1231     }
1232     if (j < job->max_j) {
1233         (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1234                                             job->v.mem,
1235                                             job->rh,
1236                                             job->v.cas == 0,
1237                                             job->w,
1238                                             job->max_j - j);
1239     }
1240
1241     opj_aligned_free(job->v.mem);
1242     opj_free(job);
1243 }
1244
1245 /** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
1246 /* that has a NB_ELTS_V8 interleave factor. */
1247 static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
1248         void *tmpOut,
1249         OPJ_UINT32 height,
1250         OPJ_UINT32 stride_width,
1251         OPJ_UINT32 cols)
1252 {
1253     const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1254     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
1255     if (cols == NB_ELTS_V8) {
1256         OPJ_UINT32 k;
1257         for (k = 0; k < height; ++k) {
1258             memcpy(tmp + NB_ELTS_V8 * k,
1259                    array + k * stride_width,
1260                    NB_ELTS_V8 * sizeof(OPJ_INT32));
1261         }
1262     } else {
1263         OPJ_UINT32 k;
1264         for (k = 0; k < height; ++k) {
1265             OPJ_UINT32 c;
1266             for (c = 0; c < cols; c++) {
1267                 tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
1268             }
1269             for (; c < NB_ELTS_V8; c++) {
1270                 tmp[NB_ELTS_V8 * k + c] = 0;
1271             }
1272         }
1273     }
1274 }
1275
1276 /* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
1277 /* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
1278 /* columns. */
1279 static INLINE void opj_dwt_deinterleave_v_cols(
1280     const OPJ_INT32 * OPJ_RESTRICT src,
1281     OPJ_INT32 * OPJ_RESTRICT dst,
1282     OPJ_INT32 dn,
1283     OPJ_INT32 sn,
1284     OPJ_UINT32 stride_width,
1285     OPJ_INT32 cas,
1286     OPJ_UINT32 cols)
1287 {
1288     OPJ_INT32 k;
1289     OPJ_INT32 i = sn;
1290     OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
1291     const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
1292     OPJ_UINT32 c;
1293
1294     for (k = 0; k < 2; k++) {
1295         while (i--) {
1296             if (cols == NB_ELTS_V8) {
1297                 memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
1298             } else {
1299                 c = 0;
1300                 switch (cols) {
1301                 case 7:
1302                     l_dest[c] = l_src[c];
1303                     c++; /* fallthru */
1304                 case 6:
1305                     l_dest[c] = l_src[c];
1306                     c++; /* fallthru */
1307                 case 5:
1308                     l_dest[c] = l_src[c];
1309                     c++; /* fallthru */
1310                 case 4:
1311                     l_dest[c] = l_src[c];
1312                     c++; /* fallthru */
1313                 case 3:
1314                     l_dest[c] = l_src[c];
1315                     c++; /* fallthru */
1316                 case 2:
1317                     l_dest[c] = l_src[c];
1318                     c++; /* fallthru */
1319                 default:
1320                     l_dest[c] = l_src[c];
1321                     break;
1322                 }
1323             }
1324             l_dest += stride_width;
1325             l_src += 2 * NB_ELTS_V8;
1326         }
1327
1328         l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
1329         l_src = src + (1 - cas) * NB_ELTS_V8;
1330         i = dn;
1331     }
1332 }
1333
1334
1335 /* Forward 5-3 transform, for the vertical pass, processing cols columns */
1336 /* where cols <= NB_ELTS_V8 */
1337 static void opj_dwt_encode_and_deinterleave_v(
1338     void *arrayIn,
1339     void *tmpIn,
1340     OPJ_UINT32 height,
1341     OPJ_BOOL even,
1342     OPJ_UINT32 stride_width,
1343     OPJ_UINT32 cols)
1344 {
1345     OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1346     OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
1347     const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
1348     const OPJ_UINT32 dn = height - sn;
1349
1350     opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1351
1352 #define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
1353 #define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
1354
1355 #ifdef __SSE2__
1356     if (height == 1) {
1357         if (!even) {
1358             OPJ_UINT32 c;
1359             for (c = 0; c < NB_ELTS_V8; c++) {
1360                 tmp[c] *= 2;
1361             }
1362         }
1363     } else if (even) {
1364         OPJ_UINT32 c;
1365         OPJ_UINT32 i;
1366         i = 0;
1367         if (i + 1 < sn) {
1368             __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
1369             __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
1370             for (; i + 1 < sn; i++) {
1371                 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1372                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1373                 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1374                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1375                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1376                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1377                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1378                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1379                 xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
1380                                          _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
1381                 xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
1382                                          _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
1383                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) =  xmm_Di_0;
1384                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) =  xmm_Di_1;
1385                 xmm_Si_0 = xmm_Sip1_0;
1386                 xmm_Si_1 = xmm_Sip1_1;
1387             }
1388         }
1389         if (((height) % 2) == 0) {
1390             for (c = 0; c < NB_ELTS_V8; c++) {
1391                 OPJ_Dc(i) -= OPJ_Sc(i);
1392             }
1393         }
1394         for (c = 0; c < NB_ELTS_V8; c++) {
1395             OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1396         }
1397         i = 1;
1398         if (i < dn) {
1399             __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1400                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1401             __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1402                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1403             const __m128i xmm_two = _mm_set1_epi32(2);
1404             for (; i < dn; i++) {
1405                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1406                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1407                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1408                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1409                 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1410                                                      (i * 2) * NB_ELTS_V8 + 4 * 0);
1411                 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1412                                                      (i * 2) * NB_ELTS_V8 + 4 * 1);
1413                 xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
1414                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
1415                 xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
1416                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
1417                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1418                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1419                 xmm_Dim1_0 = xmm_Di_0;
1420                 xmm_Dim1_1 = xmm_Di_1;
1421             }
1422         }
1423         if (((height) % 2) == 1) {
1424             for (c = 0; c < NB_ELTS_V8; c++) {
1425                 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1426             }
1427         }
1428     } else {
1429         OPJ_UINT32 c;
1430         OPJ_UINT32 i;
1431         for (c = 0; c < NB_ELTS_V8; c++) {
1432             OPJ_Sc(0) -= OPJ_Dc(0);
1433         }
1434         i = 1;
1435         if (i < sn) {
1436             __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1437                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1438             __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1439                                                    (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1440             for (; i < sn; i++) {
1441                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1442                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1443                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1444                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1445                 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1446                                                      (i * 2) * NB_ELTS_V8 + 4 * 0);
1447                 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1448                                                      (i * 2) * NB_ELTS_V8 + 4 * 1);
1449                 xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
1450                                          _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
1451                 xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
1452                                          _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
1453                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1454                 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1455                 xmm_Dim1_0 = xmm_Di_0;
1456                 xmm_Dim1_1 = xmm_Di_1;
1457             }
1458         }
1459         if (((height) % 2) == 1) {
1460             for (c = 0; c < NB_ELTS_V8; c++) {
1461                 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1462             }
1463         }
1464         i = 0;
1465         if (i + 1 < dn) {
1466             __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
1467             __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
1468             const __m128i xmm_two = _mm_set1_epi32(2);
1469             for (; i + 1 < dn; i++) {
1470                 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1471                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1472                 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1473                                                        (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1474                 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1475                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1476                 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1477                                                      (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1478                 xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
1479                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
1480                 xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
1481                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
1482                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1483                 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1484                 xmm_Si_0 = xmm_Sip1_0;
1485                 xmm_Si_1 = xmm_Sip1_1;
1486             }
1487         }
1488         if (((height) % 2) == 0) {
1489             for (c = 0; c < NB_ELTS_V8; c++) {
1490                 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1491             }
1492         }
1493     }
1494 #else
1495     if (even) {
1496         OPJ_UINT32 c;
1497         if (height > 1) {
1498             OPJ_UINT32 i;
1499             for (i = 0; i + 1 < sn; i++) {
1500                 for (c = 0; c < NB_ELTS_V8; c++) {
1501                     OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
1502                 }
1503             }
1504             if (((height) % 2) == 0) {
1505                 for (c = 0; c < NB_ELTS_V8; c++) {
1506                     OPJ_Dc(i) -= OPJ_Sc(i);
1507                 }
1508             }
1509             for (c = 0; c < NB_ELTS_V8; c++) {
1510                 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1511             }
1512             for (i = 1; i < dn; i++) {
1513                 for (c = 0; c < NB_ELTS_V8; c++) {
1514                     OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
1515                 }
1516             }
1517             if (((height) % 2) == 1) {
1518                 for (c = 0; c < NB_ELTS_V8; c++) {
1519                     OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1520                 }
1521             }
1522         }
1523     } else {
1524         OPJ_UINT32 c;
1525         if (height == 1) {
1526             for (c = 0; c < NB_ELTS_V8; c++) {
1527                 OPJ_Sc(0) *= 2;
1528             }
1529         } else {
1530             OPJ_UINT32 i;
1531             for (c = 0; c < NB_ELTS_V8; c++) {
1532                 OPJ_Sc(0) -= OPJ_Dc(0);
1533             }
1534             for (i = 1; i < sn; i++) {
1535                 for (c = 0; c < NB_ELTS_V8; c++) {
1536                     OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
1537                 }
1538             }
1539             if (((height) % 2) == 1) {
1540                 for (c = 0; c < NB_ELTS_V8; c++) {
1541                     OPJ_Sc(i) -= OPJ_Dc(i - 1);
1542                 }
1543             }
1544             for (i = 0; i + 1 < dn; i++) {
1545                 for (c = 0; c < NB_ELTS_V8; c++) {
1546                     OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
1547                 }
1548             }
1549             if (((height) % 2) == 0) {
1550                 for (c = 0; c < NB_ELTS_V8; c++) {
1551                     OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1552                 }
1553             }
1554         }
1555     }
1556 #endif
1557
1558     if (cols == NB_ELTS_V8) {
1559         opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1560                                     stride_width, even ? 0 : 1, NB_ELTS_V8);
1561     } else {
1562         opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1563                                     stride_width, even ? 0 : 1, cols);
1564     }
1565 }
1566
1567 static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
1568                                    OPJ_UINT32 end,
1569                                    const OPJ_FLOAT32 cst)
1570 {
1571     OPJ_UINT32 i;
1572 #ifdef __SSE__
1573     __m128* vw = (__m128*) fw;
1574     const __m128 vcst = _mm_set1_ps(cst);
1575     for (i = 0; i < end; ++i) {
1576         vw[0] = _mm_mul_ps(vw[0], vcst);
1577         vw[1] = _mm_mul_ps(vw[1], vcst);
1578         vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1579     }
1580 #else
1581     OPJ_UINT32 c;
1582     for (i = 0; i < end; ++i) {
1583         for (c = 0; c < NB_ELTS_V8; c++) {
1584             fw[i * 2 * NB_ELTS_V8 + c] *= cst;
1585         }
1586     }
1587 #endif
1588 }
1589
1590 static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1591                                    OPJ_UINT32 end,
1592                                    OPJ_UINT32 m,
1593                                    OPJ_FLOAT32 cst)
1594 {
1595     OPJ_UINT32 i;
1596     OPJ_UINT32 imax = opj_uint_min(end, m);
1597 #ifdef __SSE__
1598     __m128* vw = (__m128*) fw;
1599     __m128 vcst = _mm_set1_ps(cst);
1600     if (imax > 0) {
1601         __m128* vl = (__m128*) fl;
1602         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
1603         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
1604         vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1605         i = 1;
1606
1607         for (; i < imax; ++i) {
1608             vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
1609             vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
1610             vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1611         }
1612     }
1613     if (m < end) {
1614         assert(m + 1 == end);
1615         vcst = _mm_add_ps(vcst, vcst);
1616         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
1617         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
1618     }
1619 #else
1620     OPJ_INT32 c;
1621     if (imax > 0) {
1622         for (c = 0; c < NB_ELTS_V8; c++) {
1623             fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1624                                        cst;
1625         }
1626         fw += 2 * NB_ELTS_V8;
1627         i = 1;
1628         for (; i < imax; ++i) {
1629             for (c = 0; c < NB_ELTS_V8; c++) {
1630                 fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1631                                            cst;
1632             }
1633             fw += 2 * NB_ELTS_V8;
1634         }
1635     }
1636     if (m < end) {
1637         assert(m + 1 == end);
1638         for (c = 0; c < NB_ELTS_V8; c++) {
1639             fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst;
1640         }
1641     }
1642 #endif
1643 }
1644
1645 /* Forward 9-7 transform, for the vertical pass, processing cols columns */
1646 /* where cols <= NB_ELTS_V8 */
1647 static void opj_dwt_encode_and_deinterleave_v_real(
1648     void *arrayIn,
1649     void *tmpIn,
1650     OPJ_UINT32 height,
1651     OPJ_BOOL even,
1652     OPJ_UINT32 stride_width,
1653     OPJ_UINT32 cols)
1654 {
1655     OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
1656     OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
1657     const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1658     const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1659     OPJ_INT32 a, b;
1660
1661     if (height == 1) {
1662         return;
1663     }
1664
1665     opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1666
1667     if (even) {
1668         a = 0;
1669         b = 1;
1670     } else {
1671         a = 1;
1672         b = 0;
1673     }
1674     opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1675                            tmp + (b + 1) * NB_ELTS_V8,
1676                            (OPJ_UINT32)dn,
1677                            (OPJ_UINT32)opj_int_min(dn, sn - b),
1678                            opj_dwt_alpha);
1679     opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1680                            tmp + (a + 1) * NB_ELTS_V8,
1681                            (OPJ_UINT32)sn,
1682                            (OPJ_UINT32)opj_int_min(sn, dn - a),
1683                            opj_dwt_beta);
1684     opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1685                            tmp + (b + 1) * NB_ELTS_V8,
1686                            (OPJ_UINT32)dn,
1687                            (OPJ_UINT32)opj_int_min(dn, sn - b),
1688                            opj_dwt_gamma);
1689     opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1690                            tmp + (a + 1) * NB_ELTS_V8,
1691                            (OPJ_UINT32)sn,
1692                            (OPJ_UINT32)opj_int_min(sn, dn - a),
1693                            opj_dwt_delta);
1694     opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn,
1695                            opj_K);
1696     opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn,
1697                            opj_invK);
1698
1699
1700     if (cols == NB_ELTS_V8) {
1701         opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1702                                     (OPJ_INT32*)array,
1703                                     (OPJ_INT32)dn, (OPJ_INT32)sn,
1704                                     stride_width, even ? 0 : 1, NB_ELTS_V8);
1705     } else {
1706         opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1707                                     (OPJ_INT32*)array,
1708                                     (OPJ_INT32)dn, (OPJ_INT32)sn,
1709                                     stride_width, even ? 0 : 1, cols);
1710     }
1711 }
1712
1713
1714 /* <summary>                            */
1715 /* Forward 5-3 wavelet transform in 2-D. */
1716 /* </summary>                           */
1717 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
1718         opj_tcd_tilecomp_t * tilec,
1719         opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
1720         opj_encode_and_deinterleave_h_one_row_fnptr_type
1721         p_encode_and_deinterleave_h_one_row)
1722 {
1723     OPJ_INT32 i;
1724     OPJ_INT32 *bj = 00;
1725     OPJ_UINT32 w;
1726     OPJ_INT32 l;
1727
1728     OPJ_SIZE_T l_data_size;
1729
1730     opj_tcd_resolution_t * l_cur_res = 0;
1731     opj_tcd_resolution_t * l_last_res = 0;
1732     const int num_threads = opj_thread_pool_get_thread_count(tp);
1733     OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1734
1735     w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1736     l = (OPJ_INT32)tilec->numresolutions - 1;
1737
1738     l_cur_res = tilec->resolutions + l;
1739     l_last_res = l_cur_res - 1;
1740
1741     l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1742     /* overflow check */
1743     if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
1744         /* FIXME event manager error callback */
1745         return OPJ_FALSE;
1746     }
1747     l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
1748     bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1749     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1750     /* in that case, so do not error out */
1751     if (l_data_size != 0 && ! bj) {
1752         return OPJ_FALSE;
1753     }
1754     i = l;
1755
1756     while (i--) {
1757         OPJ_UINT32 j;
1758         OPJ_UINT32 rw;           /* width of the resolution level computed   */
1759         OPJ_UINT32 rh;           /* height of the resolution level computed  */
1760         OPJ_UINT32
1761         rw1;      /* width of the resolution level once lower than computed one                                       */
1762         OPJ_UINT32
1763         rh1;      /* height of the resolution level once lower than computed one                                      */
1764         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1765         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
1766         OPJ_INT32 dn, sn;
1767
1768         rw  = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
1769         rh  = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
1770         rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
1771         rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
1772
1773         cas_row = l_cur_res->x0 & 1;
1774         cas_col = l_cur_res->y0 & 1;
1775
1776         sn = (OPJ_INT32)rh1;
1777         dn = (OPJ_INT32)(rh - rh1);
1778
1779         /* Perform vertical pass */
1780         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
1781             for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
1782                 p_encode_and_deinterleave_v(tiledp + j,
1783                                             bj,
1784                                             rh,
1785                                             cas_col == 0,
1786                                             w,
1787                                             NB_ELTS_V8);
1788             }
1789             if (j < rw) {
1790                 p_encode_and_deinterleave_v(tiledp + j,
1791                                             bj,
1792                                             rh,
1793                                             cas_col == 0,
1794                                             w,
1795                                             rw - j);
1796             }
1797         }  else {
1798             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1799             OPJ_UINT32 step_j;
1800
1801             if (rw < num_jobs) {
1802                 num_jobs = rw;
1803             }
1804             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
1805
1806             for (j = 0; j < num_jobs; j++) {
1807                 opj_dwt_encode_v_job_t* job;
1808
1809                 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
1810                 if (!job) {
1811                     opj_thread_pool_wait_completion(tp, 0);
1812                     opj_aligned_free(bj);
1813                     return OPJ_FALSE;
1814                 }
1815                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1816                 if (!job->v.mem) {
1817                     opj_thread_pool_wait_completion(tp, 0);
1818                     opj_free(job);
1819                     opj_aligned_free(bj);
1820                     return OPJ_FALSE;
1821                 }
1822                 job->v.dn = dn;
1823                 job->v.sn = sn;
1824                 job->v.cas = cas_col;
1825                 job->rh = rh;
1826                 job->w = w;
1827                 job->tiledp = tiledp;
1828                 job->min_j = j * step_j;
1829                 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
1830                 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
1831                 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
1832             }
1833             opj_thread_pool_wait_completion(tp, 0);
1834         }
1835
1836         sn = (OPJ_INT32)rw1;
1837         dn = (OPJ_INT32)(rw - rw1);
1838
1839         /* Perform horizontal pass */
1840         if (num_threads <= 1 || rh <= 1) {
1841             for (j = 0; j < rh; j++) {
1842                 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
1843                 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
1844                                                        cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
1845             }
1846         }  else {
1847             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1848             OPJ_UINT32 step_j;
1849
1850             if (rh < num_jobs) {
1851                 num_jobs = rh;
1852             }
1853             step_j = (rh / num_jobs);
1854
1855             for (j = 0; j < num_jobs; j++) {
1856                 opj_dwt_encode_h_job_t* job;
1857
1858                 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
1859                 if (!job) {
1860                     opj_thread_pool_wait_completion(tp, 0);
1861                     opj_aligned_free(bj);
1862                     return OPJ_FALSE;
1863                 }
1864                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1865                 if (!job->h.mem) {
1866                     opj_thread_pool_wait_completion(tp, 0);
1867                     opj_free(job);
1868                     opj_aligned_free(bj);
1869                     return OPJ_FALSE;
1870                 }
1871                 job->h.dn = dn;
1872                 job->h.sn = sn;
1873                 job->h.cas = cas_row;
1874                 job->rw = rw;
1875                 job->w = w;
1876                 job->tiledp = tiledp;
1877                 job->min_j = j * step_j;
1878                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1879                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1880                     job->max_j = rh;
1881                 }
1882                 job->p_function = p_encode_and_deinterleave_h_one_row;
1883                 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
1884             }
1885             opj_thread_pool_wait_completion(tp, 0);
1886         }
1887
1888         l_cur_res = l_last_res;
1889
1890         --l_last_res;
1891     }
1892
1893     opj_aligned_free(bj);
1894     return OPJ_TRUE;
1895 }
1896
1897 /* Forward 5-3 wavelet transform in 2-D. */
1898 /* </summary>                           */
1899 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
1900                         opj_tcd_tilecomp_t * tilec)
1901 {
1902     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1903                                     opj_dwt_encode_and_deinterleave_v,
1904                                     opj_dwt_encode_and_deinterleave_h_one_row);
1905 }
1906
1907 /* <summary>                            */
1908 /* Inverse 5-3 wavelet transform in 2-D. */
1909 /* </summary>                           */
1910 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1911                         OPJ_UINT32 numres)
1912 {
1913     if (p_tcd->whole_tile_decoding) {
1914         return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1915     } else {
1916         return opj_dwt_decode_partial_tile(tilec, numres);
1917     }
1918 }
1919
1920 /* <summary>                */
1921 /* Get norm of 5-3 wavelet. */
1922 /* </summary>               */
1923 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1924 {
1925     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1926     /* but the array should really be extended up to 33 resolution levels */
1927     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1928     if (orient == 0 && level >= 10) {
1929         level = 9;
1930     } else if (orient > 0 && level >= 9) {
1931         level = 8;
1932     }
1933     return opj_dwt_norms[orient][level];
1934 }
1935
1936 /* <summary>                             */
1937 /* Forward 9-7 wavelet transform in 2-D. */
1938 /* </summary>                            */
1939 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
1940                              opj_tcd_tilecomp_t * tilec)
1941 {
1942     return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1943                                     opj_dwt_encode_and_deinterleave_v_real,
1944                                     opj_dwt_encode_and_deinterleave_h_one_row_real);
1945 }
1946
1947 /* <summary>                */
1948 /* Get norm of 9-7 wavelet. */
1949 /* </summary>               */
1950 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1951 {
1952     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1953     /* but the array should really be extended up to 33 resolution levels */
1954     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1955     if (orient == 0 && level >= 10) {
1956         level = 9;
1957     } else if (orient > 0 && level >= 9) {
1958         level = 8;
1959     }
1960     return opj_dwt_norms_real[orient][level];
1961 }
1962
1963 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1964 {
1965     OPJ_UINT32 numbands, bandno;
1966     numbands = 3 * tccp->numresolutions - 2;
1967     for (bandno = 0; bandno < numbands; bandno++) {
1968         OPJ_FLOAT64 stepsize;
1969         OPJ_UINT32 resno, level, orient, gain;
1970
1971         resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1972         orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1973         level = tccp->numresolutions - 1 - resno;
1974         gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1975                                           (orient == 2)) ? 1 : 2));
1976         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1977             stepsize = 1.0;
1978         } else {
1979             OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient);
1980             stepsize = (1 << (gain)) / norm;
1981         }
1982         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1983                                 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1984     }
1985 }
1986
1987 /* <summary>                             */
1988 /* Determine maximum computed resolution level for inverse wavelet transform */
1989 /* </summary>                            */
1990 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1991         OPJ_UINT32 i)
1992 {
1993     OPJ_UINT32 mr   = 0;
1994     OPJ_UINT32 w;
1995     while (--i) {
1996         ++r;
1997         if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1998             mr = w ;
1999         }
2000         if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
2001             mr = w ;
2002         }
2003     }
2004     return mr ;
2005 }
2006
2007 typedef struct {
2008     opj_dwt_t h;
2009     OPJ_UINT32 rw;
2010     OPJ_UINT32 w;
2011     OPJ_INT32 * OPJ_RESTRICT tiledp;
2012     OPJ_UINT32 min_j;
2013     OPJ_UINT32 max_j;
2014 } opj_dwt_decode_h_job_t;
2015
2016 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
2017 {
2018     OPJ_UINT32 j;
2019     opj_dwt_decode_h_job_t* job;
2020     (void)tls;
2021
2022     job = (opj_dwt_decode_h_job_t*)user_data;
2023     for (j = job->min_j; j < job->max_j; j++) {
2024         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
2025     }
2026
2027     opj_aligned_free(job->h.mem);
2028     opj_free(job);
2029 }
2030
2031 typedef struct {
2032     opj_dwt_t v;
2033     OPJ_UINT32 rh;
2034     OPJ_UINT32 w;
2035     OPJ_INT32 * OPJ_RESTRICT tiledp;
2036     OPJ_UINT32 min_j;
2037     OPJ_UINT32 max_j;
2038 } opj_dwt_decode_v_job_t;
2039
2040 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
2041 {
2042     OPJ_UINT32 j;
2043     opj_dwt_decode_v_job_t* job;
2044     (void)tls;
2045
2046     job = (opj_dwt_decode_v_job_t*)user_data;
2047     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
2048             j += PARALLEL_COLS_53) {
2049         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2050                      PARALLEL_COLS_53);
2051     }
2052     if (j < job->max_j)
2053         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2054                      (OPJ_INT32)(job->max_j - j));
2055
2056     opj_aligned_free(job->v.mem);
2057     opj_free(job);
2058 }
2059
2060
2061 /* <summary>                            */
2062 /* Inverse wavelet transform in 2-D.    */
2063 /* </summary>                           */
2064 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
2065                                     opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
2066 {
2067     opj_dwt_t h;
2068     opj_dwt_t v;
2069
2070     opj_tcd_resolution_t* tr = tilec->resolutions;
2071
2072     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2073                                  tr->x0);  /* width of the resolution level computed */
2074     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2075                                  tr->y0);  /* height of the resolution level computed */
2076
2077     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2078                                                                1].x1 -
2079                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2080     OPJ_SIZE_T h_mem_size;
2081     int num_threads;
2082
2083     if (numres == 1U) {
2084         return OPJ_TRUE;
2085     }
2086     num_threads = opj_thread_pool_get_thread_count(tp);
2087     h_mem_size = opj_dwt_max_resolution(tr, numres);
2088     /* overflow check */
2089     if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
2090         /* FIXME event manager error callback */
2091         return OPJ_FALSE;
2092     }
2093     /* We need PARALLEL_COLS_53 times the height of the array, */
2094     /* since for the vertical pass */
2095     /* we process PARALLEL_COLS_53 columns at a time */
2096     h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
2097     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2098     if (! h.mem) {
2099         /* FIXME event manager error callback */
2100         return OPJ_FALSE;
2101     }
2102
2103     v.mem = h.mem;
2104
2105     while (--numres) {
2106         OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
2107         OPJ_UINT32 j;
2108
2109         ++tr;
2110         h.sn = (OPJ_INT32)rw;
2111         v.sn = (OPJ_INT32)rh;
2112
2113         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2114         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2115
2116         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2117         h.cas = tr->x0 % 2;
2118
2119         if (num_threads <= 1 || rh <= 1) {
2120             for (j = 0; j < rh; ++j) {
2121                 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
2122             }
2123         } else {
2124             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2125             OPJ_UINT32 step_j;
2126
2127             if (rh < num_jobs) {
2128                 num_jobs = rh;
2129             }
2130             step_j = (rh / num_jobs);
2131
2132             for (j = 0; j < num_jobs; j++) {
2133                 opj_dwt_decode_h_job_t* job;
2134
2135                 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
2136                 if (!job) {
2137                     /* It would be nice to fallback to single thread case, but */
2138                     /* unfortunately some jobs may be launched and have modified */
2139                     /* tiledp, so it is not practical to recover from that error */
2140                     /* FIXME event manager error callback */
2141                     opj_thread_pool_wait_completion(tp, 0);
2142                     opj_aligned_free(h.mem);
2143                     return OPJ_FALSE;
2144                 }
2145                 job->h = h;
2146                 job->rw = rw;
2147                 job->w = w;
2148                 job->tiledp = tiledp;
2149                 job->min_j = j * step_j;
2150                 job->max_j = (j + 1U) * step_j; /* this can overflow */
2151                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
2152                     job->max_j = rh;
2153                 }
2154                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2155                 if (!job->h.mem) {
2156                     /* FIXME event manager error callback */
2157                     opj_thread_pool_wait_completion(tp, 0);
2158                     opj_free(job);
2159                     opj_aligned_free(h.mem);
2160                     return OPJ_FALSE;
2161                 }
2162                 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
2163             }
2164             opj_thread_pool_wait_completion(tp, 0);
2165         }
2166
2167         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2168         v.cas = tr->y0 % 2;
2169
2170         if (num_threads <= 1 || rw <= 1) {
2171             for (j = 0; j + PARALLEL_COLS_53 <= rw;
2172                     j += PARALLEL_COLS_53) {
2173                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
2174             }
2175             if (j < rw) {
2176                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
2177             }
2178         } else {
2179             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2180             OPJ_UINT32 step_j;
2181
2182             if (rw < num_jobs) {
2183                 num_jobs = rw;
2184             }
2185             step_j = (rw / num_jobs);
2186
2187             for (j = 0; j < num_jobs; j++) {
2188                 opj_dwt_decode_v_job_t* job;
2189
2190                 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
2191                 if (!job) {
2192                     /* It would be nice to fallback to single thread case, but */
2193                     /* unfortunately some jobs may be launched and have modified */
2194                     /* tiledp, so it is not practical to recover from that error */
2195                     /* FIXME event manager error callback */
2196                     opj_thread_pool_wait_completion(tp, 0);
2197                     opj_aligned_free(v.mem);
2198                     return OPJ_FALSE;
2199                 }
2200                 job->v = v;
2201                 job->rh = rh;
2202                 job->w = w;
2203                 job->tiledp = tiledp;
2204                 job->min_j = j * step_j;
2205                 job->max_j = (j + 1U) * step_j; /* this can overflow */
2206                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
2207                     job->max_j = rw;
2208                 }
2209                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2210                 if (!job->v.mem) {
2211                     /* FIXME event manager error callback */
2212                     opj_thread_pool_wait_completion(tp, 0);
2213                     opj_free(job);
2214                     opj_aligned_free(v.mem);
2215                     return OPJ_FALSE;
2216                 }
2217                 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
2218             }
2219             opj_thread_pool_wait_completion(tp, 0);
2220         }
2221     }
2222     opj_aligned_free(h.mem);
2223     return OPJ_TRUE;
2224 }
2225
2226 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
2227         OPJ_INT32 cas,
2228         opj_sparse_array_int32_t* sa,
2229         OPJ_UINT32 sa_line,
2230         OPJ_UINT32 sn,
2231         OPJ_UINT32 win_l_x0,
2232         OPJ_UINT32 win_l_x1,
2233         OPJ_UINT32 win_h_x0,
2234         OPJ_UINT32 win_h_x1)
2235 {
2236     OPJ_BOOL ret;
2237     ret = opj_sparse_array_int32_read(sa,
2238                                       win_l_x0, sa_line,
2239                                       win_l_x1, sa_line + 1,
2240                                       dest + cas + 2 * win_l_x0,
2241                                       2, 0, OPJ_TRUE);
2242     assert(ret);
2243     ret = opj_sparse_array_int32_read(sa,
2244                                       sn + win_h_x0, sa_line,
2245                                       sn + win_h_x1, sa_line + 1,
2246                                       dest + 1 - cas + 2 * win_h_x0,
2247                                       2, 0, OPJ_TRUE);
2248     assert(ret);
2249     OPJ_UNUSED(ret);
2250 }
2251
2252
2253 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
2254         OPJ_INT32 cas,
2255         opj_sparse_array_int32_t* sa,
2256         OPJ_UINT32 sa_col,
2257         OPJ_UINT32 nb_cols,
2258         OPJ_UINT32 sn,
2259         OPJ_UINT32 win_l_y0,
2260         OPJ_UINT32 win_l_y1,
2261         OPJ_UINT32 win_h_y0,
2262         OPJ_UINT32 win_h_y1)
2263 {
2264     OPJ_BOOL ret;
2265     ret  = opj_sparse_array_int32_read(sa,
2266                                        sa_col, win_l_y0,
2267                                        sa_col + nb_cols, win_l_y1,
2268                                        dest + cas * 4 + 2 * 4 * win_l_y0,
2269                                        1, 2 * 4, OPJ_TRUE);
2270     assert(ret);
2271     ret = opj_sparse_array_int32_read(sa,
2272                                       sa_col, sn + win_h_y0,
2273                                       sa_col + nb_cols, sn + win_h_y1,
2274                                       dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
2275                                       1, 2 * 4, OPJ_TRUE);
2276     assert(ret);
2277     OPJ_UNUSED(ret);
2278 }
2279
2280 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
2281                                      OPJ_INT32 cas,
2282                                      OPJ_INT32 win_l_x0,
2283                                      OPJ_INT32 win_l_x1,
2284                                      OPJ_INT32 win_h_x0,
2285                                      OPJ_INT32 win_h_x1)
2286 {
2287     OPJ_INT32 i;
2288
2289     if (!cas) {
2290         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
2291
2292             /* Naive version is :
2293             for (i = win_l_x0; i < i_max; i++) {
2294                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2295             }
2296             for (i = win_h_x0; i < win_h_x1; i++) {
2297                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2298             }
2299             but the compiler doesn't manage to unroll it to avoid bound
2300             checking in OPJ_S_ and OPJ_D_ macros
2301             */
2302
2303             i = win_l_x0;
2304             if (i < win_l_x1) {
2305                 OPJ_INT32 i_max;
2306
2307                 /* Left-most case */
2308                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2309                 i ++;
2310
2311                 i_max = win_l_x1;
2312                 if (i_max > dn) {
2313                     i_max = dn;
2314                 }
2315                 for (; i < i_max; i++) {
2316                     /* No bound checking */
2317                     OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
2318                 }
2319                 for (; i < win_l_x1; i++) {
2320                     /* Right-most case */
2321                     OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2322                 }
2323             }
2324
2325             i = win_h_x0;
2326             if (i < win_h_x1) {
2327                 OPJ_INT32 i_max = win_h_x1;
2328                 if (i_max >= sn) {
2329                     i_max = sn - 1;
2330                 }
2331                 for (; i < i_max; i++) {
2332                     /* No bound checking */
2333                     OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
2334                 }
2335                 for (; i < win_h_x1; i++) {
2336                     /* Right-most case */
2337                     OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2338                 }
2339             }
2340         }
2341     } else {
2342         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
2343             OPJ_S(0) /= 2;
2344         } else {
2345             for (i = win_l_x0; i < win_l_x1; i++) {
2346                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
2347             }
2348             for (i = win_h_x0; i < win_h_x1; i++) {
2349                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
2350             }
2351         }
2352     }
2353 }
2354
2355 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
2356 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
2357 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
2358 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
2359 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
2360 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
2361
2362 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
2363         OPJ_UINT32 nb_cols,
2364         OPJ_INT32 dn, OPJ_INT32 sn,
2365         OPJ_INT32 cas,
2366         OPJ_INT32 win_l_x0,
2367         OPJ_INT32 win_l_x1,
2368         OPJ_INT32 win_h_x0,
2369         OPJ_INT32 win_h_x1)
2370 {
2371     OPJ_INT32 i;
2372     OPJ_UINT32 off;
2373
2374     (void)nb_cols;
2375
2376     if (!cas) {
2377         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
2378
2379             /* Naive version is :
2380             for (i = win_l_x0; i < i_max; i++) {
2381                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2382             }
2383             for (i = win_h_x0; i < win_h_x1; i++) {
2384                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2385             }
2386             but the compiler doesn't manage to unroll it to avoid bound
2387             checking in OPJ_S_ and OPJ_D_ macros
2388             */
2389
2390             i = win_l_x0;
2391             if (i < win_l_x1) {
2392                 OPJ_INT32 i_max;
2393
2394                 /* Left-most case */
2395                 for (off = 0; off < 4; off++) {
2396                     OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2397                 }
2398                 i ++;
2399
2400                 i_max = win_l_x1;
2401                 if (i_max > dn) {
2402                     i_max = dn;
2403                 }
2404
2405 #ifdef __SSE2__
2406                 if (i + 1 < i_max) {
2407                     const __m128i two = _mm_set1_epi32(2);
2408                     __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
2409                     for (; i + 1 < i_max; i += 2) {
2410                         /* No bound checking */
2411                         __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2412                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2413                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2414                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2415                         S = _mm_sub_epi32(S,
2416                                           _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
2417                         S1 = _mm_sub_epi32(S1,
2418                                            _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
2419                         _mm_store_si128((__m128i*)(a + i * 8), S);
2420                         _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
2421                         Dm1 = D1;
2422                     }
2423                 }
2424 #endif
2425
2426                 for (; i < i_max; i++) {
2427                     /* No bound checking */
2428                     for (off = 0; off < 4; off++) {
2429                         OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
2430                     }
2431                 }
2432                 for (; i < win_l_x1; i++) {
2433                     /* Right-most case */
2434                     for (off = 0; off < 4; off++) {
2435                         OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2436                     }
2437                 }
2438             }
2439
2440             i = win_h_x0;
2441             if (i < win_h_x1) {
2442                 OPJ_INT32 i_max = win_h_x1;
2443                 if (i_max >= sn) {
2444                     i_max = sn - 1;
2445                 }
2446
2447 #ifdef __SSE2__
2448                 if (i + 1 < i_max) {
2449                     __m128i S =  _mm_load_si128((__m128i * const)(a + i * 8));
2450                     for (; i + 1 < i_max; i += 2) {
2451                         /* No bound checking */
2452                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2453                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2454                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2455                         __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
2456                         D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
2457                         D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
2458                         _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
2459                         _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
2460                         S = S2;
2461                     }
2462                 }
2463 #endif
2464
2465                 for (; i < i_max; i++) {
2466                     /* No bound checking */
2467                     for (off = 0; off < 4; off++) {
2468                         OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
2469                     }
2470                 }
2471                 for (; i < win_h_x1; i++) {
2472                     /* Right-most case */
2473                     for (off = 0; off < 4; off++) {
2474                         OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
2475                     }
2476                 }
2477             }
2478         }
2479     } else {
2480         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
2481             for (off = 0; off < 4; off++) {
2482                 OPJ_S_off(0, off) /= 2;
2483             }
2484         } else {
2485             for (i = win_l_x0; i < win_l_x1; i++) {
2486                 for (off = 0; off < 4; off++) {
2487                     OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
2488                 }
2489             }
2490             for (i = win_h_x0; i < win_h_x1; i++) {
2491                 for (off = 0; off < 4; off++) {
2492                     OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
2493                 }
2494             }
2495         }
2496     }
2497 }
2498
2499 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
2500         OPJ_UINT32 resno,
2501         OPJ_UINT32 bandno,
2502         OPJ_UINT32 tcx0,
2503         OPJ_UINT32 tcy0,
2504         OPJ_UINT32 tcx1,
2505         OPJ_UINT32 tcy1,
2506         OPJ_UINT32* tbx0,
2507         OPJ_UINT32* tby0,
2508         OPJ_UINT32* tbx1,
2509         OPJ_UINT32* tby1)
2510 {
2511     /* Compute number of decomposition for this band. See table F-1 */
2512     OPJ_UINT32 nb = (resno == 0) ?
2513                     tilec->numresolutions - 1 :
2514                     tilec->numresolutions - resno;
2515     /* Map above tile-based coordinates to sub-band-based coordinates per */
2516     /* equation B-15 of the standard */
2517     OPJ_UINT32 x0b = bandno & 1;
2518     OPJ_UINT32 y0b = bandno >> 1;
2519     if (tbx0) {
2520         *tbx0 = (nb == 0) ? tcx0 :
2521                 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
2522                 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
2523     }
2524     if (tby0) {
2525         *tby0 = (nb == 0) ? tcy0 :
2526                 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
2527                 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
2528     }
2529     if (tbx1) {
2530         *tbx1 = (nb == 0) ? tcx1 :
2531                 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
2532                 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
2533     }
2534     if (tby1) {
2535         *tby1 = (nb == 0) ? tcy1 :
2536                 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
2537                 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
2538     }
2539 }
2540
2541 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
2542                                  OPJ_UINT32 max_size,
2543                                  OPJ_UINT32* start,
2544                                  OPJ_UINT32* end)
2545 {
2546     *start = opj_uint_subs(*start, filter_width);
2547     *end = opj_uint_adds(*end, filter_width);
2548     *end = opj_uint_min(*end, max_size);
2549 }
2550
2551
2552 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
2553     opj_tcd_tilecomp_t* tilec,
2554     OPJ_UINT32 numres)
2555 {
2556     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2557     OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
2558     OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
2559     OPJ_UINT32 resno, bandno, precno, cblkno;
2560     opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
2561                                        w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
2562     if (sa == NULL) {
2563         return NULL;
2564     }
2565
2566     for (resno = 0; resno < numres; ++resno) {
2567         opj_tcd_resolution_t* res = &tilec->resolutions[resno];
2568
2569         for (bandno = 0; bandno < res->numbands; ++bandno) {
2570             opj_tcd_band_t* band = &res->bands[bandno];
2571
2572             for (precno = 0; precno < res->pw * res->ph; ++precno) {
2573                 opj_tcd_precinct_t* precinct = &band->precincts[precno];
2574                 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
2575                     opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
2576                     if (cblk->decoded_data != NULL) {
2577                         OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
2578                         OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
2579                         OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
2580                         OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
2581
2582                         if (band->bandno & 1) {
2583                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2584                             x += (OPJ_UINT32)(pres->x1 - pres->x0);
2585                         }
2586                         if (band->bandno & 2) {
2587                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2588                             y += (OPJ_UINT32)(pres->y1 - pres->y0);
2589                         }
2590
2591                         if (!opj_sparse_array_int32_write(sa, x, y,
2592                                                           x + cblk_w, y + cblk_h,
2593                                                           cblk->decoded_data,
2594                                                           1, cblk_w, OPJ_TRUE)) {
2595                             opj_sparse_array_int32_free(sa);
2596                             return NULL;
2597                         }
2598                     }
2599                 }
2600             }
2601         }
2602     }
2603
2604     return sa;
2605 }
2606
2607
2608 static OPJ_BOOL opj_dwt_decode_partial_tile(
2609     opj_tcd_tilecomp_t* tilec,
2610     OPJ_UINT32 numres)
2611 {
2612     opj_sparse_array_int32_t* sa;
2613     opj_dwt_t h;
2614     opj_dwt_t v;
2615     OPJ_UINT32 resno;
2616     /* This value matches the maximum left/right extension given in tables */
2617     /* F.2 and F.3 of the standard. */
2618     const OPJ_UINT32 filter_width = 2U;
2619
2620     opj_tcd_resolution_t* tr = tilec->resolutions;
2621     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2622
2623     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2624                                  tr->x0);  /* width of the resolution level computed */
2625     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2626                                  tr->y0);  /* height of the resolution level computed */
2627
2628     OPJ_SIZE_T h_mem_size;
2629
2630     /* Compute the intersection of the area of interest, expressed in tile coordinates */
2631     /* with the tile coordinates */
2632     OPJ_UINT32 win_tcx0 = tilec->win_x0;
2633     OPJ_UINT32 win_tcy0 = tilec->win_y0;
2634     OPJ_UINT32 win_tcx1 = tilec->win_x1;
2635     OPJ_UINT32 win_tcy1 = tilec->win_y1;
2636
2637     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2638         return OPJ_TRUE;
2639     }
2640
2641     sa = opj_dwt_init_sparse_array(tilec, numres);
2642     if (sa == NULL) {
2643         return OPJ_FALSE;
2644     }
2645
2646     if (numres == 1U) {
2647         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2648                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2649                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2650                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2651                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2652                        tilec->data_win,
2653                        1, tr_max->win_x1 - tr_max->win_x0,
2654                        OPJ_TRUE);
2655         assert(ret);
2656         OPJ_UNUSED(ret);
2657         opj_sparse_array_int32_free(sa);
2658         return OPJ_TRUE;
2659     }
2660     h_mem_size = opj_dwt_max_resolution(tr, numres);
2661     /* overflow check */
2662     /* in vertical pass, we process 4 columns at a time */
2663     if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
2664         /* FIXME event manager error callback */
2665         opj_sparse_array_int32_free(sa);
2666         return OPJ_FALSE;
2667     }
2668
2669     h_mem_size *= 4 * sizeof(OPJ_INT32);
2670     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2671     if (! h.mem) {
2672         /* FIXME event manager error callback */
2673         opj_sparse_array_int32_free(sa);
2674         return OPJ_FALSE;
2675     }
2676
2677     v.mem = h.mem;
2678
2679     for (resno = 1; resno < numres; resno ++) {
2680         OPJ_UINT32 i, j;
2681         /* Window of interest subband-based coordinates */
2682         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2683         OPJ_UINT32 win_hl_x0, win_hl_x1;
2684         OPJ_UINT32 win_lh_y0, win_lh_y1;
2685         /* Window of interest tile-resolution-based coordinates */
2686         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2687         /* Tile-resolution subband-based coordinates */
2688         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2689
2690         ++tr;
2691
2692         h.sn = (OPJ_INT32)rw;
2693         v.sn = (OPJ_INT32)rh;
2694
2695         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2696         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2697
2698         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2699         h.cas = tr->x0 % 2;
2700
2701         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2702         v.cas = tr->y0 % 2;
2703
2704         /* Get the subband coordinates for the window of interest */
2705         /* LL band */
2706         opj_dwt_get_band_coordinates(tilec, resno, 0,
2707                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2708                                      &win_ll_x0, &win_ll_y0,
2709                                      &win_ll_x1, &win_ll_y1);
2710
2711         /* HL band */
2712         opj_dwt_get_band_coordinates(tilec, resno, 1,
2713                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2714                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
2715
2716         /* LH band */
2717         opj_dwt_get_band_coordinates(tilec, resno, 2,
2718                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2719                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
2720
2721         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2722         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2723         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2724         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2725         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2726
2727         /* Subtract the origin of the bands for this tile, to the subwindow */
2728         /* of interest band coordinates, so as to get them relative to the */
2729         /* tile */
2730         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2731         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2732         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2733         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2734         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2735         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2736         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2737         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2738
2739         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2740         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2741
2742         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2743         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2744
2745         /* Compute the tile-resolution-based coordinates for the window of interest */
2746         if (h.cas == 0) {
2747             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2748             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2749         } else {
2750             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2751             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2752         }
2753
2754         if (v.cas == 0) {
2755             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2756             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2757         } else {
2758             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2759             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2760         }
2761
2762         for (j = 0; j < rh; ++j) {
2763             if ((j >= win_ll_y0 && j < win_ll_y1) ||
2764                     (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2765
2766                 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2767                 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2768                 /* on opj_decompress -i  ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2769                 /* This is less extreme than memsetting the whole buffer to 0 */
2770                 /* although we could potentially do better with better handling of edge conditions */
2771                 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2772                     h.mem[win_tr_x1 - 1] = 0;
2773                 }
2774                 if (win_tr_x1 < rw) {
2775                     h.mem[win_tr_x1] = 0;
2776                 }
2777
2778                 opj_dwt_interleave_partial_h(h.mem,
2779                                              h.cas,
2780                                              sa,
2781                                              j,
2782                                              (OPJ_UINT32)h.sn,
2783                                              win_ll_x0,
2784                                              win_ll_x1,
2785                                              win_hl_x0,
2786                                              win_hl_x1);
2787                 opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas,
2788                                          (OPJ_INT32)win_ll_x0,
2789                                          (OPJ_INT32)win_ll_x1,
2790                                          (OPJ_INT32)win_hl_x0,
2791                                          (OPJ_INT32)win_hl_x1);
2792                 if (!opj_sparse_array_int32_write(sa,
2793                                                   win_tr_x0, j,
2794                                                   win_tr_x1, j + 1,
2795                                                   h.mem + win_tr_x0,
2796                                                   1, 0, OPJ_TRUE)) {
2797                     /* FIXME event manager error callback */
2798                     opj_sparse_array_int32_free(sa);
2799                     opj_aligned_free(h.mem);
2800                     return OPJ_FALSE;
2801                 }
2802             }
2803         }
2804
2805         for (i = win_tr_x0; i < win_tr_x1;) {
2806             OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2807             opj_dwt_interleave_partial_v(v.mem,
2808                                          v.cas,
2809                                          sa,
2810                                          i,
2811                                          nb_cols,
2812                                          (OPJ_UINT32)v.sn,
2813                                          win_ll_y0,
2814                                          win_ll_y1,
2815                                          win_lh_y0,
2816                                          win_lh_y1);
2817             opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2818                                               (OPJ_INT32)win_ll_y0,
2819                                               (OPJ_INT32)win_ll_y1,
2820                                               (OPJ_INT32)win_lh_y0,
2821                                               (OPJ_INT32)win_lh_y1);
2822             if (!opj_sparse_array_int32_write(sa,
2823                                               i, win_tr_y0,
2824                                               i + nb_cols, win_tr_y1,
2825                                               v.mem + 4 * win_tr_y0,
2826                                               1, 4, OPJ_TRUE)) {
2827                 /* FIXME event manager error callback */
2828                 opj_sparse_array_int32_free(sa);
2829                 opj_aligned_free(h.mem);
2830                 return OPJ_FALSE;
2831             }
2832
2833             i += nb_cols;
2834         }
2835     }
2836     opj_aligned_free(h.mem);
2837
2838     {
2839         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2840                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2841                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2842                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2843                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2844                        tilec->data_win,
2845                        1, tr_max->win_x1 - tr_max->win_x0,
2846                        OPJ_TRUE);
2847         assert(ret);
2848         OPJ_UNUSED(ret);
2849     }
2850     opj_sparse_array_int32_free(sa);
2851     return OPJ_TRUE;
2852 }
2853
2854 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
2855                                    OPJ_FLOAT32* OPJ_RESTRICT a,
2856                                    OPJ_UINT32 width,
2857                                    OPJ_UINT32 remaining_height)
2858 {
2859     OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2860     OPJ_UINT32 i, k;
2861     OPJ_UINT32 x0 = dwt->win_l_x0;
2862     OPJ_UINT32 x1 = dwt->win_l_x1;
2863
2864     for (k = 0; k < 2; ++k) {
2865         if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2866                 ((OPJ_SIZE_T) bi & 0x0f) == 0) {
2867             /* Fast code path */
2868             for (i = x0; i < x1; ++i) {
2869                 OPJ_UINT32 j = i;
2870                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2871                 dst[0] = a[j];
2872                 j += width;
2873                 dst[1] = a[j];
2874                 j += width;
2875                 dst[2] = a[j];
2876                 j += width;
2877                 dst[3] = a[j];
2878                 j += width;
2879                 dst[4] = a[j];
2880                 j += width;
2881                 dst[5] = a[j];
2882                 j += width;
2883                 dst[6] = a[j];
2884                 j += width;
2885                 dst[7] = a[j];
2886             }
2887         } else {
2888             /* Slow code path */
2889             for (i = x0; i < x1; ++i) {
2890                 OPJ_UINT32 j = i;
2891                 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2892                 dst[0] = a[j];
2893                 j += width;
2894                 if (remaining_height == 1) {
2895                     continue;
2896                 }
2897                 dst[1] = a[j];
2898                 j += width;
2899                 if (remaining_height == 2) {
2900                     continue;
2901                 }
2902                 dst[2] = a[j];
2903                 j += width;
2904                 if (remaining_height == 3) {
2905                     continue;
2906                 }
2907                 dst[3] = a[j];
2908                 j += width;
2909                 if (remaining_height == 4) {
2910                     continue;
2911                 }
2912                 dst[4] = a[j];
2913                 j += width;
2914                 if (remaining_height == 5) {
2915                     continue;
2916                 }
2917                 dst[5] = a[j];
2918                 j += width;
2919                 if (remaining_height == 6) {
2920                     continue;
2921                 }
2922                 dst[6] = a[j];
2923                 j += width;
2924                 if (remaining_height == 7) {
2925                     continue;
2926                 }
2927                 dst[7] = a[j];
2928             }
2929         }
2930
2931         bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2932         a += dwt->sn;
2933         x0 = dwt->win_h_x0;
2934         x1 = dwt->win_h_x1;
2935     }
2936 }
2937
2938 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
2939         opj_sparse_array_int32_t* sa,
2940         OPJ_UINT32 sa_line,
2941         OPJ_UINT32 remaining_height)
2942 {
2943     OPJ_UINT32 i;
2944     for (i = 0; i < remaining_height; i++) {
2945         OPJ_BOOL ret;
2946         ret = opj_sparse_array_int32_read(sa,
2947                                           dwt->win_l_x0, sa_line + i,
2948                                           dwt->win_l_x1, sa_line + i + 1,
2949                                           /* Nasty cast from float* to int32* */
2950                                           (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2951                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2952         assert(ret);
2953         ret = opj_sparse_array_int32_read(sa,
2954                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2955                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2956                                           /* Nasty cast from float* to int32* */
2957                                           (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2958                                           2 * NB_ELTS_V8, 0, OPJ_TRUE);
2959         assert(ret);
2960         OPJ_UNUSED(ret);
2961     }
2962 }
2963
2964 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2965         OPJ_FLOAT32* OPJ_RESTRICT a,
2966         OPJ_UINT32 width,
2967         OPJ_UINT32 nb_elts_read)
2968 {
2969     opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2970     OPJ_UINT32 i;
2971
2972     for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2973         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2974                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2975     }
2976
2977     a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2978     bi = dwt->wavelet + 1 - dwt->cas;
2979
2980     for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2981         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2982                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2983     }
2984 }
2985
2986 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2987         opj_sparse_array_int32_t* sa,
2988         OPJ_UINT32 sa_col,
2989         OPJ_UINT32 nb_elts_read)
2990 {
2991     OPJ_BOOL ret;
2992     ret = opj_sparse_array_int32_read(sa,
2993                                       sa_col, dwt->win_l_x0,
2994                                       sa_col + nb_elts_read, dwt->win_l_x1,
2995                                       (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
2996                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
2997     assert(ret);
2998     ret = opj_sparse_array_int32_read(sa,
2999                                       sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
3000                                       sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
3001                                       (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
3002                                       1, 2 * NB_ELTS_V8, OPJ_TRUE);
3003     assert(ret);
3004     OPJ_UNUSED(ret);
3005 }
3006
3007 #ifdef __SSE__
3008
3009 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
3010                                        OPJ_UINT32 start,
3011                                        OPJ_UINT32 end,
3012                                        const __m128 c)
3013 {
3014     __m128* OPJ_RESTRICT vw = (__m128*) w;
3015     OPJ_UINT32 i = start;
3016     /* To be adapted if NB_ELTS_V8 changes */
3017     vw += 4 * start;
3018     /* Note: attempt at loop unrolling x2 doesn't help */
3019     for (; i < end; ++i, vw += 4) {
3020         vw[0] = _mm_mul_ps(vw[0], c);
3021         vw[1] = _mm_mul_ps(vw[1], c);
3022     }
3023 }
3024
3025 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
3026                                        OPJ_UINT32 start,
3027                                        OPJ_UINT32 end,
3028                                        OPJ_UINT32 m,
3029                                        __m128 c)
3030 {
3031     __m128* OPJ_RESTRICT vl = (__m128*) l;
3032     __m128* OPJ_RESTRICT vw = (__m128*) w;
3033     /* To be adapted if NB_ELTS_V8 changes */
3034     OPJ_UINT32 i;
3035     OPJ_UINT32 imax = opj_uint_min(end, m);
3036     if (start == 0) {
3037         if (imax >= 1) {
3038             vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
3039             vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
3040             vw += 4;
3041             start = 1;
3042         }
3043     } else {
3044         vw += start * 4;
3045     }
3046
3047     i = start;
3048     /* Note: attempt at loop unrolling x2 doesn't help */
3049     for (; i < imax; ++i) {
3050         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
3051         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
3052         vw += 4;
3053     }
3054     if (m < end) {
3055         assert(m + 1 == end);
3056         c = _mm_add_ps(c, c);
3057         vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
3058         vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
3059     }
3060 }
3061
3062 #else
3063
3064 static void opj_v8dwt_decode_step1(opj_v8_t* w,
3065                                    OPJ_UINT32 start,
3066                                    OPJ_UINT32 end,
3067                                    const OPJ_FLOAT32 c)
3068 {
3069     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
3070     OPJ_UINT32 i;
3071     /* To be adapted if NB_ELTS_V8 changes */
3072     for (i = start; i < end; ++i) {
3073         fw[i * 2 * 8    ] = fw[i * 2 * 8    ] * c;
3074         fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
3075         fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
3076         fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
3077         fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
3078         fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
3079         fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
3080         fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
3081     }
3082 }
3083
3084 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
3085                                    OPJ_UINT32 start,
3086                                    OPJ_UINT32 end,
3087                                    OPJ_UINT32 m,
3088                                    OPJ_FLOAT32 c)
3089 {
3090     OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
3091     OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
3092     OPJ_UINT32 i;
3093     OPJ_UINT32 imax = opj_uint_min(end, m);
3094     if (start > 0) {
3095         fw += 2 * NB_ELTS_V8 * start;
3096         fl = fw - 2 * NB_ELTS_V8;
3097     }
3098     /* To be adapted if NB_ELTS_V8 changes */
3099     for (i = start; i < imax; ++i) {
3100         fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
3101         fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
3102         fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
3103         fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
3104         fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
3105         fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
3106         fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
3107         fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
3108         fl = fw;
3109         fw += 2 * NB_ELTS_V8;
3110     }
3111     if (m < end) {
3112         assert(m + 1 == end);
3113         c += c;
3114         fw[-8] = fw[-8] + fl[0] * c;
3115         fw[-7] = fw[-7] + fl[1] * c;
3116         fw[-6] = fw[-6] + fl[2] * c;
3117         fw[-5] = fw[-5] + fl[3] * c;
3118         fw[-4] = fw[-4] + fl[4] * c;
3119         fw[-3] = fw[-3] + fl[5] * c;
3120         fw[-2] = fw[-2] + fl[6] * c;
3121         fw[-1] = fw[-1] + fl[7] * c;
3122     }
3123 }
3124
3125 #endif
3126
3127 /* <summary>                             */
3128 /* Inverse 9-7 wavelet transform in 1-D. */
3129 /* </summary>                            */
3130 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
3131 {
3132     OPJ_INT32 a, b;
3133     /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
3134     /* Historic value for 2 / opj_invK */
3135     /* Normally, we should use invK, but if we do so, we have failures in the */
3136     /* conformance test, due to MSE and peak errors significantly higher than */
3137     /* accepted value */
3138     /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
3139     /* the computation of the stepsize for the non LL subbands */
3140     const float two_invK = 1.625732422f;
3141     if (dwt->cas == 0) {
3142         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
3143             return;
3144         }
3145         a = 0;
3146         b = 1;
3147     } else {
3148         if (!((dwt->sn > 0) || (dwt->dn > 1))) {
3149             return;
3150         }
3151         a = 1;
3152         b = 0;
3153     }
3154 #ifdef __SSE__
3155     opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3156                                _mm_set1_ps(opj_K));
3157     opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3158                                _mm_set1_ps(two_invK));
3159     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3160                                dwt->win_l_x0, dwt->win_l_x1,
3161                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3162                                _mm_set1_ps(-opj_dwt_delta));
3163     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3164                                dwt->win_h_x0, dwt->win_h_x1,
3165                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3166                                _mm_set1_ps(-opj_dwt_gamma));
3167     opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3168                                dwt->win_l_x0, dwt->win_l_x1,
3169                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3170                                _mm_set1_ps(-opj_dwt_beta));
3171     opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3172                                dwt->win_h_x0, dwt->win_h_x1,
3173                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3174                                _mm_set1_ps(-opj_dwt_alpha));
3175 #else
3176     opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3177                            opj_K);
3178     opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3179                            two_invK);
3180     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3181                            dwt->win_l_x0, dwt->win_l_x1,
3182                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3183                            -opj_dwt_delta);
3184     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3185                            dwt->win_h_x0, dwt->win_h_x1,
3186                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3187                            -opj_dwt_gamma);
3188     opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3189                            dwt->win_l_x0, dwt->win_l_x1,
3190                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3191                            -opj_dwt_beta);
3192     opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3193                            dwt->win_h_x0, dwt->win_h_x1,
3194                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3195                            -opj_dwt_alpha);
3196 #endif
3197 }
3198
3199 typedef struct {
3200     opj_v8dwt_t h;
3201     OPJ_UINT32 rw;
3202     OPJ_UINT32 w;
3203     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3204     OPJ_UINT32 nb_rows;
3205 } opj_dwt97_decode_h_job_t;
3206
3207 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
3208 {
3209     OPJ_UINT32 j;
3210     opj_dwt97_decode_h_job_t* job;
3211     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3212     OPJ_UINT32 w;
3213     (void)tls;
3214
3215     job = (opj_dwt97_decode_h_job_t*)user_data;
3216     w = job->w;
3217
3218     assert((job->nb_rows % NB_ELTS_V8) == 0);
3219
3220     aj = job->aj;
3221     for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
3222         OPJ_UINT32 k;
3223         opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
3224         opj_v8dwt_decode(&job->h);
3225
3226         /* To be adapted if NB_ELTS_V8 changes */
3227         for (k = 0; k < job->rw; k++) {
3228             aj[k      ] = job->h.wavelet[k].f[0];
3229             aj[k + (OPJ_SIZE_T)w  ] = job->h.wavelet[k].f[1];
3230             aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
3231             aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
3232         }
3233         for (k = 0; k < job->rw; k++) {
3234             aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
3235             aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
3236             aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
3237             aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
3238         }
3239
3240         aj += w * NB_ELTS_V8;
3241     }
3242
3243     opj_aligned_free(job->h.wavelet);
3244     opj_free(job);
3245 }
3246
3247
3248 typedef struct {
3249     opj_v8dwt_t v;
3250     OPJ_UINT32 rh;
3251     OPJ_UINT32 w;
3252     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3253     OPJ_UINT32 nb_columns;
3254 } opj_dwt97_decode_v_job_t;
3255
3256 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
3257 {
3258     OPJ_UINT32 j;
3259     opj_dwt97_decode_v_job_t* job;
3260     OPJ_FLOAT32 * OPJ_RESTRICT aj;
3261     (void)tls;
3262
3263     job = (opj_dwt97_decode_v_job_t*)user_data;
3264
3265     assert((job->nb_columns % NB_ELTS_V8) == 0);
3266
3267     aj = job->aj;
3268     for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
3269         OPJ_UINT32 k;
3270
3271         opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
3272         opj_v8dwt_decode(&job->v);
3273
3274         for (k = 0; k < job->rh; ++k) {
3275             memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
3276                    NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3277         }
3278         aj += NB_ELTS_V8;
3279     }
3280
3281     opj_aligned_free(job->v.wavelet);
3282     opj_free(job);
3283 }
3284
3285
3286 /* <summary>                             */
3287 /* Inverse 9-7 wavelet transform in 2-D. */
3288 /* </summary>                            */
3289 static
3290 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
3291                                 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3292                                 OPJ_UINT32 numres)
3293 {
3294     opj_v8dwt_t h;
3295     opj_v8dwt_t v;
3296
3297     opj_tcd_resolution_t* res = tilec->resolutions;
3298
3299     OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
3300                                  res->x0);    /* width of the resolution level computed */
3301     OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
3302                                  res->y0);    /* height of the resolution level computed */
3303
3304     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
3305                                                                1].x1 -
3306                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
3307
3308     OPJ_SIZE_T l_data_size;
3309     const int num_threads = opj_thread_pool_get_thread_count(tp);
3310
3311     if (numres == 1) {
3312         return OPJ_TRUE;
3313     }
3314
3315     l_data_size = opj_dwt_max_resolution(res, numres);
3316     /* overflow check */
3317     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3318         /* FIXME event manager error callback */
3319         return OPJ_FALSE;
3320     }
3321     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3322     if (!h.wavelet) {
3323         /* FIXME event manager error callback */
3324         return OPJ_FALSE;
3325     }
3326     v.wavelet = h.wavelet;
3327
3328     while (--numres) {
3329         OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
3330         OPJ_UINT32 j;
3331
3332         h.sn = (OPJ_INT32)rw;
3333         v.sn = (OPJ_INT32)rh;
3334
3335         ++res;
3336
3337         rw = (OPJ_UINT32)(res->x1 -
3338                           res->x0);   /* width of the resolution level computed */
3339         rh = (OPJ_UINT32)(res->y1 -
3340                           res->y0);   /* height of the resolution level computed */
3341
3342         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3343         h.cas = res->x0 % 2;
3344
3345         h.win_l_x0 = 0;
3346         h.win_l_x1 = (OPJ_UINT32)h.sn;
3347         h.win_h_x0 = 0;
3348         h.win_h_x1 = (OPJ_UINT32)h.dn;
3349
3350         if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
3351             for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3352                 OPJ_UINT32 k;
3353                 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
3354                 opj_v8dwt_decode(&h);
3355
3356                 /* To be adapted if NB_ELTS_V8 changes */
3357                 for (k = 0; k < rw; k++) {
3358                     aj[k      ] = h.wavelet[k].f[0];
3359                     aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
3360                     aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
3361                     aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
3362                 }
3363                 for (k = 0; k < rw; k++) {
3364                     aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
3365                     aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
3366                     aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
3367                     aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
3368                 }
3369
3370                 aj += w * NB_ELTS_V8;
3371             }
3372         } else {
3373             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
3374             OPJ_UINT32 step_j;
3375
3376             if ((rh / NB_ELTS_V8) < num_jobs) {
3377                 num_jobs = rh / NB_ELTS_V8;
3378             }
3379             step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3380             for (j = 0; j < num_jobs; j++) {
3381                 opj_dwt97_decode_h_job_t* job;
3382
3383                 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
3384                 if (!job) {
3385                     opj_thread_pool_wait_completion(tp, 0);
3386                     opj_aligned_free(h.wavelet);
3387                     return OPJ_FALSE;
3388                 }
3389                 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3390                 if (!job->h.wavelet) {
3391                     opj_thread_pool_wait_completion(tp, 0);
3392                     opj_free(job);
3393                     opj_aligned_free(h.wavelet);
3394                     return OPJ_FALSE;
3395                 }
3396                 job->h.dn = h.dn;
3397                 job->h.sn = h.sn;
3398                 job->h.cas = h.cas;
3399                 job->h.win_l_x0 = h.win_l_x0;
3400                 job->h.win_l_x1 = h.win_l_x1;
3401                 job->h.win_h_x0 = h.win_h_x0;
3402                 job->h.win_h_x1 = h.win_h_x1;
3403                 job->rw = rw;
3404                 job->w = w;
3405                 job->aj = aj;
3406                 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
3407                                                       (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3408                 aj += w * job->nb_rows;
3409                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
3410             }
3411             opj_thread_pool_wait_completion(tp, 0);
3412             j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
3413         }
3414
3415         if (j < rh) {
3416             OPJ_UINT32 k;
3417             opj_v8dwt_interleave_h(&h, aj, w, rh - j);
3418             opj_v8dwt_decode(&h);
3419             for (k = 0; k < rw; k++) {
3420                 OPJ_UINT32 l;
3421                 for (l = 0; l < rh - j; l++) {
3422                     aj[k + (OPJ_SIZE_T)w  * l ] = h.wavelet[k].f[l];
3423                 }
3424             }
3425         }
3426
3427         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3428         v.cas = res->y0 % 2;
3429         v.win_l_x0 = 0;
3430         v.win_l_x1 = (OPJ_UINT32)v.sn;
3431         v.win_h_x0 = 0;
3432         v.win_h_x1 = (OPJ_UINT32)v.dn;
3433
3434         aj = (OPJ_FLOAT32*) tilec->data;
3435         if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
3436             for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
3437                 OPJ_UINT32 k;
3438
3439                 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
3440                 opj_v8dwt_decode(&v);
3441
3442                 for (k = 0; k < rh; ++k) {
3443                     memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3444                 }
3445                 aj += NB_ELTS_V8;
3446             }
3447         } else {
3448             /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
3449                 transfer being the limiting factor. So limit the number of
3450                 threads.
3451              */
3452             OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
3453             OPJ_UINT32 step_j;
3454
3455             if ((rw / NB_ELTS_V8) < num_jobs) {
3456                 num_jobs = rw / NB_ELTS_V8;
3457             }
3458             step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3459             for (j = 0; j < num_jobs; j++) {
3460                 opj_dwt97_decode_v_job_t* job;
3461
3462                 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
3463                 if (!job) {
3464                     opj_thread_pool_wait_completion(tp, 0);
3465                     opj_aligned_free(h.wavelet);
3466                     return OPJ_FALSE;
3467                 }
3468                 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3469                 if (!job->v.wavelet) {
3470                     opj_thread_pool_wait_completion(tp, 0);
3471                     opj_free(job);
3472                     opj_aligned_free(h.wavelet);
3473                     return OPJ_FALSE;
3474                 }
3475                 job->v.dn = v.dn;
3476                 job->v.sn = v.sn;
3477                 job->v.cas = v.cas;
3478                 job->v.win_l_x0 = v.win_l_x0;
3479                 job->v.win_l_x1 = v.win_l_x1;
3480                 job->v.win_h_x0 = v.win_h_x0;
3481                 job->v.win_h_x1 = v.win_h_x1;
3482                 job->rh = rh;
3483                 job->w = w;
3484                 job->aj = aj;
3485                 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
3486                                   (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3487                 aj += job->nb_columns;
3488                 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
3489             }
3490             opj_thread_pool_wait_completion(tp, 0);
3491         }
3492
3493         if (rw & (NB_ELTS_V8 - 1)) {
3494             OPJ_UINT32 k;
3495
3496             j = rw & (NB_ELTS_V8 - 1);
3497
3498             opj_v8dwt_interleave_v(&v, aj, w, j);
3499             opj_v8dwt_decode(&v);
3500
3501             for (k = 0; k < rh; ++k) {
3502                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
3503                        (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
3504             }
3505         }
3506     }
3507
3508     opj_aligned_free(h.wavelet);
3509     return OPJ_TRUE;
3510 }
3511
3512 static
3513 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3514                                    OPJ_UINT32 numres)
3515 {
3516     opj_sparse_array_int32_t* sa;
3517     opj_v8dwt_t h;
3518     opj_v8dwt_t v;
3519     OPJ_UINT32 resno;
3520     /* This value matches the maximum left/right extension given in tables */
3521     /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
3522     /* we currently use 3. */
3523     const OPJ_UINT32 filter_width = 4U;
3524
3525     opj_tcd_resolution_t* tr = tilec->resolutions;
3526     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
3527
3528     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
3529                                  tr->x0);    /* width of the resolution level computed */
3530     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
3531                                  tr->y0);    /* height of the resolution level computed */
3532
3533     OPJ_SIZE_T l_data_size;
3534
3535     /* Compute the intersection of the area of interest, expressed in tile coordinates */
3536     /* with the tile coordinates */
3537     OPJ_UINT32 win_tcx0 = tilec->win_x0;
3538     OPJ_UINT32 win_tcy0 = tilec->win_y0;
3539     OPJ_UINT32 win_tcx1 = tilec->win_x1;
3540     OPJ_UINT32 win_tcy1 = tilec->win_y1;
3541
3542     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
3543         return OPJ_TRUE;
3544     }
3545
3546     sa = opj_dwt_init_sparse_array(tilec, numres);
3547     if (sa == NULL) {
3548         return OPJ_FALSE;
3549     }
3550
3551     if (numres == 1U) {
3552         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3553                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3554                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3555                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3556                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3557                        tilec->data_win,
3558                        1, tr_max->win_x1 - tr_max->win_x0,
3559                        OPJ_TRUE);
3560         assert(ret);
3561         OPJ_UNUSED(ret);
3562         opj_sparse_array_int32_free(sa);
3563         return OPJ_TRUE;
3564     }
3565
3566     l_data_size = opj_dwt_max_resolution(tr, numres);
3567     /* overflow check */
3568     if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3569         /* FIXME event manager error callback */
3570         opj_sparse_array_int32_free(sa);
3571         return OPJ_FALSE;
3572     }
3573     h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3574     if (!h.wavelet) {
3575         /* FIXME event manager error callback */
3576         opj_sparse_array_int32_free(sa);
3577         return OPJ_FALSE;
3578     }
3579     v.wavelet = h.wavelet;
3580
3581     for (resno = 1; resno < numres; resno ++) {
3582         OPJ_UINT32 j;
3583         /* Window of interest subband-based coordinates */
3584         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
3585         OPJ_UINT32 win_hl_x0, win_hl_x1;
3586         OPJ_UINT32 win_lh_y0, win_lh_y1;
3587         /* Window of interest tile-resolution-based coordinates */
3588         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
3589         /* Tile-resolution subband-based coordinates */
3590         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
3591
3592         ++tr;
3593
3594         h.sn = (OPJ_INT32)rw;
3595         v.sn = (OPJ_INT32)rh;
3596
3597         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
3598         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
3599
3600         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3601         h.cas = tr->x0 % 2;
3602
3603         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3604         v.cas = tr->y0 % 2;
3605
3606         /* Get the subband coordinates for the window of interest */
3607         /* LL band */
3608         opj_dwt_get_band_coordinates(tilec, resno, 0,
3609                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3610                                      &win_ll_x0, &win_ll_y0,
3611                                      &win_ll_x1, &win_ll_y1);
3612
3613         /* HL band */
3614         opj_dwt_get_band_coordinates(tilec, resno, 1,
3615                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3616                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
3617
3618         /* LH band */
3619         opj_dwt_get_band_coordinates(tilec, resno, 2,
3620                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3621                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
3622
3623         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
3624         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
3625         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
3626         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
3627         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
3628
3629         /* Subtract the origin of the bands for this tile, to the subwindow */
3630         /* of interest band coordinates, so as to get them relative to the */
3631         /* tile */
3632         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
3633         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
3634         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
3635         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
3636         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
3637         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
3638         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
3639         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
3640
3641         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
3642         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
3643
3644         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
3645         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
3646
3647         /* Compute the tile-resolution-based coordinates for the window of interest */
3648         if (h.cas == 0) {
3649             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
3650             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
3651         } else {
3652             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
3653             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
3654         }
3655
3656         if (v.cas == 0) {
3657             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
3658             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
3659         } else {
3660             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
3661             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
3662         }
3663
3664         h.win_l_x0 = win_ll_x0;
3665         h.win_l_x1 = win_ll_x1;
3666         h.win_h_x0 = win_hl_x0;
3667         h.win_h_x1 = win_hl_x1;
3668         for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3669             if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3670                     (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3671                      j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
3672                 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
3673                 opj_v8dwt_decode(&h);
3674                 if (!opj_sparse_array_int32_write(sa,
3675                                                   win_tr_x0, j,
3676                                                   win_tr_x1, j + NB_ELTS_V8,
3677                                                   (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3678                                                   NB_ELTS_V8, 1, OPJ_TRUE)) {
3679                     /* FIXME event manager error callback */
3680                     opj_sparse_array_int32_free(sa);
3681                     opj_aligned_free(h.wavelet);
3682                     return OPJ_FALSE;
3683                 }
3684             }
3685         }
3686
3687         if (j < rh &&
3688                 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3689                  (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3690                   j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
3691             opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
3692             opj_v8dwt_decode(&h);
3693             if (!opj_sparse_array_int32_write(sa,
3694                                               win_tr_x0, j,
3695                                               win_tr_x1, rh,
3696                                               (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3697                                               NB_ELTS_V8, 1, OPJ_TRUE)) {
3698                 /* FIXME event manager error callback */
3699                 opj_sparse_array_int32_free(sa);
3700                 opj_aligned_free(h.wavelet);
3701                 return OPJ_FALSE;
3702             }
3703         }
3704
3705         v.win_l_x0 = win_ll_y0;
3706         v.win_l_x1 = win_ll_y1;
3707         v.win_h_x0 = win_lh_y0;
3708         v.win_h_x1 = win_lh_y1;
3709         for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
3710             OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
3711
3712             opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
3713             opj_v8dwt_decode(&v);
3714
3715             if (!opj_sparse_array_int32_write(sa,
3716                                               j, win_tr_y0,
3717                                               j + nb_elts, win_tr_y1,
3718                                               (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
3719                                               1, NB_ELTS_V8, OPJ_TRUE)) {
3720                 /* FIXME event manager error callback */
3721                 opj_sparse_array_int32_free(sa);
3722                 opj_aligned_free(h.wavelet);
3723                 return OPJ_FALSE;
3724             }
3725         }
3726     }
3727
3728     {
3729         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3730                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3731                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3732                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3733                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3734                        tilec->data_win,
3735                        1, tr_max->win_x1 - tr_max->win_x0,
3736                        OPJ_TRUE);
3737         assert(ret);
3738         OPJ_UNUSED(ret);
3739     }
3740     opj_sparse_array_int32_free(sa);
3741
3742     opj_aligned_free(h.wavelet);
3743     return OPJ_TRUE;
3744 }
3745
3746
3747 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
3748                              opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3749                              OPJ_UINT32 numres)
3750 {
3751     if (p_tcd->whole_tile_decoding) {
3752         return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
3753     } else {
3754         return opj_dwt_decode_partial_97(tilec, numres);
3755     }
3756 }