2 * The copyright in this software is being made available under the 2-clauses
3 * BSD License, included below. This software may be subject to other third
4 * party and contributor rights, including patent rights, and no such rights
5 * are granted under this license.
7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8 * Copyright (c) 2002-2014, Professor Benoit Macq
9 * Copyright (c) 2001-2003, David Janssens
10 * Copyright (c) 2002-2003, Yannick Verschueren
11 * Copyright (c) 2003-2007, Francois-Olivier Devaux
12 * Copyright (c) 2003-2014, Antonin Descampe
13 * Copyright (c) 2005, Herve Drolon, FreeImage Team
14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17 * All rights reserved.
19 * Redistribution and use in source and binary forms, with or without
20 * modification, are permitted provided that the following conditions
22 * 1. Redistributions of source code must retain the above copyright
23 * notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 * notice, this list of conditions and the following disclaimer in the
26 * documentation and/or other materials provided with the distribution.
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
43 #define OPJ_SKIP_POISON
44 #include "opj_includes.h"
47 #include <xmmintrin.h>
50 #include <emmintrin.h>
53 #include <tmmintrin.h>
56 #include <immintrin.h>
60 #pragma GCC poison malloc calloc realloc free
63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
66 #define OPJ_WS(i) v->mem[(i)*2]
67 #define OPJ_WD(i) v->mem[(1+(i)*2)]
70 /** Number of int32 values in a AVX2 register */
71 #define VREG_INT_COUNT 8
73 /** Number of int32 values in a SSE2 register */
74 #define VREG_INT_COUNT 4
77 /** Number of columns that we can process in parallel in the vertical pass */
78 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT)
80 /** @name Local data structures */
83 typedef struct dwt_local {
85 OPJ_INT32 dn; /* number of elements in high pass band */
86 OPJ_INT32 sn; /* number of elements in low pass band */
87 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */
93 OPJ_FLOAT32 f[NB_ELTS_V8];
96 typedef struct v8dwt_local {
98 OPJ_INT32 dn ; /* number of elements in high pass band */
99 OPJ_INT32 sn ; /* number of elements in low pass band */
100 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */
101 OPJ_UINT32 win_l_x0; /* start coord in low pass band */
102 OPJ_UINT32 win_l_x1; /* end coord in low pass band */
103 OPJ_UINT32 win_h_x0; /* start coord in high pass band */
104 OPJ_UINT32 win_h_x1; /* end coord in high pass band */
107 /* From table F.4 from the standard */
108 static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f;
109 static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f;
110 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f;
111 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f;
113 static const OPJ_FLOAT32 opj_K = 1.230174105f;
114 static const OPJ_FLOAT32 opj_invK = (OPJ_FLOAT32)(1.0 / 1.230174105);
118 /** @name Local static functions */
122 Forward lazy transform (horizontal)
124 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
125 OPJ_INT32 * OPJ_RESTRICT b,
127 OPJ_INT32 sn, OPJ_INT32 cas);
130 Forward 9-7 wavelet transform in 1-D
132 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn,
135 Explicit calculation of the Quantization Stepsizes
137 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
138 opj_stepsize_t *bandno_stepsize);
140 Inverse wavelet transform in 2-D.
142 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
143 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
145 static OPJ_BOOL opj_dwt_decode_partial_tile(
146 opj_tcd_tilecomp_t* tilec,
149 /* Forward transform, for the vertical pass, processing cols columns */
150 /* where cols <= NB_ELTS_V8 */
151 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
152 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)(
157 OPJ_UINT32 stride_width,
160 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */
161 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)(
167 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
168 opj_tcd_tilecomp_t * tilec,
169 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
170 opj_encode_and_deinterleave_h_one_row_fnptr_type
171 p_encode_and_deinterleave_h_one_row);
173 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
177 /* Inverse 9-7 wavelet transform in 1-D. */
184 #define OPJ_S(i) a[(i)*2]
185 #define OPJ_D(i) a[(1+(i)*2)]
186 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i)))
187 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i)))
189 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i)))
190 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i)))
193 /* This table contains the norms of the 5-3 wavelets for different bands. */
195 /* FIXME! the array should really be extended up to 33 resolution levels */
196 /* See https://github.com/uclouvain/openjpeg/issues/493 */
197 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
198 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
199 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
200 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
201 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
205 /* This table contains the norms of the 9-7 wavelets for different bands. */
207 /* FIXME! the array should really be extended up to 33 resolution levels */
208 /* See https://github.com/uclouvain/openjpeg/issues/493 */
209 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
210 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
211 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
212 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
213 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
217 ==========================================================
219 ==========================================================
223 /* Forward lazy transform (horizontal). */
225 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a,
226 OPJ_INT32 * OPJ_RESTRICT b,
228 OPJ_INT32 sn, OPJ_INT32 cas)
231 OPJ_INT32 * OPJ_RESTRICT l_dest = b;
232 const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas;
234 for (i = 0; i < sn; ++i) {
242 for (i = 0; i < dn; ++i) {
248 #ifdef STANDARD_SLOW_VERSION
250 /* Inverse lazy transform (horizontal). */
252 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
254 const OPJ_INT32 *ai = a;
255 OPJ_INT32 *bi = h->mem + h->cas;
262 bi = h->mem + 1 - h->cas;
271 /* Inverse lazy transform (vertical). */
273 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
275 const OPJ_INT32 *ai = a;
276 OPJ_INT32 *bi = v->mem + v->cas;
283 ai = a + (v->sn * (OPJ_SIZE_T)x);
284 bi = v->mem + 1 - v->cas;
293 #endif /* STANDARD_SLOW_VERSION */
295 #ifdef STANDARD_SLOW_VERSION
297 /* Inverse 5-3 wavelet transform in 1-D. */
299 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
305 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
306 for (i = 0; i < sn; i++) {
307 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
309 for (i = 0; i < dn; i++) {
310 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
314 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
317 for (i = 0; i < sn; i++) {
318 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
320 for (i = 0; i < dn; i++) {
321 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
327 static void opj_dwt_decode_1(const opj_dwt_t *v)
329 opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
332 #endif /* STANDARD_SLOW_VERSION */
334 #if !defined(STANDARD_SLOW_VERSION)
335 static void opj_idwt53_h_cas0(OPJ_INT32* tmp,
341 const OPJ_INT32* in_even = &tiledp[0];
342 const OPJ_INT32* in_odd = &tiledp[sn];
344 #ifdef TWO_PASS_VERSION
345 /* For documentation purpose: performs lifting in two iterations, */
346 /* but without explicit interleaving */
351 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
352 for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
353 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
355 if (len & 1) { /* if len is odd */
356 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
360 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
361 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
363 if (!(len & 1)) { /* if len is even */
364 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
367 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
371 /* Improved version of the TWO_PASS_VERSION: */
372 /* Performs lifting in one single iteration. Saves memory */
373 /* accesses and explicit interleaving. */
376 s0n = s1n - ((d1n + 1) >> 1);
378 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
385 s0n = s1n - ((d1c + d1n + 2) >> 2);
388 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
394 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
395 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
397 tmp[len - 1] = d1n + s0n;
400 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
403 static void opj_idwt53_h_cas1(OPJ_INT32* tmp,
409 const OPJ_INT32* in_even = &tiledp[sn];
410 const OPJ_INT32* in_odd = &tiledp[0];
412 #ifdef TWO_PASS_VERSION
413 /* For documentation purpose: performs lifting in two iterations, */
414 /* but without explicit interleaving */
419 for (i = 1, j = 0; i < len - 1; i += 2, j++) {
420 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
423 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
427 tmp[0] = in_even[0] + tmp[1];
428 for (i = 2, j = 1; i < len - 1; i += 2, j++) {
429 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
432 tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
435 OPJ_INT32 s1, s2, dc, dn;
439 /* Improved version of the TWO_PASS_VERSION: */
440 /* Performs lifting in one single iteration. Saves memory */
441 /* accesses and explicit interleaving. */
444 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
445 tmp[0] = in_even[0] + dc;
447 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
451 dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
453 tmp[i + 1] = s1 + ((dn + dc) >> 1);
462 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
463 tmp[len - 2] = s1 + ((dn + dc) >> 1);
466 tmp[len - 1] = s1 + dc;
469 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
473 #endif /* !defined(STANDARD_SLOW_VERSION) */
476 /* Inverse 5-3 wavelet transform in 1-D for one row. */
478 /* Performs interleave, inverse wavelet transform and copy back to buffer */
479 static void opj_idwt53_h(const opj_dwt_t *dwt,
482 #ifdef STANDARD_SLOW_VERSION
483 /* For documentation purpose */
484 opj_dwt_interleave_h(dwt, tiledp);
485 opj_dwt_decode_1(dwt);
486 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
488 const OPJ_INT32 sn = dwt->sn;
489 const OPJ_INT32 len = sn + dwt->dn;
490 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
492 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
494 /* Unmodified value */
496 } else { /* Left-most sample is on odd coordinate */
499 } else if (len == 2) {
500 OPJ_INT32* out = dwt->mem;
501 const OPJ_INT32* in_even = &tiledp[sn];
502 const OPJ_INT32* in_odd = &tiledp[0];
503 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
504 out[0] = in_even[0] + out[1];
505 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
506 } else if (len > 2) {
507 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
513 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
515 /* Conveniency macros to improve the readabilty of the formulas */
518 #define LOAD_CST(x) _mm256_set1_epi32(x)
519 #define LOAD(x) _mm256_load_si256((const VREG*)(x))
520 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x))
521 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y))
522 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
523 #define ADD(x,y) _mm256_add_epi32((x),(y))
524 #define SUB(x,y) _mm256_sub_epi32((x),(y))
525 #define SAR(x,y) _mm256_srai_epi32((x),(y))
528 #define LOAD_CST(x) _mm_set1_epi32(x)
529 #define LOAD(x) _mm_load_si128((const VREG*)(x))
530 #define LOADU(x) _mm_loadu_si128((const VREG*)(x))
531 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y))
532 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
533 #define ADD(x,y) _mm_add_epi32((x),(y))
534 #define SUB(x,y) _mm_sub_epi32((x),(y))
535 #define SAR(x,y) _mm_srai_epi32((x),(y))
537 #define ADD3(x,y,z) ADD(ADD(x,y),z)
540 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
541 const OPJ_INT32* tmp,
546 for (i = 0; i < len; ++i) {
547 /* A memcpy(&tiledp_col[i * stride + 0],
548 &tmp[PARALLEL_COLS_53 * i + 0],
549 PARALLEL_COLS_53 * sizeof(OPJ_INT32))
550 would do but would be a tiny bit slower.
551 We can take here advantage of our knowledge of alignment */
552 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
553 LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
554 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
555 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
559 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
560 * 16 in AVX2, when top-most pixel is on even coordinate */
561 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
565 OPJ_INT32* tiledp_col,
566 const OPJ_SIZE_T stride)
568 const OPJ_INT32* in_even = &tiledp_col[0];
569 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
573 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
574 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
575 const VREG two = LOAD_CST(2);
579 assert(PARALLEL_COLS_53 == 16);
580 assert(VREG_INT_COUNT == 8);
582 assert(PARALLEL_COLS_53 == 8);
583 assert(VREG_INT_COUNT == 4);
586 /* Note: loads of input even/odd values must be done in a unaligned */
587 /* fashion. But stores in tmp can be done with aligned store, since */
588 /* the temporary buffer is properly aligned */
589 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
591 s1n_0 = LOADU(in_even + 0);
592 s1n_1 = LOADU(in_even + VREG_INT_COUNT);
593 d1n_0 = LOADU(in_odd);
594 d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
596 /* s0n = s1n - ((d1n + 1) >> 1); <==> */
597 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
598 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
599 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
601 for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
607 s1n_0 = LOADU(in_even + j * stride);
608 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
609 d1n_0 = LOADU(in_odd + j * stride);
610 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
612 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
613 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
614 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
616 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
617 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
619 /* d1c + ((s0c + s0n) >> 1) */
620 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
621 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
622 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
623 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
626 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
627 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
630 VREG tmp_len_minus_1;
631 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
632 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
633 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
634 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
635 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
636 STORE(tmp + PARALLEL_COLS_53 * (len - 2),
637 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
639 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
640 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
641 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
642 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
644 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
645 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
646 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
650 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
652 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
656 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
660 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
661 * 16 in AVX2, when top-most pixel is on odd coordinate */
662 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
666 OPJ_INT32* tiledp_col,
667 const OPJ_SIZE_T stride)
672 VREG s1_0, s2_0, dc_0, dn_0;
673 VREG s1_1, s2_1, dc_1, dn_1;
674 const VREG two = LOAD_CST(2);
676 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
677 const OPJ_INT32* in_odd = &tiledp_col[0];
681 assert(PARALLEL_COLS_53 == 16);
682 assert(VREG_INT_COUNT == 8);
684 assert(PARALLEL_COLS_53 == 8);
685 assert(VREG_INT_COUNT == 4);
688 /* Note: loads of input even/odd values must be done in a unaligned */
689 /* fashion. But stores in tmp can be done with aligned store, since */
690 /* the temporary buffer is properly aligned */
691 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
693 s1_0 = LOADU(in_even + stride);
694 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
695 dc_0 = SUB(LOADU(in_odd + 0),
696 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
697 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
699 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
700 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
701 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
702 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
703 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
704 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
706 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
708 s2_0 = LOADU(in_even + (j + 1) * stride);
709 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
711 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
712 dn_0 = SUB(LOADU(in_odd + j * stride),
713 SAR(ADD3(s1_0, s2_0, two), 2));
714 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
715 SAR(ADD3(s1_1, s2_1, two), 2));
717 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
718 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
720 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
721 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
722 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
723 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
724 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
731 STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
732 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
735 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
736 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
737 SAR(ADD3(s1_0, s1_0, two), 2));
738 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
739 SAR(ADD3(s1_1, s1_1, two), 2));
741 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
742 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
743 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
744 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
745 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
747 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
748 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
750 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
751 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
755 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
769 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
771 #if !defined(STANDARD_SLOW_VERSION)
772 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
773 * pixel is on even coordinate */
774 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
777 OPJ_INT32* tiledp_col,
778 const OPJ_SIZE_T stride)
781 OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
785 /* Performs lifting in one single iteration. Saves memory */
786 /* accesses and explicit interleaving. */
789 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
790 s0n = s1n - ((d1n + 1) >> 1);
792 for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
796 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
797 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
799 s0n = s1n - ((d1c + d1n + 2) >> 2);
802 tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
809 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
811 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
813 tmp[len - 1] = d1n + s0n;
816 for (i = 0; i < len; ++i) {
817 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
822 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
823 * pixel is on odd coordinate */
824 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
827 OPJ_INT32* tiledp_col,
828 const OPJ_SIZE_T stride)
831 OPJ_INT32 s1, s2, dc, dn;
832 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
833 const OPJ_INT32* in_odd = &tiledp_col[0];
837 /* Performs lifting in one single iteration. Saves memory */
838 /* accesses and explicit interleaving. */
840 s1 = in_even[stride];
841 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
842 tmp[0] = in_even[0] + dc;
843 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
845 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
847 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
849 tmp[i + 1] = s1 + ((dn + dc) >> 1);
856 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
857 tmp[len - 2] = s1 + ((dn + dc) >> 1);
860 tmp[len - 1] = s1 + dc;
863 for (i = 0; i < len; ++i) {
864 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
867 #endif /* !defined(STANDARD_SLOW_VERSION) */
870 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
872 /* Performs interleave, inverse wavelet transform and copy back to buffer */
873 static void opj_idwt53_v(const opj_dwt_t *dwt,
874 OPJ_INT32* tiledp_col,
878 #ifdef STANDARD_SLOW_VERSION
879 /* For documentation purpose */
881 for (c = 0; c < nb_cols; c ++) {
882 opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
883 opj_dwt_decode_1(dwt);
884 for (k = 0; k < dwt->sn + dwt->dn; ++k) {
885 tiledp_col[c + k * stride] = dwt->mem[k];
889 const OPJ_INT32 sn = dwt->sn;
890 const OPJ_INT32 len = sn + dwt->dn;
892 /* If len == 1, unmodified value */
894 #if (defined(__SSE2__) || defined(__AVX2__))
895 if (len > 1 && nb_cols == PARALLEL_COLS_53) {
896 /* Same as below general case, except that thanks to SSE2/AVX2 */
897 /* we can efficiently process 8/16 columns in parallel */
898 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
904 for (c = 0; c < nb_cols; c++, tiledp_col++) {
905 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
912 for (c = 0; c < nb_cols; c++, tiledp_col++) {
920 OPJ_INT32* out = dwt->mem;
921 for (c = 0; c < nb_cols; c++, tiledp_col++) {
923 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
924 const OPJ_INT32* in_odd = &tiledp_col[0];
926 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
927 out[0] = in_even[0] + out[1];
929 for (i = 0; i < len; ++i) {
930 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
937 #if (defined(__SSE2__) || defined(__AVX2__))
938 if (len > 2 && nb_cols == PARALLEL_COLS_53) {
939 /* Same as below general case, except that thanks to SSE2/AVX2 */
940 /* we can efficiently process 8/16 columns in parallel */
941 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
947 for (c = 0; c < nb_cols; c++, tiledp_col++) {
948 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
957 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw,
962 for (; i < end; ++i) {
968 static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw,
971 const OPJ_FLOAT32 c1,
972 const OPJ_FLOAT32 c2)
975 const OPJ_UINT32 iters_common = opj_uint_min(iters_c1, iters_c2);
976 assert((((OPJ_SIZE_T)fw) & 0xf) == 0);
977 assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1);
978 for (; i + 3 < iters_common; i += 4) {
980 const __m128 vcst = _mm_set_ps(c2, c1, c2, c1);
981 *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst);
982 *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst);
995 for (; i < iters_common; i++) {
1002 } else if (i < iters_c2) {
1009 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1015 OPJ_UINT32 imax = opj_uint_min(end, m);
1017 fw[-1] += (fl[0] + fw[0]) * c;
1020 for (; i + 3 < imax; i += 4) {
1021 fw[-1] += (fw[-2] + fw[0]) * c;
1022 fw[1] += (fw[0] + fw[2]) * c;
1023 fw[3] += (fw[2] + fw[4]) * c;
1024 fw[5] += (fw[4] + fw[6]) * c;
1027 for (; i < imax; ++i) {
1028 fw[-1] += (fw[-2] + fw[0]) * c;
1033 assert(m + 1 == end);
1034 fw[-1] += (2 * fw[-2]) * c;
1038 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn,
1041 OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn;
1043 assert(dn + sn > 1);
1051 opj_dwt_encode_step2(w + a, w + b + 1,
1053 (OPJ_UINT32)opj_int_min(dn, sn - b),
1055 opj_dwt_encode_step2(w + b, w + a + 1,
1057 (OPJ_UINT32)opj_int_min(sn, dn - a),
1059 opj_dwt_encode_step2(w + a, w + b + 1,
1061 (OPJ_UINT32)opj_int_min(dn, sn - b),
1063 opj_dwt_encode_step2(w + b, w + a + 1,
1065 (OPJ_UINT32)opj_int_min(sn, dn - a),
1068 opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn,
1070 opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn,
1074 opj_dwt_encode_step1_combined(w,
1080 opj_dwt_encode_step1_combined(w,
1089 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1090 opj_stepsize_t *bandno_stepsize)
1093 p = opj_int_floorlog2(stepsize) - 13;
1094 n = 11 - opj_int_floorlog2(stepsize);
1095 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1096 bandno_stepsize->expn = numbps - p;
1100 ==========================================================
1102 ==========================================================
1105 /** Process one line for the horizontal pass of the 5x3 forward transform */
1107 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn,
1112 OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn;
1113 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn;
1114 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1115 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1120 for (i = 0; i < sn - 1; i++) {
1121 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1);
1123 if ((width % 2) == 0) {
1124 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2];
1126 row[0] += (tmp[sn] + tmp[sn] + 2) >> 2;
1127 for (i = 1; i < dn; i++) {
1128 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2);
1130 if ((width % 2) == 1) {
1131 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2);
1133 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1140 tmp[sn + 0] = row[0] - row[1];
1141 for (i = 1; i < sn; i++) {
1142 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1);
1144 if ((width % 2) == 1) {
1145 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1];
1148 for (i = 0; i < dn - 1; i++) {
1149 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
1151 if ((width % 2) == 0) {
1152 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
1154 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
1159 /** Process one line for the horizontal pass of the 9x7 forward transform */
1161 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
1166 OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
1167 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
1168 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
1169 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
1173 memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
1174 opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
1175 opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
1176 (OPJ_INT32 * OPJ_RESTRICT)row,
1177 dn, sn, even ? 0 : 1);
1182 OPJ_UINT32 rw; /* Width of the resolution to process */
1183 OPJ_UINT32 w; /* Width of tiledp */
1184 OPJ_INT32 * OPJ_RESTRICT tiledp;
1187 opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
1188 } opj_dwt_encode_h_job_t;
1190 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
1193 opj_dwt_encode_h_job_t* job;
1196 job = (opj_dwt_encode_h_job_t*)user_data;
1197 for (j = job->min_j; j < job->max_j; j++) {
1198 OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
1199 (*job->p_function)(aj, job->h.mem, job->rw,
1200 job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
1203 opj_aligned_free(job->h.mem);
1211 OPJ_INT32 * OPJ_RESTRICT tiledp;
1214 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
1215 } opj_dwt_encode_v_job_t;
1217 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
1220 opj_dwt_encode_v_job_t* job;
1223 job = (opj_dwt_encode_v_job_t*)user_data;
1224 for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
1225 (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1232 if (j < job->max_j) {
1233 (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
1241 opj_aligned_free(job->v.mem);
1245 /** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
1246 /* that has a NB_ELTS_V8 interleave factor. */
1247 static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
1250 OPJ_UINT32 stride_width,
1253 const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1254 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
1255 if (cols == NB_ELTS_V8) {
1257 for (k = 0; k < height; ++k) {
1258 memcpy(tmp + NB_ELTS_V8 * k,
1259 array + k * stride_width,
1260 NB_ELTS_V8 * sizeof(OPJ_INT32));
1264 for (k = 0; k < height; ++k) {
1266 for (c = 0; c < cols; c++) {
1267 tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
1269 for (; c < NB_ELTS_V8; c++) {
1270 tmp[NB_ELTS_V8 * k + c] = 0;
1276 /* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
1277 /* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
1279 static INLINE void opj_dwt_deinterleave_v_cols(
1280 const OPJ_INT32 * OPJ_RESTRICT src,
1281 OPJ_INT32 * OPJ_RESTRICT dst,
1284 OPJ_UINT32 stride_width,
1290 OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
1291 const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
1294 for (k = 0; k < 2; k++) {
1296 if (cols == NB_ELTS_V8) {
1297 memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
1302 l_dest[c] = l_src[c];
1305 l_dest[c] = l_src[c];
1308 l_dest[c] = l_src[c];
1311 l_dest[c] = l_src[c];
1314 l_dest[c] = l_src[c];
1317 l_dest[c] = l_src[c];
1320 l_dest[c] = l_src[c];
1324 l_dest += stride_width;
1325 l_src += 2 * NB_ELTS_V8;
1328 l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
1329 l_src = src + (1 - cas) * NB_ELTS_V8;
1335 /* Forward 5-3 transform, for the vertical pass, processing cols columns */
1336 /* where cols <= NB_ELTS_V8 */
1337 static void opj_dwt_encode_and_deinterleave_v(
1342 OPJ_UINT32 stride_width,
1345 OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
1346 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
1347 const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
1348 const OPJ_UINT32 dn = height - sn;
1350 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1352 #define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
1353 #define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
1359 for (c = 0; c < NB_ELTS_V8; c++) {
1368 __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
1369 __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
1370 for (; i + 1 < sn; i++) {
1371 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1372 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1373 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1374 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1375 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1376 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1377 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1378 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1379 xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
1380 _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
1381 xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
1382 _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
1383 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1384 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1385 xmm_Si_0 = xmm_Sip1_0;
1386 xmm_Si_1 = xmm_Sip1_1;
1389 if (((height) % 2) == 0) {
1390 for (c = 0; c < NB_ELTS_V8; c++) {
1391 OPJ_Dc(i) -= OPJ_Sc(i);
1394 for (c = 0; c < NB_ELTS_V8; c++) {
1395 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1399 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1400 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1401 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1402 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1403 const __m128i xmm_two = _mm_set1_epi32(2);
1404 for (; i < dn; i++) {
1405 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1406 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1407 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1408 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1409 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1410 (i * 2) * NB_ELTS_V8 + 4 * 0);
1411 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1412 (i * 2) * NB_ELTS_V8 + 4 * 1);
1413 xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
1414 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
1415 xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
1416 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
1417 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1418 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1419 xmm_Dim1_0 = xmm_Di_0;
1420 xmm_Dim1_1 = xmm_Di_1;
1423 if (((height) % 2) == 1) {
1424 for (c = 0; c < NB_ELTS_V8; c++) {
1425 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1431 for (c = 0; c < NB_ELTS_V8; c++) {
1432 OPJ_Sc(0) -= OPJ_Dc(0);
1436 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
1437 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
1438 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
1439 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
1440 for (; i < sn; i++) {
1441 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1442 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1443 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1444 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1445 __m128i xmm_Si_0 = *(const __m128i*)(tmp +
1446 (i * 2) * NB_ELTS_V8 + 4 * 0);
1447 __m128i xmm_Si_1 = *(const __m128i*)(tmp +
1448 (i * 2) * NB_ELTS_V8 + 4 * 1);
1449 xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
1450 _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
1451 xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
1452 _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
1453 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
1454 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
1455 xmm_Dim1_0 = xmm_Di_0;
1456 xmm_Dim1_1 = xmm_Di_1;
1459 if (((height) % 2) == 1) {
1460 for (c = 0; c < NB_ELTS_V8; c++) {
1461 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1466 __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
1467 __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
1468 const __m128i xmm_two = _mm_set1_epi32(2);
1469 for (; i + 1 < dn; i++) {
1470 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
1471 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
1472 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
1473 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
1474 __m128i xmm_Di_0 = *(const __m128i*)(tmp +
1475 (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
1476 __m128i xmm_Di_1 = *(const __m128i*)(tmp +
1477 (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
1478 xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
1479 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
1480 xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
1481 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
1482 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
1483 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
1484 xmm_Si_0 = xmm_Sip1_0;
1485 xmm_Si_1 = xmm_Sip1_1;
1488 if (((height) % 2) == 0) {
1489 for (c = 0; c < NB_ELTS_V8; c++) {
1490 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1499 for (i = 0; i + 1 < sn; i++) {
1500 for (c = 0; c < NB_ELTS_V8; c++) {
1501 OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
1504 if (((height) % 2) == 0) {
1505 for (c = 0; c < NB_ELTS_V8; c++) {
1506 OPJ_Dc(i) -= OPJ_Sc(i);
1509 for (c = 0; c < NB_ELTS_V8; c++) {
1510 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
1512 for (i = 1; i < dn; i++) {
1513 for (c = 0; c < NB_ELTS_V8; c++) {
1514 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
1517 if (((height) % 2) == 1) {
1518 for (c = 0; c < NB_ELTS_V8; c++) {
1519 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
1526 for (c = 0; c < NB_ELTS_V8; c++) {
1531 for (c = 0; c < NB_ELTS_V8; c++) {
1532 OPJ_Sc(0) -= OPJ_Dc(0);
1534 for (i = 1; i < sn; i++) {
1535 for (c = 0; c < NB_ELTS_V8; c++) {
1536 OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
1539 if (((height) % 2) == 1) {
1540 for (c = 0; c < NB_ELTS_V8; c++) {
1541 OPJ_Sc(i) -= OPJ_Dc(i - 1);
1544 for (i = 0; i + 1 < dn; i++) {
1545 for (c = 0; c < NB_ELTS_V8; c++) {
1546 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
1549 if (((height) % 2) == 0) {
1550 for (c = 0; c < NB_ELTS_V8; c++) {
1551 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
1558 if (cols == NB_ELTS_V8) {
1559 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1560 stride_width, even ? 0 : 1, NB_ELTS_V8);
1562 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
1563 stride_width, even ? 0 : 1, cols);
1567 static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
1569 const OPJ_FLOAT32 cst)
1573 __m128* vw = (__m128*) fw;
1574 const __m128 vcst = _mm_set1_ps(cst);
1575 for (i = 0; i < end; ++i) {
1576 vw[0] = _mm_mul_ps(vw[0], vcst);
1577 vw[1] = _mm_mul_ps(vw[1], vcst);
1578 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1582 for (i = 0; i < end; ++i) {
1583 for (c = 0; c < NB_ELTS_V8; c++) {
1584 fw[i * 2 * NB_ELTS_V8 + c] *= cst;
1590 static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
1596 OPJ_UINT32 imax = opj_uint_min(end, m);
1598 __m128* vw = (__m128*) fw;
1599 __m128 vcst = _mm_set1_ps(cst);
1601 __m128* vl = (__m128*) fl;
1602 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
1603 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
1604 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1607 for (; i < imax; ++i) {
1608 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
1609 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
1610 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
1614 assert(m + 1 == end);
1615 vcst = _mm_add_ps(vcst, vcst);
1616 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
1617 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
1622 for (c = 0; c < NB_ELTS_V8; c++) {
1623 fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1626 fw += 2 * NB_ELTS_V8;
1628 for (; i < imax; ++i) {
1629 for (c = 0; c < NB_ELTS_V8; c++) {
1630 fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
1633 fw += 2 * NB_ELTS_V8;
1637 assert(m + 1 == end);
1638 for (c = 0; c < NB_ELTS_V8; c++) {
1639 fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst;
1645 /* Forward 9-7 transform, for the vertical pass, processing cols columns */
1646 /* where cols <= NB_ELTS_V8 */
1647 static void opj_dwt_encode_and_deinterleave_v_real(
1652 OPJ_UINT32 stride_width,
1655 OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn;
1656 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn;
1657 const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1);
1658 const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn);
1665 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
1674 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1675 tmp + (b + 1) * NB_ELTS_V8,
1677 (OPJ_UINT32)opj_int_min(dn, sn - b),
1679 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1680 tmp + (a + 1) * NB_ELTS_V8,
1682 (OPJ_UINT32)opj_int_min(sn, dn - a),
1684 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8,
1685 tmp + (b + 1) * NB_ELTS_V8,
1687 (OPJ_UINT32)opj_int_min(dn, sn - b),
1689 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8,
1690 tmp + (a + 1) * NB_ELTS_V8,
1692 (OPJ_UINT32)opj_int_min(sn, dn - a),
1694 opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn,
1696 opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn,
1700 if (cols == NB_ELTS_V8) {
1701 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1703 (OPJ_INT32)dn, (OPJ_INT32)sn,
1704 stride_width, even ? 0 : 1, NB_ELTS_V8);
1706 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp,
1708 (OPJ_INT32)dn, (OPJ_INT32)sn,
1709 stride_width, even ? 0 : 1, cols);
1715 /* Forward 5-3 wavelet transform in 2-D. */
1717 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp,
1718 opj_tcd_tilecomp_t * tilec,
1719 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v,
1720 opj_encode_and_deinterleave_h_one_row_fnptr_type
1721 p_encode_and_deinterleave_h_one_row)
1728 OPJ_SIZE_T l_data_size;
1730 opj_tcd_resolution_t * l_cur_res = 0;
1731 opj_tcd_resolution_t * l_last_res = 0;
1732 const int num_threads = opj_thread_pool_get_thread_count(tp);
1733 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1735 w = (OPJ_UINT32)(tilec->x1 - tilec->x0);
1736 l = (OPJ_INT32)tilec->numresolutions - 1;
1738 l_cur_res = tilec->resolutions + l;
1739 l_last_res = l_cur_res - 1;
1741 l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1742 /* overflow check */
1743 if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) {
1744 /* FIXME event manager error callback */
1747 l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32);
1748 bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1749 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1750 /* in that case, so do not error out */
1751 if (l_data_size != 0 && ! bj) {
1758 OPJ_UINT32 rw; /* width of the resolution level computed */
1759 OPJ_UINT32 rh; /* height of the resolution level computed */
1761 rw1; /* width of the resolution level once lower than computed one */
1763 rh1; /* height of the resolution level once lower than computed one */
1764 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1765 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */
1768 rw = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0);
1769 rh = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0);
1770 rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0);
1771 rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0);
1773 cas_row = l_cur_res->x0 & 1;
1774 cas_col = l_cur_res->y0 & 1;
1776 sn = (OPJ_INT32)rh1;
1777 dn = (OPJ_INT32)(rh - rh1);
1779 /* Perform vertical pass */
1780 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
1781 for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) {
1782 p_encode_and_deinterleave_v(tiledp + j,
1790 p_encode_and_deinterleave_v(tiledp + j,
1798 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1801 if (rw < num_jobs) {
1804 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
1806 for (j = 0; j < num_jobs; j++) {
1807 opj_dwt_encode_v_job_t* job;
1809 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t));
1811 opj_thread_pool_wait_completion(tp, 0);
1812 opj_aligned_free(bj);
1815 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1817 opj_thread_pool_wait_completion(tp, 0);
1819 opj_aligned_free(bj);
1824 job->v.cas = cas_col;
1827 job->tiledp = tiledp;
1828 job->min_j = j * step_j;
1829 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j;
1830 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v;
1831 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job);
1833 opj_thread_pool_wait_completion(tp, 0);
1836 sn = (OPJ_INT32)rw1;
1837 dn = (OPJ_INT32)(rw - rw1);
1839 /* Perform horizontal pass */
1840 if (num_threads <= 1 || rh <= 1) {
1841 for (j = 0; j < rh; j++) {
1842 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w;
1843 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw,
1844 cas_row == 0 ? OPJ_TRUE : OPJ_FALSE);
1847 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1850 if (rh < num_jobs) {
1853 step_j = (rh / num_jobs);
1855 for (j = 0; j < num_jobs; j++) {
1856 opj_dwt_encode_h_job_t* job;
1858 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t));
1860 opj_thread_pool_wait_completion(tp, 0);
1861 opj_aligned_free(bj);
1864 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size);
1866 opj_thread_pool_wait_completion(tp, 0);
1868 opj_aligned_free(bj);
1873 job->h.cas = cas_row;
1876 job->tiledp = tiledp;
1877 job->min_j = j * step_j;
1878 job->max_j = (j + 1U) * step_j; /* this can overflow */
1879 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
1882 job->p_function = p_encode_and_deinterleave_h_one_row;
1883 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job);
1885 opj_thread_pool_wait_completion(tp, 0);
1888 l_cur_res = l_last_res;
1893 opj_aligned_free(bj);
1897 /* Forward 5-3 wavelet transform in 2-D. */
1899 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd,
1900 opj_tcd_tilecomp_t * tilec)
1902 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1903 opj_dwt_encode_and_deinterleave_v,
1904 opj_dwt_encode_and_deinterleave_h_one_row);
1908 /* Inverse 5-3 wavelet transform in 2-D. */
1910 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1913 if (p_tcd->whole_tile_decoding) {
1914 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1916 return opj_dwt_decode_partial_tile(tilec, numres);
1921 /* Get norm of 5-3 wavelet. */
1923 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1925 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1926 /* but the array should really be extended up to 33 resolution levels */
1927 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1928 if (orient == 0 && level >= 10) {
1930 } else if (orient > 0 && level >= 9) {
1933 return opj_dwt_norms[orient][level];
1937 /* Forward 9-7 wavelet transform in 2-D. */
1939 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd,
1940 opj_tcd_tilecomp_t * tilec)
1942 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec,
1943 opj_dwt_encode_and_deinterleave_v_real,
1944 opj_dwt_encode_and_deinterleave_h_one_row_real);
1948 /* Get norm of 9-7 wavelet. */
1950 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1952 /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1953 /* but the array should really be extended up to 33 resolution levels */
1954 /* See https://github.com/uclouvain/openjpeg/issues/493 */
1955 if (orient == 0 && level >= 10) {
1957 } else if (orient > 0 && level >= 9) {
1960 return opj_dwt_norms_real[orient][level];
1963 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1965 OPJ_UINT32 numbands, bandno;
1966 numbands = 3 * tccp->numresolutions - 2;
1967 for (bandno = 0; bandno < numbands; bandno++) {
1968 OPJ_FLOAT64 stepsize;
1969 OPJ_UINT32 resno, level, orient, gain;
1971 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1972 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1973 level = tccp->numresolutions - 1 - resno;
1974 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1975 (orient == 2)) ? 1 : 2));
1976 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1979 OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient);
1980 stepsize = (1 << (gain)) / norm;
1982 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1983 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1988 /* Determine maximum computed resolution level for inverse wavelet transform */
1990 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1997 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
2000 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
2011 OPJ_INT32 * OPJ_RESTRICT tiledp;
2014 } opj_dwt_decode_h_job_t;
2016 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
2019 opj_dwt_decode_h_job_t* job;
2022 job = (opj_dwt_decode_h_job_t*)user_data;
2023 for (j = job->min_j; j < job->max_j; j++) {
2024 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
2027 opj_aligned_free(job->h.mem);
2035 OPJ_INT32 * OPJ_RESTRICT tiledp;
2038 } opj_dwt_decode_v_job_t;
2040 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
2043 opj_dwt_decode_v_job_t* job;
2046 job = (opj_dwt_decode_v_job_t*)user_data;
2047 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
2048 j += PARALLEL_COLS_53) {
2049 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2053 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
2054 (OPJ_INT32)(job->max_j - j));
2056 opj_aligned_free(job->v.mem);
2062 /* Inverse wavelet transform in 2-D. */
2064 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
2065 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
2070 opj_tcd_resolution_t* tr = tilec->resolutions;
2072 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2073 tr->x0); /* width of the resolution level computed */
2074 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2075 tr->y0); /* height of the resolution level computed */
2077 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2079 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2080 OPJ_SIZE_T h_mem_size;
2086 num_threads = opj_thread_pool_get_thread_count(tp);
2087 h_mem_size = opj_dwt_max_resolution(tr, numres);
2088 /* overflow check */
2089 if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
2090 /* FIXME event manager error callback */
2093 /* We need PARALLEL_COLS_53 times the height of the array, */
2094 /* since for the vertical pass */
2095 /* we process PARALLEL_COLS_53 columns at a time */
2096 h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32);
2097 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2099 /* FIXME event manager error callback */
2106 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
2110 h.sn = (OPJ_INT32)rw;
2111 v.sn = (OPJ_INT32)rh;
2113 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2114 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2116 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2119 if (num_threads <= 1 || rh <= 1) {
2120 for (j = 0; j < rh; ++j) {
2121 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
2124 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2127 if (rh < num_jobs) {
2130 step_j = (rh / num_jobs);
2132 for (j = 0; j < num_jobs; j++) {
2133 opj_dwt_decode_h_job_t* job;
2135 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t));
2137 /* It would be nice to fallback to single thread case, but */
2138 /* unfortunately some jobs may be launched and have modified */
2139 /* tiledp, so it is not practical to recover from that error */
2140 /* FIXME event manager error callback */
2141 opj_thread_pool_wait_completion(tp, 0);
2142 opj_aligned_free(h.mem);
2148 job->tiledp = tiledp;
2149 job->min_j = j * step_j;
2150 job->max_j = (j + 1U) * step_j; /* this can overflow */
2151 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
2154 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2156 /* FIXME event manager error callback */
2157 opj_thread_pool_wait_completion(tp, 0);
2159 opj_aligned_free(h.mem);
2162 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
2164 opj_thread_pool_wait_completion(tp, 0);
2167 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2170 if (num_threads <= 1 || rw <= 1) {
2171 for (j = 0; j + PARALLEL_COLS_53 <= rw;
2172 j += PARALLEL_COLS_53) {
2173 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
2176 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
2179 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
2182 if (rw < num_jobs) {
2185 step_j = (rw / num_jobs);
2187 for (j = 0; j < num_jobs; j++) {
2188 opj_dwt_decode_v_job_t* job;
2190 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t));
2192 /* It would be nice to fallback to single thread case, but */
2193 /* unfortunately some jobs may be launched and have modified */
2194 /* tiledp, so it is not practical to recover from that error */
2195 /* FIXME event manager error callback */
2196 opj_thread_pool_wait_completion(tp, 0);
2197 opj_aligned_free(v.mem);
2203 job->tiledp = tiledp;
2204 job->min_j = j * step_j;
2205 job->max_j = (j + 1U) * step_j; /* this can overflow */
2206 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */
2209 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2211 /* FIXME event manager error callback */
2212 opj_thread_pool_wait_completion(tp, 0);
2214 opj_aligned_free(v.mem);
2217 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
2219 opj_thread_pool_wait_completion(tp, 0);
2222 opj_aligned_free(h.mem);
2226 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
2228 opj_sparse_array_int32_t* sa,
2231 OPJ_UINT32 win_l_x0,
2232 OPJ_UINT32 win_l_x1,
2233 OPJ_UINT32 win_h_x0,
2234 OPJ_UINT32 win_h_x1)
2237 ret = opj_sparse_array_int32_read(sa,
2239 win_l_x1, sa_line + 1,
2240 dest + cas + 2 * win_l_x0,
2243 ret = opj_sparse_array_int32_read(sa,
2244 sn + win_h_x0, sa_line,
2245 sn + win_h_x1, sa_line + 1,
2246 dest + 1 - cas + 2 * win_h_x0,
2253 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
2255 opj_sparse_array_int32_t* sa,
2259 OPJ_UINT32 win_l_y0,
2260 OPJ_UINT32 win_l_y1,
2261 OPJ_UINT32 win_h_y0,
2262 OPJ_UINT32 win_h_y1)
2265 ret = opj_sparse_array_int32_read(sa,
2267 sa_col + nb_cols, win_l_y1,
2268 dest + cas * 4 + 2 * 4 * win_l_y0,
2269 1, 2 * 4, OPJ_TRUE);
2271 ret = opj_sparse_array_int32_read(sa,
2272 sa_col, sn + win_h_y0,
2273 sa_col + nb_cols, sn + win_h_y1,
2274 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
2275 1, 2 * 4, OPJ_TRUE);
2280 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
2290 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
2292 /* Naive version is :
2293 for (i = win_l_x0; i < i_max; i++) {
2294 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2296 for (i = win_h_x0; i < win_h_x1; i++) {
2297 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2299 but the compiler doesn't manage to unroll it to avoid bound
2300 checking in OPJ_S_ and OPJ_D_ macros
2307 /* Left-most case */
2308 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2315 for (; i < i_max; i++) {
2316 /* No bound checking */
2317 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
2319 for (; i < win_l_x1; i++) {
2320 /* Right-most case */
2321 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2327 OPJ_INT32 i_max = win_h_x1;
2331 for (; i < i_max; i++) {
2332 /* No bound checking */
2333 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
2335 for (; i < win_h_x1; i++) {
2336 /* Right-most case */
2337 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2342 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
2345 for (i = win_l_x0; i < win_l_x1; i++) {
2346 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
2348 for (i = win_h_x0; i < win_h_x1; i++) {
2349 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
2355 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
2356 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
2357 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
2358 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
2359 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
2360 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
2362 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
2364 OPJ_INT32 dn, OPJ_INT32 sn,
2377 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */
2379 /* Naive version is :
2380 for (i = win_l_x0; i < i_max; i++) {
2381 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
2383 for (i = win_h_x0; i < win_h_x1; i++) {
2384 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
2386 but the compiler doesn't manage to unroll it to avoid bound
2387 checking in OPJ_S_ and OPJ_D_ macros
2394 /* Left-most case */
2395 for (off = 0; off < 4; off++) {
2396 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2406 if (i + 1 < i_max) {
2407 const __m128i two = _mm_set1_epi32(2);
2408 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
2409 for (; i + 1 < i_max; i += 2) {
2410 /* No bound checking */
2411 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2412 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2413 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2414 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2415 S = _mm_sub_epi32(S,
2416 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
2417 S1 = _mm_sub_epi32(S1,
2418 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
2419 _mm_store_si128((__m128i*)(a + i * 8), S);
2420 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
2426 for (; i < i_max; i++) {
2427 /* No bound checking */
2428 for (off = 0; off < 4; off++) {
2429 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
2432 for (; i < win_l_x1; i++) {
2433 /* Right-most case */
2434 for (off = 0; off < 4; off++) {
2435 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
2442 OPJ_INT32 i_max = win_h_x1;
2448 if (i + 1 < i_max) {
2449 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
2450 for (; i + 1 < i_max; i += 2) {
2451 /* No bound checking */
2452 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
2453 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
2454 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
2455 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
2456 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
2457 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
2458 _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
2459 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
2465 for (; i < i_max; i++) {
2466 /* No bound checking */
2467 for (off = 0; off < 4; off++) {
2468 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
2471 for (; i < win_h_x1; i++) {
2472 /* Right-most case */
2473 for (off = 0; off < 4; off++) {
2474 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
2480 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */
2481 for (off = 0; off < 4; off++) {
2482 OPJ_S_off(0, off) /= 2;
2485 for (i = win_l_x0; i < win_l_x1; i++) {
2486 for (off = 0; off < 4; off++) {
2487 OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
2490 for (i = win_h_x0; i < win_h_x1; i++) {
2491 for (off = 0; off < 4; off++) {
2492 OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
2499 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
2511 /* Compute number of decomposition for this band. See table F-1 */
2512 OPJ_UINT32 nb = (resno == 0) ?
2513 tilec->numresolutions - 1 :
2514 tilec->numresolutions - resno;
2515 /* Map above tile-based coordinates to sub-band-based coordinates per */
2516 /* equation B-15 of the standard */
2517 OPJ_UINT32 x0b = bandno & 1;
2518 OPJ_UINT32 y0b = bandno >> 1;
2520 *tbx0 = (nb == 0) ? tcx0 :
2521 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
2522 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
2525 *tby0 = (nb == 0) ? tcy0 :
2526 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
2527 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
2530 *tbx1 = (nb == 0) ? tcx1 :
2531 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
2532 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
2535 *tby1 = (nb == 0) ? tcy1 :
2536 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
2537 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
2541 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
2542 OPJ_UINT32 max_size,
2546 *start = opj_uint_subs(*start, filter_width);
2547 *end = opj_uint_adds(*end, filter_width);
2548 *end = opj_uint_min(*end, max_size);
2552 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
2553 opj_tcd_tilecomp_t* tilec,
2556 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2557 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
2558 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
2559 OPJ_UINT32 resno, bandno, precno, cblkno;
2560 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
2561 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
2566 for (resno = 0; resno < numres; ++resno) {
2567 opj_tcd_resolution_t* res = &tilec->resolutions[resno];
2569 for (bandno = 0; bandno < res->numbands; ++bandno) {
2570 opj_tcd_band_t* band = &res->bands[bandno];
2572 for (precno = 0; precno < res->pw * res->ph; ++precno) {
2573 opj_tcd_precinct_t* precinct = &band->precincts[precno];
2574 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
2575 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
2576 if (cblk->decoded_data != NULL) {
2577 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
2578 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
2579 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
2580 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
2582 if (band->bandno & 1) {
2583 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2584 x += (OPJ_UINT32)(pres->x1 - pres->x0);
2586 if (band->bandno & 2) {
2587 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
2588 y += (OPJ_UINT32)(pres->y1 - pres->y0);
2591 if (!opj_sparse_array_int32_write(sa, x, y,
2592 x + cblk_w, y + cblk_h,
2594 1, cblk_w, OPJ_TRUE)) {
2595 opj_sparse_array_int32_free(sa);
2608 static OPJ_BOOL opj_dwt_decode_partial_tile(
2609 opj_tcd_tilecomp_t* tilec,
2612 opj_sparse_array_int32_t* sa;
2616 /* This value matches the maximum left/right extension given in tables */
2617 /* F.2 and F.3 of the standard. */
2618 const OPJ_UINT32 filter_width = 2U;
2620 opj_tcd_resolution_t* tr = tilec->resolutions;
2621 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2623 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2624 tr->x0); /* width of the resolution level computed */
2625 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2626 tr->y0); /* height of the resolution level computed */
2628 OPJ_SIZE_T h_mem_size;
2630 /* Compute the intersection of the area of interest, expressed in tile coordinates */
2631 /* with the tile coordinates */
2632 OPJ_UINT32 win_tcx0 = tilec->win_x0;
2633 OPJ_UINT32 win_tcy0 = tilec->win_y0;
2634 OPJ_UINT32 win_tcx1 = tilec->win_x1;
2635 OPJ_UINT32 win_tcy1 = tilec->win_y1;
2637 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2641 sa = opj_dwt_init_sparse_array(tilec, numres);
2647 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2648 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2649 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2650 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2651 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2653 1, tr_max->win_x1 - tr_max->win_x0,
2657 opj_sparse_array_int32_free(sa);
2660 h_mem_size = opj_dwt_max_resolution(tr, numres);
2661 /* overflow check */
2662 /* in vertical pass, we process 4 columns at a time */
2663 if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
2664 /* FIXME event manager error callback */
2665 opj_sparse_array_int32_free(sa);
2669 h_mem_size *= 4 * sizeof(OPJ_INT32);
2670 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
2672 /* FIXME event manager error callback */
2673 opj_sparse_array_int32_free(sa);
2679 for (resno = 1; resno < numres; resno ++) {
2681 /* Window of interest subband-based coordinates */
2682 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2683 OPJ_UINT32 win_hl_x0, win_hl_x1;
2684 OPJ_UINT32 win_lh_y0, win_lh_y1;
2685 /* Window of interest tile-resolution-based coordinates */
2686 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2687 /* Tile-resolution subband-based coordinates */
2688 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2692 h.sn = (OPJ_INT32)rw;
2693 v.sn = (OPJ_INT32)rh;
2695 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2696 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2698 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2701 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2704 /* Get the subband coordinates for the window of interest */
2706 opj_dwt_get_band_coordinates(tilec, resno, 0,
2707 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2708 &win_ll_x0, &win_ll_y0,
2709 &win_ll_x1, &win_ll_y1);
2712 opj_dwt_get_band_coordinates(tilec, resno, 1,
2713 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2714 &win_hl_x0, NULL, &win_hl_x1, NULL);
2717 opj_dwt_get_band_coordinates(tilec, resno, 2,
2718 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2719 NULL, &win_lh_y0, NULL, &win_lh_y1);
2721 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2722 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2723 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2724 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2725 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2727 /* Subtract the origin of the bands for this tile, to the subwindow */
2728 /* of interest band coordinates, so as to get them relative to the */
2730 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2731 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2732 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2733 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2734 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2735 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2736 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2737 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2739 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2740 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2742 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2743 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2745 /* Compute the tile-resolution-based coordinates for the window of interest */
2747 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2748 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2750 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2751 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2755 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2756 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2758 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2759 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2762 for (j = 0; j < rh; ++j) {
2763 if ((j >= win_ll_y0 && j < win_ll_y1) ||
2764 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2766 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2767 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2768 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2769 /* This is less extreme than memsetting the whole buffer to 0 */
2770 /* although we could potentially do better with better handling of edge conditions */
2771 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2772 h.mem[win_tr_x1 - 1] = 0;
2774 if (win_tr_x1 < rw) {
2775 h.mem[win_tr_x1] = 0;
2778 opj_dwt_interleave_partial_h(h.mem,
2787 opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas,
2788 (OPJ_INT32)win_ll_x0,
2789 (OPJ_INT32)win_ll_x1,
2790 (OPJ_INT32)win_hl_x0,
2791 (OPJ_INT32)win_hl_x1);
2792 if (!opj_sparse_array_int32_write(sa,
2797 /* FIXME event manager error callback */
2798 opj_sparse_array_int32_free(sa);
2799 opj_aligned_free(h.mem);
2805 for (i = win_tr_x0; i < win_tr_x1;) {
2806 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2807 opj_dwt_interleave_partial_v(v.mem,
2817 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2818 (OPJ_INT32)win_ll_y0,
2819 (OPJ_INT32)win_ll_y1,
2820 (OPJ_INT32)win_lh_y0,
2821 (OPJ_INT32)win_lh_y1);
2822 if (!opj_sparse_array_int32_write(sa,
2824 i + nb_cols, win_tr_y1,
2825 v.mem + 4 * win_tr_y0,
2827 /* FIXME event manager error callback */
2828 opj_sparse_array_int32_free(sa);
2829 opj_aligned_free(h.mem);
2836 opj_aligned_free(h.mem);
2839 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2840 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2841 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2842 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2843 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2845 1, tr_max->win_x1 - tr_max->win_x0,
2850 opj_sparse_array_int32_free(sa);
2854 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt,
2855 OPJ_FLOAT32* OPJ_RESTRICT a,
2857 OPJ_UINT32 remaining_height)
2859 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2861 OPJ_UINT32 x0 = dwt->win_l_x0;
2862 OPJ_UINT32 x1 = dwt->win_l_x1;
2864 for (k = 0; k < 2; ++k) {
2865 if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2866 ((OPJ_SIZE_T) bi & 0x0f) == 0) {
2867 /* Fast code path */
2868 for (i = x0; i < x1; ++i) {
2870 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2888 /* Slow code path */
2889 for (i = x0; i < x1; ++i) {
2891 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8;
2894 if (remaining_height == 1) {
2899 if (remaining_height == 2) {
2904 if (remaining_height == 3) {
2909 if (remaining_height == 4) {
2914 if (remaining_height == 5) {
2919 if (remaining_height == 6) {
2924 if (remaining_height == 7) {
2931 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2938 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt,
2939 opj_sparse_array_int32_t* sa,
2941 OPJ_UINT32 remaining_height)
2944 for (i = 0; i < remaining_height; i++) {
2946 ret = opj_sparse_array_int32_read(sa,
2947 dwt->win_l_x0, sa_line + i,
2948 dwt->win_l_x1, sa_line + i + 1,
2949 /* Nasty cast from float* to int32* */
2950 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2951 2 * NB_ELTS_V8, 0, OPJ_TRUE);
2953 ret = opj_sparse_array_int32_read(sa,
2954 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2955 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2956 /* Nasty cast from float* to int32* */
2957 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2958 2 * NB_ELTS_V8, 0, OPJ_TRUE);
2964 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2965 OPJ_FLOAT32* OPJ_RESTRICT a,
2967 OPJ_UINT32 nb_elts_read)
2969 opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2972 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2973 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2974 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2977 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2978 bi = dwt->wavelet + 1 - dwt->cas;
2980 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2981 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2982 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2986 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt,
2987 opj_sparse_array_int32_t* sa,
2989 OPJ_UINT32 nb_elts_read)
2992 ret = opj_sparse_array_int32_read(sa,
2993 sa_col, dwt->win_l_x0,
2994 sa_col + nb_elts_read, dwt->win_l_x1,
2995 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
2996 1, 2 * NB_ELTS_V8, OPJ_TRUE);
2998 ret = opj_sparse_array_int32_read(sa,
2999 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
3000 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
3001 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
3002 1, 2 * NB_ELTS_V8, OPJ_TRUE);
3009 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w,
3014 __m128* OPJ_RESTRICT vw = (__m128*) w;
3015 OPJ_UINT32 i = start;
3016 /* To be adapted if NB_ELTS_V8 changes */
3018 /* Note: attempt at loop unrolling x2 doesn't help */
3019 for (; i < end; ++i, vw += 4) {
3020 vw[0] = _mm_mul_ps(vw[0], c);
3021 vw[1] = _mm_mul_ps(vw[1], c);
3025 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w,
3031 __m128* OPJ_RESTRICT vl = (__m128*) l;
3032 __m128* OPJ_RESTRICT vw = (__m128*) w;
3033 /* To be adapted if NB_ELTS_V8 changes */
3035 OPJ_UINT32 imax = opj_uint_min(end, m);
3038 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c));
3039 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c));
3048 /* Note: attempt at loop unrolling x2 doesn't help */
3049 for (; i < imax; ++i) {
3050 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c));
3051 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c));
3055 assert(m + 1 == end);
3056 c = _mm_add_ps(c, c);
3057 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4]));
3058 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3]));
3064 static void opj_v8dwt_decode_step1(opj_v8_t* w,
3067 const OPJ_FLOAT32 c)
3069 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
3071 /* To be adapted if NB_ELTS_V8 changes */
3072 for (i = start; i < end; ++i) {
3073 fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c;
3074 fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c;
3075 fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c;
3076 fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c;
3077 fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c;
3078 fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c;
3079 fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c;
3080 fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c;
3084 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w,
3090 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
3091 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
3093 OPJ_UINT32 imax = opj_uint_min(end, m);
3095 fw += 2 * NB_ELTS_V8 * start;
3096 fl = fw - 2 * NB_ELTS_V8;
3098 /* To be adapted if NB_ELTS_V8 changes */
3099 for (i = start; i < imax; ++i) {
3100 fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c);
3101 fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c);
3102 fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c);
3103 fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c);
3104 fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c);
3105 fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c);
3106 fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c);
3107 fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c);
3109 fw += 2 * NB_ELTS_V8;
3112 assert(m + 1 == end);
3114 fw[-8] = fw[-8] + fl[0] * c;
3115 fw[-7] = fw[-7] + fl[1] * c;
3116 fw[-6] = fw[-6] + fl[2] * c;
3117 fw[-5] = fw[-5] + fl[3] * c;
3118 fw[-4] = fw[-4] + fl[4] * c;
3119 fw[-3] = fw[-3] + fl[5] * c;
3120 fw[-2] = fw[-2] + fl[6] * c;
3121 fw[-1] = fw[-1] + fl[7] * c;
3128 /* Inverse 9-7 wavelet transform in 1-D. */
3130 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt)
3133 /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */
3134 /* Historic value for 2 / opj_invK */
3135 /* Normally, we should use invK, but if we do so, we have failures in the */
3136 /* conformance test, due to MSE and peak errors significantly higher than */
3137 /* accepted value */
3138 /* Due to using two_invK instead of invK, we have to compensate in tcd.c */
3139 /* the computation of the stepsize for the non LL subbands */
3140 const float two_invK = 1.625732422f;
3141 if (dwt->cas == 0) {
3142 if (!((dwt->dn > 0) || (dwt->sn > 1))) {
3148 if (!((dwt->sn > 0) || (dwt->dn > 1))) {
3155 opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3156 _mm_set1_ps(opj_K));
3157 opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3158 _mm_set1_ps(two_invK));
3159 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3160 dwt->win_l_x0, dwt->win_l_x1,
3161 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3162 _mm_set1_ps(-opj_dwt_delta));
3163 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3164 dwt->win_h_x0, dwt->win_h_x1,
3165 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3166 _mm_set1_ps(-opj_dwt_gamma));
3167 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
3168 dwt->win_l_x0, dwt->win_l_x1,
3169 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3170 _mm_set1_ps(-opj_dwt_beta));
3171 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
3172 dwt->win_h_x0, dwt->win_h_x1,
3173 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3174 _mm_set1_ps(-opj_dwt_alpha));
3176 opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
3178 opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
3180 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3181 dwt->win_l_x0, dwt->win_l_x1,
3182 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3184 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3185 dwt->win_h_x0, dwt->win_h_x1,
3186 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3188 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
3189 dwt->win_l_x0, dwt->win_l_x1,
3190 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
3192 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
3193 dwt->win_h_x0, dwt->win_h_x1,
3194 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
3203 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3205 } opj_dwt97_decode_h_job_t;
3207 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls)
3210 opj_dwt97_decode_h_job_t* job;
3211 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3215 job = (opj_dwt97_decode_h_job_t*)user_data;
3218 assert((job->nb_rows % NB_ELTS_V8) == 0);
3221 for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) {
3223 opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8);
3224 opj_v8dwt_decode(&job->h);
3226 /* To be adapted if NB_ELTS_V8 changes */
3227 for (k = 0; k < job->rw; k++) {
3228 aj[k ] = job->h.wavelet[k].f[0];
3229 aj[k + (OPJ_SIZE_T)w ] = job->h.wavelet[k].f[1];
3230 aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2];
3231 aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3];
3233 for (k = 0; k < job->rw; k++) {
3234 aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4];
3235 aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5];
3236 aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6];
3237 aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7];
3240 aj += w * NB_ELTS_V8;
3243 opj_aligned_free(job->h.wavelet);
3252 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3253 OPJ_UINT32 nb_columns;
3254 } opj_dwt97_decode_v_job_t;
3256 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls)
3259 opj_dwt97_decode_v_job_t* job;
3260 OPJ_FLOAT32 * OPJ_RESTRICT aj;
3263 job = (opj_dwt97_decode_v_job_t*)user_data;
3265 assert((job->nb_columns % NB_ELTS_V8) == 0);
3268 for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) {
3271 opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8);
3272 opj_v8dwt_decode(&job->v);
3274 for (k = 0; k < job->rh; ++k) {
3275 memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k],
3276 NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3281 opj_aligned_free(job->v.wavelet);
3287 /* Inverse 9-7 wavelet transform in 2-D. */
3290 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp,
3291 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3297 opj_tcd_resolution_t* res = tilec->resolutions;
3299 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
3300 res->x0); /* width of the resolution level computed */
3301 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
3302 res->y0); /* height of the resolution level computed */
3304 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
3306 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
3308 OPJ_SIZE_T l_data_size;
3309 const int num_threads = opj_thread_pool_get_thread_count(tp);
3315 l_data_size = opj_dwt_max_resolution(res, numres);
3316 /* overflow check */
3317 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3318 /* FIXME event manager error callback */
3321 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3323 /* FIXME event manager error callback */
3326 v.wavelet = h.wavelet;
3329 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
3332 h.sn = (OPJ_INT32)rw;
3333 v.sn = (OPJ_INT32)rh;
3337 rw = (OPJ_UINT32)(res->x1 -
3338 res->x0); /* width of the resolution level computed */
3339 rh = (OPJ_UINT32)(res->y1 -
3340 res->y0); /* height of the resolution level computed */
3342 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3343 h.cas = res->x0 % 2;
3346 h.win_l_x1 = (OPJ_UINT32)h.sn;
3348 h.win_h_x1 = (OPJ_UINT32)h.dn;
3350 if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) {
3351 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3353 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8);
3354 opj_v8dwt_decode(&h);
3356 /* To be adapted if NB_ELTS_V8 changes */
3357 for (k = 0; k < rw; k++) {
3358 aj[k ] = h.wavelet[k].f[0];
3359 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1];
3360 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
3361 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
3363 for (k = 0; k < rw; k++) {
3364 aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4];
3365 aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5];
3366 aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6];
3367 aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7];
3370 aj += w * NB_ELTS_V8;
3373 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
3376 if ((rh / NB_ELTS_V8) < num_jobs) {
3377 num_jobs = rh / NB_ELTS_V8;
3379 step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3380 for (j = 0; j < num_jobs; j++) {
3381 opj_dwt97_decode_h_job_t* job;
3383 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t));
3385 opj_thread_pool_wait_completion(tp, 0);
3386 opj_aligned_free(h.wavelet);
3389 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3390 if (!job->h.wavelet) {
3391 opj_thread_pool_wait_completion(tp, 0);
3393 opj_aligned_free(h.wavelet);
3399 job->h.win_l_x0 = h.win_l_x0;
3400 job->h.win_l_x1 = h.win_l_x1;
3401 job->h.win_h_x0 = h.win_h_x0;
3402 job->h.win_h_x1 = h.win_h_x1;
3406 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~
3407 (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3408 aj += w * job->nb_rows;
3409 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job);
3411 opj_thread_pool_wait_completion(tp, 0);
3412 j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1);
3417 opj_v8dwt_interleave_h(&h, aj, w, rh - j);
3418 opj_v8dwt_decode(&h);
3419 for (k = 0; k < rw; k++) {
3421 for (l = 0; l < rh - j; l++) {
3422 aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l];
3427 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3428 v.cas = res->y0 % 2;
3430 v.win_l_x1 = (OPJ_UINT32)v.sn;
3432 v.win_h_x1 = (OPJ_UINT32)v.dn;
3434 aj = (OPJ_FLOAT32*) tilec->data;
3435 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) {
3436 for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) {
3439 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8);
3440 opj_v8dwt_decode(&v);
3442 for (k = 0; k < rh; ++k) {
3443 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32));
3448 /* "bench_dwt -I" shows that scaling is poor, likely due to RAM
3449 transfer being the limiting factor. So limit the number of
3452 OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U);
3455 if ((rw / NB_ELTS_V8) < num_jobs) {
3456 num_jobs = rw / NB_ELTS_V8;
3458 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8;
3459 for (j = 0; j < num_jobs; j++) {
3460 opj_dwt97_decode_v_job_t* job;
3462 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t));
3464 opj_thread_pool_wait_completion(tp, 0);
3465 opj_aligned_free(h.wavelet);
3468 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3469 if (!job->v.wavelet) {
3470 opj_thread_pool_wait_completion(tp, 0);
3472 opj_aligned_free(h.wavelet);
3478 job->v.win_l_x0 = v.win_l_x0;
3479 job->v.win_l_x1 = v.win_l_x1;
3480 job->v.win_h_x0 = v.win_h_x0;
3481 job->v.win_h_x1 = v.win_h_x1;
3485 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~
3486 (NB_ELTS_V8 - 1)) - j * step_j : step_j;
3487 aj += job->nb_columns;
3488 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job);
3490 opj_thread_pool_wait_completion(tp, 0);
3493 if (rw & (NB_ELTS_V8 - 1)) {
3496 j = rw & (NB_ELTS_V8 - 1);
3498 opj_v8dwt_interleave_v(&v, aj, w, j);
3499 opj_v8dwt_decode(&v);
3501 for (k = 0; k < rh; ++k) {
3502 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
3503 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
3508 opj_aligned_free(h.wavelet);
3513 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3516 opj_sparse_array_int32_t* sa;
3520 /* This value matches the maximum left/right extension given in tables */
3521 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
3522 /* we currently use 3. */
3523 const OPJ_UINT32 filter_width = 4U;
3525 opj_tcd_resolution_t* tr = tilec->resolutions;
3526 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
3528 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
3529 tr->x0); /* width of the resolution level computed */
3530 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
3531 tr->y0); /* height of the resolution level computed */
3533 OPJ_SIZE_T l_data_size;
3535 /* Compute the intersection of the area of interest, expressed in tile coordinates */
3536 /* with the tile coordinates */
3537 OPJ_UINT32 win_tcx0 = tilec->win_x0;
3538 OPJ_UINT32 win_tcy0 = tilec->win_y0;
3539 OPJ_UINT32 win_tcx1 = tilec->win_x1;
3540 OPJ_UINT32 win_tcy1 = tilec->win_y1;
3542 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
3546 sa = opj_dwt_init_sparse_array(tilec, numres);
3552 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3553 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3554 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3555 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3556 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3558 1, tr_max->win_x1 - tr_max->win_x0,
3562 opj_sparse_array_int32_free(sa);
3566 l_data_size = opj_dwt_max_resolution(tr, numres);
3567 /* overflow check */
3568 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) {
3569 /* FIXME event manager error callback */
3570 opj_sparse_array_int32_free(sa);
3573 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t));
3575 /* FIXME event manager error callback */
3576 opj_sparse_array_int32_free(sa);
3579 v.wavelet = h.wavelet;
3581 for (resno = 1; resno < numres; resno ++) {
3583 /* Window of interest subband-based coordinates */
3584 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
3585 OPJ_UINT32 win_hl_x0, win_hl_x1;
3586 OPJ_UINT32 win_lh_y0, win_lh_y1;
3587 /* Window of interest tile-resolution-based coordinates */
3588 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
3589 /* Tile-resolution subband-based coordinates */
3590 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
3594 h.sn = (OPJ_INT32)rw;
3595 v.sn = (OPJ_INT32)rh;
3597 rw = (OPJ_UINT32)(tr->x1 - tr->x0);
3598 rh = (OPJ_UINT32)(tr->y1 - tr->y0);
3600 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
3603 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
3606 /* Get the subband coordinates for the window of interest */
3608 opj_dwt_get_band_coordinates(tilec, resno, 0,
3609 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3610 &win_ll_x0, &win_ll_y0,
3611 &win_ll_x1, &win_ll_y1);
3614 opj_dwt_get_band_coordinates(tilec, resno, 1,
3615 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3616 &win_hl_x0, NULL, &win_hl_x1, NULL);
3619 opj_dwt_get_band_coordinates(tilec, resno, 2,
3620 win_tcx0, win_tcy0, win_tcx1, win_tcy1,
3621 NULL, &win_lh_y0, NULL, &win_lh_y1);
3623 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
3624 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
3625 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
3626 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
3627 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
3629 /* Subtract the origin of the bands for this tile, to the subwindow */
3630 /* of interest band coordinates, so as to get them relative to the */
3632 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
3633 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
3634 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
3635 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
3636 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
3637 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
3638 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
3639 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
3641 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
3642 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
3644 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
3645 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
3647 /* Compute the tile-resolution-based coordinates for the window of interest */
3649 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
3650 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
3652 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
3653 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
3657 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
3658 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
3660 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
3661 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
3664 h.win_l_x0 = win_ll_x0;
3665 h.win_l_x1 = win_ll_x1;
3666 h.win_h_x0 = win_hl_x0;
3667 h.win_h_x1 = win_hl_x1;
3668 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) {
3669 if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3670 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3671 j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
3672 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j));
3673 opj_v8dwt_decode(&h);
3674 if (!opj_sparse_array_int32_write(sa,
3676 win_tr_x1, j + NB_ELTS_V8,
3677 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3678 NB_ELTS_V8, 1, OPJ_TRUE)) {
3679 /* FIXME event manager error callback */
3680 opj_sparse_array_int32_free(sa);
3681 opj_aligned_free(h.wavelet);
3688 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) ||
3689 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn &&
3690 j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
3691 opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j);
3692 opj_v8dwt_decode(&h);
3693 if (!opj_sparse_array_int32_write(sa,
3696 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
3697 NB_ELTS_V8, 1, OPJ_TRUE)) {
3698 /* FIXME event manager error callback */
3699 opj_sparse_array_int32_free(sa);
3700 opj_aligned_free(h.wavelet);
3705 v.win_l_x0 = win_ll_y0;
3706 v.win_l_x1 = win_ll_y1;
3707 v.win_h_x0 = win_lh_y0;
3708 v.win_h_x1 = win_lh_y1;
3709 for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) {
3710 OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j);
3712 opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts);
3713 opj_v8dwt_decode(&v);
3715 if (!opj_sparse_array_int32_write(sa,
3717 j + nb_elts, win_tr_y1,
3718 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
3719 1, NB_ELTS_V8, OPJ_TRUE)) {
3720 /* FIXME event manager error callback */
3721 opj_sparse_array_int32_free(sa);
3722 opj_aligned_free(h.wavelet);
3729 OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
3730 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
3731 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
3732 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
3733 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
3735 1, tr_max->win_x1 - tr_max->win_x0,
3740 opj_sparse_array_int32_free(sa);
3742 opj_aligned_free(h.wavelet);
3747 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
3748 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
3751 if (p_tcd->whole_tile_decoding) {
3752 return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres);
3754 return opj_dwt_decode_partial_97(tilec, numres);