0c4cde4970861b6e34b723d0a97bc87b70193f3b
[openjpeg.git] / libopenjpeg / dwt.c
1 /*
2  * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium
3  * Copyright (c) 2002-2007, Professor Benoit Macq
4  * Copyright (c) 2001-2003, David Janssens
5  * Copyright (c) 2002-2003, Yannick Verschueren
6  * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe
7  * Copyright (c) 2005, Herve Drolon, FreeImage Team
8  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
9  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
10  * Copyright (c) 2008, Jerome Fimes, Communications & Systemes <jerome.fimes@c-s.fr>
11  * All rights reserved.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
23  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
26  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34
35 #ifdef __SSE__
36 #include <xmmintrin.h>
37 #endif
38
39 #include "dwt.h"
40 #include "j2k.h"
41 #include "tcd.h"
42 #include "fix.h"
43 #include "opj_malloc.h"
44 #include "int.h"
45
46 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
47 /*@{*/
48
49 #define WS(i) v->mem[(i)*2]
50 #define WD(i) v->mem[(1+(i)*2)]
51
52 /** @name Local data structures */
53 /*@{*/
54
55 typedef struct dwt_local {
56         OPJ_INT32* mem;
57         OPJ_INT32 dn;
58         OPJ_INT32 sn;
59         OPJ_INT32 cas;
60 } dwt_t;
61
62 typedef union {
63         OPJ_FLOAT32     f[4];
64 } v4;
65
66 typedef struct v4dwt_local {
67         v4*     wavelet ;
68         OPJ_INT32               dn ;
69         OPJ_INT32               sn ;
70         OPJ_INT32               cas ;
71 } v4dwt_t ;
72
73 static const OPJ_FLOAT32 dwt_alpha =  1.586134342f; //  12994
74 static const OPJ_FLOAT32 dwt_beta  =  0.052980118f; //    434
75 static const OPJ_FLOAT32 dwt_gamma = -0.882911075f; //  -7233
76 static const OPJ_FLOAT32 delta = -0.443506852f; //  -3633
77
78 static const OPJ_FLOAT32 K      = 1.230174105f; //  10078
79 /* FIXME: What is this constant? */
80 static const OPJ_FLOAT32 c13318 = 1.625732422f;
81
82 /*@}*/
83
84 /**
85 Virtual function type for wavelet transform in 1-D 
86 */
87 typedef void (*DWT1DFN)(dwt_t* v);
88
89 /** @name Local static functions */
90 /*@{*/
91
92 /**
93 Forward lazy transform (horizontal)
94 */
95 static void dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
96 /**
97 Forward lazy transform (vertical)
98 */
99 static void dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
100 /**
101 Inverse lazy transform (horizontal)
102 */
103 static void dwt_interleave_h(dwt_t* h, OPJ_INT32 *a);
104 /**
105 Inverse lazy transform (vertical)
106 */
107 static void dwt_interleave_v(dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x);
108 /**
109 Forward 5-3 wavelet transform in 1-D
110 */
111 static void dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
112 /**
113 Inverse 5-3 wavelet transform in 1-D
114 */
115 static void dwt_decode_1(dwt_t *v);
116 /**
117 Forward 9-7 wavelet transform in 1-D
118 */
119 static void dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
120 /**
121 Explicit calculation of the Quantization Stepsizes 
122 */
123 static void dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize);
124 /**
125 Inverse wavelet transform in 2-D.
126 */
127 static bool dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i, DWT1DFN fn);
128
129 static OPJ_UINT32 dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i);
130
131 static bool dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) );
132 /*@}*/
133
134 /*@}*/
135
136 #define S(i) a[(i)*2]
137 #define D(i) a[(1+(i)*2)]
138 #define S_(i) ((i)<0?S(0):((i)>=sn?S(sn-1):S(i)))
139 #define D_(i) ((i)<0?D(0):((i)>=dn?D(dn-1):D(i)))
140 /* new */
141 #define SS_(i) ((i)<0?S(0):((i)>=dn?S(dn-1):S(i)))
142 #define DD_(i) ((i)<0?D(0):((i)>=sn?D(sn-1):D(i)))
143
144 /* <summary>                                                              */
145 /* This table contains the norms of the 5-3 wavelets for different bands. */
146 /* </summary>                                                             */
147 static const OPJ_FLOAT64 dwt_norms[4][10] = {
148         {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
149         {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
150         {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
151         {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
152 };
153
154 /* <summary>                                                              */
155 /* This table contains the norms of the 9-7 wavelets for different bands. */
156 /* </summary>                                                             */
157 static const OPJ_FLOAT64 dwt_norms_real[4][10] = {
158         {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
159         {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
160         {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
161         {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
162 };
163
164 /* 
165 ==========================================================
166    local functions
167 ==========================================================
168 */
169
170 /* <summary>                                     */
171 /* Forward lazy transform (horizontal).  */
172 /* </summary>                            */ 
173 static void dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
174         OPJ_INT32 i;
175         
176         OPJ_INT32 * l_dest = b;
177         OPJ_INT32 * l_src = a+cas;
178     for 
179                 (i=0; i<sn; ++i) 
180         {
181                 *l_dest++ = *l_src;
182                 l_src += 2;
183         }
184         l_dest = b + sn;
185         l_src = a + 1 - cas;
186     for 
187                 (i=0; i<dn; ++i) 
188         {
189                 *l_dest++=*l_src;
190                 l_src += 2;
191         }
192 }
193
194 /* <summary>                             */  
195 /* Forward lazy transform (vertical).    */
196 /* </summary>                            */ 
197 static void dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) {
198     OPJ_INT32 i = sn;
199         OPJ_INT32 * l_dest = b;
200         OPJ_INT32 * l_src = a+cas;
201
202     while 
203                 (i--) 
204         {
205                 *l_dest = *l_src;
206                 l_dest += x;
207                 l_src += 2;
208                 /* b[i*x]=a[2*i+cas]; */
209         }
210         l_dest = b + sn * x;
211         l_src = a + 1 - cas;
212         
213         i = dn;
214     while 
215                 (i--) 
216         {
217                 *l_dest = *l_src;
218                 l_dest += x;
219                 l_src += 2;
220                 /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
221         }
222 }
223
224 /* <summary>                             */
225 /* Inverse lazy transform (horizontal).  */
226 /* </summary>                            */
227 static void dwt_interleave_h(dwt_t* h, OPJ_INT32 *a) {
228     OPJ_INT32 *ai = a;
229     OPJ_INT32 *bi = h->mem + h->cas;
230     OPJ_INT32  i        = h->sn;
231     while
232                 ( i-- ) 
233         {
234                 *bi = *(ai++);
235                 bi += 2;
236     }
237     ai  = a + h->sn;
238     bi  = h->mem + 1 - h->cas;
239     i   = h->dn ;
240     while
241                 ( i-- ) 
242         {
243                 *bi = *(ai++);
244                 bi += 2;
245     }
246 }
247
248 /* <summary>                             */  
249 /* Inverse lazy transform (vertical).    */
250 /* </summary>                            */ 
251 static void dwt_interleave_v(dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) {
252     OPJ_INT32 *ai = a;
253     OPJ_INT32 *bi = v->mem + v->cas;
254     OPJ_INT32  i = v->sn;
255     while( i-- ) {
256       *bi = *ai;
257           bi += 2;
258           ai += x;
259     }
260     ai = a + (v->sn * x);
261     bi = v->mem + 1 - v->cas;
262     i = v->dn ;
263     while( i-- ) {
264       *bi = *ai;
265           bi += 2;  
266           ai += x;
267     }
268 }
269
270
271 /* <summary>                            */
272 /* Forward 5-3 wavelet transform in 1-D. */
273 /* </summary>                           */
274 static void dwt_encode_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
275         OPJ_INT32 i;
276         
277         if (!cas) {
278                 if ((dn > 0) || (sn > 1)) {     /* NEW :  CASE ONE ELEMENT */
279                         for (i = 0; i < dn; i++) D(i) -= (S_(i) + S_(i + 1)) >> 1;
280                         for (i = 0; i < sn; i++) S(i) += (D_(i - 1) + D_(i) + 2) >> 2;
281                 }
282         } else {
283                 if (!sn && dn == 1)                 /* NEW :  CASE ONE ELEMENT */
284                         S(0) *= 2;
285                 else {
286                         for (i = 0; i < dn; i++) S(i) -= (DD_(i) + DD_(i - 1)) >> 1;
287                         for (i = 0; i < sn; i++) D(i) += (SS_(i) + SS_(i + 1) + 2) >> 2;
288                 }
289         }
290 }
291
292 /* <summary>                            */
293 /* Inverse 5-3 wavelet transform in 1-D. */
294 /* </summary>                           */ 
295 static void dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
296         OPJ_INT32 i;
297         
298         if (!cas) {
299                 if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
300                         for (i = 0; i < sn; i++) S(i) -= (D_(i - 1) + D_(i) + 2) >> 2;
301                         for (i = 0; i < dn; i++) D(i) += (S_(i) + S_(i + 1)) >> 1;
302                 }
303         } else {
304                 if (!sn  && dn == 1)          /* NEW :  CASE ONE ELEMENT */
305                         S(0) /= 2;
306                 else {
307                         for (i = 0; i < sn; i++) D(i) -= (SS_(i) + SS_(i + 1) + 2) >> 2;
308                         for (i = 0; i < dn; i++) S(i) += (DD_(i) + DD_(i - 1)) >> 1;
309                 }
310         }
311 }
312
313 /* <summary>                            */
314 /* Inverse 5-3 wavelet transform in 1-D. */
315 /* </summary>                           */ 
316 static void dwt_decode_1(dwt_t *v) {
317         dwt_decode_1_(v->mem, v->dn, v->sn, v->cas);
318 }
319
320 /* <summary>                             */
321 /* Forward 9-7 wavelet transform in 1-D. */
322 /* </summary>                            */
323 static void dwt_encode_1_real(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) {
324         OPJ_INT32 i;
325         if (!cas) {
326                 if ((dn > 0) || (sn > 1)) {     /* NEW :  CASE ONE ELEMENT */
327                         for (i = 0; i < dn; i++)
328                                 D(i) -= fix_mul(S_(i) + S_(i + 1), 12993);
329                         for (i = 0; i < sn; i++)
330                                 S(i) -= fix_mul(D_(i - 1) + D_(i), 434);
331                         for (i = 0; i < dn; i++)
332                                 D(i) += fix_mul(S_(i) + S_(i + 1), 7233);
333                         for (i = 0; i < sn; i++)
334                                 S(i) += fix_mul(D_(i - 1) + D_(i), 3633);
335                         for (i = 0; i < dn; i++)
336                                 D(i) = fix_mul(D(i), 5038);     /*5038 */
337                         for (i = 0; i < sn; i++)
338                                 S(i) = fix_mul(S(i), 6659);     /*6660 */
339                 }
340         } else {
341                 if ((sn > 0) || (dn > 1)) {     /* NEW :  CASE ONE ELEMENT */
342                         for (i = 0; i < dn; i++)
343                                 S(i) -= fix_mul(DD_(i) + DD_(i - 1), 12993);
344                         for (i = 0; i < sn; i++)
345                                 D(i) -= fix_mul(SS_(i) + SS_(i + 1), 434);
346                         for (i = 0; i < dn; i++)
347                                 S(i) += fix_mul(DD_(i) + DD_(i - 1), 7233);
348                         for (i = 0; i < sn; i++)
349                                 D(i) += fix_mul(SS_(i) + SS_(i + 1), 3633);
350                         for (i = 0; i < dn; i++)
351                                 S(i) = fix_mul(S(i), 5038);     /*5038 */
352                         for (i = 0; i < sn; i++)
353                                 D(i) = fix_mul(D(i), 6659);     /*6660 */
354                 }
355         }
356 }
357
358 static void dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, opj_stepsize_t *bandno_stepsize) {
359         OPJ_INT32 p, n;
360         p = int_floorlog2(stepsize) - 13;
361         n = 11 - int_floorlog2(stepsize);
362         bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
363         bandno_stepsize->expn = numbps - p;
364 }
365
366 /* 
367 ==========================================================
368    DWT interface
369 ==========================================================
370 */
371
372 /* <summary>                            */
373 /* Forward 5-3 wavelet transform in 2-D. */
374 /* </summary>                           */
375 INLINE bool dwt_encode_procedure(opj_tcd_tilecomp_t * tilec,void (*p_function)(OPJ_INT32 *, OPJ_INT32,OPJ_INT32,OPJ_INT32) )
376 {
377         OPJ_INT32 i, j, k;
378         OPJ_INT32 *a = 00;
379         OPJ_INT32 *aj = 00;
380         OPJ_INT32 *bj = 00;
381         OPJ_INT32 w, l;
382         
383         OPJ_INT32 rw;                   /* width of the resolution level computed   */
384         OPJ_INT32 rh;                   /* height of the resolution level computed  */
385         OPJ_INT32 l_data_size;
386         
387         opj_tcd_resolution_t * l_cur_res = 0;
388         opj_tcd_resolution_t * l_last_res = 0;
389
390         w = tilec->x1-tilec->x0;
391         l = tilec->numresolutions-1;
392         a = tilec->data;
393         
394         l_cur_res = tilec->resolutions + l;
395         l_last_res = l_cur_res - 1;
396
397         rw = l_cur_res->x1 - l_cur_res->x0;
398         rh = l_cur_res->y1 - l_cur_res->y0;
399
400         l_data_size = dwt_max_resolution( tilec->resolutions,tilec->numresolutions) * sizeof(OPJ_INT32);
401         bj = opj_malloc(l_data_size);
402         if
403                 (! bj)
404         {
405                 return false;
406         }
407         i = l;
408         
409         while
410                 (i--)
411         {
412                 OPJ_INT32 rw1;          /* width of the resolution level once lower than computed one                                       */
413                 OPJ_INT32 rh1;          /* height of the resolution level once lower than computed one                                      */
414                 OPJ_INT32 cas_col;      /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
415                 OPJ_INT32 cas_row;      /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
416                 OPJ_INT32 dn, sn;
417                 
418                 rw  = l_cur_res->x1 - l_cur_res->x0;
419                 rh  = l_cur_res->y1 - l_cur_res->y0;
420                 rw1 = l_last_res->x1 - l_last_res->x0;
421                 rh1 = l_last_res->y1 - l_last_res->y0;
422                 
423                 cas_row = l_cur_res->x0 & 1;
424                 cas_col = l_cur_res->y0 & 1;
425         
426                 sn = rh1;
427                 dn = rh - rh1;
428                 for 
429                         (j = 0; j < rw; ++j) 
430                 {
431                         aj = a + j;
432                         for 
433                                 (k = 0; k < rh; ++k)  
434                         {
435                                 bj[k] = aj[k*w];
436                         }
437                         (*p_function) (bj, dn, sn, cas_col);
438                         dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
439                 }
440                 sn = rw1;
441                 dn = rw - rw1;
442                 for (j = 0; j < rh; j++) 
443                 {
444                         aj = a + j * w;
445                         for (k = 0; k < rw; k++)  bj[k] = aj[k];
446                         (*p_function) (bj, dn, sn, cas_row);
447                         dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
448                 }
449                 l_cur_res = l_last_res;
450                 --l_last_res;
451         }
452         opj_free(bj);
453         return true;
454 }
455 /* Forward 5-3 wavelet transform in 2-D. */
456 /* </summary>                           */
457 bool dwt_encode(opj_tcd_tilecomp_t * tilec)
458 {
459         return dwt_encode_procedure(tilec,dwt_encode_1);
460 }
461
462 /* <summary>                            */
463 /* Inverse 5-3 wavelet transform in 2-D. */
464 /* </summary>                           */
465 bool dwt_decode(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) {
466         return dwt_decode_tile(tilec, numres, &dwt_decode_1);
467 }
468
469
470 /* <summary>                          */
471 /* Get gain of 5-3 wavelet transform. */
472 /* </summary>                         */
473 OPJ_UINT32 dwt_getgain(OPJ_UINT32 orient) {
474         if (orient == 0)
475                 return 0;
476         if (orient == 1 || orient == 2)
477                 return 1;
478         return 2;
479 }
480
481 /* <summary>                */
482 /* Get norm of 5-3 wavelet. */
483 /* </summary>               */
484 OPJ_FLOAT64 dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) {
485         return dwt_norms[orient][level];
486 }
487
488 /* <summary>                             */
489 /* Forward 9-7 wavelet transform in 2-D. */
490 /* </summary>                            */
491 bool dwt_encode_real(opj_tcd_tilecomp_t * tilec)
492 {
493         return dwt_encode_procedure(tilec,dwt_encode_1_real);
494 }
495
496
497
498 /* <summary>                          */
499 /* Get gain of 9-7 wavelet transform. */
500 /* </summary>                         */
501 OPJ_UINT32 dwt_getgain_real(OPJ_UINT32 orient) {
502         (void)orient;
503         return 0;
504 }
505
506 /* <summary>                */
507 /* Get norm of 9-7 wavelet. */
508 /* </summary>               */
509 OPJ_FLOAT64 dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) {
510         return dwt_norms_real[orient][level];
511 }
512
513 void dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) {
514         OPJ_UINT32 numbands, bandno;
515         numbands = 3 * tccp->numresolutions - 2;
516         for (bandno = 0; bandno < numbands; bandno++) {
517                 OPJ_FLOAT64 stepsize;
518                 OPJ_UINT32 resno, level, orient, gain;
519
520                 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
521                 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
522                 level = tccp->numresolutions - 1 - resno;
523                 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || (orient == 2)) ? 1 : 2));
524                 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
525                         stepsize = 1.0;
526                 } else {
527                         OPJ_FLOAT64 norm = dwt_norms_real[orient][level];
528                         stepsize = (1 << (gain)) / norm;
529                 }
530                 dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), prec + gain, &tccp->stepsizes[bandno]);
531         }
532 }
533
534
535 /* <summary>                             */
536 /* Determine maximum computed resolution level for inverse wavelet transform */
537 /* </summary>                            */
538 static OPJ_UINT32 dwt_max_resolution(opj_tcd_resolution_t* restrict r, OPJ_UINT32 i) {
539         OPJ_UINT32 mr   = 0;
540         OPJ_UINT32 w;
541         while( --i ) {
542                 ++r;
543                 if( mr < ( w = r->x1 - r->x0 ) )
544                         mr = w ;
545                 if( mr < ( w = r->y1 - r->y0 ) )
546                         mr = w ;
547         }
548         return mr ;
549 }
550
551
552 /* <summary>                            */
553 /* Inverse wavelet transform in 2-D.     */
554 /* </summary>                           */
555 static bool dwt_decode_tile(opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres, DWT1DFN dwt_1D) {
556         dwt_t h;
557         dwt_t v;
558
559         opj_tcd_resolution_t* tr = tilec->resolutions;
560
561         OPJ_UINT32 rw = tr->x1 - tr->x0;        /* width of the resolution level computed */
562         OPJ_UINT32 rh = tr->y1 - tr->y0;        /* height of the resolution level computed */
563
564         OPJ_UINT32 w = tilec->x1 - tilec->x0;
565
566         h.mem = opj_aligned_malloc(dwt_max_resolution(tr, numres) * sizeof(OPJ_INT32));
567         if
568                 (! h.mem)
569         {
570                 return false;
571         }
572
573         v.mem = h.mem;
574
575         while( --numres) {
576                 OPJ_INT32 * restrict tiledp = tilec->data;
577                 OPJ_UINT32 j;
578
579                 ++tr;
580                 h.sn = rw;
581                 v.sn = rh;
582
583                 rw = tr->x1 - tr->x0;
584                 rh = tr->y1 - tr->y0;
585
586                 h.dn = rw - h.sn;
587                 h.cas = tr->x0 % 2;
588
589                 for(j = 0; j < rh; ++j) {
590                         dwt_interleave_h(&h, &tiledp[j*w]);
591                         (dwt_1D)(&h);
592                         memcpy(&tiledp[j*w], h.mem, rw * sizeof(OPJ_INT32));
593                 }
594
595                 v.dn = rh - v.sn;
596                 v.cas = tr->y0 % 2;
597
598                 for(j = 0; j < rw; ++j){
599                         OPJ_UINT32 k;
600                         dwt_interleave_v(&v, &tiledp[j], w);
601                         (dwt_1D)(&v);
602                         for(k = 0; k < rh; ++k) {
603                                 tiledp[k * w + j] = v.mem[k];
604                         }
605                 }
606         }
607         opj_aligned_free(h.mem);
608         return true;
609 }
610
611 static void v4dwt_interleave_h(v4dwt_t* restrict w, OPJ_FLOAT32* restrict a, OPJ_INT32 x, OPJ_INT32 size){
612         OPJ_FLOAT32* restrict bi = (OPJ_FLOAT32*) (w->wavelet + w->cas);
613         OPJ_INT32 count = w->sn;
614         OPJ_INT32 i, k;
615         for(k = 0; k < 2; ++k){
616                 for(i = 0; i < count; ++i){
617                         OPJ_INT32 j = i;
618                         bi[i*8    ] = a[j];
619                         j += x;
620                         if(j >= size) continue;
621                         bi[i*8 + 1] = a[j];
622                         j += x;
623                         if(j >= size) continue;
624                         bi[i*8 + 2] = a[j];
625                         j += x;
626                         if(j >= size) continue;
627                         bi[i*8 + 3] = a[j];
628                 }
629                 bi = (OPJ_FLOAT32*) (w->wavelet + 1 - w->cas);
630                 a += w->sn;
631                 size -= w->sn;
632                 count = w->dn;
633         }
634 }
635
636 static void v4dwt_interleave_v(v4dwt_t* restrict v , OPJ_FLOAT32* restrict a , OPJ_INT32 x){
637         v4* restrict bi = v->wavelet + v->cas;
638         OPJ_INT32 i;
639         for(i = 0; i < v->sn; ++i){
640                 memcpy(&bi[i*2], &a[i*x], 4 * sizeof(OPJ_FLOAT32));
641         }
642         a += v->sn * x;
643         bi = v->wavelet + 1 - v->cas;
644         for(i = 0; i < v->dn; ++i){
645                 memcpy(&bi[i*2], &a[i*x], 4 * sizeof(OPJ_FLOAT32));
646         }
647 }
648
649 #ifdef __SSE__
650
651 static void v4dwt_decode_step1_sse(v4* w, OPJ_INT32 count, const __m128 c){
652         __m128* restrict vw = (__m128*) w;
653         OPJ_INT32 i;
654         for(i = 0; i < count; ++i){
655                 __m128 tmp = vw[i*2];
656                 vw[i*2] = tmp * c;
657         }
658 }
659
660 static void v4dwt_decode_step2_sse(v4* l, v4* w, OPJ_INT32 k, OPJ_INT32 m, __m128 c){
661         __m128* restrict vl = (__m128*) l;
662         __m128* restrict vw = (__m128*) w;
663         OPJ_INT32 i;
664         for(i = 0; i < m; ++i){
665                 __m128 tmp1 = vl[ 0];
666                 __m128 tmp2 = vw[-1];
667                 __m128 tmp3 = vw[ 0];
668                 vw[-1] = tmp2 + ((tmp1 + tmp3) * c);
669                 vl = vw;
670                 vw += 2;
671         }
672         if(m >= k){
673                 return;
674         }
675         c += c;
676         c *= vl[0];
677         for(; m < k; ++m){
678                 __m128 tmp = vw[-1];
679                 vw[-1] = tmp + c;
680                 vw += 2;
681         }
682 }
683
684 #else
685
686 static void v4dwt_decode_step1(v4* w, OPJ_INT32 count, const OPJ_FLOAT32 c){
687         OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;
688         OPJ_INT32 i;
689         for(i = 0; i < count; ++i){
690                 OPJ_FLOAT32 tmp1 = fw[i*8    ];
691                 OPJ_FLOAT32 tmp2 = fw[i*8 + 1];
692                 OPJ_FLOAT32 tmp3 = fw[i*8 + 2];
693                 OPJ_FLOAT32 tmp4 = fw[i*8 + 3];
694                 fw[i*8    ] = tmp1 * c;
695                 fw[i*8 + 1] = tmp2 * c;
696                 fw[i*8 + 2] = tmp3 * c;
697                 fw[i*8 + 3] = tmp4 * c;
698         }
699 }
700
701 static void v4dwt_decode_step2(v4* l, v4* w, OPJ_INT32 k, OPJ_INT32 m, OPJ_FLOAT32 c){
702         OPJ_FLOAT32* restrict fl = (OPJ_FLOAT32*) l;
703         OPJ_FLOAT32* restrict fw = (OPJ_FLOAT32*) w;
704         OPJ_INT32 i;
705         for(i = 0; i < m; ++i){
706                 OPJ_FLOAT32 tmp1_1 = fl[0];
707                 OPJ_FLOAT32 tmp1_2 = fl[1];
708                 OPJ_FLOAT32 tmp1_3 = fl[2];
709                 OPJ_FLOAT32 tmp1_4 = fl[3];
710                 OPJ_FLOAT32 tmp2_1 = fw[-4];
711                 OPJ_FLOAT32 tmp2_2 = fw[-3];
712                 OPJ_FLOAT32 tmp2_3 = fw[-2];
713                 OPJ_FLOAT32 tmp2_4 = fw[-1];
714                 OPJ_FLOAT32 tmp3_1 = fw[0];
715                 OPJ_FLOAT32 tmp3_2 = fw[1];
716                 OPJ_FLOAT32 tmp3_3 = fw[2];
717                 OPJ_FLOAT32 tmp3_4 = fw[3];
718                 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
719                 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
720                 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
721                 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
722                 fl = fw;
723                 fw += 8;
724         }
725         if(m < k){
726                 OPJ_FLOAT32 c1;
727                 OPJ_FLOAT32 c2;
728                 OPJ_FLOAT32 c3;
729                 OPJ_FLOAT32 c4;
730                 c += c;
731                 c1 = fl[0] * c;
732                 c2 = fl[1] * c;
733                 c3 = fl[2] * c;
734                 c4 = fl[3] * c;
735                 for(; m < k; ++m){
736                         OPJ_FLOAT32 tmp1 = fw[-4];
737                         OPJ_FLOAT32 tmp2 = fw[-3];
738                         OPJ_FLOAT32 tmp3 = fw[-2];
739                         OPJ_FLOAT32 tmp4 = fw[-1];
740                         fw[-4] = tmp1 + c1;
741                         fw[-3] = tmp2 + c2;
742                         fw[-2] = tmp3 + c3;
743                         fw[-1] = tmp4 + c4;
744                         fw += 8;
745                 }
746         }
747 }
748
749 #endif
750
751 /* <summary>                             */
752 /* Inverse 9-7 wavelet transform in 1-D. */
753 /* </summary>                            */
754 static void v4dwt_decode(v4dwt_t* restrict dwt){
755         OPJ_INT32 a, b;
756         if(dwt->cas == 0) {
757                 if(!((dwt->dn > 0) || (dwt->sn > 1))){
758                         return;
759                 }
760                 a = 0;
761                 b = 1;
762         }else{
763                 if(!((dwt->sn > 0) || (dwt->dn > 1))) {
764                         return;
765                 }
766                 a = 1;
767                 b = 0;
768         }
769 #ifdef __SSE__
770         v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
771         v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
772         v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(delta));
773         v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
774         v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
775         v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
776 #else
777         v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
778         v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
779         v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), delta);
780         v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
781         v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
782         v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
783 #endif
784 }
785
786 /* <summary>                             */
787 /* Inverse 9-7 wavelet transform in 2-D. */
788 /* </summary>                            */
789 bool dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, OPJ_UINT32 numres){
790         v4dwt_t h;
791         v4dwt_t v;
792
793         opj_tcd_resolution_t* res = tilec->resolutions;
794
795         OPJ_UINT32 rw = res->x1 - res->x0;      /* width of the resolution level computed */
796         OPJ_UINT32 rh = res->y1 - res->y0;      /* height of the resolution level computed */
797
798         OPJ_UINT32 w = tilec->x1 - tilec->x0;
799
800         h.wavelet = (v4*) opj_aligned_malloc((dwt_max_resolution(res, numres)+5) * sizeof(v4));
801         v.wavelet = h.wavelet;
802
803         while( --numres) {
804                 OPJ_FLOAT32 * restrict aj = (OPJ_FLOAT32*) tilec->data;
805                 OPJ_UINT32 bufsize = (tilec->x1 - tilec->x0) * (tilec->y1 - tilec->y0);
806                 OPJ_INT32 j;
807
808                 h.sn = rw;
809                 v.sn = rh;
810
811                 ++res;
812
813                 rw = res->x1 - res->x0; /* width of the resolution level computed */
814                 rh = res->y1 - res->y0; /* height of the resolution level computed */
815
816                 h.dn = rw - h.sn;
817                 h.cas = res->x0 & 1;
818
819                 for(j = rh; j > 0; j -= 4){
820                         v4dwt_interleave_h(&h, aj, w, bufsize);
821                         v4dwt_decode(&h);
822                         if(j >= 4){
823                                 OPJ_INT32 k = rw;
824                                 while
825                                         (--k >= 0)
826                                 {
827                                         aj[k    ] = h.wavelet[k].f[0];
828                                         aj[k+w  ] = h.wavelet[k].f[1];
829                                         aj[k+w*2] = h.wavelet[k].f[2];
830                                         aj[k+w*3] = h.wavelet[k].f[3];
831                                 }
832                         }else{
833                                 OPJ_INT32 k = rw;
834                                 while
835                                         (--k >= 0)
836                                 {
837                                         switch(j) {
838                                                 case 3: aj[k+w*2] = h.wavelet[k].f[2];
839                                                 case 2: aj[k+w  ] = h.wavelet[k].f[1];
840                                                 case 1: aj[k    ] = h.wavelet[k].f[0];
841                                         }
842                                 }
843                         }
844                         aj += w*4;
845                         bufsize -= w*4;
846                 }
847
848                 v.dn = rh - v.sn;
849                 v.cas = res->y0 % 2;
850
851                 aj = (OPJ_FLOAT32*) tilec->data;
852                 for(j = rw; j > 0; j -= 4){
853                         v4dwt_interleave_v(&v, aj, w);
854                         v4dwt_decode(&v);
855                         if(j >= 4){
856                                 OPJ_UINT32 k;
857                                 for(k = 0; k < rh; ++k){
858                                         memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
859                                 }
860                         }else{
861                                 OPJ_UINT32 k;
862                                 for(k = 0; k < rh; ++k){
863                                         memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(OPJ_FLOAT32));
864                                 }
865                         }
866                         aj += 4;
867                 }
868         }
869
870         opj_aligned_free(h.wavelet);
871         return true;
872 }
873