[1.5] Man page syntax fixes. Thanks to vskytta for patch.
[openjpeg.git] / libopenjpeg / dwt.c
index f4f069df2f62c1e62d4db22828689a911cff53eb..0fbfc2033fe63f6bf499018800bbf6ab1622b5ae 100644 (file)
@@ -64,12 +64,12 @@ typedef struct v4dwt_local {
        int             cas ;
 } v4dwt_t ;
 
-static const float alpha =  1.586134342f; //  12994
-static const float beta  =  0.052980118f; //    434
-static const float gamma = -0.882911075f; //  -7233
-static const float delta = -0.443506852f; //  -3633
+static const float dwt_alpha =  1.586134342f; /*  12994 */
+static const float dwt_beta  =  0.052980118f; /*    434 */
+static const float dwt_gamma = -0.882911075f; /*  -7233 */
+static const float dwt_delta = -0.443506852f; /*  -3633 */
 
-static const float K      = 1.230174105f; //  10078
+static const float K      = 1.230174105f; /*  10078 */
 /* FIXME: What is this constant? */
 static const float c13318 = 1.625732422f;
 
@@ -527,7 +527,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
 
        int w = tilec->x1 - tilec->x0;
 
-       h.mem = opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
+       h.mem = (int*)opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
        v.mem = h.mem;
 
        while( --numres) {
@@ -570,6 +570,20 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
        int count = w->sn;
        int i, k;
        for(k = 0; k < 2; ++k){
+               if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+                       /* Fast code path */
+                       for(i = 0; i < count; ++i){
+                               int j = i;
+                               bi[i*8    ] = a[j];
+                               j += x;
+                               bi[i*8 + 1] = a[j];
+                               j += x;
+                               bi[i*8 + 2] = a[j];
+                               j += x;
+                               bi[i*8 + 3] = a[j];
+                       }
+               } else {
+                       /* Slow code path */
                for(i = 0; i < count; ++i){
                        int j = i;
                        bi[i*8    ] = a[j];
@@ -583,6 +597,7 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
                        if(j > size) continue;
                        bi[i*8 + 3] = a[j];
                }
+               }
                bi = (float*) (w->wavelet + 1 - w->cas);
                a += w->sn;
                size -= w->sn;
@@ -608,9 +623,21 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
 static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
        __m128* restrict vw = (__m128*) w;
        int i;
+       /* 4x unrolled loop */
+       for(i = 0; i < count >> 2; ++i){
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+       }
+       count &= 3;
        for(i = 0; i < count; ++i){
-               __m128 tmp = vw[i*2];
-               vw[i*2] = tmp * c;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
        }
 }
 
@@ -618,22 +645,24 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
        __m128* restrict vl = (__m128*) l;
        __m128* restrict vw = (__m128*) w;
        int i;
+       __m128 tmp1, tmp2, tmp3;
+       tmp1 = vl[0];
        for(i = 0; i < m; ++i){
-               __m128 tmp1 = vl[ 0];
-               __m128 tmp2 = vw[-1];
-               __m128 tmp3 = vw[ 0];
-               vw[-1] = tmp2 + ((tmp1 + tmp3) * c);
-               vl = vw;
+               tmp2 = vw[-1];
+               tmp3 = vw[ 0];
+               vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+               tmp1 = tmp3;
                vw += 2;
        }
+       vl = vw - 2;
        if(m >= k){
                return;
        }
-       c += c;
-       c *= vl[0];
+       c = _mm_add_ps(c, c);
+       c = _mm_mul_ps(c, vl[0]);
        for(; m < k; ++m){
                __m128 tmp = vw[-1];
-               vw[-1] = tmp + c;
+               vw[-1] = _mm_add_ps(tmp, c);
                vw += 2;
        }
 }
@@ -726,17 +755,17 @@ static void v4dwt_decode(v4dwt_t* restrict dwt){
 #ifdef __SSE__
        v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
        v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
-       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(delta));
-       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(gamma));
-       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(beta));
-       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(alpha));
+       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
+       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
+       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
+       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
 #else
        v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
        v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
-       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), delta);
-       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), gamma);
-       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), beta);
-       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), alpha);
+       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
+       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
+       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
+       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
 #endif
 }
 
@@ -773,19 +802,24 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
                h.dn = rw - h.sn;
                h.cas = res->x0 % 2;
 
-               for(j = rh; j > 0; j -= 4){
+               for(j = rh; j > 3; j -= 4){
+                       int k;
                        v4dwt_interleave_h(&h, aj, w, bufsize);
                        v4dwt_decode(&h);
-                       if(j >= 4){
-                               int k;
                                for(k = rw; --k >= 0;){
                                        aj[k    ] = h.wavelet[k].f[0];
                                        aj[k+w  ] = h.wavelet[k].f[1];
                                        aj[k+w*2] = h.wavelet[k].f[2];
                                        aj[k+w*3] = h.wavelet[k].f[3];
                                }
-                       }else{
+                       aj += w*4;
+                       bufsize -= w*4;
+               }
+               if (rh & 0x03) {
                                int k;
+                       j = rh & 0x03;
+                       v4dwt_interleave_h(&h, aj, w, bufsize);
+                       v4dwt_decode(&h);
                                for(k = rw; --k >= 0;){
                                        switch(j) {
                                                case 3: aj[k+w*2] = h.wavelet[k].f[2];
@@ -794,30 +828,29 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
                                        }
                                }
                        }
-                       aj += w*4;
-                       bufsize -= w*4;
-               }
 
                v.dn = rh - v.sn;
                v.cas = res->y0 % 2;
 
                aj = (float*) tilec->data;
-               for(j = rw; j > 0; j -= 4){
+               for(j = rw; j > 3; j -= 4){
+                       int k;
                        v4dwt_interleave_v(&v, aj, w);
                        v4dwt_decode(&v);
-                       if(j >= 4){
-                               int k;
                                for(k = 0; k < rh; ++k){
                                        memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float));
                                }
-                       }else{
+                       aj += 4;
+               }
+               if (rw & 0x03){
                                int k;
+                       j = rw & 0x03;
+                       v4dwt_interleave_v(&v, aj, w);
+                       v4dwt_decode(&v);
                                for(k = 0; k < rh; ++k){
                                        memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float));
                                }
                        }
-                       aj += 4;
-               }
        }
 
        opj_aligned_free(h.wavelet);