[1.5] Man page syntax fixes. Thanks to vskytta for patch.

[openjpeg.git] / libopenjpeg / dwt.c
diff --git a/libopenjpeg/dwt.c b/libopenjpeg/dwt.c

index f4f069df2f62c1e62d4db22828689a911cff53eb..0fbfc2033fe63f6bf499018800bbf6ab1622b5ae 100644 (file)
--- a/libopenjpeg/dwt.c
+++ b/libopenjpeg/dwt.c
@@ -64,12 +64,12 @@ typedef struct v4dwt_local {
         int             cas ;
  } v4dwt_t ;
  
-static const float alpha =  1.586134342f; //  12994
-static const float beta  =  0.052980118f; //    434
-static const float gamma = -0.882911075f; //  -7233
-static const float delta = -0.443506852f; //  -3633
+static const float dwt_alpha =  1.586134342f; /*  12994 */
+static const float dwt_beta  =  0.052980118f; /*    434 */
+static const float dwt_gamma = -0.882911075f; /*  -7233 */
+static const float dwt_delta = -0.443506852f; /*  -3633 */
  
-static const float K      = 1.230174105f; //  10078
+static const float K      = 1.230174105f; /*  10078 */
  /* FIXME: What is this constant? */
  static const float c13318 = 1.625732422f;
  
@@ -527,7 +527,7 @@ static void dwt_decode_tile(opj_tcd_tilecomp_t* tilec, int numres, DWT1DFN dwt_1
  
         int w = tilec->x1 - tilec->x0;
  
-       h.mem = opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
+       h.mem = (int*)opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
         v.mem = h.mem;
  
         while( --numres) {
@@ -570,6 +570,20 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
         int count = w->sn;
         int i, k;
         for(k = 0; k < 2; ++k){
+               if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+                       /* Fast code path */
+                       for(i = 0; i < count; ++i){
+                               int j = i;
+                               bi[i*8    ] = a[j];
+                               j += x;
+                               bi[i*8 + 1] = a[j];
+                               j += x;
+                               bi[i*8 + 2] = a[j];
+                               j += x;
+                               bi[i*8 + 3] = a[j];
+                       }
+               } else {
+                       /* Slow code path */
                 for(i = 0; i < count; ++i){
                         int j = i;
                         bi[i*8    ] = a[j];
@@ -583,6 +597,7 @@ static void v4dwt_interleave_h(v4dwt_t* restrict w, float* restrict a, int x, in
                         if(j > size) continue;
                         bi[i*8 + 3] = a[j];
                 }
+               }
                 bi = (float*) (w->wavelet + 1 - w->cas);
                 a += w->sn;
                 size -= w->sn;
@@ -608,9 +623,21 @@ static void v4dwt_interleave_v(v4dwt_t* restrict v , float* restrict a , int x){
  static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
         __m128* restrict vw = (__m128*) w;
         int i;
+       /* 4x unrolled loop */
+       for(i = 0; i < count >> 2; ++i){
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
+       }
+       count &= 3;
         for(i = 0; i < count; ++i){
-               __m128 tmp = vw[i*2];
-               vw[i*2] = tmp * c;
+               *vw = _mm_mul_ps(*vw, c);
+               vw += 2;
         }
  }
  
@@ -618,22 +645,24 @@ static void v4dwt_decode_step2_sse(v4* l, v4* w, int k, int m, __m128 c){
         __m128* restrict vl = (__m128*) l;
         __m128* restrict vw = (__m128*) w;
         int i;
+       __m128 tmp1, tmp2, tmp3;
+       tmp1 = vl[0];
         for(i = 0; i < m; ++i){
-               __m128 tmp1 = vl[ 0];
-               __m128 tmp2 = vw[-1];
-               __m128 tmp3 = vw[ 0];
-               vw[-1] = tmp2 + ((tmp1 + tmp3) * c);
-               vl = vw;
+               tmp2 = vw[-1];
+               tmp3 = vw[ 0];
+               vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+               tmp1 = tmp3;
                 vw += 2;
         }
+       vl = vw - 2;
         if(m >= k){
                 return;
         }
-       c += c;
-       c *= vl[0];
+       c = _mm_add_ps(c, c);
+       c = _mm_mul_ps(c, vl[0]);
         for(; m < k; ++m){
                 __m128 tmp = vw[-1];
-               vw[-1] = tmp + c;
+               vw[-1] = _mm_add_ps(tmp, c);
                 vw += 2;
         }
  }
@@ -726,17 +755,17 @@ static void v4dwt_decode(v4dwt_t* restrict dwt){
  #ifdef __SSE__
         v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
         v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
-       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(delta));
-       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(gamma));
-       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(beta));
-       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(alpha));
+       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
+       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
+       v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
+       v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
  #else
         v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
         v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
-       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), delta);
-       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), gamma);
-       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), beta);
-       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), alpha);
+       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
+       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
+       v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
+       v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
  #endif
  }
  
@@ -773,19 +802,24 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
                 h.dn = rw - h.sn;
                 h.cas = res->x0 % 2;
  
-               for(j = rh; j > 0; j -= 4){
+               for(j = rh; j > 3; j -= 4){
+                       int k;
                         v4dwt_interleave_h(&h, aj, w, bufsize);
                         v4dwt_decode(&h);
-                       if(j >= 4){
-                               int k;
                                 for(k = rw; --k >= 0;){
                                         aj[k    ] = h.wavelet[k].f[0];
                                         aj[k+w  ] = h.wavelet[k].f[1];
                                         aj[k+w*2] = h.wavelet[k].f[2];
                                         aj[k+w*3] = h.wavelet[k].f[3];
                                 }
-                       }else{
+                       aj += w*4;
+                       bufsize -= w*4;
+               }
+               if (rh & 0x03) {
                                 int k;
+                       j = rh & 0x03;
+                       v4dwt_interleave_h(&h, aj, w, bufsize);
+                       v4dwt_decode(&h);
                                 for(k = rw; --k >= 0;){
                                         switch(j) {
                                                 case 3: aj[k+w*2] = h.wavelet[k].f[2];
@@ -794,30 +828,29 @@ void dwt_decode_real(opj_tcd_tilecomp_t* restrict tilec, int numres){
                                         }
                                 }
                         }
-                       aj += w*4;
-                       bufsize -= w*4;
-               }
  
                 v.dn = rh - v.sn;
                 v.cas = res->y0 % 2;
  
                 aj = (float*) tilec->data;
-               for(j = rw; j > 0; j -= 4){
+               for(j = rw; j > 3; j -= 4){
+                       int k;
                         v4dwt_interleave_v(&v, aj, w);
                         v4dwt_decode(&v);
-                       if(j >= 4){
-                               int k;
                                 for(k = 0; k < rh; ++k){
                                         memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float));
                                 }
-                       }else{
+                       aj += 4;
+               }
+               if (rw & 0x03){
                                 int k;
+                       j = rw & 0x03;
+                       v4dwt_interleave_v(&v, aj, w);
+                       v4dwt_decode(&v);
                                 for(k = 0; k < rh; ++k){
                                         memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float));
                                 }
                         }
-                       aj += 4;
-               }
         }
  
         opj_aligned_free(h.wavelet);