int cas ;
} v4dwt_t ;
-static const float alpha = 1.586134342f; // 12994
-static const float beta = 0.052980118f; // 434
-static const float gamma = -0.882911075f; // -7233
-static const float delta = -0.443506852f; // -3633
+static const float dwt_alpha = 1.586134342f; /* 12994 */
+static const float dwt_beta = 0.052980118f; /* 434 */
+static const float dwt_gamma = -0.882911075f; /* -7233 */
+static const float dwt_delta = -0.443506852f; /* -3633 */
-static const float K = 1.230174105f; // 10078
+static const float K = 1.230174105f; /* 10078 */
/* FIXME: What is this constant? */
static const float c13318 = 1.625732422f;
int w = tilec->x1 - tilec->x0;
- h.mem = opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
+ h.mem = (int*)opj_aligned_malloc(dwt_decode_max_resolution(tr, numres) * sizeof(int));
v.mem = h.mem;
while( --numres) {
int count = w->sn;
int i, k;
for(k = 0; k < 2; ++k){
+ if (count + 3 * x < size && ((size_t) a & 0x0f) == 0 && ((size_t) bi & 0x0f) == 0 && (x & 0x0f) == 0) {
+ /* Fast code path */
+ for(i = 0; i < count; ++i){
+ int j = i;
+ bi[i*8 ] = a[j];
+ j += x;
+ bi[i*8 + 1] = a[j];
+ j += x;
+ bi[i*8 + 2] = a[j];
+ j += x;
+ bi[i*8 + 3] = a[j];
+ }
+ } else {
+ /* Slow code path */
for(i = 0; i < count; ++i){
int j = i;
bi[i*8 ] = a[j];
if(j > size) continue;
bi[i*8 + 3] = a[j];
}
+ }
bi = (float*) (w->wavelet + 1 - w->cas);
a += w->sn;
size -= w->sn;
static void v4dwt_decode_step1_sse(v4* w, int count, const __m128 c){
__m128* restrict vw = (__m128*) w;
int i;
+ /* 4x unrolled loop */
+ for(i = 0; i < count >> 2; ++i){
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
+ }
+ count &= 3;
for(i = 0; i < count; ++i){
- __m128 tmp = vw[i*2];
- vw[i*2] = tmp * c;
+ *vw = _mm_mul_ps(*vw, c);
+ vw += 2;
}
}
__m128* restrict vl = (__m128*) l;
__m128* restrict vw = (__m128*) w;
int i;
+ __m128 tmp1, tmp2, tmp3;
+ tmp1 = vl[0];
for(i = 0; i < m; ++i){
- __m128 tmp1 = vl[ 0];
- __m128 tmp2 = vw[-1];
- __m128 tmp3 = vw[ 0];
- vw[-1] = tmp2 + ((tmp1 + tmp3) * c);
- vl = vw;
+ tmp2 = vw[-1];
+ tmp3 = vw[ 0];
+ vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+ tmp1 = tmp3;
vw += 2;
}
+ vl = vw - 2;
if(m >= k){
return;
}
- c += c;
- c *= vl[0];
+ c = _mm_add_ps(c, c);
+ c = _mm_mul_ps(c, vl[0]);
for(; m < k; ++m){
__m128 tmp = vw[-1];
- vw[-1] = tmp + c;
+ vw[-1] = _mm_add_ps(tmp, c);
vw += 2;
}
}
#ifdef __SSE__
v4dwt_decode_step1_sse(dwt->wavelet+a, dwt->sn, _mm_set1_ps(K));
v4dwt_decode_step1_sse(dwt->wavelet+b, dwt->dn, _mm_set1_ps(c13318));
- v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(delta));
- v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(gamma));
- v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(beta));
- v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(alpha));
+ v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_delta));
+ v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_gamma));
+ v4dwt_decode_step2_sse(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), _mm_set1_ps(dwt_beta));
+ v4dwt_decode_step2_sse(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), _mm_set1_ps(dwt_alpha));
#else
v4dwt_decode_step1(dwt->wavelet+a, dwt->sn, K);
v4dwt_decode_step1(dwt->wavelet+b, dwt->dn, c13318);
- v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), delta);
- v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), gamma);
- v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), beta);
- v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), alpha);
+ v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_delta);
+ v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_gamma);
+ v4dwt_decode_step2(dwt->wavelet+b, dwt->wavelet+a+1, dwt->sn, int_min(dwt->sn, dwt->dn-a), dwt_beta);
+ v4dwt_decode_step2(dwt->wavelet+a, dwt->wavelet+b+1, dwt->dn, int_min(dwt->dn, dwt->sn-b), dwt_alpha);
#endif
}
h.dn = rw - h.sn;
h.cas = res->x0 % 2;
- for(j = rh; j > 0; j -= 4){
+ for(j = rh; j > 3; j -= 4){
+ int k;
v4dwt_interleave_h(&h, aj, w, bufsize);
v4dwt_decode(&h);
- if(j >= 4){
- int k;
for(k = rw; --k >= 0;){
aj[k ] = h.wavelet[k].f[0];
aj[k+w ] = h.wavelet[k].f[1];
aj[k+w*2] = h.wavelet[k].f[2];
aj[k+w*3] = h.wavelet[k].f[3];
}
- }else{
+ aj += w*4;
+ bufsize -= w*4;
+ }
+ if (rh & 0x03) {
int k;
+ j = rh & 0x03;
+ v4dwt_interleave_h(&h, aj, w, bufsize);
+ v4dwt_decode(&h);
for(k = rw; --k >= 0;){
switch(j) {
case 3: aj[k+w*2] = h.wavelet[k].f[2];
}
}
}
- aj += w*4;
- bufsize -= w*4;
- }
v.dn = rh - v.sn;
v.cas = res->y0 % 2;
aj = (float*) tilec->data;
- for(j = rw; j > 0; j -= 4){
+ for(j = rw; j > 3; j -= 4){
+ int k;
v4dwt_interleave_v(&v, aj, w);
v4dwt_decode(&v);
- if(j >= 4){
- int k;
for(k = 0; k < rh; ++k){
memcpy(&aj[k*w], &v.wavelet[k], 4 * sizeof(float));
}
- }else{
+ aj += 4;
+ }
+ if (rw & 0x03){
int k;
+ j = rw & 0x03;
+ v4dwt_interleave_v(&v, aj, w);
+ v4dwt_decode(&v);
for(k = 0; k < rh; ++k){
memcpy(&aj[k*w], &v.wavelet[k], j * sizeof(float));
}
}
- aj += 4;
- }
}
opj_aligned_free(h.wavelet);