From c1e0fba0c46cb528a08b89b986e86ff0f4792558 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Fri, 1 Sep 2017 22:09:58 +0200 Subject: [PATCH] opj_v4dwt_decode_step1_sse(): rework a bit to improve code generation --- src/lib/openjp2/dwt.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 71597f81..719c7330 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -2274,14 +2274,19 @@ static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, __m128* OPJ_RESTRICT vw = (__m128*) w; OPJ_UINT32 i; /* 4x unrolled loop */ - for (i = start; i + 3 < end; i += 4) { - vw[2 * i] = _mm_mul_ps(vw[2 * i], c); - vw[2 * i + 2] = _mm_mul_ps(vw[2 * i + 2], c); - vw[2 * i + 4] = _mm_mul_ps(vw[2 * i + 4], c); - vw[2 * i + 6] = _mm_mul_ps(vw[2 * i + 6], c); - } - for (; i < end; ++i) { - vw[2 * i] = _mm_mul_ps(vw[2 * i], c); + vw += 2 * start; + for (i = start; i + 3 < end; i += 4, vw += 8) { + __m128 xmm0 = _mm_mul_ps(vw[0], c); + __m128 xmm2 = _mm_mul_ps(vw[2], c); + __m128 xmm4 = _mm_mul_ps(vw[4], c); + __m128 xmm6 = _mm_mul_ps(vw[6], c); + vw[0] = xmm0; + vw[2] = xmm2; + vw[4] = xmm4; + vw[6] = xmm6; + } + for (; i < end; ++i, vw += 2) { + vw[0] = _mm_mul_ps(vw[0], c); } } -- 2.30.2