opj_v4dwt_decode_step2_sse(): loop unroll
authorEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:08 +0000 (16:31 +0200)
committerEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:08 +0000 (16:31 +0200)
src/lib/openjp2/dwt.c

index 185248181ebd9e7186874a56cf695d78e754b579..71597f810613a47754ede9a611f280a01ef8006e 100644 (file)
@@ -2302,7 +2302,29 @@ static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
         vw += start * 2;
         tmp1 = vw[-3];
     }
-    for (i = start; i < imax; ++i) {
+
+    i = start;
+
+    /* 4x loop unrolling */
+    for (; i + 3 < imax; i += 4) {
+        __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
+        tmp2 = vw[-1];
+        tmp3 = vw[ 0];
+        tmp4 = vw[ 1];
+        tmp5 = vw[ 2];
+        tmp6 = vw[ 3];
+        tmp7 = vw[ 4];
+        tmp8 = vw[ 5];
+        tmp9 = vw[ 6];
+        vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
+        vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
+        vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
+        vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
+        tmp1 = tmp9;
+        vw += 8;
+    }
+
+    for (; i < imax; ++i) {
         tmp2 = vw[-1];
         tmp3 = vw[ 0];
         vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));