opj_dwt_decode_partial_1_parallel(): add SSE2 optimization
authorEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:02 +0000 (16:31 +0200)
committerEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:02 +0000 (16:31 +0200)
src/lib/openjp2/dwt.c

index ae1cbd50ffcffe07d5766a6e6335c6d9430186b8..cbbe50d0d3eaca0edb2cf1832304788f33451c88 100644 (file)
@@ -1698,6 +1698,28 @@ static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
                 if (i_max > dn) {
                     i_max = dn;
                 }
+
+#ifdef __SSE2__
+                if (i + 1 < i_max) {
+                    const __m128i two = _mm_set1_epi32(2);
+                    __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
+                    for (; i + 1 < i_max; i += 2) {
+                        /* No bound checking */
+                        __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
+                        __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
+                        __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
+                        __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
+                        S = _mm_sub_epi32(S,
+                                          _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
+                        S1 = _mm_sub_epi32(S1,
+                                           _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
+                        _mm_store_si128((__m128i*)(a + i * 8), S);
+                        _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
+                        Dm1 = D1;
+                    }
+                }
+#endif
+
                 for (; i < i_max; i++) {
                     /* No bound checking */
                     for (off = 0; off < 4; off++) {
@@ -1718,6 +1740,25 @@ static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
                 if (i_max >= sn) {
                     i_max = sn - 1;
                 }
+
+#ifdef __SSE2__
+                if (i + 1 < i_max) {
+                    __m128i S =  _mm_load_si128((__m128i * const)(a + i * 8));
+                    for (; i + 1 < i_max; i += 2) {
+                        /* No bound checking */
+                        __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
+                        __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
+                        __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
+                        __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
+                        D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
+                        D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
+                        _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
+                        _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
+                        S = S2;
+                    }
+                }
+#endif
+
                 for (; i < i_max; i++) {
                     /* No bound checking */
                     for (off = 0; off < 4; off++) {