opj_t1_decode_cblks(): tiny perf increase when loop unrolling
authorEven Rouault <even.rouault@spatialys.com>
Mon, 23 May 2016 08:25:55 +0000 (10:25 +0200)
committerEven Rouault <even.rouault@spatialys.com>
Mon, 23 May 2016 09:53:53 +0000 (11:53 +0200)
src/lib/openjp2/t1.c

index 9ad6ffd0e7d44235ee0704977e672d12a0c14995..89a7ff85d1ee0e7f3df3c832783d57e9ba659291 100644 (file)
@@ -1559,7 +1559,18 @@ OPJ_BOOL opj_t1_decode_cblks(   opj_t1_t* t1,
                                        if (tccp->qmfbid == 1) {
                         OPJ_INT32* restrict tiledp = &tilec->data[(OPJ_UINT32)y * tile_w + (OPJ_UINT32)x];
                                                for (j = 0; j < cblk_h; ++j) {
-                                                       for (i = 0; i < cblk_w; ++i) {
+                                                       i = 0;
+                                                       for (; i < (cblk_w & ~3); i += 4) {
+                                                               OPJ_INT32 tmp0 = datap[(j * cblk_w) + i];
+                                                               OPJ_INT32 tmp1 = datap[(j * cblk_w) + i+1];
+                                                               OPJ_INT32 tmp2 = datap[(j * cblk_w) + i+2];
+                                                               OPJ_INT32 tmp3 = datap[(j * cblk_w) + i+3];
+                                                               ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp0/2;
+                                                               ((OPJ_INT32*)tiledp)[(j * tile_w) + i+1] = tmp1/2;
+                                                               ((OPJ_INT32*)tiledp)[(j * tile_w) + i+2] = tmp2/2;
+                                                               ((OPJ_INT32*)tiledp)[(j * tile_w) + i+3] = tmp3/2;
+                                                       }
+                                                       for (; i < cblk_w; ++i) {
                                                                OPJ_INT32 tmp = datap[(j * cblk_w) + i];
                                                                ((OPJ_INT32*)tiledp)[(j * tile_w) + i] = tmp/2;
                                                        }