Sub-tile decoding: speed up vertical pass in IDWT5x3 by processing 4 cols at a time
authorEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:00 +0000 (16:31 +0200)
committerEven Rouault <even.rouault@spatialys.com>
Fri, 1 Sep 2017 14:31:00 +0000 (16:31 +0200)
src/lib/openjp2/dwt.c
src/lib/openjp2/sparse_array.c

index 153bfa40b2755d7409be93d89f38c81f40a8660b..ae1cbd50ffcffe07d5766a6e6335c6d9430186b8 100644 (file)
@@ -1551,6 +1551,7 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
         OPJ_INT32 cas,
         opj_sparse_array_int32_t* sa,
         OPJ_UINT32 sa_col,
+        OPJ_UINT32 nb_cols,
         OPJ_UINT32 sn,
         OPJ_UINT32 win_l_y0,
         OPJ_UINT32 win_l_y1,
@@ -1560,15 +1561,15 @@ static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
     OPJ_BOOL ret;
     ret  = opj_sparse_array_int32_read(sa,
                                        sa_col, win_l_y0,
-                                       sa_col + 1, win_l_y1,
-                                       dest + cas + 2 * win_l_y0,
-                                       0, 2, OPJ_TRUE);
+                                       sa_col + nb_cols, win_l_y1,
+                                       dest + cas * 4 + 2 * 4 * win_l_y0,
+                                       1, 2 * 4, OPJ_TRUE);
     assert(ret);
     ret = opj_sparse_array_int32_read(sa,
                                       sa_col, sn + win_h_y0,
-                                      sa_col + 1, sn + win_h_y1,
-                                      dest + 1 - cas + 2 * win_h_y0,
-                                      0, 2, OPJ_TRUE);
+                                      sa_col + nb_cols, sn + win_h_y1,
+                                      dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
+                                      1, 2 * 4, OPJ_TRUE);
     assert(ret);
     OPJ_UNUSED(ret);
 }
@@ -1648,6 +1649,109 @@ static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn,
     }
 }
 
+#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
+#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
+#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
+#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
+#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
+#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
+
+static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
+        OPJ_UINT32 nb_cols,
+        OPJ_INT32 dn, OPJ_INT32 sn,
+        OPJ_INT32 cas,
+        OPJ_INT32 win_l_x0,
+        OPJ_INT32 win_l_x1,
+        OPJ_INT32 win_h_x0,
+        OPJ_INT32 win_h_x1)
+{
+    OPJ_INT32 i;
+    OPJ_UINT32 off;
+
+    (void)nb_cols;
+
+    if (!cas) {
+        if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
+
+            /* Naive version is :
+            for (i = win_l_x0; i < i_max; i++) {
+                OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
+            }
+            for (i = win_h_x0; i < win_h_x1; i++) {
+                OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
+            }
+            but the compiler doesn't manage to unroll it to avoid bound
+            checking in OPJ_S_ and OPJ_D_ macros
+            */
+
+            i = win_l_x0;
+            if (i < win_l_x1) {
+                OPJ_INT32 i_max;
+
+                /* Left-most case */
+                for (off = 0; off < 4; off++) {
+                    OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
+                }
+                i ++;
+
+                i_max = win_l_x1;
+                if (i_max > dn) {
+                    i_max = dn;
+                }
+                for (; i < i_max; i++) {
+                    /* No bound checking */
+                    for (off = 0; off < 4; off++) {
+                        OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
+                    }
+                }
+                for (; i < win_l_x1; i++) {
+                    /* Right-most case */
+                    for (off = 0; off < 4; off++) {
+                        OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
+                    }
+                }
+            }
+
+            i = win_h_x0;
+            if (i < win_h_x1) {
+                OPJ_INT32 i_max = win_h_x1;
+                if (i_max >= sn) {
+                    i_max = sn - 1;
+                }
+                for (; i < i_max; i++) {
+                    /* No bound checking */
+                    for (off = 0; off < 4; off++) {
+                        OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
+                    }
+                }
+                for (; i < win_h_x1; i++) {
+                    /* Right-most case */
+                    for (off = 0; off < 4; off++) {
+                        OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
+                    }
+                }
+            }
+        }
+    } else {
+        if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
+            for (off = 0; off < 4; off++) {
+                OPJ_S_off(0, off) /= 2;
+            }
+        } else {
+            for (i = win_l_x0; i < win_l_x1; i++) {
+                for (off = 0; off < 4; off++) {
+                    OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
+                }
+            }
+            for (i = win_h_x0; i < win_h_x1; i++) {
+                for (off = 0; off < 4; off++) {
+                    OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
+                }
+            }
+        }
+    }
+}
+
 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
         OPJ_UINT32 resno,
         OPJ_UINT32 bandno,
@@ -1804,13 +1908,14 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
     }
     h_mem_size = opj_dwt_max_resolution(tr, numres);
     /* overflow check */
-    if (h_mem_size > (SIZE_MAX / sizeof(OPJ_INT32))) {
+    /* in vertical pass, we process 4 columns at a time */
+    if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
         /* FIXME event manager error callback */
         opj_sparse_array_int32_free(sa);
         return OPJ_FALSE;
     }
 
-    h_mem_size *= sizeof(OPJ_INT32);
+    h_mem_size *= 4 * sizeof(OPJ_INT32);
     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
     if (! h.mem) {
         /* FIXME event manager error callback */
@@ -1946,31 +2051,35 @@ static OPJ_BOOL opj_dwt_decode_partial_tile(
             }
         }
 
-        for (i = win_tr_x0; i < win_tr_x1; ++i) {
+        for (i = win_tr_x0; i < win_tr_x1;) {
+            OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
             opj_dwt_interleave_partial_v(v.mem,
                                          v.cas,
                                          sa,
                                          i,
+                                         nb_cols,
                                          (OPJ_UINT32)v.sn,
                                          win_ll_y0,
                                          win_ll_y1,
                                          win_lh_y0,
                                          win_lh_y1);
-            opj_dwt_decode_partial_1(v.mem, v.dn, v.sn, v.cas,
-                                     (OPJ_INT32)win_ll_y0,
-                                     (OPJ_INT32)win_ll_y1,
-                                     (OPJ_INT32)win_lh_y0,
-                                     (OPJ_INT32)win_lh_y1);
+            opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
+                                              (OPJ_INT32)win_ll_y0,
+                                              (OPJ_INT32)win_ll_y1,
+                                              (OPJ_INT32)win_lh_y0,
+                                              (OPJ_INT32)win_lh_y1);
             if (!opj_sparse_array_int32_write(sa,
                                               i, win_tr_y0,
-                                              i + 1, win_tr_y1,
-                                              v.mem + win_tr_y0,
-                                              0, 1, OPJ_TRUE)) {
+                                              i + nb_cols, win_tr_y1,
+                                              v.mem + 4 * win_tr_y0,
+                                              1, 4, OPJ_TRUE)) {
                 /* FIXME event manager error callback */
                 opj_sparse_array_int32_free(sa);
                 opj_aligned_free(h.mem);
                 return OPJ_FALSE;
             }
+
+            i += nb_cols;
         }
     }
     opj_aligned_free(h.mem);
index b0634f67e8ecc4b9e653416dbd6db0f5778a51a7..48c4b23b0ca0c0c417bdf4259d231d00cbe98665 100644 (file)
@@ -165,10 +165,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                     if (buf_col_stride == 1) {
                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
                                                            (x - x0) * buf_col_stride;
-                        for (j = 0; j < y_incr; j++) {
-                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
-                            dest_ptr += buf_line_stride;
-                            src_ptr += block_width;
+                        if (x_incr == 4) {
+                            // Same code as general branch, but the compiler
+                            // can have an efficient memcpy()
+                            for (j = 0; j < y_incr; j++) {
+                                memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                                dest_ptr += buf_line_stride;
+                                src_ptr += block_width;
+                            }
+                        } else {
+                            for (j = 0; j < y_incr; j++) {
+                                memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                                dest_ptr += buf_line_stride;
+                                src_ptr += block_width;
+                            }
                         }
                     } else {
                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
@@ -179,6 +189,17 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                                 dest_ptr += buf_line_stride;
                                 src_ptr += block_width;
                             }
+                        } else if (y_incr == 1 && buf_col_stride == 2) {
+                            OPJ_UINT32 k;
+                            for (k = 0; k < (x_incr & ~3U); k += 4) {
+                                dest_ptr[k * buf_col_stride] = src_ptr[k];
+                                dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
+                                dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
+                                dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
+                            }
+                            for (; k < x_incr; k++) {
+                                dest_ptr[k * buf_col_stride] = src_ptr[k];
+                            }
                         } else {
                             /* General case */
                             for (j = 0; j < y_incr; j++) {
@@ -207,10 +228,20 @@ static OPJ_BOOL opj_sparse_array_int32_read_or_write(
                                                        (size_t)block_width + block_x_offset;
                     const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
                                                             (size_t)buf_line_stride + (x - x0) * buf_col_stride;
-                    for (j = 0; j < y_incr; j++) {
-                        memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
-                        dest_ptr += block_width;
-                        src_ptr += buf_line_stride;
+                    if (x_incr == 4) {
+                        // Same code as general branch, but the compiler
+                        // can have an efficient memcpy()
+                        for (j = 0; j < y_incr; j++) {
+                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                            dest_ptr += block_width;
+                            src_ptr += buf_line_stride;
+                        }
+                    } else {
+                        for (j = 0; j < y_incr; j++) {
+                            memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
+                            dest_ptr += block_width;
+                            src_ptr += buf_line_stride;
+                        }
                     }
                 } else {
                     OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *