sparse_array: optimizations for lossy case
[openjpeg.git] / src / lib / openjp2 / sparse_array.c
1 /*
2  * The copyright in this software is being made available under the 2-clauses
3  * BSD License, included below. This software may be subject to other third
4  * party and contributor rights, including patent rights, and no such rights
5  * are granted under this license.
6  *
7  * Copyright (c) 2017, IntoPix SA <contact@intopix.com>
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include "opj_includes.h"
33
34
35 struct opj_sparse_array_int32 {
36     OPJ_UINT32 width;
37     OPJ_UINT32 height;
38     OPJ_UINT32 block_width;
39     OPJ_UINT32 block_height;
40     OPJ_UINT32 block_count_hor;
41     OPJ_UINT32 block_count_ver;
42     OPJ_INT32** data_blocks;
43 };
44
45 opj_sparse_array_int32_t* opj_sparse_array_int32_create(OPJ_UINT32 width,
46         OPJ_UINT32 height,
47         OPJ_UINT32 block_width,
48         OPJ_UINT32 block_height)
49 {
50     opj_sparse_array_int32_t* sa;
51
52     if (width == 0 || height == 0 || block_width == 0 || block_height == 0) {
53         return NULL;
54     }
55     if (block_width > ((OPJ_UINT32)~0U) / block_height / sizeof(OPJ_INT32)) {
56         return NULL;
57     }
58
59     sa = opj_calloc(1, sizeof(opj_sparse_array_int32_t));
60     sa->width = width;
61     sa->height = height;
62     sa->block_width = block_width;
63     sa->block_height = block_height;
64     sa->block_count_hor = opj_uint_ceildiv(width, block_width);
65     sa->block_count_ver = opj_uint_ceildiv(height, block_height);
66     if (sa->block_count_hor > ((OPJ_UINT32)~0U) / sa->block_count_ver) {
67         opj_free(sa);
68         return NULL;
69     }
70     sa->data_blocks = opj_calloc(sizeof(OPJ_INT32*),
71                                  sa->block_count_hor * sa->block_count_ver);
72     if (sa->data_blocks == NULL) {
73         opj_free(sa);
74         return NULL;
75     }
76
77     return sa;
78 }
79
80 void opj_sparse_array_int32_free(opj_sparse_array_int32_t* sa)
81 {
82     if (sa) {
83         OPJ_UINT32 i;
84         for (i = 0; i < sa->block_count_hor * sa->block_count_ver; i++) {
85             if (sa->data_blocks[i]) {
86                 opj_free(sa->data_blocks[i]);
87             }
88         }
89         opj_free(sa->data_blocks);
90         opj_free(sa);
91     }
92 }
93
94 OPJ_BOOL opj_sparse_array_is_region_valid(const opj_sparse_array_int32_t* sa,
95         OPJ_UINT32 x0,
96         OPJ_UINT32 y0,
97         OPJ_UINT32 x1,
98         OPJ_UINT32 y1)
99 {
100     return !(x0 >= sa->width || x1 <= x0 || x1 > sa->width ||
101              y0 >= sa->height || y1 <= y0 || y1 > sa->height);
102 }
103
104 static OPJ_BOOL opj_sparse_array_int32_read_or_write(
105     const opj_sparse_array_int32_t* sa,
106     OPJ_UINT32 x0,
107     OPJ_UINT32 y0,
108     OPJ_UINT32 x1,
109     OPJ_UINT32 y1,
110     OPJ_INT32* buf,
111     OPJ_UINT32 buf_col_stride,
112     OPJ_UINT32 buf_line_stride,
113     OPJ_BOOL forgiving,
114     OPJ_BOOL is_read_op)
115 {
116     OPJ_UINT32 y, block_y;
117     OPJ_UINT32 y_incr = 0;
118     const OPJ_UINT32 block_width = sa->block_width;
119
120     if (!opj_sparse_array_is_region_valid(sa, x0, y0, x1, y1)) {
121         return forgiving;
122     }
123
124     block_y = y0 / sa->block_height;
125     for (y = y0; y < y1; block_y ++, y += y_incr) {
126         OPJ_UINT32 x, block_x;
127         OPJ_UINT32 x_incr = 0;
128         OPJ_UINT32 block_y_offset;
129         y_incr = (y == y0) ? sa->block_height - (y0 % sa->block_height) :
130                  sa->block_height;
131         block_y_offset = sa->block_height - y_incr;
132         y_incr = opj_uint_min(y_incr, y1 - y);
133         block_x = x0 / block_width;
134         for (x = x0; x < x1; block_x ++, x += x_incr) {
135             OPJ_UINT32 j;
136             OPJ_UINT32 block_x_offset;
137             OPJ_INT32* src_block;
138             x_incr = (x == x0) ? block_width - (x0 % block_width) : block_width;
139             block_x_offset = block_width - x_incr;
140             x_incr = opj_uint_min(x_incr, x1 - x);
141             src_block = sa->data_blocks[block_y * sa->block_count_hor + block_x];
142             if (is_read_op) {
143                 if (src_block == NULL) {
144                     if (buf_col_stride == 1) {
145                         OPJ_INT32* dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
146                                               (x - x0) * buf_col_stride;
147                         for (j = 0; j < y_incr; j++) {
148                             memset(dest_ptr, 0, sizeof(OPJ_INT32) * x_incr);
149                             dest_ptr += buf_line_stride;
150                         }
151                     } else {
152                         OPJ_INT32* dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
153                                               (x - x0) * buf_col_stride;
154                         for (j = 0; j < y_incr; j++) {
155                             OPJ_UINT32 k;
156                             for (k = 0; k < x_incr; k++) {
157                                 dest_ptr[k * buf_col_stride] = 0;
158                             }
159                             dest_ptr += buf_line_stride;
160                         }
161                     }
162                 } else {
163                     const OPJ_INT32* OPJ_RESTRICT src_ptr = src_block + block_y_offset *
164                                                             (size_t)block_width + block_x_offset;
165                     if (buf_col_stride == 1) {
166                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
167                                                            (x - x0) * buf_col_stride;
168                         if (x_incr == 4) {
169                             // Same code as general branch, but the compiler
170                             // can have an efficient memcpy()
171                             for (j = 0; j < y_incr; j++) {
172                                 memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
173                                 dest_ptr += buf_line_stride;
174                                 src_ptr += block_width;
175                             }
176                         } else {
177                             for (j = 0; j < y_incr; j++) {
178                                 memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
179                                 dest_ptr += buf_line_stride;
180                                 src_ptr += block_width;
181                             }
182                         }
183                     } else {
184                         OPJ_INT32* OPJ_RESTRICT dest_ptr = buf + (y - y0) * (size_t)buf_line_stride +
185                                                            (x - x0) * buf_col_stride;
186                         if (x_incr == 1) {
187                             for (j = 0; j < y_incr; j++) {
188                                 *dest_ptr = *src_ptr;
189                                 dest_ptr += buf_line_stride;
190                                 src_ptr += block_width;
191                             }
192                         } else if (y_incr == 1 && buf_col_stride == 2) {
193                             OPJ_UINT32 k;
194                             for (k = 0; k < (x_incr & ~3U); k += 4) {
195                                 dest_ptr[k * buf_col_stride] = src_ptr[k];
196                                 dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
197                                 dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
198                                 dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
199                             }
200                             for (; k < x_incr; k++) {
201                                 dest_ptr[k * buf_col_stride] = src_ptr[k];
202                             }
203                         } else if (x_incr >= 8 && buf_col_stride == 8) {
204                             for (j = 0; j < y_incr; j++) {
205                                 OPJ_UINT32 k;
206                                 for (k = 0; k < (x_incr & ~3U); k += 4) {
207                                     dest_ptr[k * buf_col_stride] = src_ptr[k];
208                                     dest_ptr[(k + 1) * buf_col_stride] = src_ptr[k + 1];
209                                     dest_ptr[(k + 2) * buf_col_stride] = src_ptr[k + 2];
210                                     dest_ptr[(k + 3) * buf_col_stride] = src_ptr[k + 3];
211                                 }
212                                 for (; k < x_incr; k++) {
213                                     dest_ptr[k * buf_col_stride] = src_ptr[k];
214                                 }
215                                 dest_ptr += buf_line_stride;
216                                 src_ptr += block_width;
217                             }
218                         } else {
219                             /* General case */
220                             for (j = 0; j < y_incr; j++) {
221                                 OPJ_UINT32 k;
222                                 for (k = 0; k < x_incr; k++) {
223                                     dest_ptr[k * buf_col_stride] = src_ptr[k];
224                                 }
225                                 dest_ptr += buf_line_stride;
226                                 src_ptr += block_width;
227                             }
228                         }
229                     }
230                 }
231             } else {
232                 if (src_block == NULL) {
233                     src_block = opj_calloc(1,
234                                            sa->block_width * sa->block_height * sizeof(OPJ_INT32));
235                     if (src_block == NULL) {
236                         return OPJ_FALSE;
237                     }
238                     sa->data_blocks[block_y * sa->block_count_hor + block_x] = src_block;
239                 }
240
241                 if (buf_col_stride == 1) {
242                     OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *
243                                                        (size_t)block_width + block_x_offset;
244                     const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
245                                                             (size_t)buf_line_stride + (x - x0) * buf_col_stride;
246                     if (x_incr == 4) {
247                         // Same code as general branch, but the compiler
248                         // can have an efficient memcpy()
249                         for (j = 0; j < y_incr; j++) {
250                             memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
251                             dest_ptr += block_width;
252                             src_ptr += buf_line_stride;
253                         }
254                     } else {
255                         for (j = 0; j < y_incr; j++) {
256                             memcpy(dest_ptr, src_ptr, sizeof(OPJ_INT32) * x_incr);
257                             dest_ptr += block_width;
258                             src_ptr += buf_line_stride;
259                         }
260                     }
261                 } else {
262                     OPJ_INT32* OPJ_RESTRICT dest_ptr = src_block + block_y_offset *
263                                                        (size_t)block_width + block_x_offset;
264                     const OPJ_INT32* OPJ_RESTRICT src_ptr = buf + (y - y0) *
265                                                             (size_t)buf_line_stride + (x - x0) * buf_col_stride;
266                     if (x_incr == 1) {
267                         for (j = 0; j < y_incr; j++) {
268                             *dest_ptr = *src_ptr;
269                             src_ptr += buf_line_stride;
270                             dest_ptr += block_width;
271                         }
272                     } else if (x_incr >= 8 && buf_col_stride == 8) {
273                         for (j = 0; j < y_incr; j++) {
274                             OPJ_UINT32 k;
275                             for (k = 0; k < (x_incr & ~3U); k += 4) {
276                                 dest_ptr[k] = src_ptr[k * buf_col_stride];
277                                 dest_ptr[k + 1] = src_ptr[(k + 1) * buf_col_stride];
278                                 dest_ptr[k + 2] = src_ptr[(k + 2) * buf_col_stride];
279                                 dest_ptr[k + 3] = src_ptr[(k + 3) * buf_col_stride];
280                             }
281                             for (; k < x_incr; k++) {
282                                 dest_ptr[k] = src_ptr[k * buf_col_stride];
283                             }
284                             src_ptr += buf_line_stride;
285                             dest_ptr += block_width;
286                         }
287                     } else {
288                         /* General case */
289                         for (j = 0; j < y_incr; j++) {
290                             OPJ_UINT32 k;
291                             for (k = 0; k < x_incr; k++) {
292                                 dest_ptr[k] = src_ptr[k * buf_col_stride];
293                             }
294                             src_ptr += buf_line_stride;
295                             dest_ptr += block_width;
296                         }
297                     }
298                 }
299             }
300         }
301     }
302
303     return OPJ_TRUE;
304 }
305
306 OPJ_BOOL opj_sparse_array_int32_read(const opj_sparse_array_int32_t* sa,
307                                      OPJ_UINT32 x0,
308                                      OPJ_UINT32 y0,
309                                      OPJ_UINT32 x1,
310                                      OPJ_UINT32 y1,
311                                      OPJ_INT32* dest,
312                                      OPJ_UINT32 dest_col_stride,
313                                      OPJ_UINT32 dest_line_stride,
314                                      OPJ_BOOL forgiving)
315 {
316     return opj_sparse_array_int32_read_or_write(
317                (opj_sparse_array_int32_t*)sa, x0, y0, x1, y1,
318                dest,
319                dest_col_stride,
320                dest_line_stride,
321                forgiving,
322                OPJ_TRUE);
323 }
324
325 OPJ_BOOL opj_sparse_array_int32_write(opj_sparse_array_int32_t* sa,
326                                       OPJ_UINT32 x0,
327                                       OPJ_UINT32 y0,
328                                       OPJ_UINT32 x1,
329                                       OPJ_UINT32 y1,
330                                       const OPJ_INT32* src,
331                                       OPJ_UINT32 src_col_stride,
332                                       OPJ_UINT32 src_line_stride,
333                                       OPJ_BOOL forgiving)
334 {
335     return opj_sparse_array_int32_read_or_write(sa, x0, y0, x1, y1,
336             (OPJ_INT32*)src,
337             src_col_stride,
338             src_line_stride,
339             forgiving,
340             OPJ_FALSE);
341 }