+
+ for (i = 0; i < dn - 1; i++) {
+ row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2);
+ }
+ if ((width % 2) == 0) {
+ row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2);
+ }
+ memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32));
+ }
+ }
+}
+
+/** Process one line for the horizontal pass of the 9x7 forward transform */
+static
+void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn,
+ void* tmpIn,
+ OPJ_UINT32 width,
+ OPJ_BOOL even)
+{
+ OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn;
+ OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn;
+ const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1);
+ const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn);
+ if (width == 1) {
+ return;
+ }
+ memcpy(tmp, row, width * sizeof(OPJ_FLOAT32));
+ opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1);
+ opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp,
+ (OPJ_INT32 * OPJ_RESTRICT)row,
+ dn, sn, even ? 0 : 1);
+}
+
+typedef struct {
+ opj_dwt_t h;
+ OPJ_UINT32 rw; /* Width of the resolution to process */
+ OPJ_UINT32 w; /* Width of tiledp */
+ OPJ_INT32 * OPJ_RESTRICT tiledp;
+ OPJ_UINT32 min_j;
+ OPJ_UINT32 max_j;
+ opj_encode_and_deinterleave_h_one_row_fnptr_type p_function;
+} opj_dwt_encode_h_job_t;
+
+static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls)
+{
+ OPJ_UINT32 j;
+ opj_dwt_encode_h_job_t* job;
+ (void)tls;
+
+ job = (opj_dwt_encode_h_job_t*)user_data;
+ for (j = job->min_j; j < job->max_j; j++) {
+ OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w;
+ (*job->p_function)(aj, job->h.mem, job->rw,
+ job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE);
+ }
+
+ opj_aligned_free(job->h.mem);
+ opj_free(job);
+}
+
+typedef struct {
+ opj_dwt_t v;
+ OPJ_UINT32 rh;
+ OPJ_UINT32 w;
+ OPJ_INT32 * OPJ_RESTRICT tiledp;
+ OPJ_UINT32 min_j;
+ OPJ_UINT32 max_j;
+ opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v;
+} opj_dwt_encode_v_job_t;
+
+static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls)
+{
+ OPJ_UINT32 j;
+ opj_dwt_encode_v_job_t* job;
+ (void)tls;
+
+ job = (opj_dwt_encode_v_job_t*)user_data;
+ for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) {
+ (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
+ job->v.mem,
+ job->rh,
+ job->v.cas == 0,
+ job->w,
+ NB_ELTS_V8);
+ }
+ if (j < job->max_j) {
+ (*job->p_encode_and_deinterleave_v)(job->tiledp + j,
+ job->v.mem,
+ job->rh,
+ job->v.cas == 0,
+ job->w,
+ job->max_j - j);
+ }
+
+ opj_aligned_free(job->v.mem);
+ opj_free(job);
+}
+
+/** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */
+/* that has a NB_ELTS_V8 interleave factor. */
+static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn,
+ void *tmpOut,
+ OPJ_UINT32 height,
+ OPJ_UINT32 stride_width,
+ OPJ_UINT32 cols)
+{
+ const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn;
+ OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut;
+ if (cols == NB_ELTS_V8) {
+ OPJ_UINT32 k;
+ for (k = 0; k < height; ++k) {
+ memcpy(tmp + NB_ELTS_V8 * k,
+ array + k * stride_width,
+ NB_ELTS_V8 * sizeof(OPJ_INT32));
+ }
+ } else {
+ OPJ_UINT32 k;
+ for (k = 0; k < height; ++k) {
+ OPJ_UINT32 c;
+ for (c = 0; c < cols; c++) {
+ tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width];
+ }
+ for (; c < NB_ELTS_V8; c++) {
+ tmp[NB_ELTS_V8 * k + c] = 0;
+ }
+ }
+ }
+}
+
+/* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */
+/* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */
+/* columns. */
+static INLINE void opj_dwt_deinterleave_v_cols(
+ const OPJ_INT32 * OPJ_RESTRICT src,
+ OPJ_INT32 * OPJ_RESTRICT dst,
+ OPJ_INT32 dn,
+ OPJ_INT32 sn,
+ OPJ_UINT32 stride_width,
+ OPJ_INT32 cas,
+ OPJ_UINT32 cols)
+{
+ OPJ_INT32 k;
+ OPJ_INT32 i = sn;
+ OPJ_INT32 * OPJ_RESTRICT l_dest = dst;
+ const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8;
+ OPJ_UINT32 c;
+
+ for (k = 0; k < 2; k++) {
+ while (i--) {
+ if (cols == NB_ELTS_V8) {
+ memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32));
+ } else {
+ c = 0;
+ switch (cols) {
+ case 7:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ case 6:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ case 5:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ case 4:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ case 3:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ case 2:
+ l_dest[c] = l_src[c];
+ c++; /* fallthru */
+ default:
+ l_dest[c] = l_src[c];
+ break;
+ }
+ }
+ l_dest += stride_width;
+ l_src += 2 * NB_ELTS_V8;
+ }
+
+ l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width;
+ l_src = src + (1 - cas) * NB_ELTS_V8;
+ i = dn;
+ }
+}
+
+
+/* Forward 5-3 transform, for the vertical pass, processing cols columns */
+/* where cols <= NB_ELTS_V8 */
+static void opj_dwt_encode_and_deinterleave_v(
+ void *arrayIn,
+ void *tmpIn,
+ OPJ_UINT32 height,
+ OPJ_BOOL even,
+ OPJ_UINT32 stride_width,
+ OPJ_UINT32 cols)
+{
+ OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn;
+ OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn;
+ const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1;
+ const OPJ_UINT32 dn = height - sn;
+
+ opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols);
+
+#define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c]
+#define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c]
+
+#ifdef __SSE2__
+ if (height == 1) {
+ if (!even) {
+ OPJ_UINT32 c;
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ tmp[c] *= 2;
+ }
+ }
+ } else if (even) {
+ OPJ_UINT32 c;
+ OPJ_UINT32 i;
+ i = 0;
+ if (i + 1 < sn) {
+ __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0);
+ __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1);
+ for (; i + 1 < sn; i++) {
+ __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
+ (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
+ (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
+ __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+ xmm_Di_0 = _mm_sub_epi32(xmm_Di_0,
+ _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1));
+ xmm_Di_1 = _mm_sub_epi32(xmm_Di_1,
+ _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1));
+ *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
+ *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
+ xmm_Si_0 = xmm_Sip1_0;
+ xmm_Si_1 = xmm_Sip1_1;
+ }
+ }
+ if (((height) % 2) == 0) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) -= OPJ_Sc(i);
+ }
+ }
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
+ }
+ i = 1;
+ if (i < dn) {
+ __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
+ (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
+ (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
+ const __m128i xmm_two = _mm_set1_epi32(2);
+ for (; i < dn; i++) {
+ __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+ __m128i xmm_Si_0 = *(const __m128i*)(tmp +
+ (i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Si_1 = *(const __m128i*)(tmp +
+ (i * 2) * NB_ELTS_V8 + 4 * 1);
+ xmm_Si_0 = _mm_add_epi32(xmm_Si_0,
+ _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2));
+ xmm_Si_1 = _mm_add_epi32(xmm_Si_1,
+ _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2));
+ *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
+ *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
+ xmm_Dim1_0 = xmm_Di_0;
+ xmm_Dim1_1 = xmm_Di_1;
+ }
+ }
+ if (((height) % 2) == 1) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
+ }
+ }
+ } else {
+ OPJ_UINT32 c;
+ OPJ_UINT32 i;
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(0) -= OPJ_Dc(0);
+ }
+ i = 1;
+ if (i < sn) {
+ __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 +
+ (i - 1) * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 +
+ (i - 1) * 2) * NB_ELTS_V8 + 4 * 1);
+ for (; i < sn; i++) {
+ __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+ __m128i xmm_Si_0 = *(const __m128i*)(tmp +
+ (i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Si_1 = *(const __m128i*)(tmp +
+ (i * 2) * NB_ELTS_V8 + 4 * 1);
+ xmm_Si_0 = _mm_sub_epi32(xmm_Si_0,
+ _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1));
+ xmm_Si_1 = _mm_sub_epi32(xmm_Si_1,
+ _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1));
+ *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0;
+ *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1;
+ xmm_Dim1_0 = xmm_Di_0;
+ xmm_Dim1_1 = xmm_Di_1;
+ }
+ }
+ if (((height) % 2) == 1) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) -= OPJ_Dc(i - 1);
+ }
+ }
+ i = 0;
+ if (i + 1 < dn) {
+ __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0));
+ __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1));
+ const __m128i xmm_two = _mm_set1_epi32(2);
+ for (; i + 1 < dn; i++) {
+ __m128i xmm_Sip1_0 = *(const __m128i*)(tmp +
+ (i + 1) * 2 * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Sip1_1 = *(const __m128i*)(tmp +
+ (i + 1) * 2 * NB_ELTS_V8 + 4 * 1);
+ __m128i xmm_Di_0 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 0);
+ __m128i xmm_Di_1 = *(const __m128i*)(tmp +
+ (1 + i * 2) * NB_ELTS_V8 + 4 * 1);
+ xmm_Di_0 = _mm_add_epi32(xmm_Di_0,
+ _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2));
+ xmm_Di_1 = _mm_add_epi32(xmm_Di_1,
+ _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2));
+ *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0;
+ *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1;
+ xmm_Si_0 = xmm_Sip1_0;
+ xmm_Si_1 = xmm_Sip1_1;
+ }
+ }
+ if (((height) % 2) == 0) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
+ }
+ }
+ }
+#else
+ if (even) {
+ OPJ_UINT32 c;
+ if (height > 1) {
+ OPJ_UINT32 i;
+ for (i = 0; i + 1 < sn; i++) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1;
+ }
+ }
+ if (((height) % 2) == 0) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) -= OPJ_Sc(i);
+ }
+ }
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2;
+ }
+ for (i = 1; i < dn; i++) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2;
+ }
+ }
+ if (((height) % 2) == 1) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2;
+ }
+ }
+ }
+ } else {
+ OPJ_UINT32 c;
+ if (height == 1) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(0) *= 2;
+ }
+ } else {
+ OPJ_UINT32 i;
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(0) -= OPJ_Dc(0);
+ }
+ for (i = 1; i < sn; i++) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1;
+ }
+ }
+ if (((height) % 2) == 1) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Sc(i) -= OPJ_Dc(i - 1);
+ }
+ }
+ for (i = 0; i + 1 < dn; i++) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2;
+ }
+ }
+ if (((height) % 2) == 0) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2;
+ }
+ }
+ }
+ }
+#endif
+
+ if (cols == NB_ELTS_V8) {
+ opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
+ stride_width, even ? 0 : 1, NB_ELTS_V8);
+ } else {
+ opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn,
+ stride_width, even ? 0 : 1, cols);
+ }
+}
+
+static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw,
+ OPJ_UINT32 end,
+ const OPJ_FLOAT32 cst)
+{
+ OPJ_UINT32 i;
+#ifdef __SSE__
+ __m128* vw = (__m128*) fw;
+ const __m128 vcst = _mm_set1_ps(cst);
+ for (i = 0; i < end; ++i) {
+ vw[0] = _mm_mul_ps(vw[0], vcst);
+ vw[1] = _mm_mul_ps(vw[1], vcst);
+ vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+ }
+#else
+ OPJ_UINT32 c;
+ for (i = 0; i < end; ++i) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ fw[i * 2 * NB_ELTS_V8 + c] *= cst;
+ }
+ }
+#endif
+}
+
+static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw,
+ OPJ_UINT32 end,
+ OPJ_UINT32 m,
+ OPJ_FLOAT32 cst)
+{
+ OPJ_UINT32 i;
+ OPJ_UINT32 imax = opj_uint_min(end, m);
+#ifdef __SSE__
+ __m128* vw = (__m128*) fw;
+ __m128 vcst = _mm_set1_ps(cst);
+ if (imax > 0) {
+ __m128* vl = (__m128*) fl;
+ vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst));
+ vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst));
+ vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+ i = 1;
+
+ for (; i < imax; ++i) {
+ vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst));
+ vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst));
+ vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128));
+ }
+ }
+ if (m < end) {
+ assert(m + 1 == end);
+ vcst = _mm_add_ps(vcst, vcst);
+ vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst));
+ vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst));
+ }
+#else
+ OPJ_INT32 c;
+ if (imax > 0) {
+ for (c = 0; c < NB_ELTS_V8; c++) {
+ fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) *
+ cst;