X-Git-Url: https://main.carlh.net/gitweb/?a=blobdiff_plain;f=libopenjpeg%2Fmct.c;h=870993b06d250b01f4a8163b960dd00934190224;hb=6e0162a8a67932e59489b6f6a3bec01d64d2db8b;hp=f41c9ab05baed69375bd5bcebd2f3e360f4ed0a8;hpb=95bc884365deb41c357583874c23d82eac7cad2d;p=openjpeg.git diff --git a/libopenjpeg/mct.c b/libopenjpeg/mct.c index f41c9ab0..870993b0 100644 --- a/libopenjpeg/mct.c +++ b/libopenjpeg/mct.c @@ -1,9 +1,10 @@ /* + * Copyright (c) 2002-2007, Communications and Remote Sensing Laboratory, Universite catholique de Louvain (UCL), Belgium + * Copyright (c) 2002-2007, Professor Benoit Macq * Copyright (c) 2001-2003, David Janssens * Copyright (c) 2002-2003, Yannick Verschueren - * Copyright (c) 2003-2005, Francois Devaux and Antonin Descampe - * Copyright (c) 2005, Hervé Drolon, FreeImage Team - * Copyright (c) 2002-2005, Communications and remote sensing Laboratory, Universite catholique de Louvain, Belgium + * Copyright (c) 2003-2007, Francois-Olivier Devaux and Antonin Descampe + * Copyright (c) 2005, Herve Drolon, FreeImage Team * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,6 +29,10 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#ifdef __SSE__ +#include +#endif + #include "opj_includes.h" /* */ @@ -43,16 +48,20 @@ static const double mct_norms_real[3] = { 1.732, 1.805, 1.573 }; /* */ /* Foward reversible MCT. */ /* */ -void mct_encode(int *c0, int *c1, int *c2, int n) { +void mct_encode( + int* restrict c0, + int* restrict c1, + int* restrict c2, + int n) +{ int i; - for (i = 0; i < n; i++) { - int r, g, b, y, u, v; - r = c0[i]; - g = c1[i]; - b = c2[i]; - y = (r + (g << 1) + b) >> 2; - u = b - g; - v = r - g; + for(i = 0; i < n; ++i) { + int r = c0[i]; + int g = c1[i]; + int b = c2[i]; + int y = (r + (g * 2) + b) >> 2; + int u = b - g; + int v = r - g; c0[i] = y; c1[i] = u; c2[i] = v; @@ -62,16 +71,20 @@ void mct_encode(int *c0, int *c1, int *c2, int n) { /* */ /* Inverse reversible MCT. */ /* */ -void mct_decode(int *c0, int *c1, int *c2, int n) { +void mct_decode( + int* restrict c0, + int* restrict c1, + int* restrict c2, + int n) +{ int i; - for (i = 0; i < n; i++) { - int y, u, v, r, g, b; - y = c0[i]; - u = c1[i]; - v = c2[i]; - g = y - ((u + v) >> 2); - r = v + g; - b = u + g; + for (i = 0; i < n; ++i) { + int y = c0[i]; + int u = c1[i]; + int v = c2[i]; + int g = y - ((u + v) >> 2); + int r = v + g; + int b = u + g; c0[i] = r; c1[i] = g; c2[i] = b; @@ -88,16 +101,20 @@ double mct_getnorm(int compno) { /* */ /* Foward irreversible MCT. */ /* */ -void mct_encode_real(int *c0, int *c1, int *c2, int n) { +void mct_encode_real( + int* restrict c0, + int* restrict c1, + int* restrict c2, + int n) +{ int i; - for (i = 0; i < n; i++) { - int r, g, b, y, u, v; - r = c0[i]; - g = c1[i]; - b = c2[i]; - y = fix_mul(r, 2449) + fix_mul(g, 4809) + fix_mul(b, 934); - u = -fix_mul(r, 1382) - fix_mul(g, 2714) + fix_mul(b, 4096); - v = fix_mul(r, 4096) - fix_mul(g, 3430) - fix_mul(b, 666); + for(i = 0; i < n; ++i) { + int r = c0[i]; + int g = c1[i]; + int b = c2[i]; + int y = fix_mul(r, 2449) + fix_mul(g, 4809) + fix_mul(b, 934); + int u = -fix_mul(r, 1382) - fix_mul(g, 2714) + fix_mul(b, 4096); + int v = fix_mul(r, 4096) - fix_mul(g, 3430) - fix_mul(b, 666); c0[i] = y; c1[i] = u; c2[i] = v; @@ -107,16 +124,58 @@ void mct_encode_real(int *c0, int *c1, int *c2, int n) { /* */ /* Inverse irreversible MCT. */ /* */ -void mct_decode_real(int *c0, int *c1, int *c2, int n) { +void mct_decode_real( + float* restrict c0, + float* restrict c1, + float* restrict c2, + int n) +{ int i; - for (i = 0; i < n; i++) { - int y, u, v, r, g, b; - y = c0[i]; - u = c1[i]; - v = c2[i]; - r = y + fix_mul(v, 11485); - g = y - fix_mul(u, 2819) - fix_mul(v, 5850); - b = y + fix_mul(u, 14516); +#ifdef __SSE__ + __m128 vrv, vgu, vgv, vbu; + vrv = _mm_set1_ps(1.402f); + vgu = _mm_set1_ps(0.34413f); + vgv = _mm_set1_ps(0.71414f); + vbu = _mm_set1_ps(1.772f); + for (i = 0; i < (n >> 3); ++i) { + __m128 vy, vu, vv; + __m128 vr, vg, vb; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + + vy = _mm_load_ps(c0); + vu = _mm_load_ps(c1); + vv = _mm_load_ps(c2); + vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv)); + vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv, vgv)); + vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu)); + _mm_store_ps(c0, vr); + _mm_store_ps(c1, vg); + _mm_store_ps(c2, vb); + c0 += 4; + c1 += 4; + c2 += 4; + } + n &= 7; +#endif + for(i = 0; i < n; ++i) { + float y = c0[i]; + float u = c1[i]; + float v = c2[i]; + float r = y + (v * 1.402f); + float g = y - (u * 0.34413f) - (v * (0.71414f)); + float b = y + (u * 1.772f); c0[i] = r; c1[i] = g; c2[i] = b;