diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index f609b2be8c6..b5f971d40c2 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -24,30 +24,27 @@ * FIXME: Implement image processing gradient filters */ +#include #include "Imaging.h" #define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f) - 0.5F)) +#define INT32_MAX_F 2147483647.0F static inline UINT8 clip8(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= 255.0) { - return 255; - } + // Branchless clamp to [0, 255]. + in = in < 0.0f ? 0.0f : in; + in = in > 255.0f ? 255.0f : in; return (UINT8)in; } static inline INT32 clip32(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= pow(2, 31) - 1) { - return pow(2, 31) - 1; - } - return (INT32)in; + // Clamp the low bound branchlessly (maxss). + in = in < 0.0f ? 0.0f : in; + // Must explicitly return INT32_MAX for the high bound + // to avoid incorrect overflow to INT_MIN. + return in >= INT32_MAX_F ? INT32_MAX : (INT32)in; } Imaging @@ -116,36 +113,39 @@ kernel_i16(int size, UINT8 *in0, int x, const float *kernel, int bigendian) { int half_size = (size - 1) / 2; for (i = 0; i < size; i++) { int x1 = x + i - half_size; - result += _i2f( - in0[x1 * 2 + (bigendian ? 1 : 0)] + - (in0[x1 * 2 + (bigendian ? 0 : 1)] >> 8) - ) * + result += (float)(in0[x1 * 2 + (bigendian ? 1 : 0)] + + (in0[x1 * 2 + (bigendian ? 0 : 1)] >> 8)) * kernel[i]; } return result; } -void +static void ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x3(in0, x, kernel, d) \ - (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ - _i2f(in0[x + d]) * (kernel)[2]) +#define KERNEL1x3(in0, x, kernel, d) \ + ((float)(in0[x - d]) * (kernel)[0] + (float)(in0[x]) * (kernel)[1] + \ + (float)(in0[x + d]) * (kernel)[2]) int x = 0, y = 0; + // restrict safety: assert im and imOut to be separate, + // so we can safely `restrict` the copy pointers below + assert(im->image != imOut->image); + // Hoist for loop optimization. + int xsize = im->xsize, ysize = im->ysize; memcpy(imOut->image[0], im->image[0], im->linesize); if (im->bands == 1) { // Add one time for rounding offset += 0.5; if (im->type == IMAGING_TYPE_INT32) { - for (y = 1; y < im->ysize - 1; y++) { - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *out = (INT32 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + INT32 *restrict in_1 = (INT32 *)im->image[y - 1]; + INT32 *restrict in0 = (INT32 *)im->image[y]; + INT32 *restrict in1 = (INT32 *)im->image[y + 1]; + INT32 *restrict out = (INT32 *)imOut->image[y]; out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss = offset; ss += KERNEL1x3(in1, x, &kernel[0], 1); ss += KERNEL1x3(in0, x, &kernel[3], 1); @@ -166,17 +166,17 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { bigendian = 1; } } - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; out[0] = in0[0]; if (im->type == IMAGING_TYPE_SPECIAL) { out[1] = in0[1]; } - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss = offset; if (im->type == IMAGING_TYPE_SPECIAL) { ss += kernel_i16(3, in1, x, &kernel[0], bigendian); @@ -203,15 +203,15 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { } else { // Add one time for rounding offset += 0.5; - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; memcpy(out, in0, sizeof(UINT32)); if (im->bands == 2) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss3 = offset; UINT32 v; @@ -225,7 +225,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 3) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -243,7 +243,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 4) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -271,32 +271,38 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(imOut->image[y], im->image[y], im->linesize); } -void +static void ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x5(in0, x, kernel, d) \ - (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ - _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \ - _i2f(in0[x + d + d]) * (kernel)[4]) +#define KERNEL1x5(in0, x, kernel, d) \ + ((float)(in0[x - d - d]) * (kernel)[0] + (float)(in0[x - d]) * (kernel)[1] + \ + (float)(in0[x]) * (kernel)[2] + (float)(in0[x + d]) * (kernel)[3] + \ + (float)(in0[x + d + d]) * (kernel)[4]) int x = 0, y = 0; + // restrict safety: assert im and imOut to be separate, + // so we can safely `restrict` the copy pointers below + assert(im->image != imOut->image); + // Hoist for loop optimization. + int xsize = im->xsize, ysize = im->ysize; + memcpy(imOut->image[0], im->image[0], im->linesize); memcpy(imOut->image[1], im->image[1], im->linesize); if (im->bands == 1) { // Add one time for rounding offset += 0.5; if (im->type == IMAGING_TYPE_INT32) { - for (y = 2; y < im->ysize - 2; y++) { - INT32 *in_2 = (INT32 *)im->image[y - 2]; - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *in2 = (INT32 *)im->image[y + 2]; - INT32 *out = (INT32 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + INT32 *restrict in_2 = (INT32 *)im->image[y - 2]; + INT32 *restrict in_1 = (INT32 *)im->image[y - 1]; + INT32 *restrict in0 = (INT32 *)im->image[y]; + INT32 *restrict in1 = (INT32 *)im->image[y + 1]; + INT32 *restrict in2 = (INT32 *)im->image[y + 2]; + INT32 *restrict out = (INT32 *)imOut->image[y]; out[0] = in0[0]; out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss = offset; ss += KERNEL1x5(in2, x, &kernel[0], 1); ss += KERNEL1x5(in1, x, &kernel[5], 1); @@ -320,13 +326,13 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { bigendian = 1; } } - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + UINT8 *restrict in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict in2 = (UINT8 *)im->image[y + 2]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; out[0] = in0[0]; out[1] = in0[1]; @@ -334,7 +340,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { out[2] = in0[2]; out[3] = in0[3]; } - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss = offset; if (im->type == IMAGING_TYPE_SPECIAL) { ss += kernel_i16(5, in2, x, &kernel[0], bigendian); @@ -368,17 +374,17 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { } else { // Add one time for rounding offset += 0.5; - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + UINT8 *restrict in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict in2 = (UINT8 *)im->image[y + 2]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; memcpy(out, in0, sizeof(UINT32) * 2); if (im->bands == 2) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss3 = offset; UINT32 v; @@ -396,7 +402,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 3) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -420,7 +426,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 4) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; diff --git a/src/libImaging/ImagingUtils.h b/src/libImaging/ImagingUtils.h index a362780d0ee..de8245b3b46 100644 --- a/src/libImaging/ImagingUtils.h +++ b/src/libImaging/ImagingUtils.h @@ -28,17 +28,3 @@ #define CLIP8(v) ((v) <= 0 ? 0 : (v) < 256 ? (v) : 255) #define CLIP16(v) ((v) <= 0 ? 0 : (v) < 65536 ? (v) : 65535) - -/* This is to work around a bug in GCC prior 4.9 in 64 bit mode. - GCC generates code with partial dependency which is 3 times slower. - See: https://stackoverflow.com/a/26588074/253146 */ -#if defined(__x86_64__) && defined(__SSE__) && !defined(__NO_INLINE__) && \ - !defined(__clang__) && defined(GCC_VERSION) && (GCC_VERSION < 40900) -static float __attribute__((always_inline)) inline _i2f(int v) { - float x; - __asm__("xorps %0, %0; cvtsi2ss %1, %0" : "=x"(x) : "r"(v)); - return x; -} -#else -static float inline _i2f(int v) { return (float)v; } -#endif