From 8252bdb70f221afd47cb7576803f3e80e45446cf Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 2 Jun 2026 11:31:18 +0300 Subject: [PATCH 1/4] Filter: hoist image bounds and add restrict on row pointers --- src/libImaging/Filter.c | 103 ++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index f609b2be8c6..df6118ec173 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -132,20 +132,25 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { _i2f(in0[x + d]) * (kernel)[2]) int x = 0, y = 0; + // restrict safety: assert im and imOut to be separate, + // so we can safely `restrict` the copy pointers below + assert(im->image != imOut->image); + // Hoist for loop optimization. + int xsize = im->xsize, ysize = im->ysize; memcpy(imOut->image[0], im->image[0], im->linesize); if (im->bands == 1) { // Add one time for rounding offset += 0.5; if (im->type == IMAGING_TYPE_INT32) { - for (y = 1; y < im->ysize - 1; y++) { - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *out = (INT32 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + INT32 *restrict in_1 = (INT32 *)im->image[y - 1]; + INT32 *restrict in0 = (INT32 *)im->image[y]; + INT32 *restrict in1 = (INT32 *)im->image[y + 1]; + INT32 *restrict out = (INT32 *)imOut->image[y]; out[0] = in0[0]; - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss = offset; ss += KERNEL1x3(in1, x, &kernel[0], 1); ss += KERNEL1x3(in0, x, &kernel[3], 1); @@ -166,17 +171,17 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { bigendian = 1; } } - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; out[0] = in0[0]; if (im->type == IMAGING_TYPE_SPECIAL) { out[1] = in0[1]; } - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss = offset; if (im->type == IMAGING_TYPE_SPECIAL) { ss += kernel_i16(3, in1, x, &kernel[0], bigendian); @@ -203,15 +208,15 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { } else { // Add one time for rounding offset += 0.5; - for (y = 1; y < im->ysize - 1; y++) { - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 1; y < ysize - 1; y++) { + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; memcpy(out, in0, sizeof(UINT32)); if (im->bands == 2) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss3 = offset; UINT32 v; @@ -225,7 +230,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 3) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -243,7 +248,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 4) { - for (x = 1; x < im->xsize - 1; x++) { + for (x = 1; x < xsize - 1; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -280,23 +285,29 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { int x = 0, y = 0; + // restrict safety: assert im and imOut to be separate, + // so we can safely `restrict` the copy pointers below + assert(im->image != imOut->image); + // Hoist for loop optimization. + int xsize = im->xsize, ysize = im->ysize; + memcpy(imOut->image[0], im->image[0], im->linesize); memcpy(imOut->image[1], im->image[1], im->linesize); if (im->bands == 1) { // Add one time for rounding offset += 0.5; if (im->type == IMAGING_TYPE_INT32) { - for (y = 2; y < im->ysize - 2; y++) { - INT32 *in_2 = (INT32 *)im->image[y - 2]; - INT32 *in_1 = (INT32 *)im->image[y - 1]; - INT32 *in0 = (INT32 *)im->image[y]; - INT32 *in1 = (INT32 *)im->image[y + 1]; - INT32 *in2 = (INT32 *)im->image[y + 2]; - INT32 *out = (INT32 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + INT32 *restrict in_2 = (INT32 *)im->image[y - 2]; + INT32 *restrict in_1 = (INT32 *)im->image[y - 1]; + INT32 *restrict in0 = (INT32 *)im->image[y]; + INT32 *restrict in1 = (INT32 *)im->image[y + 1]; + INT32 *restrict in2 = (INT32 *)im->image[y + 2]; + INT32 *restrict out = (INT32 *)imOut->image[y]; out[0] = in0[0]; out[1] = in0[1]; - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss = offset; ss += KERNEL1x5(in2, x, &kernel[0], 1); ss += KERNEL1x5(in1, x, &kernel[5], 1); @@ -320,13 +331,13 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { bigendian = 1; } } - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + UINT8 *restrict in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict in2 = (UINT8 *)im->image[y + 2]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; out[0] = in0[0]; out[1] = in0[1]; @@ -334,7 +345,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { out[2] = in0[2]; out[3] = in0[3]; } - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss = offset; if (im->type == IMAGING_TYPE_SPECIAL) { ss += kernel_i16(5, in2, x, &kernel[0], bigendian); @@ -368,17 +379,17 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { } else { // Add one time for rounding offset += 0.5; - for (y = 2; y < im->ysize - 2; y++) { - UINT8 *in_2 = (UINT8 *)im->image[y - 2]; - UINT8 *in_1 = (UINT8 *)im->image[y - 1]; - UINT8 *in0 = (UINT8 *)im->image[y]; - UINT8 *in1 = (UINT8 *)im->image[y + 1]; - UINT8 *in2 = (UINT8 *)im->image[y + 2]; - UINT8 *out = (UINT8 *)imOut->image[y]; + for (y = 2; y < ysize - 2; y++) { + UINT8 *restrict in_2 = (UINT8 *)im->image[y - 2]; + UINT8 *restrict in_1 = (UINT8 *)im->image[y - 1]; + UINT8 *restrict in0 = (UINT8 *)im->image[y]; + UINT8 *restrict in1 = (UINT8 *)im->image[y + 1]; + UINT8 *restrict in2 = (UINT8 *)im->image[y + 2]; + UINT8 *restrict out = (UINT8 *)imOut->image[y]; memcpy(out, in0, sizeof(UINT32) * 2); if (im->bands == 2) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss3 = offset; UINT32 v; @@ -396,7 +407,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 3) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; @@ -420,7 +431,7 @@ ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(out + x * sizeof(v), &v, sizeof(v)); } } else if (im->bands == 4) { - for (x = 2; x < im->xsize - 2; x++) { + for (x = 2; x < xsize - 2; x++) { float ss0 = offset; float ss1 = offset; float ss2 = offset; From 75679d74f75f99a67c5bf65d97f54037a63427e4 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jun 2026 09:00:27 +0300 Subject: [PATCH 2/4] Filter: mark internal filter functions static --- src/libImaging/Filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index df6118ec173..a8a63583081 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -125,7 +125,7 @@ kernel_i16(int size, UINT8 *in0, int x, const float *kernel, int bigendian) { return result; } -void +static void ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { #define KERNEL1x3(in0, x, kernel, d) \ (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ @@ -276,7 +276,7 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { memcpy(imOut->image[y], im->image[y], im->linesize); } -void +static void ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { #define KERNEL1x5(in0, x, kernel, d) \ (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ From 0975cdd1b1f1aeb59c480669d37dd2867119e4a9 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jun 2026 09:14:22 +0300 Subject: [PATCH 3/4] Filter: speed up clip8/clip32 --- src/libImaging/Filter.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index a8a63583081..fd5ff112193 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -24,30 +24,27 @@ * FIXME: Implement image processing gradient filters */ +#include #include "Imaging.h" #define ROUND_UP(f) ((int)((f) >= 0.0 ? (f) + 0.5F : (f) - 0.5F)) +#define INT32_MAX_F 2147483647.0F static inline UINT8 clip8(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= 255.0) { - return 255; - } + // Branchless clamp to [0, 255]. + in = in < 0.0f ? 0.0f : in; + in = in > 255.0f ? 255.0f : in; return (UINT8)in; } static inline INT32 clip32(float in) { - if (in <= 0.0) { - return 0; - } - if (in >= pow(2, 31) - 1) { - return pow(2, 31) - 1; - } - return (INT32)in; + // Clamp the low bound branchlessly (maxss). + in = in < 0.0f ? 0.0f : in; + // Must explicitly return INT32_MAX for the high bound + // to avoid incorrect overflow to INT_MIN. + return in >= INT32_MAX_F ? INT32_MAX : (INT32)in; } Imaging From 7c81174c5b0c693104afd351daf9a914bb429d1c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jun 2026 10:42:48 +0300 Subject: [PATCH 4/4] Filter: use direct float cast instead of i2f helper _i2f is a compatibility shim for `(float)` for pre-2015 versions of GCC (added in 2017 in ef1df617f932edb73466af43a946d890f7f81bd4). Compilers can presumably optimize `(float)` better than an inline function call. The helper is then unused and removed. --- src/libImaging/Filter.c | 20 +++++++++----------- src/libImaging/ImagingUtils.h | 14 -------------- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c index fd5ff112193..b5f971d40c2 100644 --- a/src/libImaging/Filter.c +++ b/src/libImaging/Filter.c @@ -113,10 +113,8 @@ kernel_i16(int size, UINT8 *in0, int x, const float *kernel, int bigendian) { int half_size = (size - 1) / 2; for (i = 0; i < size; i++) { int x1 = x + i - half_size; - result += _i2f( - in0[x1 * 2 + (bigendian ? 1 : 0)] + - (in0[x1 * 2 + (bigendian ? 0 : 1)] >> 8) - ) * + result += (float)(in0[x1 * 2 + (bigendian ? 1 : 0)] + + (in0[x1 * 2 + (bigendian ? 0 : 1)] >> 8)) * kernel[i]; } return result; @@ -124,9 +122,9 @@ kernel_i16(int size, UINT8 *in0, int x, const float *kernel, int bigendian) { static void ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x3(in0, x, kernel, d) \ - (_i2f(in0[x - d]) * (kernel)[0] + _i2f(in0[x]) * (kernel)[1] + \ - _i2f(in0[x + d]) * (kernel)[2]) +#define KERNEL1x3(in0, x, kernel, d) \ + ((float)(in0[x - d]) * (kernel)[0] + (float)(in0[x]) * (kernel)[1] + \ + (float)(in0[x + d]) * (kernel)[2]) int x = 0, y = 0; // restrict safety: assert im and imOut to be separate, @@ -275,10 +273,10 @@ ImagingFilter3x3(Imaging imOut, Imaging im, const float *kernel, float offset) { static void ImagingFilter5x5(Imaging imOut, Imaging im, const float *kernel, float offset) { -#define KERNEL1x5(in0, x, kernel, d) \ - (_i2f(in0[x - d - d]) * (kernel)[0] + _i2f(in0[x - d]) * (kernel)[1] + \ - _i2f(in0[x]) * (kernel)[2] + _i2f(in0[x + d]) * (kernel)[3] + \ - _i2f(in0[x + d + d]) * (kernel)[4]) +#define KERNEL1x5(in0, x, kernel, d) \ + ((float)(in0[x - d - d]) * (kernel)[0] + (float)(in0[x - d]) * (kernel)[1] + \ + (float)(in0[x]) * (kernel)[2] + (float)(in0[x + d]) * (kernel)[3] + \ + (float)(in0[x + d + d]) * (kernel)[4]) int x = 0, y = 0; diff --git a/src/libImaging/ImagingUtils.h b/src/libImaging/ImagingUtils.h index a362780d0ee..de8245b3b46 100644 --- a/src/libImaging/ImagingUtils.h +++ b/src/libImaging/ImagingUtils.h @@ -28,17 +28,3 @@ #define CLIP8(v) ((v) <= 0 ? 0 : (v) < 256 ? (v) : 255) #define CLIP16(v) ((v) <= 0 ? 0 : (v) < 65536 ? (v) : 65535) - -/* This is to work around a bug in GCC prior 4.9 in 64 bit mode. - GCC generates code with partial dependency which is 3 times slower. - See: https://stackoverflow.com/a/26588074/253146 */ -#if defined(__x86_64__) && defined(__SSE__) && !defined(__NO_INLINE__) && \ - !defined(__clang__) && defined(GCC_VERSION) && (GCC_VERSION < 40900) -static float __attribute__((always_inline)) inline _i2f(int v) { - float x; - __asm__("xorps %0, %0; cvtsi2ss %1, %0" : "=x"(x) : "r"(v)); - return x; -} -#else -static float inline _i2f(int v) { return (float)v; } -#endif