From 36823705b7a439bd4af70e319df0dbdc4f008503 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 1 Jun 2026 03:26:53 -0700 Subject: [PATCH 1/5] Optimize FFT for strided input to avoid oversized allocation --- dpnp/fft/dpnp_utils_fft.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 733436ab988..5b6dd3c2432 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -408,12 +408,25 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): a = dpnp.reshape(a, local_shape) index = 1 + if not a.flags.c_contiguous: # cuFFT requires input arrays to be C-contiguous (row-major) # for correct execution - if ( - dpnp.is_cuda_backend(a) and not a.flags.c_contiguous - ): # pragma: no cover + if dpnp.is_cuda_backend(a): # pragma: no cover a = dpnp.ascontiguousarray(a) + else: + # Check if the memory footprint of the strides exceeds + # the number of elements. + # If so, copy to contiguous to avoid oversized allocation + # for the output array and unnecessary copy to contiguous + # after oneMKL FFT + _strides = dpnp.get_usm_ndarray(a).strides + _shape = a.shape + # max element offset reachable by the strides + max_disp = sum( + st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0 + ) + if (max_disp + 1) > a.size: + a = dpnp.ascontiguousarray(a) # w/a for cuFFT to avoid "Invalid strides" error when # the last dimension is 1 and there are multiple axes From fbb13bddb3584f93c33f1f724f89f2284560e0f5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 1 Jun 2026 04:10:18 -0700 Subject: [PATCH 2/5] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6472d90153c..6d997782a67 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ This release is compatible with NumPy 2.4.5. * Updated `searchsorted` implementations to align with the 2025.12 array API spec [gh-2902](https://github.com/IntelPython/dpnp/pull/2902) * Updated tests to align with NumPy 2.4.5 compatibility [gh-2920](https://github.com/IntelPython/dpnp/pull/2920) * Replaced `.pxi` includes in `dpnp.tensor` with modular `.pxd`/`.pyx` Cython imports [#2913](https://github.com/IntelPython/dpnp/pull/2913) +* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939) ### Deprecated From 7d1d7a2174975d50e32f484f8e67a6af96a5f99a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 4 Jun 2026 04:24:09 -0700 Subject: [PATCH 3/5] Update comment to clarify negative strides handling --- dpnp/fft/dpnp_utils_fft.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 5b6dd3c2432..dadf8b2fc7f 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -421,7 +421,9 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): # after oneMKL FFT _strides = dpnp.get_usm_ndarray(a).strides _shape = a.shape - # max element offset reachable by the strides + # Max element displacement reachable by the strides. + # Negative strides are handled by _copy_array, so only + # positive strides are possible here max_disp = sum( st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0 ) From a9419ae0dd8936186d917b8508135968fd34eafc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 18 Jun 2026 06:42:29 -0700 Subject: [PATCH 4/5] Apply remarks --- dpnp/fft/dpnp_utils_fft.py | 11 ++++++----- dpnp/tests/test_fft.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index dadf8b2fc7f..ee00136c5d1 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -419,13 +419,14 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): # If so, copy to contiguous to avoid oversized allocation # for the output array and unnecessary copy to contiguous # after oneMKL FFT - _strides = dpnp.get_usm_ndarray(a).strides + elem_strides = dpnp.get_usm_ndarray(a).strides _shape = a.shape - # Max element displacement reachable by the strides. - # Negative strides are handled by _copy_array, so only - # positive strides are possible here + # Max element displacement reachable by positive strides. + # Negative strides are handled by _copy_array; + # zero strides are safely ignored as they reuse the same + # memory location and don't extend the footprint max_disp = sum( - st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0 + st * (sh - 1) for st, sh in zip(elem_strides, _shape) if st > 0 ) if (max_disp + 1) > a.size: a = dpnp.ascontiguousarray(a) diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py index f8cc95a7a3c..b8669714724 100644 --- a/dpnp/tests/test_fft.py +++ b/dpnp/tests/test_fft.py @@ -234,6 +234,26 @@ def test_strided_2d(self, stride_x, stride_y): expected = numpy.fft.fft(a) assert_dtype_allclose(result, expected) + def test_non_contiguous_no_copy(self): + a = generate_random_numpy_array((4, 5, 6), dtype=numpy.complex64) + # Non-contiguous input with compact footprint (no copy needed) + ia = dpnp.moveaxis(dpnp.array(a), 0, -1) + a_np = dpnp.asnumpy(ia) + + result = dpnp.fft.fft(ia) + expected = numpy.fft.fft(a_np) + assert_dtype_allclose(result, expected) + + @pytest.mark.parametrize("slc", [numpy.s_[::2, :], numpy.s_[:, ::3]]) + def test_non_contiguous_with_copy(self, slc): + # Strided input with oversized footprint (triggers copy) + a = generate_random_numpy_array((10, 12), dtype=numpy.complex64) + ia = dpnp.array(a)[slc] + + result = dpnp.fft.fft(ia) + expected = numpy.fft.fft(a[slc]) + assert_dtype_allclose(result, expected) + def test_empty_array(self): a = numpy.empty((10, 0, 4), dtype=numpy.complex64) ia = dpnp.array(a) From ead0efff1b500a6cd73d296470e41e0424b15b2f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 19 Jun 2026 03:31:37 -0700 Subject: [PATCH 5/5] Apply remarks --- dpnp/fft/dpnp_utils_fft.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index ee00136c5d1..3f5af951103 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -408,6 +408,8 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): a = dpnp.reshape(a, local_shape) index = 1 + elem_strides = dpnp.get_usm_ndarray(a).strides + if not a.flags.c_contiguous: # cuFFT requires input arrays to be C-contiguous (row-major) # for correct execution @@ -419,14 +421,13 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): # If so, copy to contiguous to avoid oversized allocation # for the output array and unnecessary copy to contiguous # after oneMKL FFT - elem_strides = dpnp.get_usm_ndarray(a).strides - _shape = a.shape + a_shape = a.shape # Max element displacement reachable by positive strides. # Negative strides are handled by _copy_array; # zero strides are safely ignored as they reuse the same # memory location and don't extend the footprint max_disp = sum( - st * (sh - 1) for st, sh in zip(elem_strides, _shape) if st > 0 + st * (sh - 1) for st, sh in zip(elem_strides, a_shape) if st > 0 ) if (max_disp + 1) > a.size: a = dpnp.ascontiguousarray(a) @@ -440,8 +441,7 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): if cufft_wa: # pragma: no cover a = dpnp.moveaxis(a, -1, -2) - strides = dpnp.get_usm_ndarray(a).strides - a_strides = _standardize_strides_to_nonzero(strides, a.shape) + a_strides = _standardize_strides_to_nonzero(elem_strides, a.shape) dsc, out_strides = _commit_descriptor( a, forward, in_place, c2c, a_strides, index, batch_fft )