From 36823705b7a439bd4af70e319df0dbdc4f008503 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 1 Jun 2026 03:26:53 -0700
Subject: [PATCH 1/5] Optimize FFT for strided input to avoid oversized
 allocation

---
 dpnp/fft/dpnp_utils_fft.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 733436ab988..5b6dd3c2432 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -408,12 +408,25 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
         a = dpnp.reshape(a, local_shape)
         index = 1
 
+    if not a.flags.c_contiguous:
         # cuFFT requires input arrays to be C-contiguous (row-major)
         # for correct execution
-        if (
-            dpnp.is_cuda_backend(a) and not a.flags.c_contiguous
-        ):  # pragma: no cover
+        if dpnp.is_cuda_backend(a):  # pragma: no cover
             a = dpnp.ascontiguousarray(a)
+        else:
+            # Check if the memory footprint of the strides exceeds
+            # the number of elements.
+            # If so, copy to contiguous to avoid oversized allocation
+            # for the output array and unnecessary copy to contiguous
+            # after oneMKL FFT
+            _strides = dpnp.get_usm_ndarray(a).strides
+            _shape = a.shape
+            # max element offset reachable by the strides
+            max_disp = sum(
+                st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0
+            )
+            if (max_disp + 1) > a.size:
+                a = dpnp.ascontiguousarray(a)
 
     # w/a for cuFFT to avoid "Invalid strides" error when
     # the last dimension is 1 and there are multiple axes

From fbb13bddb3584f93c33f1f724f89f2284560e0f5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 1 Jun 2026 04:10:18 -0700
Subject: [PATCH 2/5] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6472d90153c..6d997782a67 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@ This release is compatible with NumPy 2.4.5.
 * Updated `searchsorted` implementations to align with the 2025.12 array API spec [gh-2902](https://github.com/IntelPython/dpnp/pull/2902)
 * Updated tests to align with NumPy 2.4.5 compatibility [gh-2920](https://github.com/IntelPython/dpnp/pull/2920)
 * Replaced `.pxi` includes in `dpnp.tensor` with modular `.pxd`/`.pyx` Cython imports [#2913](https://github.com/IntelPython/dpnp/pull/2913)
+* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939)
 
 ### Deprecated
 

From 7d1d7a2174975d50e32f484f8e67a6af96a5f99a Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 4 Jun 2026 04:24:09 -0700
Subject: [PATCH 3/5] Update comment to clarify negative strides handling

---
 dpnp/fft/dpnp_utils_fft.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index 5b6dd3c2432..dadf8b2fc7f 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -421,7 +421,9 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
             # after oneMKL FFT
             _strides = dpnp.get_usm_ndarray(a).strides
             _shape = a.shape
-            # max element offset reachable by the strides
+            # Max element displacement reachable by the strides.
+            # Negative strides are handled by _copy_array, so only
+            # positive strides are possible here
             max_disp = sum(
                 st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0
             )

From a9419ae0dd8936186d917b8508135968fd34eafc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 18 Jun 2026 06:42:29 -0700
Subject: [PATCH 4/5] Apply remarks

---
 dpnp/fft/dpnp_utils_fft.py | 11 ++++++-----
 dpnp/tests/test_fft.py     | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index dadf8b2fc7f..ee00136c5d1 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -419,13 +419,14 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
             # If so, copy to contiguous to avoid oversized allocation
             # for the output array and unnecessary copy to contiguous
             # after oneMKL FFT
-            _strides = dpnp.get_usm_ndarray(a).strides
+            elem_strides = dpnp.get_usm_ndarray(a).strides
             _shape = a.shape
-            # Max element displacement reachable by the strides.
-            # Negative strides are handled by _copy_array, so only
-            # positive strides are possible here
+            # Max element displacement reachable by positive strides.
+            # Negative strides are handled by _copy_array;
+            # zero strides are safely ignored as they reuse the same
+            # memory location and don't extend the footprint
             max_disp = sum(
-                st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0
+                st * (sh - 1) for st, sh in zip(elem_strides, _shape) if st > 0
             )
             if (max_disp + 1) > a.size:
                 a = dpnp.ascontiguousarray(a)
diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py
index f8cc95a7a3c..b8669714724 100644
--- a/dpnp/tests/test_fft.py
+++ b/dpnp/tests/test_fft.py
@@ -234,6 +234,26 @@ def test_strided_2d(self, stride_x, stride_y):
         expected = numpy.fft.fft(a)
         assert_dtype_allclose(result, expected)
 
+    def test_non_contiguous_no_copy(self):
+        a = generate_random_numpy_array((4, 5, 6), dtype=numpy.complex64)
+        # Non-contiguous input with compact footprint (no copy needed)
+        ia = dpnp.moveaxis(dpnp.array(a), 0, -1)
+        a_np = dpnp.asnumpy(ia)
+
+        result = dpnp.fft.fft(ia)
+        expected = numpy.fft.fft(a_np)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.parametrize("slc", [numpy.s_[::2, :], numpy.s_[:, ::3]])
+    def test_non_contiguous_with_copy(self, slc):
+        # Strided input with oversized footprint (triggers copy)
+        a = generate_random_numpy_array((10, 12), dtype=numpy.complex64)
+        ia = dpnp.array(a)[slc]
+
+        result = dpnp.fft.fft(ia)
+        expected = numpy.fft.fft(a[slc])
+        assert_dtype_allclose(result, expected)
+
     def test_empty_array(self):
         a = numpy.empty((10, 0, 4), dtype=numpy.complex64)
         ia = dpnp.array(a)

From ead0efff1b500a6cd73d296470e41e0424b15b2f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 19 Jun 2026 03:31:37 -0700
Subject: [PATCH 5/5] Apply remarks

---
 dpnp/fft/dpnp_utils_fft.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py
index ee00136c5d1..3f5af951103 100644
--- a/dpnp/fft/dpnp_utils_fft.py
+++ b/dpnp/fft/dpnp_utils_fft.py
@@ -408,6 +408,8 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
         a = dpnp.reshape(a, local_shape)
         index = 1
 
+    elem_strides = dpnp.get_usm_ndarray(a).strides
+
     if not a.flags.c_contiguous:
         # cuFFT requires input arrays to be C-contiguous (row-major)
         # for correct execution
@@ -419,14 +421,13 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
             # If so, copy to contiguous to avoid oversized allocation
             # for the output array and unnecessary copy to contiguous
             # after oneMKL FFT
-            elem_strides = dpnp.get_usm_ndarray(a).strides
-            _shape = a.shape
+            a_shape = a.shape
             # Max element displacement reachable by positive strides.
             # Negative strides are handled by _copy_array;
             # zero strides are safely ignored as they reuse the same
             # memory location and don't extend the footprint
             max_disp = sum(
-                st * (sh - 1) for st, sh in zip(elem_strides, _shape) if st > 0
+                st * (sh - 1) for st, sh in zip(elem_strides, a_shape) if st > 0
             )
             if (max_disp + 1) > a.size:
                 a = dpnp.ascontiguousarray(a)
@@ -440,8 +441,7 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True):
     if cufft_wa:  # pragma: no cover
         a = dpnp.moveaxis(a, -1, -2)
 
-    strides = dpnp.get_usm_ndarray(a).strides
-    a_strides = _standardize_strides_to_nonzero(strides, a.shape)
+    a_strides = _standardize_strides_to_nonzero(elem_strides, a.shape)
     dsc, out_strides = _commit_descriptor(
         a, forward, in_place, c2c, a_strides, index, batch_fft
     )