diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 53a0dfd27ba..a39368c7baa 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -48,9 +48,9 @@ set(_tensor_impl_sources
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
-    # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp
     # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 3c6939eff7a..edb2c096bad 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -27,6 +27,13 @@
 # *****************************************************************************
 
 
+from dpctl_ext.tensor._copy_utils import (
+    asnumpy,
+    astype,
+    copy,
+    from_numpy,
+    to_numpy,
+)
 from dpctl_ext.tensor._ctors import (
     full,
     tril,
@@ -36,11 +43,22 @@
     put,
     take,
 )
+from dpctl_ext.tensor._manipulation_functions import (
+    roll,
+)
+from dpctl_ext.tensor._reshape import reshape
 
 __all__ = [
+    "asnumpy",
+    "astype",
+    "copy",
+    "from_numpy",
     "full",
     "put",
+    "reshape",
+    "roll",
     "take",
+    "to_numpy",
     "tril",
     "triu",
 ]
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
new file mode 100644
index 00000000000..c62218893a2
--- /dev/null
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -0,0 +1,755 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import builtins
+
+import dpctl
+import dpctl.memory as dpm
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._data_types import _get_dtype
+from dpctl.tensor._device import normalize_queue_device
+from dpctl.tensor._type_utils import _dtype_supported_by_device_impl
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+__doc__ = (
+    "Implementation module for copy- and cast- operations on "
+    ":class:`dpctl.tensor.usm_ndarray`."
+)
+
+int32_t_max = 1 + np.iinfo(np.int32).max
+
+
+def _copy_to_numpy(ary):
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}")
+    if ary.size == 0:
+        # no data needs to be copied for zero sized array
+        return np.ndarray(ary.shape, dtype=ary.dtype)
+    nb = ary.usm_data.nbytes
+    q = ary.sycl_queue
+    hh = dpm.MemoryUSMHost(nb, queue=q)
+    h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype)
+    itsz = ary.itemsize
+    strides_bytes = tuple(si * itsz for si in ary.strides)
+    offset = ary._element_offset * itsz
+    # ensure that content of ary.usm_data is final
+    q.wait()
+    hh.copy_from_device(ary.usm_data)
+    return np.ndarray(
+        ary.shape,
+        dtype=ary.dtype,
+        buffer=h,
+        strides=strides_bytes,
+        offset=offset,
+    )
+
+
+def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
+    """Copies numpy array `np_ary` into a new usm_ndarray"""
+    # This may perform a copy to meet stated requirements
+    Xnp = np.require(np_ary, requirements=["A", "E"])
+    alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None)
+    dt = Xnp.dtype
+    if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64")
+        )
+    else:
+        Xusm_dtype = dt
+    Xusm = dpt.empty(
+        Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
+    )
+    _copy_from_numpy_into(Xusm, Xnp)
+    return Xusm
+
+
+def _copy_from_numpy_into(dst, np_ary):
+    """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray"""
+    if not isinstance(np_ary, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}")
+    if not isinstance(dst, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(dst)}")
+    if np_ary.flags["OWNDATA"]:
+        Xnp = np_ary
+    else:
+        # Determine base of input array
+        base = np_ary.base
+        while isinstance(base, np.ndarray):
+            base = base.base
+        if isinstance(base, dpm._memory._Memory):
+            # we must perform a copy, since subsequent
+            # _copy_numpy_ndarray_into_usm_ndarray is implemented using
+            # sycl::buffer, and using USM-pointers with sycl::buffer
+            # results is undefined behavior
+            Xnp = np_ary.copy()
+        else:
+            Xnp = np_ary
+    src_ary = np.broadcast_to(Xnp, dst.shape)
+    copy_q = dst.sycl_queue
+    if copy_q.sycl_device.has_aspect_fp64 is False:
+        src_ary_dt_c = src_ary.dtype.char
+        if src_ary_dt_c == "d":
+            src_ary = src_ary.astype(np.float32)
+        elif src_ary_dt_c == "D":
+            src_ary = src_ary.astype(np.complex64)
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_ev = _manager.submitted_events
+    # synchronizing call
+    ti._copy_numpy_ndarray_into_usm_ndarray(
+        src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev
+    )
+
+
+def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None):
+    """
+    from_numpy(arg, device=None, usm_type="device", sycl_queue=None)
+
+    Creates :class:`dpctl.tensor.usm_ndarray` from instance of
+    :class:`numpy.ndarray`.
+
+    Args:
+        arg:
+            Input convertible to :class:`numpy.ndarray`
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is ``None``,
+            returned array is created on the default-selected device.
+            Default: ``None``
+        usm_type (str): The requested USM allocation type for the
+            output array. Recognized values are ``"device"``,
+            ``"shared"``, or ``"host"``
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            A SYCL queue that determines output array allocation device
+            as well as execution placement of data movement operations.
+            The ``device`` and ``sycl_queue`` arguments
+            are equivalent. Only one of them should be specified. If both
+            are provided, they must be consistent and result in using the
+            same execution queue. Default: ``None``
+
+    The returned array has the same shape, and the same data type kind.
+    If the device does not support the data type of input array, a
+    closest support data type of the same kind may be returned, e.g.
+    input array of type ``float16`` may be upcast to ``float32`` if the
+    target device does not support 16-bit floating point type.
+    """
+    q = normalize_queue_device(sycl_queue=sycl_queue, device=device)
+    return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q)
+
+
+def to_numpy(usm_ary, /):
+    """
+    to_numpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content of
+            ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+def asnumpy(usm_ary):
+    """
+    asnumpy(usm_ary)
+
+    Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary``
+    into :class:`numpy.ndarray` instance of the same shape and same data
+    type.
+
+    Args:
+        usm_ary (usm_ndarray):
+            Input array
+    Returns:
+        :class:`numpy.ndarray`:
+            An instance of :class:`numpy.ndarray` populated with content
+            of ``usm_ary``
+    """
+    return _copy_to_numpy(usm_ary)
+
+
+class Dummy:
+    """Helper class with specified ``__sycl_usm_array_interface__`` attribute"""
+
+    def __init__(self, iface):
+        self.__sycl_usm_array_interface__ = iface
+
+
+def _copy_overlapping(dst, src):
+    """Assumes src and dst have the same shape."""
+    q = normalize_queue_device(sycl_queue=dst.sycl_queue)
+    tmp = dpt.usm_ndarray(
+        src.shape,
+        dtype=src.dtype,
+        buffer="device",
+        order="C",
+        buffer_ctor_kwargs={"queue": q},
+    )
+    _manager = dpctl.utils.SequentialOrderManager[q]
+    dep_evs = _manager.submitted_events
+    hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=tmp, sycl_queue=q, depends=dep_evs
+    )
+    _manager.add_event_pair(hcp1, cp1)
+    hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=tmp, dst=dst, sycl_queue=q, depends=[cp1]
+    )
+    _manager.add_event_pair(hcp2, cp2)
+
+
+def _copy_same_shape(dst, src):
+    """Assumes src and dst have the same shape."""
+    # check that memory regions do not overlap
+    if ti._array_overlap(dst, src):
+        if src._pointer == dst._pointer and (
+            src is dst
+            or (src.strides == dst.strides and src.dtype == dst.dtype)
+        ):
+            return
+        _copy_overlapping(src=src, dst=dst)
+        return
+
+    copy_q = dst.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[copy_q]
+    dep_evs = _manager.submitted_events
+    hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs
+    )
+    _manager.add_event_pair(hev, cpy_ev)
+
+
+if hasattr(np, "broadcast_shapes"):
+
+    def _broadcast_shapes(sh1, sh2):
+        return np.broadcast_shapes(sh1, sh2)
+
+else:
+
+    def _broadcast_shapes(sh1, sh2):
+        # use arrays with zero strides, whose memory footprint
+        # is independent of the number of array elements
+        return np.broadcast(
+            np.empty(sh1, dtype=[]),
+            np.empty(sh2, dtype=[]),
+        ).shape
+
+
+def _broadcast_strides(X_shape, X_strides, res_ndim):
+    """
+    Broadcasts strides to match the given dimensions;
+    returns tuple type strides.
+    """
+    out_strides = [0] * res_ndim
+    X_shape_len = len(X_shape)
+    str_dim = -X_shape_len
+    for i in range(X_shape_len):
+        shape_value = X_shape[i]
+        if not shape_value == 1:
+            out_strides[str_dim] = X_strides[i]
+        str_dim += 1
+
+    return tuple(out_strides)
+
+
+def _copy_from_usm_ndarray_to_usm_ndarray(dst, src):
+    if any(
+        not isinstance(arg, dpt.usm_ndarray)
+        for arg in (
+            dst,
+            src,
+        )
+    ):
+        raise TypeError(
+            "Both types are expected to be dpctl.tensor.usm_ndarray, "
+            f"got {type(dst)} and {type(src)}."
+        )
+
+    if dst.ndim == src.ndim and dst.shape == src.shape:
+        _copy_same_shape(dst, src)
+        return
+
+    try:
+        common_shape = _broadcast_shapes(dst.shape, src.shape)
+    except ValueError as exc:
+        raise ValueError("Shapes of two arrays are not compatible") from exc
+
+    if dst.size < src.size and dst.size < np.prod(common_shape):
+        raise ValueError("Destination is smaller ")
+
+    if len(common_shape) > dst.ndim:
+        ones_count = len(common_shape) - dst.ndim
+        for k in range(ones_count):
+            if common_shape[k] != 1:
+                raise ValueError
+        common_shape = common_shape[ones_count:]
+
+    if src.ndim < len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    elif src.ndim == len(common_shape):
+        new_src_strides = _broadcast_strides(
+            src.shape, src.strides, len(common_shape)
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+    else:
+        # since broadcasting succeeded, src.ndim is greater because of
+        # leading sequence of ones, so we trim it
+        n = len(common_shape)
+        new_src_strides = _broadcast_strides(
+            src.shape[-n:], src.strides[-n:], n
+        )
+        src_same_shape = dpt.usm_ndarray(
+            common_shape,
+            dtype=src.dtype,
+            buffer=src.usm_data,
+            strides=new_src_strides,
+            offset=src._element_offset,
+        )
+
+    _copy_same_shape(dst, src_same_shape)
+
+
+def _make_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty array with shape and strides like `x`, with dtype `dt`,
+    USM type `usm_type`, on device `dev`.
+    """
+    st = list(x.strides)
+    perm = sorted(
+        range(x.ndim),
+        key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0,
+        reverse=True,
+    )
+    inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
+    sh = x.shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if min(st) < 0:
+        st_sorted = [st[i] for i in perm]
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if st_sorted[i] < 0
+                else slice(None, None, None)
+            )
+            for i in range(x.ndim)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_orderK(x, dt, usm_type=None, dev=None):
+    """
+    Returns empty array like `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(x)}")
+    if usm_type is None:
+        usm_type = x.usm_type
+    if dev is None:
+        dev = x.device
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty_like(
+            x, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
+    """
+    Returns empty usm_ndarray like NumPy array `x`, using order='K'
+
+    For an array `x` that was obtained by permutation of a contiguous
+    array the returned array will have the same shape and the same
+    strides as `x`.
+    """
+    if not isinstance(x, np.ndarray):
+        raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
+    fl = x.flags
+    if fl["C"] or x.size <= 1:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    elif fl["F"]:
+        return dpt.empty(
+            x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    return _make_empty_like_orderK(x, dt, usm_type, dev)
+
+
+def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    if nd1 > nd2 and X1.shape == res_shape:
+        return _empty_like_orderK(X1, dt, usm_type, dev)
+    elif nd1 < nd2 and X2.shape == res_shape:
+        return _empty_like_orderK(X2, dt, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    if fl1["C"] or fl2["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    max_ndim = max(nd1, nd2)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (st1_sorted[i] < 0 and st2_sorted[i] < 0)
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
+    if not isinstance(X1, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X1)}")
+    if not isinstance(X2, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X2)}")
+    if not isinstance(X3, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray, got {type(X3)}")
+    nd1 = X1.ndim
+    nd2 = X2.ndim
+    nd3 = X3.ndim
+    if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3:
+        return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev)
+    elif (
+        X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1
+    ):
+        return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev)
+    elif (
+        X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2
+    ):
+        return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev)
+    fl1 = X1.flags
+    fl2 = X2.flags
+    fl3 = X3.flags
+    if fl1["C"] or fl2["C"] or fl3["C"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
+        )
+    if fl1["F"] and fl2["F"] and fl3["F"]:
+        return dpt.empty(
+            res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
+        )
+    st1 = list(X1.strides)
+    st2 = list(X2.strides)
+    st3 = list(X3.strides)
+    max_ndim = max(nd1, nd2, nd3)
+    st1 += [0] * (max_ndim - len(st1))
+    st2 += [0] * (max_ndim - len(st2))
+    st3 += [0] * (max_ndim - len(st3))
+    sh1 = list(X1.shape) + [0] * (max_ndim - nd1)
+    sh2 = list(X2.shape) + [0] * (max_ndim - nd2)
+    sh3 = list(X3.shape) + [0] * (max_ndim - nd3)
+    perm = sorted(
+        range(max_ndim),
+        key=lambda d: (
+            builtins.abs(st1[d]) if sh1[d] > 1 else 0,
+            builtins.abs(st2[d]) if sh2[d] > 1 else 0,
+            builtins.abs(st3[d]) if sh3[d] > 1 else 0,
+        ),
+        reverse=True,
+    )
+    inv_perm = sorted(range(max_ndim), key=lambda i: perm[i])
+    st1_sorted = [st1[i] for i in perm]
+    st2_sorted = [st2[i] for i in perm]
+    st3_sorted = [st3[i] for i in perm]
+    sh = res_shape
+    sh_sorted = tuple(sh[i] for i in perm)
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
+    if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
+        sl = tuple(
+            (
+                slice(None, None, -1)
+                if (
+                    st1_sorted[i] < 0
+                    and st2_sorted[i] < 0
+                    and st3_sorted[i] < 0
+                )
+                else slice(None, None, None)
+            )
+            for i in range(nd1)
+        )
+        R = R[sl]
+    return dpt.permute_dims(R, inv_perm)
+
+
+def copy(usm_ary, /, *, order="K"):
+    """copy(ary, order="K")
+
+    Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        ary (usm_ndarray):
+            Input array
+        order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional):
+            Controls the memory layout of the output array
+    Returns:
+        usm_ndarray:
+            A copy of the input array.
+
+    Memory layout of the copy is controlled by ``order`` keyword,
+    following NumPy's conventions. The ``order`` keywords can be
+    one of the following:
+
+    .. list-table::
+
+        * - ``"C"``
+          - C-contiguous memory layout
+        * - ``"F"``
+          - Fortran-contiguous memory layout
+        * - ``"A"``
+          - Fortran-contiguous if the input array is also Fortran-contiguous,
+            otherwise C-contiguous
+        * - ``"K"``
+          - match the layout of ``usm_ary`` as closely as possible.
+
+    """
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, usm_ary.dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=usm_ary.dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_same_shape(R, usm_ary)
+    return R
+
+
+def astype(
+    usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None
+):
+    """astype(array, new_dtype, order="K", casting="unsafe", \
+            copy=True, device=None)
+
+    Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a
+    specified type.
+
+    Args:
+        array (usm_ndarray):
+            An input array.
+        new_dtype (dtype):
+            The data type of the resulting array. If `None`, gives default
+            floating point type supported by device where the resulting array
+            will be located.
+        order ({"C", "F", "A", "K"}, optional):
+            Controls memory layout of the resulting array if a copy
+            is returned.
+        casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional):
+            Controls what kind of data casting may occur. Please see
+            :meth:`numpy.ndarray.astype` for description of casting modes.
+        copy (bool, optional):
+            By default, `astype` always returns a newly allocated array.
+            If this keyword is set to `False`, a view of the input array
+            may be returned when possible.
+        device (object): array API specification of device where the
+            output array is created. Device can be specified by
+            a filter selector string, an instance of
+            :class:`dpctl.SyclDevice`, an instance of
+            :class:`dpctl.SyclQueue`, or an instance of
+            :class:`dpctl.tensor.Device`. If the value is `None`,
+            returned array is created on the same device as `array`.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array with requested data type.
+
+    A view can be returned, if possible, when `copy=False` is used.
+    """
+    if not isinstance(usm_ary, dpt.usm_ndarray):
+        return TypeError(
+            f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}"
+        )
+    if len(order) == 0 or order[0] not in "KkAaCcFf":
+        raise ValueError(
+            "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'."
+        )
+    order = order[0].upper()
+    ary_dtype = usm_ary.dtype
+    if device is not None:
+        if not isinstance(device, dpctl.SyclQueue):
+            if isinstance(device, dpt.Device):
+                device = device.sycl_queue
+            else:
+                device = dpt.Device.create_device(device).sycl_queue
+        d = device.sycl_device
+        target_dtype = _get_dtype(newdtype, device)
+        if not _dtype_supported_by_device_impl(
+            target_dtype, d.has_aspect_fp16, d.has_aspect_fp64
+        ):
+            raise ValueError(
+                f"Requested dtype '{target_dtype}' is not supported by the "
+                "target device"
+            )
+        usm_ary = usm_ary.to_device(device)
+    else:
+        target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
+
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
+        raise TypeError(
+            f"Can not cast from {ary_dtype} to {newdtype} "
+            f"according to rule {casting}."
+        )
+    c_contig = usm_ary.flags.c_contiguous
+    f_contig = usm_ary.flags.f_contiguous
+    needs_copy = copy or not ary_dtype == target_dtype
+    if not needs_copy and (order != "K"):
+        # ensure that order="F" for C-contig input triggers copy,
+        # and order="C" for F-contig input triggers copy too.
+        # 1D arrays which are both C- and F- contig should not
+        # force copying for neither order="F", nor order="C", see gh-1926
+        needs_copy = (
+            c_contig and not f_contig and order not in ["A", "C"]
+        ) or (not c_contig and f_contig and order not in ["A", "F"])
+    if not needs_copy:
+        return usm_ary
+    copy_order = "C"
+    if order == "C":
+        pass
+    elif order == "F":
+        copy_order = order
+    elif order == "A":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    elif order == "K":
+        if usm_ary.flags.f_contiguous:
+            copy_order = "F"
+    else:
+        raise ValueError(
+            "Unrecognized value of the order keyword. "
+            "Recognized values are 'A', 'C', 'F', or 'K'"
+        )
+    if order == "K":
+        R = _empty_like_orderK(usm_ary, target_dtype)
+    else:
+        R = dpt.usm_ndarray(
+            usm_ary.shape,
+            dtype=target_dtype,
+            buffer=usm_ary.usm_type,
+            order=copy_order,
+            buffer_ctor_kwargs={"queue": usm_ary.sycl_queue},
+        )
+    _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary)
+    return R
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index a0e7b28e66f..5a39e9367e9 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -36,6 +36,9 @@
 from dpctl.tensor._data_types import _get_dtype
 from dpctl.tensor._device import normalize_queue_device
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -147,7 +150,7 @@ def full(
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
+        return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 106df09cf97..df4f3e95304 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -32,6 +32,9 @@
 import dpctl.tensor as dpt
 import dpctl.utils
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals):
     if vals.dtype == x.dtype:
         rhs = vals
     else:
-        rhs = dpt.astype(vals, x.dtype)
+        rhs = dpt_ext.astype(vals, x.dtype)
     rhs = dpt.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
new file mode 100644
index 00000000000..fa8fc27876b
--- /dev/null
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -0,0 +1,120 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils as dputils
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._tensor_impl as ti
+
+from ._numpy_helper import normalize_axis_tuple
+
+__doc__ = (
+    "Implementation module for array manipulation "
+    "functions in :module:`dpctl.tensor`"
+)
+
+
+def roll(x, /, shift, *, axis=None):
+    """
+    roll(x, shift, axis)
+
+    Rolls array elements along a specified axis.
+    Array elements that roll beyond the last position are re-introduced
+    at the first position. Array elements that roll beyond the first position
+    are re-introduced at the last position.
+
+    Args:
+        x (usm_ndarray): input array
+        shift (Union[int, Tuple[int,...]]): number of places by which the
+            elements are shifted. If `shift` is a tuple, then `axis` must be a
+            tuple of the same size, and each of the given axes must be shifted
+            by the corresponding element in `shift`. If `shift` is an `int`
+            and `axis` a tuple, then the same `shift` must be used for all
+            specified axes. If a `shift` is positive, then array elements is
+            shifted positively (toward larger indices) along the dimension of
+            `axis`.
+            If a `shift` is negative, then array elements must be shifted
+            negatively (toward smaller indices) along the dimension of `axis`.
+        axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which
+            elements to shift. If `axis` is `None`, the array is
+            flattened, shifted, and then restored to its original shape.
+            Default: `None`.
+
+    Returns:
+        usm_ndarray:
+            An array having the same `dtype`, `usm_type` and
+            `device` attributes as `x` and whose elements are shifted relative
+            to `x`.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected usm_ndarray type, got {type(x)}.")
+    exec_q = x.sycl_queue
+    _manager = dputils.SequentialOrderManager[exec_q]
+    if axis is None:
+        shift = operator.index(shift)
+        res = dpt.empty(
+            x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+        )
+        sz = operator.index(x.size)
+        shift = (shift % sz) if sz > 0 else 0
+        dep_evs = _manager.submitted_events
+        hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d(
+            src=x,
+            dst=res,
+            shift=shift,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(hev, roll_ev)
+        return res
+    axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True)
+    broadcasted = np.broadcast(shift, axis)
+    if broadcasted.ndim > 1:
+        raise ValueError("'shift' and 'axis' should be scalars or 1D sequences")
+    shifts = [
+        0,
+    ] * x.ndim
+    shape = x.shape
+    for sh, ax in broadcasted:
+        n_i = operator.index(shape[ax])
+        shifted = shifts[ax] + operator.index(sh)
+        shifts[ax] = (shifted % n_i) if n_i > 0 else 0
+    res = dpt.empty(
+        x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
+    )
+    dep_evs = _manager.submitted_events
+    ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd(
+        src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs
+    )
+    _manager.add_event_pair(ht_e, roll_ev)
+    return res
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
new file mode 100644
index 00000000000..6afa1dc245c
--- /dev/null
+++ b/dpctl_ext/tensor/_reshape.py
@@ -0,0 +1,206 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import operator
+
+import dpctl.tensor as dpt
+import dpctl.utils
+import numpy as np
+from dpctl.tensor._tensor_impl import (
+    _copy_usm_ndarray_for_reshape,
+    _ravel_multi_index,
+    _unravel_index,
+)
+
+__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`."
+
+
+def _make_unit_indexes(shape):
+    """
+    Construct a diagonal matrix with with one on the diagonal
+    except if the corresponding element of shape is 1.
+    """
+    nd = len(shape)
+    mi = np.zeros((nd, nd), dtype="u4")
+    for i, dim in enumerate(shape):
+        mi[i, i] = 1 if dim > 1 else 0
+    return mi
+
+
+def ti_unravel_index(flat_index, shape, order="C"):
+    return _unravel_index(flat_index, shape, order)
+
+
+def ti_ravel_multi_index(multi_index, shape, order="C"):
+    return _ravel_multi_index(multi_index, shape, order)
+
+
+def reshaped_strides(old_sh, old_sts, new_sh, order="C"):
+    """
+    When reshaping array with `old_sh` shape and `old_sts` strides
+    into the new shape `new_sh`, returns the new stride if the reshape
+    can be a view, otherwise returns `None`.
+    """
+    eye_new_mi = _make_unit_indexes(new_sh)
+    new_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                old_sts, ti_unravel_index(flat_index, old_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, new_sh, order=order)
+            for unitvec in eye_new_mi
+        ]
+    ]
+    eye_old_mi = _make_unit_indexes(old_sh)
+    check_sts = [
+        sum(
+            st_i * ind_i
+            for st_i, ind_i in zip(
+                new_sts, ti_unravel_index(flat_index, new_sh, order=order)
+            )
+        )
+        for flat_index in [
+            ti_ravel_multi_index(unitvec, old_sh, order=order)
+            for unitvec in eye_old_mi
+        ]
+    ]
+    valid = all(
+        check_st == old_st or old_dim == 1
+        for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh)
+    )
+    return new_sts if valid else None
+
+
+def reshape(X, /, shape, *, order="C", copy=None):
+    """reshape(x, shape, order="C")
+
+    Reshapes array ``x`` into new shape.
+
+    Args:
+        x (usm_ndarray):
+            input array
+        shape (Tuple[int]):
+            the desired shape of the resulting array.
+        order ("C", "F", optional):
+            memory layout of the resulting array
+            if a copy is found to be necessary. Supported
+            choices are ``"C"`` for C-contiguous, or row-major layout;
+            and ``"F"`` for F-contiguous, or column-major layout.
+
+    Returns:
+        usm_ndarray:
+            Reshaped array is a view, if possible,
+            and a copy otherwise with memory layout as indicated
+            by ``order`` keyword.
+    """
+    if not isinstance(X, dpt.usm_ndarray):
+        raise TypeError
+    if not isinstance(shape, (list, tuple)):
+        shape = (shape,)
+    if order in "cfCF":
+        order = order.upper()
+    else:
+        raise ValueError(
+            f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}"
+        )
+    if copy not in (True, False, None):
+        raise ValueError(
+            f"Keyword 'copy' not recognized. Expecting True, False, "
+            f"or None, got {copy}"
+        )
+    shape = [operator.index(d) for d in shape]
+    negative_ones_count = 0
+    for nshi in shape:
+        if nshi == -1:
+            negative_ones_count = negative_ones_count + 1
+        if (nshi < -1) or negative_ones_count > 1:
+            raise ValueError(
+                "Target shape should have at most 1 negative "
+                "value which can only be -1"
+            )
+    if negative_ones_count:
+        sz = -np.prod(shape)
+        if sz == 0:
+            raise ValueError(
+                f"Can not reshape array of size {X.size} into "
+                f"shape {tuple(i for i in shape if i >= 0)}"
+            )
+        v = X.size // sz
+        shape = [v if d == -1 else d for d in shape]
+    if X.size != np.prod(shape):
+        raise ValueError(f"Can not reshape into {shape}")
+    if X.size:
+        newsts = reshaped_strides(X.shape, X.strides, shape, order=order)
+    else:
+        newsts = (1,) * len(shape)
+    copy_required = newsts is None
+    if copy_required and (copy is False):
+        raise ValueError(
+            "Reshaping the array requires a copy, but no copying was "
+            "requested by using copy=False"
+        )
+    copy_q = X.sycl_queue
+    if copy_required or (copy is True):
+        # must perform a copy
+        copy_q = X.sycl_queue
+        flat_res = dpt.usm_ndarray(
+            (X.size,),
+            dtype=X.dtype,
+            buffer=X.usm_type,
+            buffer_ctor_kwargs={"queue": copy_q},
+        )
+        _manager = dpctl.utils.SequentialOrderManager[copy_q]
+        dep_evs = _manager.submitted_events
+        if order == "C":
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        else:
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
+            hev, r_e = _copy_usm_ndarray_for_reshape(
+                src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
+            )
+        _manager.add_event_pair(hev, r_e)
+        return dpt.usm_ndarray(
+            tuple(shape), dtype=X.dtype, buffer=flat_res, order=order
+        )
+    # can form a view
+    if (len(shape) == X.ndim) and all(
+        s1 == s2 for s1, s2 in zip(shape, X.shape)
+    ):
+        return X
+    return dpt.usm_ndarray(
+        shape,
+        dtype=X.dtype,
+        buffer=X,
+        strides=tuple(newsts),
+        offset=X._element_offset,
+    )
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
new file mode 100644
index 00000000000..524bfcfdb98
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp
@@ -0,0 +1,184 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_reshape.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_reshape_fn_ptr_t
+    copy_for_reshape_generic_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    py::ssize_t src_nelems = src.get_size();
+    py::ssize_t dst_nelems = dst.get_size();
+
+    // Must have the same number of elements
+    if (src_nelems != dst_nelems) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same number of elements.");
+    }
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_reshape requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    // dimensions may be different
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    auto fn = copy_for_reshape_generic_dispatch_vector[type_id];
+
+    auto src_shape = src.get_shape_vector();
+    auto src_strides = src.get_strides_vector();
+
+    auto dst_shape = dst.get_shape_vector();
+    auto dst_strides = dst.get_strides_vector();
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_shape, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, src_shape, src_strides, dst_shape,
+        dst_strides);
+    auto copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_reshape_event =
+        fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data,
+           dst_data, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_reshape_event}, shape_strides_owner);
+
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_reshape_event);
+}
+
+void init_copy_for_reshape_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory;
+
+    DispatchVectorBuilder<copy_for_reshape_fn_ptr_t,
+                          CopyForReshapeGenericFactory, num_types>
+        dvb;
+    dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
new file mode 100644
index 00000000000..c5af885ad6c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_reshape_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
new file mode 100644
index 00000000000..a187b224767
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp
@@ -0,0 +1,400 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+#include "copy_for_roll.hpp"
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "simplify_iteration_space.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_for_roll_ndshift_strided_fn_ptr_t;
+using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t;
+using dpctl::utils::keep_args_alive;
+
+// define static vector
+static copy_for_roll_strided_fn_ptr_t
+    copy_for_roll_strided_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_contig_fn_ptr_t
+    copy_for_roll_contig_dispatch_vector[td_ns::num_types];
+
+static copy_for_roll_ndshift_strided_fn_ptr_t
+    copy_for_roll_ndshift_dispatch_vector[td_ns::num_types];
+
+/*
+ * Copies src into dst (same data type) of different shapes by using flat
+ * iterations.
+ *
+ * Equivalent to the following loop:
+ *
+ * for i for range(src.size):
+ *     dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)]
+ */
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_1d requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check same contexts
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    const bool is_src_c_contig = src.is_c_contiguous();
+    const bool is_src_f_contig = src.is_f_contiguous();
+
+    const bool is_dst_c_contig = dst.is_c_contiguous();
+    const bool is_dst_f_contig = dst.is_f_contiguous();
+
+    const bool both_c_contig = is_src_c_contig && is_dst_c_contig;
+    const bool both_f_contig = is_src_f_contig && is_dst_f_contig;
+
+    // normalize shift parameter to be 0 <= offset < src_nelems
+    std::size_t offset =
+        (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems);
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    if (both_c_contig || both_f_contig) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+            static constexpr py::ssize_t zero_offset = 0;
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data,
+                   zero_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_nd;
+    const py::ssize_t *shape = src_shape_ptr;
+
+    // nd, simplified_* and *_offset are modified by reference
+    dpctl::tensor::py_internal::simplify_iteration_space(
+        nd, shape, src_strides, dst_strides,
+        // output
+        simplified_shape, simplified_src_strides, simplified_dst_strides,
+        src_offset, dst_offset);
+
+    if (nd == 1 && simplified_src_strides[0] == 1 &&
+        simplified_dst_strides[0] == 1) {
+        auto fn = copy_for_roll_contig_dispatch_vector[type_id];
+
+        if (fn != nullptr) {
+
+            sycl::event copy_for_roll_ev =
+                fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data,
+                   dst_offset, depends);
+
+            sycl::event ht_ev =
+                keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev});
+
+            return std::make_pair(ht_ev, copy_for_roll_ev);
+        }
+    }
+
+    auto fn = copy_for_roll_strided_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    sycl::event temporaries_cleanup_ev =
+        dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {copy_for_roll_event}, shape_strides_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int dst_nd = dst.get_ndim();
+
+    // Must have the same number of dimensions
+    if (src_nd != dst_nd) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same number of dimensions.");
+    }
+
+    if (static_cast<std::size_t>(src_nd) != shifts.size()) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires shifts to "
+            "contain an integral shift for each array dimension.");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *dst_shape_ptr = dst.get_shape_raw();
+
+    if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same shape.");
+    }
+
+    py::ssize_t src_nelems = src.get_size();
+
+    int src_typenum = src.get_typenum();
+    int dst_typenum = dst.get_typenum();
+
+    // typenames must be the same
+    if (src_typenum != dst_typenum) {
+        throw py::value_error(
+            "copy_usm_ndarray_for_roll_nd requires src and dst to "
+            "have the same type.");
+    }
+
+    if (src_nelems == 0) {
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    // check for compatible queues
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    if (src_nelems == 1) {
+        // handle special case of 1-element array
+        int src_elemsize = src.get_elemsize();
+        const char *src_data = src.get_data();
+        char *dst_data = dst.get_data();
+        sycl::event copy_ev =
+            exec_q.copy<char>(src_data, dst_data, src_elemsize, depends);
+        return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}),
+                              copy_ev);
+    }
+
+    auto array_types = td_ns::usm_ndarray_types();
+    int type_id = array_types.typenum_to_lookup_id(src_typenum);
+
+    std::vector<py::ssize_t> normalized_shifts{};
+    normalized_shifts.reserve(src_nd);
+
+    for (int i = 0; i < src_nd; ++i) {
+        // normalize shift parameter to be 0 <= offset < dim
+        py::ssize_t dim = src_shape_ptr[i];
+        std::size_t offset =
+            (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim);
+
+        normalized_shifts.push_back(offset);
+    }
+
+    const char *src_data = src.get_data();
+    char *dst_data = dst.get_data();
+
+    auto const &src_strides = src.get_strides_vector();
+    auto const &dst_strides = dst.get_strides_vector();
+    auto const &common_shape = src.get_shape_vector();
+
+    static constexpr py::ssize_t src_offset = 0;
+    static constexpr py::ssize_t dst_offset = 0;
+
+    auto fn = copy_for_roll_ndshift_dispatch_vector[type_id];
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(2);
+
+    // shape_strides = [src_shape, src_strides, dst_strides]
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, common_shape, src_strides, dst_strides,
+        normalized_shifts);
+    auto shape_strides_shifts_owner =
+        std::move(std::get<0>(ptr_size_event_tuple));
+    sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get();
+
+    std::vector<sycl::event> all_deps(depends.size() + 1);
+    all_deps.push_back(copy_shape_ev);
+    all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends));
+
+    sycl::event copy_for_roll_event =
+        fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data,
+           src_offset, dst_data, dst_offset, all_deps);
+
+    auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {copy_for_roll_event}, shape_strides_shifts_owner);
+    host_task_events.push_back(temporaries_cleanup_ev);
+
+    return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events),
+                          copy_for_roll_event);
+}
+
+void init_copy_for_roll_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory;
+
+    DispatchVectorBuilder<copy_for_roll_strided_fn_ptr_t,
+                          CopyForRollStridedFactory, num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory;
+    DispatchVectorBuilder<copy_for_roll_contig_fn_ptr_t,
+                          CopyForRollContigFactory, num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector);
+
+    using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory;
+    DispatchVectorBuilder<copy_for_roll_ndshift_strided_fn_ptr_t,
+                          CopyForRollNDShiftFactory, num_types>
+        dvb3;
+    dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
new file mode 100644
index 00000000000..cffbf9f6f0d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp
@@ -0,0 +1,65 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 py::ssize_t shift,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern std::pair<sycl::event, sycl::event>
+    copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src,
+                                 const dpctl::tensor::usm_ndarray &dst,
+                                 const std::vector<py::ssize_t> &shifts,
+                                 sycl::queue &exec_q,
+                                 const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_for_roll_dispatch_vectors();
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
new file mode 100644
index 00000000000..e97e8aeb1ca
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -0,0 +1,368 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+#include "kernels/copy_and_cast.hpp"
+#include "utils/offset_utils.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "simplify_iteration_space.hpp"
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl::tensor::py_internal
+{
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_blocking_fn_ptr_t
+    copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types]
+                                                   [td_ns::num_types];
+
+using dpctl::tensor::kernels::copy_and_cast::
+    copy_and_cast_from_host_contig_blocking_fn_ptr_t;
+
+static copy_and_cast_from_host_contig_blocking_fn_ptr_t
+    copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types]
+                                                          [td_ns::num_types];
+
+void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends)
+{
+    int src_ndim = npy_src.ndim();
+    int dst_ndim = dst.get_ndim();
+
+    if (src_ndim != dst_ndim) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "different array ranks, "
+                              "i.e. different number of indices needed to "
+                              "address array elements.");
+    }
+
+    const py::ssize_t *src_shape = npy_src.shape();
+    const py::ssize_t *dst_shape = dst.get_shape_raw();
+    bool shapes_equal(true);
+    std::size_t src_nelems(1);
+    for (int i = 0; shapes_equal && (i < src_ndim); ++i) {
+        shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]);
+        src_nelems *= static_cast<std::size_t>(src_shape[i]);
+    }
+
+    if (!shapes_equal) {
+        throw py::value_error("Source ndarray and destination usm_ndarray have "
+                              "difference shapes.");
+    }
+
+    if (src_nelems == 0) {
+        // nothing to do
+        return;
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems);
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) {
+        throw py::value_error("Execution queue is not compatible with the "
+                              "allocation queue");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst);
+
+    // here we assume that NumPy's type numbers agree with ours for types
+    // supported in both
+    int src_typenum =
+        py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num;
+    int dst_typenum = dst.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_type_id = array_types.typenum_to_lookup_id(src_typenum);
+    int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum);
+
+    py::buffer_info src_pybuf = npy_src.request();
+    const char *const src_data = static_cast<const char *const>(src_pybuf.ptr);
+    char *dst_data = dst.get_data();
+
+    int src_flags = npy_src.flags();
+
+    // check for applicability of special cases:
+    //      (same type && (both C-contiguous || both F-contiguous)
+    const bool both_c_contig =
+        ((src_flags & py::array::c_style) && dst.is_c_contiguous());
+    const bool both_f_contig =
+        ((src_flags & py::array::f_style) && dst.is_f_contiguous());
+
+    const bool same_data_types = (src_type_id == dst_type_id);
+
+    if (both_c_contig || both_f_contig) {
+        if (same_data_types) {
+            int src_elem_size = npy_src.itemsize();
+
+            sycl::event copy_ev =
+                exec_q.memcpy(static_cast<void *>(dst_data),
+                              static_cast<const void *>(src_data),
+                              src_nelems * src_elem_size, depends);
+
+            {
+                // wait for copy_ev to complete
+                // release GIL to allow other threads (host_tasks)
+                // a chance to acquire GIL
+                py::gil_scoped_release lock{};
+                copy_ev.wait();
+            }
+
+            return;
+        }
+        else {
+            py::gil_scoped_release lock{};
+
+            auto copy_and_cast_from_host_contig_blocking_fn =
+                copy_and_cast_from_host_contig_blocking_dispatch_table
+                    [dst_type_id][src_type_id];
+
+            static constexpr py::ssize_t zero_offset(0);
+
+            copy_and_cast_from_host_contig_blocking_fn(
+                exec_q, src_nelems, src_data, zero_offset, dst_data,
+                zero_offset, depends);
+
+            return;
+        }
+    }
+
+    auto const &dst_strides =
+        dst.get_strides_vector(); // N.B.: strides in elements
+
+    using shT = std::vector<py::ssize_t>;
+    shT simplified_shape;
+    shT simplified_src_strides;
+    shT simplified_dst_strides;
+    py::ssize_t src_offset(0);
+    py::ssize_t dst_offset(0);
+
+    int nd = src_ndim;
+    const py::ssize_t *shape = src_shape;
+
+    const py::ssize_t *src_strides_p =
+        npy_src.strides();                         // N.B.: strides in bytes
+    py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes
+
+    bool is_src_c_contig = ((src_flags & py::array::c_style) != 0);
+    bool is_src_f_contig = ((src_flags & py::array::f_style) != 0);
+
+    shT src_strides_in_elems;
+    if (src_strides_p) {
+        src_strides_in_elems.resize(nd);
+        // copy and convert strides from bytes to elements
+        std::transform(
+            src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems),
+            [src_itemsize](py::ssize_t el) {
+                py::ssize_t q = el / src_itemsize;
+                if (q * src_itemsize != el) {
+                    throw std::runtime_error(
+                        "NumPy array strides are not multiple of itemsize");
+                }
+                return q;
+            });
+    }
+    else {
+        if (is_src_c_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::c_contiguous_strides(nd, src_shape);
+        }
+        else if (is_src_f_contig) {
+            src_strides_in_elems =
+                dpctl::tensor::f_contiguous_strides(nd, src_shape);
+        }
+        else {
+            throw py::value_error("NumPy source array has null strides but is "
+                                  "neither C- nor F-contiguous.");
+        }
+    }
+
+    // nd, simplified_* vectors and offsets are modified by reference
+    simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides,
+                             // outputs
+                             simplified_shape, simplified_src_strides,
+                             simplified_dst_strides, src_offset, dst_offset);
+
+    assert(simplified_shape.size() == static_cast<std::size_t>(nd));
+    assert(simplified_src_strides.size() == static_cast<std::size_t>(nd));
+    assert(simplified_dst_strides.size() == static_cast<std::size_t>(nd));
+
+    // handle nd == 0
+    if (nd == 0) {
+        nd = 1;
+        simplified_shape.reserve(nd);
+        simplified_shape.push_back(1);
+
+        simplified_src_strides.reserve(nd);
+        simplified_src_strides.push_back(1);
+
+        simplified_dst_strides.reserve(nd);
+        simplified_dst_strides.push_back(1);
+    }
+
+    const bool is_contig_vector =
+        ((nd == 1) && (simplified_src_strides.front() == 1) &&
+         (simplified_dst_strides.front() == 1));
+
+    const bool can_use_memcpy = (same_data_types && is_contig_vector &&
+                                 (src_offset == 0) && (dst_offset == 0));
+
+    if (can_use_memcpy) {
+        int src_elem_size = npy_src.itemsize();
+
+        sycl::event copy_ev = exec_q.memcpy(
+            static_cast<void *>(dst_data), static_cast<const void *>(src_data),
+            src_nelems * src_elem_size, depends);
+
+        {
+            // wait for copy_ev to complete
+            // release GIL to allow other threads (host_tasks)
+            // a chance to acquire GIL
+            py::gil_scoped_release lock{};
+
+            copy_ev.wait();
+        }
+
+        return;
+    }
+
+    // Minimum and maximum element offsets for source np.ndarray
+    py::ssize_t npy_src_min_nelem_offset(src_offset);
+    py::ssize_t npy_src_max_nelem_offset(src_offset);
+    for (int i = 0; i < nd; ++i) {
+        if (simplified_src_strides[i] < 0) {
+            npy_src_min_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+        else {
+            npy_src_max_nelem_offset +=
+                simplified_src_strides[i] * (simplified_shape[i] - 1);
+        }
+    }
+
+    if (is_contig_vector) {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        auto copy_and_cast_from_host_contig_blocking_fn =
+            copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id]
+                                                                  [src_type_id];
+
+        copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data,
+                                                   src_offset, dst_data,
+                                                   dst_offset, depends);
+
+        return;
+    }
+
+    std::vector<sycl::event> host_task_events;
+    host_task_events.reserve(1);
+
+    // Copy shape strides into device memory
+    using dpctl::tensor::offset_utils::device_allocate_and_pack;
+    auto ptr_size_event_tuple = device_allocate_and_pack<py::ssize_t>(
+        exec_q, host_task_events, simplified_shape, simplified_src_strides,
+        simplified_dst_strides);
+    auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple));
+    const sycl::event &copy_shape_ev = std::get<2>(ptr_size_event_tuple);
+    const py::ssize_t *shape_strides = shape_strides_owner.get();
+
+    {
+        // release GIL for the blocking call
+        py::gil_scoped_release lock{};
+
+        // Get implementation function pointer
+        auto copy_and_cast_from_host_blocking_fn =
+            copy_and_cast_from_host_blocking_dispatch_table[dst_type_id]
+                                                           [src_type_id];
+
+        copy_and_cast_from_host_blocking_fn(
+            exec_q, src_nelems, nd, shape_strides, src_data, src_offset,
+            npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data,
+            dst_offset, depends, {copy_shape_ev});
+
+        // invoke USM deleter in smart pointer while GIL is held
+        shape_strides_owner.reset(nullptr);
+    }
+
+    return;
+}
+
+void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void)
+{
+    using namespace td_ns;
+    using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_blocking_fn_ptr_t,
+                         CopyAndCastFromHostFactory, num_types>
+        dtb_copy_from_numpy;
+
+    dtb_copy_from_numpy.populate_dispatch_table(
+        copy_and_cast_from_host_blocking_dispatch_table);
+
+    using dpctl::tensor::kernels::copy_and_cast::
+        CopyAndCastFromHostContigFactory;
+
+    DispatchTableBuilder<copy_and_cast_from_host_contig_blocking_fn_ptr_t,
+                         CopyAndCastFromHostContigFactory, num_types>
+        dtb_copy_from_numpy_contig;
+
+    dtb_copy_from_numpy_contig.populate_dispatch_table(
+        copy_and_cast_from_host_contig_blocking_dispatch_table);
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
new file mode 100644
index 00000000000..f2de95f97cc
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -0,0 +1,57 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void copy_numpy_ndarray_into_usm_ndarray(
+    const py::array &npy_src,
+    const dpctl::tensor::usm_ndarray &dst,
+    sycl::queue &exec_q,
+    const std::vector<sycl::event> &depends = {});
+
+extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
index 182124ac4ae..c1372c1c240 100644
--- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
+++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp
@@ -51,9 +51,9 @@
 // #include "clip.hpp"
 #include "copy_and_cast_usm_to_usm.hpp"
 #include "copy_as_contig.hpp"
-// #include "copy_for_reshape.hpp"
-// #include "copy_for_roll.hpp"
-// #include "copy_numpy_ndarray_into_usm_ndarray.hpp"
+#include "copy_for_reshape.hpp"
+#include "copy_for_roll.hpp"
+#include "copy_numpy_ndarray_into_usm_ndarray.hpp"
 #include "device_support_queries.hpp"
 // #include "eye_ctor.hpp"
 #include "full_ctor.hpp"
@@ -87,16 +87,16 @@ using dpctl::tensor::py_internal::py_as_f_contig;
 
 /* =========================== Copy for reshape ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape;
 
 /* =========================== Copy for roll ============================= */
 
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
-// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d;
+using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd;
 
 /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */
 
-// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
+using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray;
 
 /* ============= linear-sequence ==================== */
 
@@ -146,7 +146,7 @@ void init_dispatch_tables(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_and_cast_usm_to_usm_dispatch_tables();
-    // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
+    init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables();
     init_advanced_indexing_dispatch_tables();
     // init_where_dispatch_tables();
     return;
@@ -158,8 +158,8 @@ void init_dispatch_vectors(void)
     using namespace dpctl::tensor::py_internal;
 
     init_copy_as_contig_dispatch_vectors();
-    // init_copy_for_reshape_dispatch_vectors();
-    // init_copy_for_roll_dispatch_vectors();
+    init_copy_for_reshape_dispatch_vectors();
+    init_copy_for_roll_dispatch_vectors();
     // init_linear_sequences_dispatch_vectors();
     init_full_ctor_dispatch_vectors();
     init_zeros_ctor_dispatch_vectors();
@@ -279,28 +279,29 @@ PYBIND11_MODULE(_tensor_impl, m)
         },
         "");
 
-    // m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "number of elements using underlying 'C'-contiguous order for
-    //       flat " "traversal. " "Returns a tuple of events: (ht_event,
-    //       comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_reshape", &copy_usm_ndarray_for_reshape,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "number of elements using underlying 'C'-contiguous order for flat "
+          "traversal. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for flat "
-    //       "traversal with shift. "
-    //       "Returns a tuple of events: (ht_event, comp_event)",
-    //       py::arg("src"), py::arg("dst"), py::arg("shift"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_usm_ndarray_for_roll_1d", &copy_usm_ndarray_for_roll_1d,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for flat "
+          "traversal with shift. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shift"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
-    // m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
-    //       "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same
-    //       " "shapes using underlying 'C'-contiguous order for " "traversal
-    //       with shifts along each axis. " "Returns a tuple of events:
-    //       (ht_event, comp_event)", py::arg("src"), py::arg("dst"),
-    //       py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") =
-    //       py::list());
+    m.def("_copy_usm_ndarray_for_roll_nd", &copy_usm_ndarray_for_roll_nd,
+          "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same "
+          "shapes using underlying 'C'-contiguous order for "
+          "traversal with shifts along each axis. "
+          "Returns a tuple of events: (ht_event, comp_event)",
+          py::arg("src"), py::arg("dst"), py::arg("shifts"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
 
     //     m.def("_linspace_step", &usm_ndarray_linear_sequence_step,
     //           "Fills input 1D contiguous usm_ndarray `dst` with linear
@@ -317,11 +318,11 @@ PYBIND11_MODULE(_tensor_impl, m)
     //           py::arg("include_endpoint"), py::arg("sycl_queue"),
     //           py::arg("depends") = py::list());
 
-    // m.def("_copy_numpy_ndarray_into_usm_ndarray",
-    //       &copy_numpy_ndarray_into_usm_ndarray,
-    //       "Copy from numpy array `src` into usm_ndarray `dst`
-    //       synchronously.", py::arg("src"), py::arg("dst"),
-    //       py::arg("sycl_queue"), py::arg("depends") = py::list());
+    m.def("_copy_numpy_ndarray_into_usm_ndarray",
+          &copy_numpy_ndarray_into_usm_ndarray,
+          "Copy from numpy array `src` into usm_ndarray `dst` synchronously.",
+          py::arg("src"), py::arg("dst"), py::arg("sycl_queue"),
+          py::arg("depends") = py::list());
 
     m.def("_zeros_usm_ndarray", &usm_ndarray_zeros,
           "Populate usm_ndarray `dst` with zeros.", py::arg("dst"),
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index d94a031801f..47edf63a68b 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -33,6 +33,9 @@
 import dpctl.utils as dpu
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
@@ -230,7 +233,9 @@ def dpnp_linspace(
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False)
+        usm_res = dpt_ext.reshape(
+            usm_res, (-1,) + (1,) * delta.ndim, copy=False
+        )
 
         if step_num > 0:
             step = delta / step_num
@@ -256,7 +261,7 @@ def dpnp_linspace(
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
 
-    res = dpt.astype(usm_res, dtype, copy=False)
+    res = dpt_ext.astype(usm_res, dtype, copy=False)
     res = dpnp_array._create_from_usm_ndarray(res)
 
     if retstep is True:
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index 88abcee5035..55d74e8c180 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -47,6 +47,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as dti
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
@@ -212,7 +213,7 @@ def __call__(
 
         x_usm = dpnp.get_usm_ndarray(x)
         if dtype is not None:
-            x_usm = dpt.astype(x_usm, dtype, copy=False)
+            x_usm = dpt_ext.astype(x_usm, dtype, copy=False)
 
         out = self._unpack_out_kw(out)
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
@@ -718,9 +719,9 @@ def __call__(
                     sycl_queue=x2.sycl_queue,
                     usm_type=x2.usm_type,
                 )
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
                 x2_usm = dpt.asarray(
                     x2,
                     dtype=dtype,
@@ -728,8 +729,8 @@ def __call__(
                     usm_type=x1.usm_type,
                 )
             else:
-                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
+                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
 
         res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order)
 
@@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None):
                 res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm)
 
             if dtype is not None:
-                res_usm = dpt.astype(res_usm, dtype, copy=False)
+                res_usm = dpt_ext.astype(res_usm, dtype, copy=False)
 
             if out is not None and isinstance(out, dpnp_array):
                 return out
diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py
index 4137a279474..ddba9f634cb 100644
--- a/dpnp/dpnp_algo/dpnp_fill.py
+++ b/dpnp/dpnp_algo/dpnp_fill.py
@@ -32,10 +32,10 @@
 import dpctl.utils as dpu
 from dpctl.tensor._ctors import _cast_fill_val
 
-import dpnp
-
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
+import dpnp
 from dpctl_ext.tensor._tensor_impl import (
     _copy_usm_ndarray_into_usm_ndarray,
     _full_usm_ndarray,
@@ -56,7 +56,7 @@ def dpnp_fill(arr, val):
             raise dpu.ExecutionPlacementError(
                 "Input arrays have incompatible queues."
             )
-        a_val = dpt.astype(val, arr.dtype)
+        a_val = dpt_ext.astype(val, arr.dtype)
         a_val = dpt.broadcast_to(a_val, arr.shape)
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 6a2b2fd1977..d122aff2c13 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -41,6 +41,9 @@
 import dpctl.tensor._type_utils as dtu
 from dpctl.tensor._numpy_helper import AxisError
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from . import memory as dpm
@@ -764,7 +767,7 @@ def asnumpy(self):
 
         """
 
-        return dpt.asnumpy(self._array_obj)
+        return dpt_ext.asnumpy(self._array_obj)
 
     def astype(
         self,
diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py
index c8e28529cd5..acda579a5f5 100644
--- a/dpnp/dpnp_container.py
+++ b/dpnp/dpnp_container.py
@@ -38,6 +38,8 @@
 import dpctl.tensor as dpt
 import dpctl.utils as dpu
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp.dpnp_array import dpnp_array
@@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"):
     if order is None:
         order = "K"
 
-    array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order)
+    array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order)
     return dpnp_array._create_from_usm_ndarray(array_obj)
 
 
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 533bdc36c61..6c050a20898 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -53,6 +53,7 @@
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -136,7 +137,7 @@ def asnumpy(a, order="C"):
         return a.asnumpy()
 
     if isinstance(a, dpt.usm_ndarray):
-        return dpt.asnumpy(a)
+        return dpt_ext.asnumpy(a)
 
     return numpy.asarray(a, order=order)
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index 8d4ebdd1a6c..c2dd5793f82 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -46,6 +46,9 @@
 import dpctl.tensor as dpt
 import numpy
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 from dpnp import dpnp_container
 
@@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         order = "K"
 
     usm_x = dpnp.get_usm_ndarray(x)
-    usm_res = dpt.astype(
+    usm_res = dpt_ext.astype(
         usm_x, dtype, order=order, casting=casting, copy=copy, device=device
     )
 
@@ -3114,7 +3117,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     s0 = (1,) * ndim
     output = [
-        dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
+        dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
         for i, x in enumerate(xi)
     ]
 
@@ -3122,14 +3125,14 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
     _, _ = get_usm_allocations(output)
 
     if indexing == "xy" and ndim > 1:
-        output[0] = dpt.reshape(output[0], (1, -1) + s0[2:])
-        output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:])
+        output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:])
+        output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
         output = dpt.broadcast_arrays(*output)
 
     if copy:
-        output = [dpt.copy(x) for x in output]
+        output = [dpt_ext.copy(x) for x in output]
 
     return [dpnp_array._create_from_usm_ndarray(x) for x in output]
 
@@ -3929,7 +3932,7 @@ def vander(
 
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
-        dpt.reshape(usm_x, (-1, 1)),
+        dpt_ext.reshape(usm_x, (-1, 1)),
         dpt.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a01a036e16c..439ec288ebe 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -51,11 +51,10 @@
 from dpctl.tensor._indexing_functions import _get_indexing_mode
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
-import dpctl_ext.tensor as dpt_ext
-
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"):
         # NumPy will cast up to int64 in general but
         # int32 is more than safe for bool
         if ind_dt == dpnp.bool:
-            inds = dpt.astype(inds, dpt.int32)
+            inds = dpt_ext.astype(inds, dpt.int32)
         else:
             raise TypeError("input index array must be of integer data type")
 
@@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"):
         choices = tuple(
             map(
                 lambda chc: (
-                    chc if chc.dtype == res_dt else dpt.astype(chc, res_dt)
+                    chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt)
                 ),
                 choices,
             )
@@ -815,14 +814,14 @@ def extract(condition, a):
     )
 
     if usm_cond.size != usm_a.size:
-        usm_a = dpt.reshape(usm_a, -1)
-        usm_cond = dpt.reshape(usm_cond, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
-            usm_a = dpt.reshape(usm_a, -1)
-            usm_cond = dpt.reshape(usm_cond, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
+            usm_cond = dpt_ext.reshape(usm_cond, -1)
 
         usm_res = dpt.extract(usm_cond, usm_a)
 
@@ -959,18 +958,18 @@ def fill_diagonal(a, val, wrap=False):
     # a.flat[:end:step] = val
     # but need to consider use case when `a` is usm_ndarray also
     a_sh = a.shape
-    tmp_a = dpt.reshape(usm_a, -1)
+    tmp_a = dpt_ext.reshape(usm_a, -1)
     if dpnp.isscalar(usm_val):
         tmp_a[:end:step] = usm_val
     else:
-        usm_val = dpt.reshape(usm_val, -1)
+        usm_val = dpt_ext.reshape(usm_val, -1)
 
         # Setitem can work only if index size equal val size.
         # Using loop for general case without dependencies of val size.
         for i in range(0, usm_val.size):
             tmp_a[step * i : end : step * (i + 1)] = usm_val[i]
 
-    tmp_a = dpt.reshape(tmp_a, a_sh)
+    tmp_a = dpt_ext.reshape(tmp_a, a_sh)
     usm_a[:] = tmp_a
 
 
@@ -1611,12 +1610,14 @@ def place(a, mask, vals):
 
     if usm_vals.ndim != 1:
         # dpt.place supports only 1-D array of values
-        usm_vals = dpt.reshape(usm_vals, -1)
+        usm_vals = dpt_ext.reshape(usm_vals, -1)
 
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
         # while numpy.place does that with "safe" casting rule
-        usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False)
+        usm_vals = dpt_ext.astype(
+            usm_vals, usm_a.dtype, casting="safe", copy=False
+        )
 
     dpt.place(usm_a, usm_mask, usm_vals)
 
@@ -1708,19 +1709,19 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if usm_ind.ndim != 1:
         # dpt.put supports only 1-D array of indices
-        usm_ind = dpt.reshape(usm_ind, -1, copy=False)
+        usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False)
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe")
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
 
     dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
-        in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
+        in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False)
 
 
 def put_along_axis(a, ind, values, axis, mode="wrap"):
@@ -2162,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
     if axis is None:
         if a_ndim > 1:
             # flatten input array
-            usm_a = dpt.reshape(usm_a, -1)
+            usm_a = dpt_ext.reshape(usm_a, -1)
         axis = 0
     elif a_ndim == 0:
         axis = normalize_axis_index(operator.index(axis), 1)
@@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.take supports only integer dtype for array of indices
-        usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
+        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
 
     usm_res = _take_index(
         usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 9df5278bd16..edd98348afe 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -53,6 +53,9 @@
     normalize_axis_tuple,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a):
             dpt.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
-                dpt.reshape(first_nan, 1),
+                dpt_ext.reshape(first_nan, 1),
             )
 
         result += (usm_res.inverse_indices,)
@@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None):
         )
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy)
+    usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3259,9 +3262,9 @@ def roll(x, shift, axis=None):
         shift = dpnp.asnumpy(shift)
 
     if axis is None:
-        return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape)
+        return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
-    usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
+    usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py
index db33a88c748..3e5cdd4da6a 100644
--- a/dpnp/dpnp_iface_sorting.py
+++ b/dpnp/dpnp_iface_sorting.py
@@ -43,6 +43,9 @@
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -84,7 +87,7 @@ def _wrap_sort_argsort(
 
     usm_a = dpnp.get_usm_ndarray(a)
     if axis is None:
-        usm_a = dpt.reshape(usm_a, -1)
+        usm_a = dpt_ext.reshape(usm_a, -1)
         axis = -1
 
     axis = normalize_axis_index(axis, ndim=usm_a.ndim)
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
index 7e092184366..daff981d5cc 100644
--- a/dpnp/dpnp_iface_statistics.py
+++ b/dpnp/dpnp_iface_statistics.py
@@ -47,6 +47,9 @@
 import numpy
 from dpctl.tensor._numpy_helper import normalize_axis_index
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 # pylint: disable=no-name-in-module
@@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims)
     if dtype is not None:
-        usm_res = dpt.astype(usm_res, dtype)
+        usm_res = dpt_ext.astype(usm_res, dtype)
 
     return dpnp.get_result_array(usm_res, out, casting="unsafe")
 
diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py
index eb20f9b3ffe..423004470ba 100644
--- a/dpnp/tests/test_arraycreation.py
+++ b/dpnp/tests/test_arraycreation.py
@@ -13,6 +13,9 @@
     assert_raises_regex,
 )
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt_ext
 import dpnp
 
 from .helper import (
@@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order):
     ],
 )
 def test_dpctl_tensor_input(func, args):
-    x0 = dpt.reshape(dpt.arange(9), (3, 3))
+    x0 = dpt_ext.reshape(dpt.arange(9), (3, 3))
     new_args = [eval(val, {"x0": x0}) for val in args]
     X = getattr(dpt, func)(*new_args)
     Y = getattr(dpnp, func)(*new_args)