diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 53a0dfd27ba..a39368c7baa 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,9 +48,9 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 3c6939eff7a..edb2c096bad 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,6 +27,13 @@ # ***************************************************************************** +from dpctl_ext.tensor._copy_utils import ( + asnumpy, + astype, + copy, + from_numpy, + to_numpy, +) from dpctl_ext.tensor._ctors import ( full, tril, @@ -36,11 +43,22 @@ put, take, ) +from dpctl_ext.tensor._manipulation_functions import ( + roll, +) +from dpctl_ext.tensor._reshape import reshape __all__ = [ + "asnumpy", + "astype", + "copy", + "from_numpy", "full", "put", + "reshape", + "roll", "take", + "to_numpy", "tril", "triu", ] diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py new file mode 100644 index 00000000000..c62218893a2 --- /dev/null +++ b/dpctl_ext/tensor/_copy_utils.py @@ -0,0 +1,755 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import builtins + +import dpctl +import dpctl.memory as dpm +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._data_types import _get_dtype +from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._type_utils import _dtype_supported_by_device_impl + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +__doc__ = ( + "Implementation module for copy- and cast- operations on " + ":class:`dpctl.tensor.usm_ndarray`." +) + +int32_t_max = 1 + np.iinfo(np.int32).max + + +def _copy_to_numpy(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") + if ary.size == 0: + # no data needs to be copied for zero sized array + return np.ndarray(ary.shape, dtype=ary.dtype) + nb = ary.usm_data.nbytes + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) + h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) + itsz = ary.itemsize + strides_bytes = tuple(si * itsz for si in ary.strides) + offset = ary._element_offset * itsz + # ensure that content of ary.usm_data is final + q.wait() + hh.copy_from_device(ary.usm_data) + return np.ndarray( + ary.shape, + dtype=ary.dtype, + buffer=h, + strides=strides_bytes, + offset=offset, + ) + + +def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): + """Copies numpy array `np_ary` into a new usm_ndarray""" + # This may perform a copy to meet stated requirements + Xnp = np.require(np_ary, requirements=["A", "E"]) + alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None) + dt = Xnp.dtype + if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64") + ) + else: + Xusm_dtype = dt + Xusm = dpt.empty( + Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue + ) + _copy_from_numpy_into(Xusm, Xnp) + return Xusm + + +def _copy_from_numpy_into(dst, np_ary): + """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray""" + if not isinstance(np_ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}") + if not isinstance(dst, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(dst)}") + if np_ary.flags["OWNDATA"]: + Xnp = np_ary + else: + # Determine base of input array + base = np_ary.base + while isinstance(base, np.ndarray): + base = base.base + if isinstance(base, dpm._memory._Memory): + # we must perform a copy, since subsequent + # _copy_numpy_ndarray_into_usm_ndarray is implemented using + # sycl::buffer, and using USM-pointers with sycl::buffer + # results is undefined behavior + Xnp = np_ary.copy() + else: + Xnp = np_ary + src_ary = np.broadcast_to(Xnp, dst.shape) + copy_q = dst.sycl_queue + if copy_q.sycl_device.has_aspect_fp64 is False: + src_ary_dt_c = src_ary.dtype.char + if src_ary_dt_c == "d": + src_ary = src_ary.astype(np.float32) + elif src_ary_dt_c == "D": + src_ary = src_ary.astype(np.complex64) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_ev = _manager.submitted_events + # synchronizing call + ti._copy_numpy_ndarray_into_usm_ndarray( + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev + ) + + +def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): + """ + from_numpy(arg, device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` from instance of + :class:`numpy.ndarray`. + + Args: + arg: + Input convertible to :class:`numpy.ndarray` + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is ``None``, + returned array is created on the default-selected device. + Default: ``None`` + usm_type (str): The requested USM allocation type for the + output array. Recognized values are ``"device"``, + ``"shared"``, or ``"host"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + A SYCL queue that determines output array allocation device + as well as execution placement of data movement operations. + The ``device`` and ``sycl_queue`` arguments + are equivalent. Only one of them should be specified. If both + are provided, they must be consistent and result in using the + same execution queue. Default: ``None`` + + The returned array has the same shape, and the same data type kind. + If the device does not support the data type of input array, a + closest support data type of the same kind may be returned, e.g. + input array of type ``float16`` may be upcast to ``float32`` if the + target device does not support 16-bit floating point type. + """ + q = normalize_queue_device(sycl_queue=sycl_queue, device=device) + return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q) + + +def to_numpy(usm_ary, /): + """ + to_numpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content of + ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +def asnumpy(usm_ary): + """ + asnumpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data + type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content + of ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +class Dummy: + """Helper class with specified ``__sycl_usm_array_interface__`` attribute""" + + def __init__(self, iface): + self.__sycl_usm_array_interface__ = iface + + +def _copy_overlapping(dst, src): + """Assumes src and dst have the same shape.""" + q = normalize_queue_device(sycl_queue=dst.sycl_queue) + tmp = dpt.usm_ndarray( + src.shape, + dtype=src.dtype, + buffer="device", + order="C", + buffer_ctor_kwargs={"queue": q}, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp, dst=dst, sycl_queue=q, depends=[cp1] + ) + _manager.add_event_pair(hcp2, cp2) + + +def _copy_same_shape(dst, src): + """Assumes src and dst have the same shape.""" + # check that memory regions do not overlap + if ti._array_overlap(dst, src): + if src._pointer == dst._pointer and ( + src is dst + or (src.strides == dst.strides and src.dtype == dst.dtype) + ): + return + _copy_overlapping(src=src, dst=dst) + return + + copy_q = dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + +if hasattr(np, "broadcast_shapes"): + + def _broadcast_shapes(sh1, sh2): + return np.broadcast_shapes(sh1, sh2) + +else: + + def _broadcast_shapes(sh1, sh2): + # use arrays with zero strides, whose memory footprint + # is independent of the number of array elements + return np.broadcast( + np.empty(sh1, dtype=[]), + np.empty(sh2, dtype=[]), + ).shape + + +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def _copy_from_usm_ndarray_to_usm_ndarray(dst, src): + if any( + not isinstance(arg, dpt.usm_ndarray) + for arg in ( + dst, + src, + ) + ): + raise TypeError( + "Both types are expected to be dpctl.tensor.usm_ndarray, " + f"got {type(dst)} and {type(src)}." + ) + + if dst.ndim == src.ndim and dst.shape == src.shape: + _copy_same_shape(dst, src) + return + + try: + common_shape = _broadcast_shapes(dst.shape, src.shape) + except ValueError as exc: + raise ValueError("Shapes of two arrays are not compatible") from exc + + if dst.size < src.size and dst.size < np.prod(common_shape): + raise ValueError("Destination is smaller ") + + if len(common_shape) > dst.ndim: + ones_count = len(common_shape) - dst.ndim + for k in range(ones_count): + if common_shape[k] != 1: + raise ValueError + common_shape = common_shape[ones_count:] + + if src.ndim < len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + elif src.ndim == len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + else: + # since broadcasting succeeded, src.ndim is greater because of + # leading sequence of ones, so we trim it + n = len(common_shape) + new_src_strides = _broadcast_strides( + src.shape[-n:], src.strides[-n:], n + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src.usm_data, + strides=new_src_strides, + offset=src._element_offset, + ) + + _copy_same_shape(dst, src_same_shape) + + +def _make_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty array with shape and strides like `x`, with dtype `dt`, + USM type `usm_type`, on device `dev`. + """ + st = list(x.strides) + perm = sorted( + range(x.ndim), + key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0, + reverse=True, + ) + inv_perm = sorted(range(x.ndim), key=lambda i: perm[i]) + sh = x.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if min(st) < 0: + st_sorted = [st[i] for i in perm] + sl = tuple( + ( + slice(None, None, -1) + if st_sorted[i] < 0 + else slice(None, None, None) + ) + for i in range(x.ndim) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_orderK(x, dt, usm_type=None, dev=None): + """ + Returns empty array like `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(x)}") + if usm_type is None: + usm_type = x.usm_type + if dev is None: + dev = x.device + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _from_numpy_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty usm_ndarray like NumPy array `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(x)}") + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + nd1 = X1.ndim + nd2 = X2.ndim + if nd1 > nd2 and X1.shape == res_shape: + return _empty_like_orderK(X1, dt, usm_type, dev) + elif nd1 < nd2 and X2.shape == res_shape: + return _empty_like_orderK(X2, dt, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + if fl1["C"] or fl2["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + max_ndim = max(nd1, nd2) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if (st1_sorted[i] < 0 and st2_sorted[i] < 0) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + if not isinstance(X3, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X3)}") + nd1 = X1.ndim + nd2 = X2.ndim + nd3 = X3.ndim + if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3: + return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev) + elif ( + X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1 + ): + return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev) + elif ( + X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2 + ): + return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + fl3 = X3.flags + if fl1["C"] or fl2["C"] or fl3["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"] and fl3["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + st3 = list(X3.strides) + max_ndim = max(nd1, nd2, nd3) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + st3 += [0] * (max_ndim - len(st3)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + sh3 = list(X3.shape) + [0] * (max_ndim - nd3) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + builtins.abs(st3[d]) if sh3[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + st3_sorted = [st3[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if ( + st1_sorted[i] < 0 + and st2_sorted[i] < 0 + and st3_sorted[i] < 0 + ) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def copy(usm_ary, /, *, order="K"): + """copy(ary, order="K") + + Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`. + + Args: + ary (usm_ndarray): + Input array + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + Controls the memory layout of the output array + Returns: + usm_ndarray: + A copy of the input array. + + Memory layout of the copy is controlled by ``order`` keyword, + following NumPy's conventions. The ``order`` keywords can be + one of the following: + + .. list-table:: + + * - ``"C"`` + - C-contiguous memory layout + * - ``"F"`` + - Fortran-contiguous memory layout + * - ``"A"`` + - Fortran-contiguous if the input array is also Fortran-contiguous, + otherwise C-contiguous + * - ``"K"`` + - match the layout of ``usm_ary`` as closely as possible. + + """ + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + if not isinstance(usm_ary, dpt.usm_ndarray): + raise TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, usm_ary.dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=usm_ary.dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_same_shape(R, usm_ary) + return R + + +def astype( + usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None +): + """astype(array, new_dtype, order="K", casting="unsafe", \ + copy=True, device=None) + + Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a + specified type. + + Args: + array (usm_ndarray): + An input array. + new_dtype (dtype): + The data type of the resulting array. If `None`, gives default + floating point type supported by device where the resulting array + will be located. + order ({"C", "F", "A", "K"}, optional): + Controls memory layout of the resulting array if a copy + is returned. + casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional): + Controls what kind of data casting may occur. Please see + :meth:`numpy.ndarray.astype` for description of casting modes. + copy (bool, optional): + By default, `astype` always returns a newly allocated array. + If this keyword is set to `False`, a view of the input array + may be returned when possible. + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is `None`, + returned array is created on the same device as `array`. + Default: `None`. + + Returns: + usm_ndarray: + An array with requested data type. + + A view can be returned, if possible, when `copy=False` is used. + """ + if not isinstance(usm_ary, dpt.usm_ndarray): + return TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + ary_dtype = usm_ary.dtype + if device is not None: + if not isinstance(device, dpctl.SyclQueue): + if isinstance(device, dpt.Device): + device = device.sycl_queue + else: + device = dpt.Device.create_device(device).sycl_queue + d = device.sycl_device + target_dtype = _get_dtype(newdtype, device) + if not _dtype_supported_by_device_impl( + target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 + ): + raise ValueError( + f"Requested dtype '{target_dtype}' is not supported by the " + "target device" + ) + usm_ary = usm_ary.to_device(device) + else: + target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) + + if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): + raise TypeError( + f"Can not cast from {ary_dtype} to {newdtype} " + f"according to rule {casting}." + ) + c_contig = usm_ary.flags.c_contiguous + f_contig = usm_ary.flags.f_contiguous + needs_copy = copy or not ary_dtype == target_dtype + if not needs_copy and (order != "K"): + # ensure that order="F" for C-contig input triggers copy, + # and order="C" for F-contig input triggers copy too. + # 1D arrays which are both C- and F- contig should not + # force copying for neither order="F", nor order="C", see gh-1926 + needs_copy = ( + c_contig and not f_contig and order not in ["A", "C"] + ) or (not c_contig and f_contig and order not in ["A", "F"]) + if not needs_copy: + return usm_ary + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, target_dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=target_dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary) + return R diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index a0e7b28e66f..5a39e9367e9 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -36,6 +36,9 @@ from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti @@ -147,7 +150,7 @@ def full( usm_type=usm_type, sycl_queue=sycl_queue, ) - return dpt.copy(dpt.broadcast_to(X, shape), order=order) + return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order) else: _validate_fill_value(fill_value) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 106df09cf97..df4f3e95304 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -32,6 +32,9 @@ import dpctl.tensor as dpt import dpctl.utils +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index @@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals): if vals.dtype == x.dtype: rhs = vals else: - rhs = dpt.astype(vals, x.dtype) + rhs = dpt_ext.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) _manager = dpctl.utils.SequentialOrderManager[exec_q] diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py new file mode 100644 index 00000000000..fa8fc27876b --- /dev/null +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -0,0 +1,120 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils as dputils +import numpy as np + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_tuple + +__doc__ = ( + "Implementation module for array manipulation " + "functions in :module:`dpctl.tensor`" +) + + +def roll(x, /, shift, *, axis=None): + """ + roll(x, shift, axis) + + Rolls array elements along a specified axis. + Array elements that roll beyond the last position are re-introduced + at the first position. Array elements that roll beyond the first position + are re-introduced at the last position. + + Args: + x (usm_ndarray): input array + shift (Union[int, Tuple[int,...]]): number of places by which the + elements are shifted. If `shift` is a tuple, then `axis` must be a + tuple of the same size, and each of the given axes must be shifted + by the corresponding element in `shift`. If `shift` is an `int` + and `axis` a tuple, then the same `shift` must be used for all + specified axes. If a `shift` is positive, then array elements is + shifted positively (toward larger indices) along the dimension of + `axis`. + If a `shift` is negative, then array elements must be shifted + negatively (toward smaller indices) along the dimension of `axis`. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which + elements to shift. If `axis` is `None`, the array is + flattened, shifted, and then restored to its original shape. + Default: `None`. + + Returns: + usm_ndarray: + An array having the same `dtype`, `usm_type` and + `device` attributes as `x` and whose elements are shifted relative + to `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + exec_q = x.sycl_queue + _manager = dputils.SequentialOrderManager[exec_q] + if axis is None: + shift = operator.index(shift) + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + sz = operator.index(x.size) + shift = (shift % sz) if sz > 0 else 0 + dep_evs = _manager.submitted_events + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=x, + dst=res, + shift=shift, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, roll_ev) + return res + axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True) + broadcasted = np.broadcast(shift, axis) + if broadcasted.ndim > 1: + raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") + shifts = [ + 0, + ] * x.ndim + shape = x.shape + for sh, ax in broadcasted: + n_i = operator.index(shape[ax]) + shifted = shifts[ax] + operator.index(sh) + shifts[ax] = (shifted % n_i) if n_i > 0 else 0 + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e, roll_ev) + return res diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py new file mode 100644 index 00000000000..6afa1dc245c --- /dev/null +++ b/dpctl_ext/tensor/_reshape.py @@ -0,0 +1,206 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._tensor_impl import ( + _copy_usm_ndarray_for_reshape, + _ravel_multi_index, + _unravel_index, +) + +__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`." + + +def _make_unit_indexes(shape): + """ + Construct a diagonal matrix with with one on the diagonal + except if the corresponding element of shape is 1. + """ + nd = len(shape) + mi = np.zeros((nd, nd), dtype="u4") + for i, dim in enumerate(shape): + mi[i, i] = 1 if dim > 1 else 0 + return mi + + +def ti_unravel_index(flat_index, shape, order="C"): + return _unravel_index(flat_index, shape, order) + + +def ti_ravel_multi_index(multi_index, shape, order="C"): + return _ravel_multi_index(multi_index, shape, order) + + +def reshaped_strides(old_sh, old_sts, new_sh, order="C"): + """ + When reshaping array with `old_sh` shape and `old_sts` strides + into the new shape `new_sh`, returns the new stride if the reshape + can be a view, otherwise returns `None`. + """ + eye_new_mi = _make_unit_indexes(new_sh) + new_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + old_sts, ti_unravel_index(flat_index, old_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, new_sh, order=order) + for unitvec in eye_new_mi + ] + ] + eye_old_mi = _make_unit_indexes(old_sh) + check_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + new_sts, ti_unravel_index(flat_index, new_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, old_sh, order=order) + for unitvec in eye_old_mi + ] + ] + valid = all( + check_st == old_st or old_dim == 1 + for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh) + ) + return new_sts if valid else None + + +def reshape(X, /, shape, *, order="C", copy=None): + """reshape(x, shape, order="C") + + Reshapes array ``x`` into new shape. + + Args: + x (usm_ndarray): + input array + shape (Tuple[int]): + the desired shape of the resulting array. + order ("C", "F", optional): + memory layout of the resulting array + if a copy is found to be necessary. Supported + choices are ``"C"`` for C-contiguous, or row-major layout; + and ``"F"`` for F-contiguous, or column-major layout. + + Returns: + usm_ndarray: + Reshaped array is a view, if possible, + and a copy otherwise with memory layout as indicated + by ``order`` keyword. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError + if not isinstance(shape, (list, tuple)): + shape = (shape,) + if order in "cfCF": + order = order.upper() + else: + raise ValueError( + f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}" + ) + if copy not in (True, False, None): + raise ValueError( + f"Keyword 'copy' not recognized. Expecting True, False, " + f"or None, got {copy}" + ) + shape = [operator.index(d) for d in shape] + negative_ones_count = 0 + for nshi in shape: + if nshi == -1: + negative_ones_count = negative_ones_count + 1 + if (nshi < -1) or negative_ones_count > 1: + raise ValueError( + "Target shape should have at most 1 negative " + "value which can only be -1" + ) + if negative_ones_count: + sz = -np.prod(shape) + if sz == 0: + raise ValueError( + f"Can not reshape array of size {X.size} into " + f"shape {tuple(i for i in shape if i >= 0)}" + ) + v = X.size // sz + shape = [v if d == -1 else d for d in shape] + if X.size != np.prod(shape): + raise ValueError(f"Can not reshape into {shape}") + if X.size: + newsts = reshaped_strides(X.shape, X.strides, shape, order=order) + else: + newsts = (1,) * len(shape) + copy_required = newsts is None + if copy_required and (copy is False): + raise ValueError( + "Reshaping the array requires a copy, but no copying was " + "requested by using copy=False" + ) + copy_q = X.sycl_queue + if copy_required or (copy is True): + # must perform a copy + copy_q = X.sycl_queue + flat_res = dpt.usm_ndarray( + (X.size,), + dtype=X.dtype, + buffer=X.usm_type, + buffer_ctor_kwargs={"queue": copy_q}, + ) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + if order == "C": + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + else: + X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, r_e) + return dpt.usm_ndarray( + tuple(shape), dtype=X.dtype, buffer=flat_res, order=order + ) + # can form a view + if (len(shape) == X.ndim) and all( + s1 == s2 for s1, s2 in zip(shape, X.shape) + ): + return X + return dpt.usm_ndarray( + shape, + dtype=X.dtype, + buffer=X, + strides=tuple(newsts), + offset=X._element_offset, + ) diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp new file mode 100644 index 00000000000..524bfcfdb98 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_reshape.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_reshape_fn_ptr_t + copy_for_reshape_generic_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t src_nelems = src.get_size(); + py::ssize_t dst_nelems = dst.get_size(); + + // Must have the same number of elements + if (src_nelems != dst_nelems) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same number of elements."); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + // dimensions may be different + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + + auto src_shape = src.get_shape_vector(); + auto src_strides = src.get_strides_vector(); + + auto dst_shape = dst.get_shape_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, src_shape, src_strides, dst_shape, + dst_strides); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_reshape_event = + fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, + dst_data, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_reshape_event); +} + +void init_copy_for_reshape_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp new file mode 100644 index 00000000000..c5af885ad6c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_reshape_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp new file mode 100644 index 00000000000..a187b224767 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp @@ -0,0 +1,400 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_roll.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast:: + copy_for_roll_ndshift_strided_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_roll_strided_fn_ptr_t + copy_for_roll_strided_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_contig_fn_ptr_t + copy_for_roll_contig_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_ndshift_strided_fn_ptr_t + copy_for_roll_ndshift_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same number of dimensions."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + const bool is_src_c_contig = src.is_c_contiguous(); + const bool is_src_f_contig = src.is_f_contiguous(); + + const bool is_dst_c_contig = dst.is_c_contiguous(); + const bool is_dst_f_contig = dst.is_f_contiguous(); + + const bool both_c_contig = is_src_c_contig && is_dst_c_contig; + const bool both_f_contig = is_src_f_contig && is_dst_f_contig; + + // normalize shift parameter to be 0 <= offset < src_nelems + std::size_t offset = + (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (both_c_contig || both_f_contig) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape_ptr; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data, + dst_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto fn = copy_for_roll_strided_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, + src_offset, dst_data, dst_offset, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same number of dimensions."); + } + + if (static_cast(src_nd) != shifts.size()) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires shifts to " + "contain an integral shift for each array dimension."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check for compatible queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + std::vector normalized_shifts{}; + normalized_shifts.reserve(src_nd); + + for (int i = 0; i < src_nd; ++i) { + // normalize shift parameter to be 0 <= offset < dim + py::ssize_t dim = src_shape_ptr[i]; + std::size_t offset = + (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); + + normalized_shifts.push_back(offset); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + auto const &common_shape = src.get_shape_vector(); + + static constexpr py::ssize_t src_offset = 0; + static constexpr py::ssize_t dst_offset = 0; + + auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, common_shape, src_strides, dst_strides, + normalized_shifts); + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +void init_copy_for_roll_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory; + DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp new file mode 100644 index 00000000000..cffbf9f6f0d --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp @@ -0,0 +1,65 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_roll_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp new file mode 100644 index 00000000000..e97e8aeb1ca --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -0,0 +1,368 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; + +static copy_and_cast_from_host_blocking_fn_ptr_t + copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_contig_blocking_fn_ptr_t; + +static copy_and_cast_from_host_contig_blocking_fn_ptr_t + copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_ndim = npy_src.ndim(); + int dst_ndim = dst.get_ndim(); + + if (src_ndim != dst_ndim) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "different array ranks, " + "i.e. different number of indices needed to " + "address array elements."); + } + + const py::ssize_t *src_shape = npy_src.shape(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + for (int i = 0; shapes_equal && (i < src_ndim); ++i) { + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + src_nelems *= static_cast(src_shape[i]); + } + + if (!shapes_equal) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "difference shapes."); + } + + if (src_nelems == 0) { + // nothing to do + return; + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // here we assume that NumPy's type numbers agree with ours for types + // supported in both + int src_typenum = + py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + py::buffer_info src_pybuf = npy_src.request(); + const char *const src_data = static_cast(src_pybuf.ptr); + char *dst_data = dst.get_data(); + + int src_flags = npy_src.flags(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + const bool both_c_contig = + ((src_flags & py::array::c_style) && dst.is_c_contiguous()); + const bool both_f_contig = + ((src_flags & py::array::f_style) && dst.is_f_contiguous()); + + const bool same_data_types = (src_type_id == dst_type_id); + + if (both_c_contig || both_f_contig) { + if (same_data_types) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + copy_ev.wait(); + } + + return; + } + else { + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table + [dst_type_id][src_type_id]; + + static constexpr py::ssize_t zero_offset(0); + + copy_and_cast_from_host_contig_blocking_fn( + exec_q, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + return; + } + } + + auto const &dst_strides = + dst.get_strides_vector(); // N.B.: strides in elements + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_ndim; + const py::ssize_t *shape = src_shape; + + const py::ssize_t *src_strides_p = + npy_src.strides(); // N.B.: strides in bytes + py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes + + bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); + bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); + + shT src_strides_in_elems; + if (src_strides_p) { + src_strides_in_elems.resize(nd); + // copy and convert strides from bytes to elements + std::transform( + src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems), + [src_itemsize](py::ssize_t el) { + py::ssize_t q = el / src_itemsize; + if (q * src_itemsize != el) { + throw std::runtime_error( + "NumPy array strides are not multiple of itemsize"); + } + return q; + }); + } + else { + if (is_src_c_contig) { + src_strides_in_elems = + dpctl::tensor::c_contiguous_strides(nd, src_shape); + } + else if (is_src_f_contig) { + src_strides_in_elems = + dpctl::tensor::f_contiguous_strides(nd, src_shape); + } + else { + throw py::value_error("NumPy source array has null strides but is " + "neither C- nor F-contiguous."); + } + } + + // nd, simplified_* vectors and offsets are modified by reference + simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides, + // outputs + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + // handle nd == 0 + if (nd == 0) { + nd = 1; + simplified_shape.reserve(nd); + simplified_shape.push_back(1); + + simplified_src_strides.reserve(nd); + simplified_src_strides.push_back(1); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.push_back(1); + } + + const bool is_contig_vector = + ((nd == 1) && (simplified_src_strides.front() == 1) && + (simplified_dst_strides.front() == 1)); + + const bool can_use_memcpy = (same_data_types && is_contig_vector && + (src_offset == 0) && (dst_offset == 0)); + + if (can_use_memcpy) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = exec_q.memcpy( + static_cast(dst_data), static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + + copy_ev.wait(); + } + + return; + } + + // Minimum and maximum element offsets for source np.ndarray + py::ssize_t npy_src_min_nelem_offset(src_offset); + py::ssize_t npy_src_max_nelem_offset(src_offset); + for (int i = 0; i < nd; ++i) { + if (simplified_src_strides[i] < 0) { + npy_src_min_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + else { + npy_src_max_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + } + + if (is_contig_vector) { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data, + src_offset, dst_data, + dst_offset, depends); + + return; + } + + std::vector host_task_events; + host_task_events.reserve(1); + + // Copy shape strides into device memory + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + // Get implementation function pointer + auto copy_and_cast_from_host_blocking_fn = + copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_blocking_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.reset(nullptr); + } + + return; +} + +void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + + DispatchTableBuilder + dtb_copy_from_numpy; + + dtb_copy_from_numpy.populate_dispatch_table( + copy_and_cast_from_host_blocking_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast:: + CopyAndCastFromHostContigFactory; + + DispatchTableBuilder + dtb_copy_from_numpy_contig; + + dtb_copy_from_numpy_contig.populate_dispatch_table( + copy_and_cast_from_host_contig_blocking_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp new file mode 100644 index 00000000000..f2de95f97cc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 182124ac4ae..c1372c1c240 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -51,9 +51,9 @@ // #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" -// #include "copy_for_reshape.hpp" -// #include "copy_for_roll.hpp" -// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "copy_for_reshape.hpp" +#include "copy_for_roll.hpp" +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" #include "full_ctor.hpp" @@ -87,16 +87,16 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* =========================== Copy for reshape ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* =========================== Copy for roll ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ -// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; /* ============= linear-sequence ==================== */ @@ -146,7 +146,7 @@ void init_dispatch_tables(void) using namespace dpctl::tensor::py_internal; init_copy_and_cast_usm_to_usm_dispatch_tables(); - // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; @@ -158,8 +158,8 @@ void init_dispatch_vectors(void) using namespace dpctl::tensor::py_internal; init_copy_as_contig_dispatch_vectors(); - // init_copy_for_reshape_dispatch_vectors(); - // init_copy_for_roll_dispatch_vectors(); + init_copy_for_reshape_dispatch_vectors(); + init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); @@ -279,28 +279,29 @@ PYBIND11_MODULE(_tensor_impl, m) }, ""); - // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "number of elements using underlying 'C'-contiguous order for - // flat " "traversal. " "Returns a tuple of events: (ht_event, - // comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "number of elements using underlying 'C'-contiguous order for flat " + "traversal. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for flat " - // "traversal with shift. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("src"), py::arg("dst"), py::arg("shift"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for flat " + "traversal with shift. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shift"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for " "traversal - // with shifts along each axis. " "Returns a tuple of events: - // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = - // py::list()); + m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for " + "traversal with shifts along each axis. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shifts"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, // "Fills input 1D contiguous usm_ndarray `dst` with linear @@ -317,11 +318,11 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("include_endpoint"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_copy_numpy_ndarray_into_usm_ndarray", - // ©_numpy_ndarray_into_usm_ndarray, - // "Copy from numpy array `src` into usm_ndarray `dst` - // synchronously.", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_numpy_ndarray_into_usm_ndarray", + ©_numpy_ndarray_into_usm_ndarray, + "Copy from numpy array `src` into usm_ndarray `dst` synchronously.", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index d94a031801f..47edf63a68b 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -33,6 +33,9 @@ import dpctl.utils as dpu import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device @@ -230,7 +233,9 @@ def dpnp_linspace( usm_type=_usm_type, sycl_queue=sycl_queue_normalized, ) - usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False) + usm_res = dpt_ext.reshape( + usm_res, (-1,) + (1,) * delta.ndim, copy=False + ) if step_num > 0: step = delta / step_num @@ -256,7 +261,7 @@ def dpnp_linspace( if dpnp.issubdtype(dtype, dpnp.integer): dpt.floor(usm_res, out=usm_res) - res = dpt.astype(usm_res, dtype, copy=False) + res = dpt_ext.astype(usm_res, dtype, copy=False) res = dpnp_array._create_from_usm_ndarray(res) if retstep is True: diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 88abcee5035..55d74e8c180 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -47,6 +47,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi @@ -212,7 +213,7 @@ def __call__( x_usm = dpnp.get_usm_ndarray(x) if dtype is not None: - x_usm = dpt.astype(x_usm, dtype, copy=False) + x_usm = dpt_ext.astype(x_usm, dtype, copy=False) out = self._unpack_out_kw(out) out_usm = None if out is None else dpnp.get_usm_ndarray(out) @@ -718,9 +719,9 @@ def __call__( sycl_queue=x2.sycl_queue, usm_type=x2.usm_type, ) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) elif dpnp.isscalar(x2): - x1_usm = dpt.astype(x1_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) x2_usm = dpt.asarray( x2, dtype=dtype, @@ -728,8 +729,8 @@ def __call__( usm_type=x1.usm_type, ) else: - x1_usm = dpt.astype(x1_usm, dtype, copy=False) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order) @@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None): res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm) if dtype is not None: - res_usm = dpt.astype(res_usm, dtype, copy=False) + res_usm = dpt_ext.astype(res_usm, dtype, copy=False) if out is not None and isinstance(out, dpnp_array): return out diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 4137a279474..ddba9f634cb 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,10 +32,10 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -import dpnp - # TODO: revert to `from dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, @@ -56,7 +56,7 @@ def dpnp_fill(arr, val): raise dpu.ExecutionPlacementError( "Input arrays have incompatible queues." ) - a_val = dpt.astype(val, arr.dtype) + a_val = dpt_ext.astype(val, arr.dtype) a_val = dpt.broadcast_to(a_val, arr.shape) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 6a2b2fd1977..d122aff2c13 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -41,6 +41,9 @@ import dpctl.tensor._type_utils as dtu from dpctl.tensor._numpy_helper import AxisError +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from . import memory as dpm @@ -764,7 +767,7 @@ def asnumpy(self): """ - return dpt.asnumpy(self._array_obj) + return dpt_ext.asnumpy(self._array_obj) def astype( self, diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index c8e28529cd5..acda579a5f5 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,8 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"): if order is None: order = "K" - array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order) + array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 533bdc36c61..6c050a20898 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -53,6 +53,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -136,7 +137,7 @@ def asnumpy(a, order="C"): return a.asnumpy() if isinstance(a, dpt.usm_ndarray): - return dpt.asnumpy(a) + return dpt_ext.asnumpy(a) return numpy.asarray(a, order=order) diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 8d4ebdd1a6c..c2dd5793f82 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -46,6 +46,9 @@ import dpctl.tensor as dpt import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp import dpnp_container @@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None): order = "K" usm_x = dpnp.get_usm_ndarray(x) - usm_res = dpt.astype( + usm_res = dpt_ext.astype( usm_x, dtype, order=order, casting=casting, copy=copy, device=device ) @@ -3114,7 +3117,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): s0 = (1,) * ndim output = [ - dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) + dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi) ] @@ -3122,14 +3125,14 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): _, _ = get_usm_allocations(output) if indexing == "xy" and ndim > 1: - output[0] = dpt.reshape(output[0], (1, -1) + s0[2:]) - output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:]) + output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:]) + output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:]) if not sparse: output = dpt.broadcast_arrays(*output) if copy: - output = [dpt.copy(x) for x in output] + output = [dpt_ext.copy(x) for x in output] return [dpnp_array._create_from_usm_ndarray(x) for x in output] @@ -3929,7 +3932,7 @@ def vander( tmp = m[:, ::-1] if not increasing else m dpnp.power( - dpt.reshape(usm_x, (-1, 1)), + dpt_ext.reshape(usm_x, (-1, 1)), dpt.arange( N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue ), diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a01a036e16c..439ec288ebe 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -51,11 +51,10 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index -import dpctl_ext.tensor as dpt_ext - # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"): # NumPy will cast up to int64 in general but # int32 is more than safe for bool if ind_dt == dpnp.bool: - inds = dpt.astype(inds, dpt.int32) + inds = dpt_ext.astype(inds, dpt.int32) else: raise TypeError("input index array must be of integer data type") @@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"): choices = tuple( map( lambda chc: ( - chc if chc.dtype == res_dt else dpt.astype(chc, res_dt) + chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt) ), choices, ) @@ -815,14 +814,14 @@ def extract(condition, a): ) if usm_cond.size != usm_a.size: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt.extract(usm_cond, usm_a) @@ -959,18 +958,18 @@ def fill_diagonal(a, val, wrap=False): # a.flat[:end:step] = val # but need to consider use case when `a` is usm_ndarray also a_sh = a.shape - tmp_a = dpt.reshape(usm_a, -1) + tmp_a = dpt_ext.reshape(usm_a, -1) if dpnp.isscalar(usm_val): tmp_a[:end:step] = usm_val else: - usm_val = dpt.reshape(usm_val, -1) + usm_val = dpt_ext.reshape(usm_val, -1) # Setitem can work only if index size equal val size. # Using loop for general case without dependencies of val size. for i in range(0, usm_val.size): tmp_a[step * i : end : step * (i + 1)] = usm_val[i] - tmp_a = dpt.reshape(tmp_a, a_sh) + tmp_a = dpt_ext.reshape(tmp_a, a_sh) usm_a[:] = tmp_a @@ -1611,12 +1610,14 @@ def place(a, mask, vals): if usm_vals.ndim != 1: # dpt.place supports only 1-D array of values - usm_vals = dpt.reshape(usm_vals, -1) + usm_vals = dpt_ext.reshape(usm_vals, -1) if usm_vals.dtype != usm_a.dtype: # dpt.place casts values to a.dtype with "unsafe" rule, # while numpy.place does that with "safe" casting rule - usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False) + usm_vals = dpt_ext.astype( + usm_vals, usm_a.dtype, casting="safe", copy=False + ) dpt.place(usm_a, usm_mask, usm_vals) @@ -1708,19 +1709,19 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if usm_ind.ndim != 1: # dpt.put supports only 1-D array of indices - usm_ind = dpt.reshape(usm_ind, -1, copy=False) + usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False) if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.put supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe") in_usm_a = usm_a if axis is None and usm_a.ndim > 1: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access - in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) + in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False) def put_along_axis(a, ind, values, axis, mode="wrap"): @@ -2162,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if axis is None: if a_ndim > 1: # flatten input array - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = 0 elif a_ndim == 0: axis = normalize_axis_index(operator.index(axis), 1) @@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.take supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe") usm_res = _take_index( usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 9df5278bd16..edd98348afe 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -53,6 +53,9 @@ normalize_axis_tuple, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .dpnp_array import dpnp_array @@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a): dpt.place( usm_res.inverse_indices, usm_res.inverse_indices > first_nan, - dpt.reshape(first_nan, 1), + dpt_ext.reshape(first_nan, 1), ) result += (usm_res.inverse_indices,) @@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None): ) usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy) + usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy) return dpnp_array._create_from_usm_ndarray(usm_res) @@ -3259,9 +3262,9 @@ def roll(x, shift, axis=None): shift = dpnp.asnumpy(shift) if axis is None: - return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape) + return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape) - usm_res = dpt.roll(usm_x, shift=shift, axis=axis) + usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index db33a88c748..3e5cdd4da6a 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -43,6 +43,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -84,7 +87,7 @@ def _wrap_sort_argsort( usm_a = dpnp.get_usm_ndarray(a) if axis is None: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = -1 axis = normalize_axis_index(axis, ndim=usm_a.ndim) diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index 7e092184366..daff981d5cc 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -47,6 +47,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True): usm_a = dpnp.get_usm_ndarray(a) usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims) if dtype is not None: - usm_res = dpt.astype(usm_res, dtype) + usm_res = dpt_ext.astype(usm_res, dtype) return dpnp.get_result_array(usm_res, out, casting="unsafe") diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index eb20f9b3ffe..423004470ba 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -13,6 +13,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order): ], ) def test_dpctl_tensor_input(func, args): - x0 = dpt.reshape(dpt.arange(9), (3, 3)) + x0 = dpt_ext.reshape(dpt.arange(9), (3, 3)) new_args = [eval(val, {"x0": x0}) for val in args] X = getattr(dpt, func)(*new_args) Y = getattr(dpnp, func)(*new_args)