diff --git a/include/tvm/relax/exec_builder.h b/include/tvm/relax/exec_builder.h index 4fd0fd66bb90..222dea3fb1f2 100644 --- a/include/tvm/relax/exec_builder.h +++ b/include/tvm/relax/exec_builder.h @@ -122,6 +122,15 @@ class ExecBuilderNode : public Object { rv = value; return ConvertConstant_(rv); } + /*! + * \brief update memory scopes. + * + * This function builds the memory scopes for constants. + * + * \param idx Index of the constant + * \param scope The memory scope. + */ + void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope); /*! * \brief Raw access to underlying executable build in progress. */ diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h index 37488ff31f52..7e2380ddf691 100644 --- a/include/tvm/runtime/vm/executable.h +++ b/include/tvm/runtime/vm/executable.h @@ -155,6 +155,8 @@ class VMExecutable : public ffi::ModuleObj { std::unordered_map func_map; /*! \brief The global constant pool. */ std::vector constants; + /*! \brief The VDevice memory scopes */ + std::unordered_map memory_scopes; /*! \brief The offset of instruction. */ std::vector instr_offset; /*! \brief The byte data of instruction. */ @@ -177,6 +179,11 @@ class VMExecutable : public ffi::ModuleObj { * \param strm The input stream. */ void SaveGlobalSection(dmlc::Stream* strm) const; + /*! + * \brief Save the memory scopes. + * \param strm The output stream. + */ + void SaveMemoryScopeSection(dmlc::Stream* strm) const; /*! * \brief Save the constant pool. * \param strm The input stream. @@ -197,6 +204,11 @@ class VMExecutable : public ffi::ModuleObj { * \param strm The input stream. */ void LoadGlobalSection(dmlc::Stream* strm); + /*! + * \brief Load the memory scopes. + * \param strm The input stream. + */ + void LoadMemoryScopeSection(dmlc::Stream* strm); /*! * \brief Load the constant pool. * \param strm The input stream. diff --git a/include/tvm/tir/backend/adreno/transform.h b/include/tvm/tir/backend/adreno/transform.h new file mode 100644 index 000000000000..d2ca90a8cede --- /dev/null +++ b/include/tvm/tir/backend/adreno/transform.h @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/tir/backend/adreno/transform.h + * \brief TIR specific Adreno GPU transformation passes. + */ +#ifndef TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_ +#define TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_ + +#include +#include +#include +#include +#include + +#include +#include + +namespace tvm { +namespace tir { +namespace backend { +namespace adreno { +namespace transform { + +using tvm::tir::transform::CreatePrimFuncPass; +using tvm::transform::Pass; +using tvm::transform::PassContext; +using tvm::transform::PassContextNode; +using tvm::transform::PassInfo; +using tvm::transform::PassInfoNode; +using tvm::transform::PassNode; +using tvm::transform::Sequential; + +/*! + * \brief Texture flattening pass. + * \return The pass. + */ +TVM_DLL Pass TextureFlatten(); + +/*! + * \brief Inject Texture Allocation intrinsic. + * \return The pass. + */ +TVM_DLL Pass InjectTextureAlloc(); + +} // namespace transform +} // namespace adreno +} // namespace backend +} // namespace tir +} // namespace tvm + +#endif // TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_ diff --git a/python/tvm/dlight/adreno/__init__.py b/python/tvm/dlight/adreno/__init__.py index ea2781455989..87afa08818b2 100644 --- a/python/tvm/dlight/adreno/__init__.py +++ b/python/tvm/dlight/adreno/__init__.py @@ -18,3 +18,6 @@ Adreno schedule rules. """ from .convolution import Conv2d +from .layout_transform import LayoutTransform +from .fallback import Fallback +from .pool import Pool2D diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py index fc2cc449a1c6..462fb2550bd4 100644 --- a/python/tvm/dlight/adreno/convolution.py +++ b/python/tvm/dlight/adreno/convolution.py @@ -16,215 +16,85 @@ # under the License. # pylint: disable=missing-docstring, invalid-name """A Conv2d schedule rule for Adreno GPU operators.""" -from dataclasses import dataclass -from typing import List, Optional +from typing import Optional, Union from tvm import tir from tvm.target import Target -from tvm.tir import IterVar -from tvm.tir.schedule.schedule import BlockRV -from ..analysis import BlockInfo, IterInfo +from .utils import schedule_inline_blocks, schedule_default +from .. import analysis from .base import AdrenoScheduleRule -def is_spatial_block(sch: tir.Schedule, block: BlockRV) -> bool: - block_stmt = sch.get(block) - iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} - return iter_types == {IterVar.DataPar} - - -def is_reduction_block(sch: tir.Schedule, block: BlockRV) -> bool: - block_stmt = sch.get(block) - iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars} - return iter_types == {IterVar.CommReduce, IterVar.DataPar} - - -def _collect_producers(sch: tir.Schedule, block: tir.schedule.BlockRV): - result = [] - for producer in sch.get_producers(block): - result.append(producer) - result.extend(_collect_producers(sch, producer)) - return result - - -def _collect_consumers(sch: tir.Schedule, block: tir.schedule.BlockRV): - result = [] - for consumer in sch.get_consumers(block): - result.append(consumer) - result.extend(_collect_consumers(sch, consumer)) - return result - - -def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo: - def _iter_kind(loop: tir.IterVar) -> str: - return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O") - - def _is_reduction_block(block: tir.schedule.BlockRV): - for iter_var in sch.get(block).iter_vars: - if _iter_kind(iter_var) == "R": - return True - return False - - return BlockInfo( - name=sch.get(block).name_hint, - iters=[ - IterInfo( - kind=_iter_kind(iter_var), - var=iter_var.var, - dom=iter_var.dom.extent, - loop_rv=loop_rv, - ) - for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars) - ], - block_rv=block, - reduction_block=_is_reduction_block(block), - ) - - -def get_reduction_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]) -> bool: - # NOTE: We assume there is only one reduction block in the function - # all blocks are required to be spatial or reduction - if not all( - [is_reduction_block(sch, block) or is_spatial_block(sch, block) for block in blocks] - ): - return None - - # There is only one reduction block - reduction_blocks = [block for block in blocks if is_reduction_block(sch, block)] - if len(reduction_blocks) != 1: - return None - - return reduction_blocks[0] - - -def is_convolution(sch: tir.Schedule, block: tir.schedule.BlockRV): - # TODO: Use buffer access patterns to discover convolution type kernels instead of using name. - return ( - sch.get(block).name_hint.count("conv2d_NCHWc_OIHWo") - and "".join([iter_type.kind for iter_type in get_block_info(sch, block).iters]) - == "SSSSSRRR" - ) - - class Conv2d(AdrenoScheduleRule): """The schedule rule for convolution computation""" - @dataclass - class Config: - block_size_x: int = 8 - block_size_y: int = 8 - vector_size: int = 1 - unroll: int = 256 # 0 means no unroll - use_shared: bool = True - storage_align: bool = False - inner_x: bool = False - - def get_configs(self, target: Target) -> Config: - """Get the schedule config for the target""" - if target.kind.name == "cuda" or target.kind.name == "rocm": - return Conv2d.Config( - block_size_x=8, - block_size_y=16, - vector_size=2, - unroll=256, - use_shared=True, - storage_align=True, - inner_x=False, - ) - elif target.kind.name == "opencl" and ( - ("android" in str(target.host)) or ("adreno" in str(target.attrs)) - ): - return Conv2d.Config( - block_size_x=32, - block_size_y=4, - vector_size=8, - unroll=16, - use_shared=False, - storage_align=False, - inner_x=True, - ) - else: - return Conv2d.Config() + @staticmethod + def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV): + n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk) + + bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True) + by, vy, ty = sch.split(oh, [None, 1, 16], preserve_unit_iters=True) + bx, vx, tx = sch.split(ow, [None, 1, 16], preserve_unit_iters=True) + + bz = sch.fuse(n, bz, preserve_unit_iters=True) + sch.reorder(bz, by, bx, vz, vy, vx, tz, ty, tx, ob) + sch.bind(bz, "blockIdx.z") + sch.bind(by, "blockIdx.y") + sch.bind(bx, "blockIdx.x") + sch.bind(vz, "vthread.z") + sch.bind(vy, "vthread.y") + sch.bind(vx, "vthread.x") + sch.bind(tz, "threadIdx.z") + sch.bind(ty, "threadIdx.y") + sch.bind(tx, "threadIdx.x") + + rblk = sch.cache_read(blk, 0, "local") + ico, icb = sch.split(ic, [None, 4], preserve_unit_iters=True) + sch.reorder(ico, kh, kw, icb, ob) + + sch.compute_at(rblk, kw, preserve_unit_loops=True) + sch.vectorize(sch.get_loops(rblk)[-1]) + wblk = sch.cache_write(blk, 0, "local") + sch.reverse_compute_at(wblk, tx, preserve_unit_loops=True) + sch.vectorize(sch.get_loops(wblk)[-1]) + init_blk = sch.decompose_reduction(blk, tx) + sch.vectorize(sch.get_loops(init_blk)[-1]) def apply( # pylint: disable=too-many-locals,missing-docstring self, - func: tir.PrimFunc, + func: Union[tir.PrimFunc, tir.Schedule], target: Target, _: bool, ) -> Optional[tir.Schedule]: - if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available( + target + ): return None if isinstance(func, tir.PrimFunc): sch = tir.Schedule(func) + sch.work_on("main") + elif isinstance(func, tir.Schedule): + sch = func - # config = self.get_configs(target) - root_block = analysis.get_root_block(sch) + root_block = analysis.get_root_block(sch, sch.func_working_on) blocks = sch.get_child_blocks(root_block) - reduction_block = get_reduction_blocks(sch, blocks) + reduction_blocks = list( + filter(lambda block: analysis.get_block_info(sch, block).is_reduction(), blocks) + ) + remaining_blocks = [blk for blk in blocks if blk not in reduction_blocks] - if reduction_block is None: - return None - if not is_convolution(sch, reduction_block): + def is_convolution(blk): + block_info = analysis.get_block_info(sch, blk) + return "conv2d_NCHWc" in block_info.name + + if len(reduction_blocks) != 1 or not is_convolution(reduction_blocks[0]): return None - def schedule_data_pad(blk): - axes = sch.get_loops(blk) - axes, vec = axes[:-1], axes[-1] - axis = sch.fuse(*axes) - bx, ty, tx = sch.split(axis, [None, 16, 16]) - sch.bind(bx, "blockIdx.x") - sch.bind(ty, "threadIdx.y") - sch.bind(tx, "threadIdx.x") - sch.vectorize(vec) - - def schedule_conv2d(blk): - # TODO: Loop Pattern mayn't be reliable, need to perform better analysis. - n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk) - sch.reorder(n, oc, oh, ow, ic, kh, kw, ob) - main_lp = sch.fuse(n, oc, oh, ow) - bx, ty, tx = sch.split(main_lp, [None, 16, 16]) - sch.bind(tx, "threadIdx.x") - sch.bind(ty, "threadIdx.y") - sch.bind(bx, "blockIdx.x") - - ico, icv = sch.split(ic, [None, 4]) - sch.reorder(ico, kh, kw, icv, ob) - rblk = sch.cache_read(blk, 0, "local") - sch.compute_at(rblk, kw) - sch.vectorize(sch.get_loops(rblk)[-1]) - wblk = sch.cache_write(blk, 0, "local") - sch.reverse_compute_at(wblk, tx) - sch.vectorize(sch.get_loops(wblk)[-1]) - sch.vectorize(ob) - init_blk = sch.decompose_reduction(blk, ico) - sch.vectorize(sch.get_loops(init_blk)[-1]) - - def is_data_pad(block: tir.stmt.Block): - return is_spatial_block(sch, block) and tir.analysis.has_if_then_else(sch.get(block)) - - def schedule_conv2d_blocks(): - - # Do analysis to find block type - blocks = sch.get_child_blocks(root_block) - passed_reduction = False - for blk in blocks: - if is_reduction_block(sch, blk): - schedule_conv2d(blk) - passed_reduction = True - elif is_data_pad(blk): - schedule_data_pad(blk) - elif is_spatial_block(sch, blk): - try: - if not passed_reduction: - sch.compute_inline(blk) - else: - sch.reverse_compute_inline(blk) - except: # pylint: disable=W0702 - pass - else: - raise TypeError("Can't Schedule this Block", sch.get(blk)) - - schedule_conv2d_blocks() + conv_blk = reduction_blocks[0] + Conv2d.schedule_conv2d(sch, conv_blk) + remaining_blocks = schedule_inline_blocks(sch, remaining_blocks) + schedule_default(sch, remaining_blocks) + return sch diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py new file mode 100644 index 000000000000..050781a21f32 --- /dev/null +++ b/python/tvm/dlight/adreno/fallback.py @@ -0,0 +1,177 @@ +# licensed to the apache software foundation (asf) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Dlight Adreno Fallback Schedules""" + + +from typing import List, Union + +from tvm import tir +from tvm.target import Target +from .. import analysis +from .base import AdrenoScheduleRule + + +def _assert_gpu_target(target: Target): + if "gpu" not in target.keys: + raise ValueError(f"Expect a GPU target, but got {target}") + + +def get_max_threads_per_block(target: Target) -> int: + _assert_gpu_target(target) + max_threads_per_block = None + for name in ["max_threads_per_block", "max_num_threads"]: + if max_threads_per_block is None: + max_threads_per_block = target.attrs.get(name, None) + if max_threads_per_block is None: + max_threads_per_block = 64 + return int(max_threads_per_block) + + +# pylint: disable=invalid-name,missing-function-docstring,unused-variable,unused-import +class Fallback(AdrenoScheduleRule): + """Texture Based Fallback Schedule(s) for Adreno""" + + @staticmethod + def schedule_inline_blocks( + sch: tir.Schedule, blocks: List[tir.schedule.BlockRV] + ) -> List[tir.schedule.BlockRV]: + """ + Auto Inlines Injective and Element-wise Operations while trying to omit data pad blocks... + """ + + if blocks is None: + root_blk = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_blk) + + remaining_blocks = [] + for blk in blocks: + block_info = analysis.get_block_info(sch, blk) + if block_info.is_injective() and not block_info.is_data_pad(sch): + if len(sch.get_consumers(blk)) == 1: + try: + sch.compute_inline(blk) + except Exception: # pylint: disable=broad-exception-caught + remaining_blocks.append(blk) + elif len(sch.get_producers(blk)) == 1: + inlined_once = False + try: + # Would cause an issue inlining to producer with multiple consumers + while ( + len(sch.get_producers(blk)) == 1 + and len(sch.get_consumers(sch.get_producers(blk)[0])) == 1 + ): + sch.reverse_compute_inline(blk) + inlined_once = True + except Exception: # pylint: disable=broad-exception-caught + break + if not inlined_once: + remaining_blocks.append(blk) + else: + remaining_blocks.append(blk) + else: + remaining_blocks.append(blk) + return remaining_blocks + + @staticmethod + def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV): + block_info = analysis.get_block_info(sch, blk) + + s_loops, r_loops, o_loops = [], [], [] + v_loop = block_info.write_bufs(sch)[0].assoc_lps[-1] + + for iter_info in block_info.iters: + if sch.get(iter_info.loop_rv) == sch.get(v_loop): + continue + {"S": s_loops, "R": r_loops, "O": o_loops}.get(iter_info.kind).append(iter_info.loop_rv) + + iter_vars = analysis.collect_block_iter_vars_used_in_access_region( + sch.get(blk), block_info.write_bufs(sch)[0].buf_region.region + ) + o_outer = [lp for lp in o_loops if sch.get(lp).var in iter_vars] + o_inner = [lp for lp in o_loops if sch.get(lp).var not in iter_vars] + + # Can't change loop order for opaque loops + if o_loops != o_outer + o_inner: + return + + o_outer.append(v_loop) + sch.reorder(*s_loops, *o_outer, *r_loops, *o_inner) + + assert s_loops + tgt = Target.current(allow_none=True) + + b = sch.fuse(*s_loops) + tx_extent = get_max_threads_per_block(tgt) if tgt is not None else 256 + bx, tx = sch.split(b, [None, tx_extent]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + if len(r_loops) > 1: + lp = [*s_loops, *o_outer][-1] + init_block = sch.decompose_reduction(blk, lp) + wblk = sch.cache_write(blk, 0, "local") + sch.compute_at(wblk, lp) + if v_loop: + sch.vectorize(sch.get_loops(init_block)[-1]) + sch.vectorize(sch.get_loops(wblk)[-1]) + elif v_loop is not None: + sch.vectorize(v_loop) + + @staticmethod + def schedule_fallback(sch): + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + schedule_blocks = [ + blk + for blk in blocks + if analysis.get_block_info(sch, blk).is_reduction() + or analysis.get_block_info(sch, blk).is_data_pad(sch) + ] + remaining_blocks = [blk for blk in blocks if blk not in schedule_blocks] + + for blk in schedule_blocks: + Fallback.schedule_default(sch, blk) + remaining_blocks = Fallback.schedule_inline_blocks(sch, remaining_blocks) + # TODO: Analyze unscheduled blocks to schedule instead of relying on remaining + for blk in remaining_blocks: + Fallback.schedule_default(sch, blk) + + def apply( # pylint: disable=too-many-locals + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + # pylint: disable=invalid-name + + if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target): + return None + + sch = tir.Schedule(func) + root_block = analysis.get_root_block(sch) + blocks = sch.get_child_blocks(root_block) + + if any(len(sch.get_child_blocks(block)) != 0 for block in blocks): + return None + + block_infos = [analysis.get_block_info(sch, block) for block in blocks] + if not any("texture" in block.write_bufs(sch)[0].get_scope() for block in block_infos): + return None + + Fallback.schedule_fallback(sch) + return sch diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py new file mode 100644 index 000000000000..bf3c446d59ff --- /dev/null +++ b/python/tvm/dlight/adreno/layout_transform.py @@ -0,0 +1,129 @@ +# Licensed to the apache software foundation (asf) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=invalid-name, unused-variable + +"Schedules for Texture Based Layout Transforms" +from typing import List, Union + +from tvm import tir +from tvm.target import Target +from .. import analysis + +from .base import AdrenoScheduleRule + + +class LayoutTransform(AdrenoScheduleRule): + """Texture based Layout Transform Dlight Schedule for Adreno""" + + def __init__(self, use_op_name=True): + self.use_op_name = use_op_name + + # TODO: Try using Coalesced Writes... + def apply( # pylint: disable=too-many-locals + self, + func: Union[tir.PrimFunc, tir.Schedule], + target: Target, + _: bool, + ) -> Union[None, tir.Schedule, List[tir.Schedule]]: + # pylint: disable=invalid-name + if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available( + target + ): + return None + + if isinstance(func, tir.PrimFunc): + sch = tir.Schedule(func) + sch.work_on("main") + elif isinstance(func, tir.Schedule): + sch = func + + root_block = analysis.get_root_block(sch, sch.func_working_on) + + if len(sch.get_child_blocks(root_block)) != 1: + return None + + blk = sch.get_child_blocks(root_block)[0] + block_info = analysis.get_block_info(sch, blk) + if not ( + (self.use_op_name and block_info.name == "te_layout_transform") + or (not self.use_op_name and block_info.is_layout_transform(sch)) + ): + return None + + read_buf, write_buf = (block_info.read_bufs(sch)[0], block_info.write_bufs(sch)[0]) + lps = block_info.get_loops() + lpv_read, lpv_write = ( + read_buf.assoc_lps[-1], + write_buf.assoc_lps[-1], + ) + + if lpv_read is None or lpv_write is None: + return None + + vlen_read, vlen_write = read_buf.get_vecsize(), write_buf.get_vecsize() + local_cache = sch.get(lpv_read) != sch.get(lpv_write) or vlen_read != vlen_write + block_loops = [ + lp + for lp in lps + if sch.get(lp) != sch.get(lpv_read) and sch.get(lp) != sch.get(lpv_write) + ] + vec_loops = ( + [lpv_read, lpv_write] if sch.get(lpv_read) != sch.get(lpv_write) else (lpv_read,) + ) + sch.reorder(*block_loops, *vec_loops) + if local_cache: + if sch.get(lpv_read) != sch.get(lpv_write): + blp_read, vlp_read = sch.split( + lpv_read, [None, vlen_read], preserve_unit_iters=True + ) + blp_write, vlp_write = sch.split( + lpv_write, [None, vlen_write], preserve_unit_iters=True + ) + sch.reorder(blp_read, blp_write, vlp_read, vlp_write) + block_loops += [blp_read, blp_write] + rblk = sch.cache_read(blk, 0, "local") + sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True) + sch.vectorize(sch.get_loops(rblk)[-1]) + sch.vectorize(vlp_write) + else: + if vlen_read > vlen_write: + read_lp, vec_lp = sch.split(blk, [None, vlen_write], preserve_unit_iters=True) + rblk = sch.cache_read(blk, 0, "local") + sch.compute_at(rblk, read_lp, preserve_unit_loops=True) + sch.vectorize(sch.get_loops(rblk)[-1]) + sch.vectorize(vec_lp) + else: + rblk = sch.cache_read(blk, 0, "local") + sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True) + _, vread_lp = sch.split( + sch.get_loops(rblk)[-1], vlen_read, preserve_unit_iters=True + ) + sch.vectorize(vread_lp) + sch.vectorize(vlp_write) + else: + blp, vlp = sch.split(lpv_read, [None, vlen_read], preserve_unit_iters=True) + block_loops += [blp] + sch.vectorize(vlp) + + b = sch.fuse(*block_loops) + tx_extent = min(sch.get(b).extent, 256) + candidates = [1, 2, 4, 8, 16, 32] + bx, tx = sch.split(b, [None, 256], preserve_unit_iters=True) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + return sch diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py new file mode 100644 index 000000000000..e3709caad96a --- /dev/null +++ b/python/tvm/dlight/adreno/pool.py @@ -0,0 +1,92 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring +""" Pool schedule rule for Adreno operators.""" + +from tvm import tir +from tvm.target import Target + +from .base import AdrenoScheduleRule +from .. import analysis + + +# pylint: disable=invalid-name, unused-variable +class Pool2D(AdrenoScheduleRule): + def apply( # pylint: disable=too-many-locals,missing-docstring + self, + func: tir.PrimFunc, + target: Target, + _: bool, + ) -> tir.Schedule: + sch = tir.Schedule(func) + root = sch.get_block(name="root", func_name="main") + + blocks = sch.get_child_blocks(root) + blocks_names = [sch.get(blk).name_hint for blk in blocks] + + if not "adaptive_pool_sum" in blocks_names and not "pool_max" in blocks_names: + return None + + def schedule_pad(blk: tir.schedule.BlockRV): + lps, veclp = sch.get_loops(blk)[:-1], sch.get_loops(blk)[-1] + sch.vectorize(veclp) + b = sch.fuse(*lps) + tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256) + bx, tx = sch.split(b, [None, tx_extent]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + def schedule_max_pool(blk: tir.schedule.BlockRV): + block_info = analysis.get_block_info(sch, blk) + iters_kind = "".join([_iter.kind for _iter in block_info.iters]) + if iters_kind != "SSSSSRR": + return None + + lps = sch.get_loops(blk) + block_lps, vec_lp, red_lps = lps[:4], lps[4], lps[5:] + write_blk = sch.cache_write(blk, 0, "local") + sch.reverse_compute_at(write_blk, vec_lp) + b = sch.fuse(*block_lps) + tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256) + bx, tx = sch.split(b, [None, tx_extent]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.vectorize(vec_lp) + + return True + + passed_reduction = False + for blk in blocks: + if sch.get(blk).name_hint == "pad_temp": + schedule_pad(blk) + elif ( + sch.get(blk).name_hint == "adaptive_pool_sum" + or sch.get(blk).name_hint == "pool_max" + ): + ok = schedule_max_pool(blk) + if not ok: + return None + passed_reduction = True + else: + try: + if passed_reduction: + sch.reverse_compute_inline(blk) + else: + sch.compute_inline(blk) + except Exception: # pylint: disable=broad-except + pass + return sch diff --git a/python/tvm/dlight/adreno/utils.py b/python/tvm/dlight/adreno/utils.py new file mode 100644 index 000000000000..73597fe8578c --- /dev/null +++ b/python/tvm/dlight/adreno/utils.py @@ -0,0 +1,89 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Utilis for Adreno operators.""" + +# pylint: disable=import-outside-toplevel, unused-argument, invalid-name, missing-function-docstring +from typing import List + +from tvm.target import Target +from tvm import tir + +from ..analysis import BlockInfo + + +def get_texture_storage(block_info: BlockInfo): + """ + Returns the texture layout acceptable for the shape + + Parameters + ---------- + shape: array + Shape of the tensor to be packed to texture + """ + # certain limitation of the Qualcomm devices. Subject to be determined for certain device + # individually, but until we have access to remote device during compilation, we have to + # define it uniformly for all target devices + # spatial_limit = 16384, depth_limit = 2048 + # TODO: Check Write Bufs. + shape = block_info.write_bufs[0].buf_region.buffer.shape + + spatial_limit = Target.current().attrs["texture_spatial_limit"] + depth_limit = Target.current().attrs["texture_depth_limit"] + + if len(shape) > 4: + if shape[0] < spatial_limit and shape[1] * shape[2] * shape[3] < spatial_limit: + return "global.texture-weight" + elif shape[0] < depth_limit and shape[2] * shape[3] < spatial_limit: + return "global.texture-nhwc" + elif ( + shape[0] * shape[1] < depth_limit + and shape[2] < spatial_limit + and shape[3] < spatial_limit + ): + return "global.texture" + elif len(shape) > 3: + if shape[0] < spatial_limit and shape[1] * shape[2] < spatial_limit: + return "global.texture-weight" + elif shape[0] < depth_limit and shape[1] < spatial_limit and shape[2] < spatial_limit: + return "global.texture" + elif len(shape) == 3: + if shape[0] < spatial_limit and shape[1] < spatial_limit: + return "global.texture-weight" + + return "global" + + +def schedule_inline_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV] = None): + from .fallback import Fallback + + return Fallback.schedule_inline_blocks(sch, blocks) + + +def schedule_default(sch, blocks: List[tir.schedule.BlockRV] = None): + from .fallback import Fallback + + ret = [] + for blk in blocks: + ret.append(Fallback.schedule_default(sch, blk)) + + return ret + + +def schedule_fallback(sch, blk): + from .fallback import Fallback + + return Fallback.schedule_fallback(sch) diff --git a/python/tvm/dlight/analysis/__init__.py b/python/tvm/dlight/analysis/__init__.py index bf68d0855015..0df8abb2bf5c 100644 --- a/python/tvm/dlight/analysis/__init__.py +++ b/python/tvm/dlight/analysis/__init__.py @@ -24,6 +24,7 @@ is_broadcast_epilogue, normalize_prim_func, get_root_block, + get_block_info, ) from .gemv import ( is_gemv, diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py index e3357c6e78db..161deaf53772 100644 --- a/python/tvm/dlight/analysis/common_analysis.py +++ b/python/tvm/dlight/analysis/common_analysis.py @@ -14,9 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Analysis on TIR blocks, loops and functions.""" -from typing import List, Optional, Set, Union +# pylint: disable=missing-function-docstring, missing-class-docstring +# pylint: disable=unused-argument, unused-variable +"""Analysis on TIR blocks, loops and functions.""" +from collections import namedtuple +from typing import List, Optional, Set, Union, Tuple from typing_extensions import Literal from tvm_ffi import get_global_func @@ -24,6 +27,7 @@ from tvm.target.target import Target from tvm.tir import Schedule from tvm.tir.schedule import BlockRV +from tvm.runtime import DataType class IterInfo: @@ -59,6 +63,106 @@ def __repr__(self) -> str: return str(self) +get_blockrealize = get_global_func("tir.schedule.GetBlockRealize") +# BufferIndex Types +Index = namedtuple("Index", ["sub"]) # c +RemIndex = namedtuple("RemIndex", ["sub", "div"]) # c%len +DivIndex = namedtuple("DivIndex", ["sub", "div"]) # c//len +MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"]) # co*len + cb +BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]] + + +class BufferInfo: + "Information about Buffer. Provides useful analysis" + buf_region: tir.BufferRegion + shape: Tuple[int] + assoc_lps: List[Union[tir.schedule.LoopRV, None]] + assoc_lps_info: List[Union[tir.For, None]] + + def __init__( + self, + sch: tir.Schedule, + block_rv: tir.schedule.BlockRV, + buf_region: tir.BufferRegion, + lps: Union[List[tir.schedule.LoopRV], None], + ): + block = sch.get(block_rv) + if lps is None: + lps = sch.get_loops(block_rv) + loops = [sch.get(lp) for lp in lps] + iter_vars = [Var.var for Var in block.iter_vars] + iter_values = get_blockrealize(sch, block_rv).iter_values + lpvar_lp = dict([loop.loop_var, lp] for loop, lp in zip(loops, lps)) + var_lp = dict(zip(iter_vars, [lpvar_lp.get(val, None) for val in iter_values])) + + def extract_index_types(buf: tir.BufferRegion) -> BufIndex: + buf_index = [] + for expr in buf.region: + expr = expr.min + dim = None + if isinstance(expr, tir.expr.Add) and isinstance(expr.b, tir.expr.Var): + var_add = expr.b + if ( + isinstance(expr, tir.expr.Mul) + and isinstance(expr.a, tir.expr.Var) + and isinstance(expr.b, tir.expr.IntImm) + ): + mul = expr.b + var_mul = expr.a + dim = MergeIndex(var_mul, mul, var_add) + elif ( + isinstance(expr, tir.expr.FloorMod) + and isinstance(expr.a, tir.expr.Var) + and isinstance(expr.b, tir.expr.IntImm) + ): + dim = RemIndex(expr.a, expr.b) + elif ( + isinstance(expr, tir.expr.FloorDiv) + and isinstance(expr.a, tir.expr.Var) + and isinstance(expr.b, tir.expr.IntImm) + ): + dim = DivIndex(expr.a, expr.b) + elif isinstance(expr, tir.expr.Var): + dim = Index(expr) + buf_index.append(dim) + return buf_index + + indexes = extract_index_types(buf_region) + assoc_lps = [ + ( + var_lp.get(getattr(idx, "sub"), None) + if not isinstance(idx, DivIndex) and not idx is None + else None + ) + for idx in indexes + ] + + self.buf_region = buf_region + self.assoc_lps = assoc_lps + self.assoc_lps_info = [(sch.get(lp) if lp is not None else None) for lp in assoc_lps] + self.shape = buf_region.buffer.shape + + def get_scope(self) -> str: + return self.buf_region.buffer.scope() + + def get_vecsize(self, buf_index: int = 0, vbits: int = 128): + if self.assoc_lps_info[-1] is None: + return None + + vlp_extent = int(self.assoc_lps_info[-1].extent) & ~( + int(self.assoc_lps_info[-1].extent) - 1 + ) + vbuf_extent = int(self.shape[-1]) & ~(int(self.shape[-1]) - 1) + + return min(vlp_extent, vbuf_extent, vbits // DataType(self.buf_region.buffer.dtype).bits) + + def __str__(self) -> str: + return f"BufferInfo({self.buf_region})" + + def __repr__(self) -> str: + return str(self) + + class BlockInfo: """Information about a TIR block.""" @@ -84,6 +188,16 @@ def dom(self) -> List[Union[int, tir.PrimExpr]]: """The iteration domain of the block.""" return [i.dom for i in self.iters] + def read_bufs(self, sch: tir.Schedule) -> List[BufferInfo]: + block_stmt = sch.get(self.block_rv) + lps = sch.get_loops(self.block_rv) + return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads] + + def write_bufs(self, sch: tir.Schedule) -> List[BufferInfo]: + block_stmt = sch.get(self.block_rv) + lps = sch.get_loops(self.block_rv) + return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes] + def dom_kind(self) -> str: """The iteration domain kind of the block, for example, SSSS, SSSR.""" return "".join(i.kind for i in self.iters) @@ -108,15 +222,48 @@ def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool: if len(r_region) != len(w_region): return False for var, r_dom, w_dom in zip(block.iter_vars, r_region, w_region): - if not _check_unit_var_range(var, r_dom) or not _check_unit_var_range(var, w_dom): + if not _check_unit_var_range(r_dom, var) or not _check_unit_var_range(w_dom, var): return False return True + def get_loops(self) -> List[tir.schedule.LoopRV]: + return [iter_info.loop_rv for iter_info in self.iters] + def is_reduction(self) -> bool: """Whether the block is a reduction workload.""" # TODO(@junrushao): distinguish GEMV and reduction return self._reduction_block + def is_layout_transform(self, sch: tir.Schedule) -> bool: + """Whether the Block can be considered having a Layout Transform Pattern""" + return ( + all(k == "S" for k in self.dom_kind()) + and len(self.write_bufs(sch)) == 1 + and len(self.read_bufs(sch)) == 1 + and not self.is_elementwise(sch) + and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv)) + ) + + def is_data_pad(self, sch: tir.Schedule) -> bool: + """Whether the Block can be considered having a data pad pattern""" + return ( + all(k == "S" for k in self.dom_kind()) + and len(self.write_bufs(sch)) == 1 + and len(self.read_bufs(sch)) == 1 + and not self.is_elementwise(sch) + and len(self.write_bufs(sch)[0].buf_region.region) + == len(self.read_bufs(sch)[0].buf_region.region) + and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv)) + ) + + def is_convolution(self) -> bool: + """Whether a Block can be considered having Convolution Pattern""" + raise NotImplementedError + + def is_pool(self) -> bool: + """Whether a Block can be considered having Pooling Pattern""" + raise NotImplementedError + def is_gemv(self) -> bool: """Whether the block is a GEMV workload.""" raise NotImplementedError @@ -171,6 +318,32 @@ def _iter_kind(i: tir.IterVar) -> str: return blocks +def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo: + def _iter_kind(loop: tir.IterVar) -> str: + return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O") + + def _is_reduction_block(block: tir.schedule.BlockRV): + for iter_var in sch.get(block).iter_vars: + if _iter_kind(iter_var) == "R": + return True + return False + + return BlockInfo( + name=sch.get(block).name_hint, + iters=[ + IterInfo( + kind=_iter_kind(iter_var), + var=iter_var.var, + dom=iter_var.dom.extent, + loop_rv=loop_rv, + ) + for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars) + ], + block_rv=block, + reduction_block=_is_reduction_block(block), + ) + + def _assert_gpu_target(target: Target): if "gpu" not in target.keys: raise ValueError(f"Expect a GPU target, but got {target}") diff --git a/python/tvm/ir/global_info.py b/python/tvm/ir/global_info.py index 185e10b88cce..597ff68dc1d8 100644 --- a/python/tvm/ir/global_info.py +++ b/python/tvm/ir/global_info.py @@ -18,6 +18,7 @@ import tvm import tvm_ffi from tvm.runtime.object import Object +from tvm.runtime import Device from . import _ffi_api @@ -39,6 +40,8 @@ def same_as(self, other): @tvm_ffi.register_object("ir.DummyGlobalInfo") class DummyGlobalInfo(GlobalInfo): + """DummyGlobalInfo""" + def __init__(self) -> None: self.__init_handle_by_constructor__( _ffi_api.DummyGlobalInfo, @@ -47,6 +50,8 @@ def __init__(self) -> None: @tvm_ffi.register_object("ir.VDevice") class VDevice(GlobalInfo): + """VDevice""" + def __init__( self, target=None, @@ -55,4 +60,6 @@ def __init__( ) -> None: if isinstance(target, (dict, str)): target = tvm.target.Target(tvm.runtime.convert(target)) + if isinstance(target, Device): + target = tvm.target.Target.from_device(target) self.__init_handle_by_constructor__(_ffi_api.VDevice, target, vdevice_id, memory_scope) diff --git a/python/tvm/relax/backend/adreno/pipeline.py b/python/tvm/relax/backend/adreno/pipeline.py index 612b8ce7011d..cbbb17a6180a 100644 --- a/python/tvm/relax/backend/adreno/pipeline.py +++ b/python/tvm/relax/backend/adreno/pipeline.py @@ -18,6 +18,7 @@ import tvm from tvm import dlight as dl from tvm import relax +from tvm.relax.transform.legalize_ops import adreno as legalize_adreno def library_dispatch_passes(target: tvm.target.Target): # pylint: disable=unused-argument @@ -30,21 +31,78 @@ def library_dispatch_passes(target: tvm.target.Target): # pylint: disable=unuse def legalize_passes(target: tvm.target.Target): # pylint: disable=unused-argument """The default legalization passes for Adreno GPU backend.""" - return [ - relax.transform.DecomposeOpsForInference(), - relax.transform.FoldConstant(), - relax.transform.LegalizeOps(), - relax.transform.AnnotateTIROpPattern(), - relax.transform.FoldConstant(), - relax.transform.FuseOps(), - relax.transform.FuseTIR(), - relax.transform.DeadCodeElimination(), - dl.ApplyDefaultSchedule( - dl.gpu.Reduction(), - dl.gpu.GeneralReduction(), - dl.gpu.Fallback(), - ), + desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]} + skip_ops = [ + "relax.nn.conv2d", + "relax.nn.max_pool2d", + "relax.nn.adaptive_avg_pool2d", ] + pass_list = [] + + pass_list.extend( + [ + tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)), + relax.transform.DecomposeOpsForInference(), + ] + ) + if "texture" in target.keys: + pass_list.extend( + [ + relax.transform.ConvertLayout(desired_layouts), + relax.transform.Normalize(), + relax.transform.FoldConstant(), + relax.transform.LegalizeOps(skip_ops=skip_ops), + relax.transform.AnnotateTIROpPattern(), + relax.backend.adreno.transform.AnnotateCustomMemoryScope(target), + ] + ) + pass_list.extend([tvm.relax.transform.LegalizeOps()]) + if "texture" in target.keys: + pass_list.extend( + [ + relax.transform.LegalizeOps( + {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo}, + ) + ] + ) + + pass_list.extend( + [ + relax.transform.AnnotateTIROpPattern(), + relax.transform.FoldConstant(), + relax.transform.FuseOps(), + relax.transform.FuseTIR(), + relax.transform.DeadCodeElimination(), + ] + ) + if "texture" in target.keys: + pass_list.extend( + [ + relax.backend.adreno.transform.FoldVDeviceScopeChange(), + relax.transform.DeadCodeElimination(), + relax.transform.SpecializePrimFuncBasedOnCallSite(), + ] + ) + pass_list.extend([relax.transform.Normalize()]) + pass_list.extend( + [ + dl.ApplyDefaultSchedule( + dl.adreno.Conv2d(), + dl.adreno.LayoutTransform(), + dl.adreno.Pool2D(), + ) + ] + ) + pass_list.extend( + [ + dl.ApplyDefaultSchedule( + dl.gpu.Reduction(), + dl.gpu.GeneralReduction(), + dl.gpu.Fallback(), + ) + ] + ) + return pass_list def dataflow_lower_passes(target: tvm.target.Target): # pylint: disable=unused-argument diff --git a/python/tvm/relax/op/memory/memory.py b/python/tvm/relax/op/memory/memory.py index 7b84ffc48bb6..8e17194f67d9 100644 --- a/python/tvm/relax/op/memory/memory.py +++ b/python/tvm/relax/op/memory/memory.py @@ -62,7 +62,11 @@ def alloc_storage( @args_converter.auto def alloc_tensor( - storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr] + storage: Expr, + offset: Union[int, Expr], + shape: Expr, + dtype: Union[str, Expr], + runtime_device_ind: Union[int, Expr] = PrimValue(0), ) -> Call: """Construct a Call to allocate a tensor on a certain storage starting from the given offset. @@ -80,6 +84,10 @@ def alloc_tensor( dtype : Union[str, Expr] The datatype of the tensor to be allocated. + runtime_device_ind: Union[int, Expr] + The device index indicating on which device the tensor is to be + allocated at runtime. Index -1 is reserved for the host device. + Returns ------- result : Call @@ -89,7 +97,7 @@ def alloc_tensor( offset = PrimValue(offset) if isinstance(dtype, str): dtype = DataTypeImm(dtype) - return _ffi_api.alloc_tensor(storage, offset, shape, dtype) # type: ignore + return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind) # type: ignore @args_converter.auto diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py index 3ed6b29648b4..dcc9e5a33a53 100644 --- a/python/tvm/relax/op/vm/vm.py +++ b/python/tvm/relax/op/vm/vm.py @@ -62,7 +62,11 @@ def alloc_storage( @args_converter.auto def alloc_tensor( - storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr] + storage: Expr, + offset: Union[int, Expr], + shape: Expr, + dtype: Union[str, Expr], + runtime_device_ind: Union[int, Expr] = PrimValue(0), ) -> Call: """Construct a Call to allocate a tensor on a certain storage starting from the given offset. @@ -80,6 +84,10 @@ def alloc_tensor( dtype : Union[str, Expr] The datatype of the tensor to be allocated. + runtime_device_ind: Union[int, Expr] + The device index indicating on which device the tensor is to be + allocated at runtime. Index -1 is reserved for the host device. + Returns ------- result : Call @@ -89,7 +97,7 @@ def alloc_tensor( offset = PrimValue(offset) if isinstance(dtype, str): dtype = DataTypeImm(dtype) - return _ffi_api.alloc_tensor(storage, offset, shape, dtype) # type: ignore + return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind) # type: ignore def kill_object(obj: Expr) -> Call: diff --git a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py index 00081f92b197..b73b40d6fc9c 100644 --- a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py +++ b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py @@ -78,6 +78,6 @@ def rewrite_alloc_tensor(self, call: relax.Call) -> relax.Call: ) return relax.Call( self.memory_alloc_tensor_op, - args=[ipc_alloc_storage, call.args[2], shape, dtype], + args=[ipc_alloc_storage, call.args[2], shape, dtype, relax.PrimValue(0)], sinfo_args=call.sinfo_args, ) diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py index f221a1308965..094ec3379fe9 100644 --- a/python/tvm/script/ir_builder/relax/ir.py +++ b/python/tvm/script/ir_builder/relax/ir.py @@ -619,6 +619,30 @@ def emit_var_binding(value: VarBinding) -> Var: return _ffi_api.EmitVarBinding(value) # type: ignore +def emit_with_sinfo( + op: str, + args: Expr, + sinfo_args: Optional[Union[StructInfo, List[StructInfo]]] = None, +) -> Call: + """Create a relax Call with sinfo_args. + Parameters + ---------- + op: Expr + The relax op for which sinfo_args to be appended + args : Expr + The arguments. + sinfo_args: Optional[Union[StructInfo, List[StructInfo]]] + The list of structure info arguments. + + Returns + ------- + call: Call + The created Relax Call + """ + builtin_call = tvm.ir.Op.get(op) + return Call(builtin_call, args, attrs=None, sinfo_args=sinfo_args) + + ############################### SeqExpr ############################### @@ -822,6 +846,7 @@ def dtype(value: Union[py_str, DataType]) -> Expr: "emit_te", "emit_var_binding", "emit_match_cast", + "emit_with_sinfo", "equal", "ewise_fma", "exp", diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index eb6e25f0450c..31788d5f062a 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -838,7 +838,7 @@ def stm32(series="unknown", options=None): return Target(" ".join(["c"] + opts)) -def adreno(model="unknown", options=None, clml=False): +def adreno(model="unknown", options=None, cfg=None, backend="opencl"): """Returns a Qualcomm GPU target. Parameters ---------- @@ -846,13 +846,22 @@ def adreno(model="unknown", options=None, clml=False): The model of this device options : str or list of str Additional options + cfg : str + Additional hints for target pipeline behavior + backend : str + Backend API, can be "opencl" or "vulkan" """ - if clml: - opts = ["-device=adreno", "--keys=adreno,opencl,gpu,clml", "-model=%s" % model] - else: - opts = ["-device=adreno", "--keys=adreno,opencl,gpu", "-model=%s" % model] + + if backend not in ["opencl", "vulkan"]: + raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.") + + keys = f"adreno,{backend},gpu" + if cfg: + keys += f",{cfg}" + + opts = ["-device=adreno", f"--keys={keys}", f"-model={model}"] opts = _merge_opts(opts, options) - return Target(" ".join(["opencl"] + opts)) + return Target(" ".join([backend] + opts)) def create(target): diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index da22cf77466f..fc9a2296e786 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -918,6 +918,17 @@ def _multi_gpu_exists(): parent_features="cuda", ) +# Mark a test as requiring the OpenCL runtime on remote RPC +requires_adreno_opencl = Feature( + "opencl", + long_name="Remote Adreno OpenCL", + cmake_flag="USE_OPENCL", + target_kind_enabled="opencl", + target_kind_hardware=None, + parent_features="gpu", + run_time_check=lambda: os.getenv("RPC_TARGET") is not None, +) + # Mark a test as requiring the OpenCL runtime requires_opencl = Feature( "opencl", diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 0a598e5e9bb9..74f2f6b2f757 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -126,3 +126,4 @@ from .build import build from .pipeline import get_tir_pipeline, get_default_tir_pipeline from .functor import PyStmtExprVisitor, PyStmtExprMutator +from . import backend diff --git a/python/tvm/tir/backend/__init__.py b/python/tvm/tir/backend/__init__.py new file mode 100644 index 000000000000..bd04016a2bb3 --- /dev/null +++ b/python/tvm/tir/backend/__init__.py @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""TIR backends""" + +from . import adreno diff --git a/python/tvm/tir/backend/adreno/__init__.py b/python/tvm/tir/backend/adreno/__init__.py new file mode 100644 index 000000000000..cb59e73b57e2 --- /dev/null +++ b/python/tvm/tir/backend/adreno/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""The TIR Adreno backend passes """ + +from . import transform +from .pipeline import get_tir_pipeline diff --git a/python/tvm/tir/backend/adreno/pipeline.py b/python/tvm/tir/backend/adreno/pipeline.py new file mode 100644 index 000000000000..1efc8e83963c --- /dev/null +++ b/python/tvm/tir/backend/adreno/pipeline.py @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=invalid-name +"""The TIR backend compilation pipeline for Adreno""" + +import tvm +from tvm import tir + + +def default_tir_pipeline(): + """The default tir pipeline used in tvm.tir.build""" + + @tvm.transform.module_pass(opt_level=0) + def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.IRModule: + """The default lowering passes for TIR backend.""" + pass_ctx = tvm.transform.PassContext.current() + config = pass_ctx.config + passes = [ + tir.backend.adreno.transform.TextureFlatten(), + tir.transform.CanonicalizeLoop(), + tir.transform.LowerCrossThreadReduction(), + tir.transform.LowerInitBlock(), + tir.transform.PlanAndUpdateBufferAllocationLocation(), + tir.transform.ConvertBlocksToOpaque(), + tir.transform.LiftThreadBinding(), + tir.transform.ManifestSharedMemoryLocalStage(), + tir.transform.CompactBufferAllocation(), + tir.transform.LowerAutoCopy(), + tir.transform.UnifyThreadBinding(), + tir.transform.LowerMatchBuffer(), + tir.transform.Simplify(), + tir.transform.InjectPermutedLayout(), + tir.transform.AnnotateIrregularLoop(), + tir.transform.InjectSoftwarePipeline(), + tir.transform.TransformMmaBufferLayout(), + tir.transform.LowerOpaqueBlock(), + tir.backend.adreno.transform.InjectTextureAlloc(), + tir.transform.FlattenBuffer(), + tir.transform.BF16ComputeLegalize(), + tir.transform.NarrowDataType(32), + tir.transform.LoopPartition(), + tir.transform.VectorizeLoop(not bool(config.get("tir.disable_vectorize", False))), + tir.transform.InjectVirtualThread(), + tir.transform.InjectDoubleBuffer(), + ] + if not bool(config.get("tir.disable_storage_rewrite", False)): + passes.append(tir.transform.StorageRewrite()) + if config.get("tir.use_async_copy", False): + passes.append(tir.transform.LowerAsyncDMA()) + passes.extend( + [ + tir.transform.HoistIfThenElse(), + tir.transform.UnrollLoop(), + tir.transform.RenormalizeSplitPattern(), + tir.transform.Simplify(), + tir.transform.RemoveNoOp(), + tir.transform.RewriteUnsafeSelect(), + ] + ) + # Additional passes based on configuration. + if bool(config.get("tir.instrument_bound_checkers", False)): + passes.append(tir.transform.InstrumentBoundCheckers()) + if bool(config.get("tir.ptx_ldg32", False)): + passes.append(tir.transform.InjectPTXLDG32(True)) + passes.append( + tir.transform.CommonSubexprElimTIR( + not bool(config.get("tir.disable_cse_tir", False)), + bool(config.get("tir.enable_equiv_terms_in_cse_tir", False)), + ) + ) + if bool(config.get("tir.instrument_lwp", False)): + passes.append(tir.transform.InstrumentProfileIntrinsics()) + passes.extend( + [ + # Bind the target first so that target-specific attributes are available. + tir.transform.FP8ComputeLegalize(), + # VerifyVTCMLimit must occur before LowerVtcmAlloc. + tir.transform.VerifyVTCMLimit(), + tir.transform.LowerVtcmAlloc(), + tir.transform.VerifyMemory(), + tir.transform.AnnotateEntryFunc(), + ] + ) + if bool(config.get("tir.detect_global_barrier", False)): + passes.append(tir.transform.ThreadSync("global")) + passes.extend( + [ + tir.transform.ThreadSync("shared"), + tir.transform.ThreadSync("shared.dyn"), + tir.transform.ThreadSync("warp"), + tir.transform.InferFragment(), + tir.transform.LowerThreadAllreduce(), + ] + ) + if bool(config.get("tir.use_async_copy", False)): + passes.append(tir.transform.InjectPTXAsyncCopy()) + if bool(config.get("tir.ptx_ldg32", False)): + passes.append(tir.transform.InjectPTXLDG32()) + passes.extend( + [ + tir.transform.AnnotateDeviceRegions(), + tir.transform.SplitHostDevice(), + # MergeSharedMemoryAllocations must follow SplitHostDevice. + tir.transform.MergeSharedMemoryAllocations(), + tir.transform.MakePackedAPI(), + tir.transform.FP8StorageLegalize(), + tir.transform.BF16StorageLegalize(), + tir.transform.LowerDeviceKernelLaunch(), + ] + ) + mod = tvm.ir.transform.Sequential(passes)(mod) + return mod + + return _pipeline + + +def finalize_host_passes(): # pylint: disable=unused-argument + """The default finalization passes for TIR backend.""" + host_pass_list = [ + tir.transform.LowerTVMBuiltin(), + tir.transform.LowerCustomDatatypes(), + tir.transform.LowerIntrin(), + tir.transform.LowerDeviceStorageAccessInfo(), + tir.transform.CombineContextCall(), + ] + return tvm.ir.transform.Sequential(host_pass_list) + + +def finalize_device_passes(): # pylint: disable=unused-argument + """The default finalization passes for TIR backend.""" + device_pass_list = [ + tir.transform.LowerWarpMemory(), + tir.transform.Simplify(), + tir.transform.LowerCustomDatatypes(), + tir.transform.LowerDeviceStorageAccessInfo(), + tir.transform.LowerIntrin(), + ] + return tvm.ir.transform.Sequential(device_pass_list) + + +def get_tir_pipeline( + target: tvm.target.Target, # pylint: disable=unused-argument +) -> tvm.transform.Pass: + """Get the TIR pipeline for Adreno GPU.""" + return default_tir_pipeline() diff --git a/python/tvm/tir/backend/adreno/transform/__init__.py b/python/tvm/tir/backend/adreno/transform/__init__.py new file mode 100644 index 000000000000..25c83f604916 --- /dev/null +++ b/python/tvm/tir/backend/adreno/transform/__init__.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Adreno TIR transformations. """ + +from .transform import ( + InjectTextureAlloc, + TextureFlatten, +) diff --git a/python/tvm/tir/backend/adreno/transform/_ffi_api.py b/python/tvm/tir/backend/adreno/transform/_ffi_api.py new file mode 100644 index 000000000000..7df0ae3eae47 --- /dev/null +++ b/python/tvm/tir/backend/adreno/transform/_ffi_api.py @@ -0,0 +1,21 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""FFI APIs for tvm.tir.backend.adreno.transform""" +import tvm_ffi + + +tvm_ffi.init_ffi_api("tir.backend.adreno.transform", __name__) diff --git a/python/tvm/tir/backend/adreno/transform/transform.py b/python/tvm/tir/backend/adreno/transform/transform.py new file mode 100644 index 000000000000..03cc162a4bad --- /dev/null +++ b/python/tvm/tir/backend/adreno/transform/transform.py @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Wrapping existing transformations.""" +# pylint: disable=invalid-name, unsupported-binary-operation + +from . import _ffi_api + + +def InjectTextureAlloc(): + """Inject Texture Allocation Intrinsic to make sure appropriate lowering + via alloc_nd/alloc_free calls + Returns + ------- + fpass : tvm.transform.Pass + The result pass + """ + return _ffi_api.InjectTextureAlloc() # type: ignore + + +def TextureFlatten(): + """Flatten the multi-dimensional read/write to 2D. + + + Parameters + ---------- + + Returns + ------- + fpass : tvm.transform.Pass + The result pass + """ + return _ffi_api.TextureFlatten() # type: ignore diff --git a/python/tvm/tir/pipeline.py b/python/tvm/tir/pipeline.py index 96ed9dfdbc96..1ee4a5b1d315 100644 --- a/python/tvm/tir/pipeline.py +++ b/python/tvm/tir/pipeline.py @@ -21,6 +21,8 @@ import tvm from tvm import tir +from . import backend + def default_tir_pipeline(): """The default tir pipeline used in tvm.tir.build""" @@ -176,4 +178,7 @@ def get_default_tir_pipeline( target: tvm.target.Target, # pylint: disable=unused-argument ) -> tvm.transform.Pass: """Get the default TIR pipeline for the given target.""" - return default_tir_pipeline() + if target.kind.name == "opencl" and "adreno" in target.keys: + return backend.adreno.get_tir_pipeline(target) + else: + return default_tir_pipeline() diff --git a/src/relax/backend/adreno/annotate_custom_storage.cc b/src/relax/backend/adreno/annotate_custom_storage.cc index 887b81872940..f42c5c456124 100644 --- a/src/relax/backend/adreno/annotate_custom_storage.cc +++ b/src/relax/backend/adreno/annotate_custom_storage.cc @@ -280,30 +280,58 @@ class CollectConsumerScopeInfo : public ExprVisitor { mod_ = mod; target_ = target; VisitExpr(func->body); - // Extend the scope for tuple items - for (const auto& val : arg_to_binding) { + + for (const auto& val : tuple_item_to_binding) { if (scope_info.find(val.first) != scope_info.end()) { - if (scope_info.find(val.second) == scope_info.end()) { - scope_info.Set(val.second, scope_info[val.first]); - } else { - auto ent = scope_info[val.second]; - for (auto ent_val : scope_info[val.first]) { - ent.Set(ent_val.first, ent_val.second); + for (const auto& item_val : val.second) { + // TODO(Siva): How about ops that generate tuples like split ? + if (tuples_to_binding.find(item_val.first) == tuples_to_binding.end()) { + continue; + } + auto producer_var = tuples_to_binding[item_val.first][item_val.second]; + if (scope_info.find(producer_var) == scope_info.end()) { + scope_info.Set(producer_var, scope_info[val.first]); + } else { + auto ent = scope_info[producer_var]; + for (auto ent_val : scope_info[val.first]) { + ent.Set(ent_val.first, ent_val.second); + } + scope_info.Set(producer_var, ent); } - scope_info.Set(val.second, ent); } } } - return std::make_pair(call_scope_info, scope_info); } void VisitBinding_(const VarBindingNode* binding, const TupleGetItemNode* tuple_get_item_node) final { - if (arg_to_binding.find(ffi::GetRef(binding->var.get())) == arg_to_binding.end()) { - arg_to_binding.Set(ffi::GetRef(binding->var.get()), - ffi::GetRef(tuple_get_item_node->tuple.get())); + /* + * lv9 = R.call_tir(add, (lv7, m + * lv: R.Tuple( + * R.Tensor((1, 3, 224, 224), dtype="float32"), + * R.Tensor((3,), dtype="float32"), + * R.Tensor((3,), dtype="float32") + * ) = lv9, metadata["relax.expr.Constant"][4], metadata["relax.expr.Constant"][5] + * lv1_1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0] + * lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d(lv1_1, ..... + * + * lv1_1 scope is requested by conv2d now we need to populate the same to lv9 + * Capture essestial information here as + * + * lv => {(0, lv1_1), (1, ) ...} + */ + ffi::Map field_map; + if (tuple_item_to_binding.find(ffi::GetRef(binding->var.get())) != + tuple_item_to_binding.end()) { + field_map = tuple_item_to_binding[ffi::GetRef(binding->var.get())]; } + field_map.Set(ffi::GetRef(tuple_get_item_node->tuple.get()), tuple_get_item_node->index); + tuple_item_to_binding.Set(ffi::GetRef(binding->var.get()), field_map); + } + + void VisitBinding_(const VarBindingNode* binding, const TupleNode* tuple) final { + tuples_to_binding.Set(ffi::GetRef(binding->var.get()), tuple->fields); } void VisitExpr_(const CallNode* call) final { @@ -325,17 +353,18 @@ class CollectConsumerScopeInfo : public ExprVisitor { func_args = Tuple(call->args); } - bool is_texture_supported = SupportsTexture(op_attrs, op_pattern.value()); + auto is_texture_supported = SupportsTexture(op_attrs, op_pattern.value()); ffi::Array arg_scope; - for (auto arg : func_args->fields) { - auto sinfo = GetStructInfo(arg); + for (uint32_t i = 0; i < func_args->fields.size(); ++i) { + auto sinfo = GetStructInfo(func_args->fields[i]); if (auto tensor_sinfo = sinfo.as()) { - auto scope = is_texture_supported - ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value())) - : "global"; + bool is_texture = + i < is_texture_supported.size() ? is_texture_supported[i] : is_texture_supported[0]; + auto scope = + is_texture ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value())) : "global"; ffi::Map> ent_call; - const VarNode* arg_var = arg.as(); + const VarNode* arg_var = func_args->fields[i].as(); if (scope_info.find(ffi::GetRef(arg_var)) != scope_info.end()) { ent_call = scope_info[ffi::GetRef(arg_var)]; } @@ -368,28 +397,29 @@ class CollectConsumerScopeInfo : public ExprVisitor { return op_pat; } - bool SupportsTexture(const ffi::Array& op_attrs, Integer op_pattern) { - if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return true; + std::vector SupportsTexture(const ffi::Array& op_attrs, Integer op_pattern) { + if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return {true}; for (auto attr : op_attrs) { if (auto conv_attr = attr.as()) { if (conv_attr->data_layout == "NCHW4c" && conv_attr->kernel_layout == "OIHW4o") { - return true; + // No Texture for weights + return {true, false}; } } else if (auto pool_attrs = attr.as()) { if (pool_attrs->layout == "NCHW4c") { - return true; + return {true}; } } else if (auto avg_attrs = attr.as()) { if (avg_attrs->layout == "NCHW4c") { - return true; + return {true}; } } else if (attr.as()) { - return true; + return {true}; } } - return false; + return {false}; } std::string Scope(ffi::Array shape) { @@ -432,7 +462,8 @@ class CollectConsumerScopeInfo : public ExprVisitor { ffi::Map>> scope_info; /* A map of call node and scope info for each argument it consunes */ ffi::Map> call_scope_info; - ffi::Map arg_to_binding; + ffi::Map> tuple_item_to_binding; + ffi::Map> tuples_to_binding; IRModule mod_; Target target_; }; @@ -600,12 +631,9 @@ class DefineVDevice : ExprMutator { if (call->op == call_tir_op) { gv = Downcast(call->args[0]); - // tir::PrimFunc pfunc = Downcast(mod_->Lookup(gv)); - // out_sinfo = call->sinfo_args[0]; func_args = Downcast(call->args[1]); } else { func_args = Tuple(call->args); - // return call; } ffi::Array new_args; diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc index e2d9b5b068b7..04e9880b262a 100644 --- a/src/relax/backend/vm/codegen_vm.cc +++ b/src/relax/backend/vm/codegen_vm.cc @@ -35,6 +35,7 @@ #include "../../../runtime/const_loader_module.h" #include "../../../target/source/codegen_source_base.h" +#include "../../transform/utils.h" namespace tvm { namespace relax { @@ -214,7 +215,15 @@ class CodeGenVM : public ExprFunctor { } Instruction::Arg VisitExpr_(const ConstantNode* op) final { - return builder_->ConvertConstant(op->data); + auto arg = builder_->ConvertConstant(op->data); + + if (auto tsinfo = op->struct_info_.as()) { + if (tsinfo->vdevice.defined()) { + VDevice vdev = tsinfo->vdevice.value(); + builder_->SaveMemoryScope(arg, vdev->memory_scope); + } + } + return arg; } Instruction::Arg VisitExpr_(const ShapeExprNode* op) final { @@ -333,11 +342,19 @@ class CodeGenVM : public ExprFunctor { } void EmitAllocTensor(const Call& call_node, RegName dst_reg) { - ICHECK_EQ(call_node->args.size(), 4); + ICHECK_EQ(call_node->args.size(), 5); std::vector args; - args.reserve(4); - for (Expr arg : call_node->args) { - args.push_back(this->VisitExpr(arg)); + for (int i = 0; i < 4; ++i) { + args.push_back(this->VisitExpr(call_node->args[i])); + } + int64_t vdevice_index = -1; + if (auto* prim_value_node = call_node->args[4].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index); + + if (vdevice.defined()) { + args.push_back(this->VisitExpr(StringImm(vdevice.value()->memory_scope))); } builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg); } diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc index a5bb83d406a5..c783a9da3d2a 100644 --- a/src/relax/backend/vm/codegen_vm_tir.cc +++ b/src/relax/backend/vm/codegen_vm_tir.cc @@ -38,6 +38,8 @@ #include #include +#include "../../transform/utils.h" + namespace tvm { namespace relax { namespace codegen_vm { @@ -401,12 +403,21 @@ class CodeGenVMTIR : public ExprFunctor(const Expr&)> { } void EmitAllocTensor(const Call& call_node, int64_t dst_reg) { - ICHECK_EQ(call_node->args.size(), 4); + ICHECK_EQ(call_node->args.size(), 5); ffi::Array args; - args.reserve(4); - for (Expr arg : call_node->args) { - args.push_back(this->VisitExpr(arg).value()); + for (int i = 0; i < 4; ++i) { + args.push_back(this->VisitExpr(call_node->args[i]).value()); + } + int64_t vdevice_index = -1; + if (auto* prim_value_node = call_node->args[4].as()) { + vdevice_index = prim_value_node->value.as()->value; } + auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index); + + if (vdevice.defined()) { + args.push_back(tir::StringImm(vdevice.value()->memory_scope)); + } + this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg); } diff --git a/src/relax/backend/vm/exec_builder.cc b/src/relax/backend/vm/exec_builder.cc index b893b48830ce..a2fef22609be 100644 --- a/src/relax/backend/vm/exec_builder.cc +++ b/src/relax/backend/vm/exec_builder.cc @@ -46,6 +46,10 @@ ObjectPtr ExecBuilderNode::Get() { return exec_; } +void ExecBuilderNode::SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope) { + exec_->memory_scopes[idx.value()] = scope; +} + vm::Instruction::Arg ExecBuilderNode::ConvertConstant_(Any cvalue) { // emit constant immediate as immediate. if (auto opt_int = cvalue.as()) { diff --git a/src/relax/backend/vm/lower_runtime_builtin.cc b/src/relax/backend/vm/lower_runtime_builtin.cc index 71b8413e9889..a1ffa4618423 100644 --- a/src/relax/backend/vm/lower_runtime_builtin.cc +++ b/src/relax/backend/vm/lower_runtime_builtin.cc @@ -92,7 +92,13 @@ class LowerRuntimeBuiltinMutator : public ExprMutator { Expr MakeMemAllocTensor(const Call& call) { PrimValue offset = Downcast(call->args[1]); DataTypeImm dtype = Downcast(call->args[3]); - return Call(vm_alloc_tensor_op_, {call->args[0], offset, call->args[2], dtype}, Attrs()); + + ffi::Array call_args = {call->args[0], offset, call->args[2], dtype}; + if (5 == call->args.size()) { + call_args.push_back(call->args[4]); + } + + return Call(vm_alloc_tensor_op_, call_args, Attrs()); } Expr MakeMemKillObject(const Call& call) { @@ -168,8 +174,10 @@ class LowerRuntimeBuiltinMutator : public ExprMutator { VDevice vdev = attrs->dst_vdevice; int dev_type = vdev->target->GetTargetDeviceType(); int dev_id = vdev->vdevice_id; + StringImm storage_scope = StringImm(vdev->memory_scope); args.push_back(PrimValue::Int64(dev_type)); args.push_back(PrimValue::Int64(dev_id)); + args.push_back(storage_scope); return Call(builtin_to_device_, args, call_node->attrs, {GetStructInfo(call_node)}); } diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc index 54f9da4c786f..3acfb53b2784 100644 --- a/src/relax/op/op.cc +++ b/src/relax/op/op.cc @@ -23,6 +23,7 @@ #include #include +#include "../transform/utils.h" #include "op_common.h" namespace tvm { @@ -1200,6 +1201,15 @@ StructInfo InferStructInfoAllocateTensor(const Call& call, const BlockBuilder& c const DataTypeImm dtype_imm = ffi::GetRef(dtype_node); out_dtype = dtype_imm->value; } + int64_t vdevice_index = -1; + if (auto* prim_value_node = call->args[2].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index); + + if (vdevice.defined()) { + return TensorStructInfo(call->args[0], out_dtype, vdevice.value()); + } return TensorStructInfo(call->args[0], out_dtype); } @@ -1266,28 +1276,55 @@ StructInfo InferStructInfoMemAllocTensor(const Call& call, const BlockBuilder& c const DataTypeImm dtype_imm = ffi::GetRef(dtype_node); out_dtype = dtype_imm->value; } + + if (call->args.size() == 5) { + int64_t vdevice_index = -1; + if (auto* prim_value_node = call->args[4].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index); + if (vdevice.defined()) { + return TensorStructInfo(call->args[2], out_dtype, vdevice.value()); + } + } + return TensorStructInfo(call->args[2], out_dtype); } TVM_REGISTER_OP("relax.memory.alloc_tensor") - .set_num_inputs(4) + .set_num_inputs(5) .add_argument("storage", "Expr", "The storage to allocate the tensor to.") .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.") .add_argument("shape", "Expr", "The shape of the tensor to allocate.") .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.") + .add_argument("runtime_device_index", "PrimValue", + "The device index indicating on which device the tensor is to be " + "allocated at runtime. Index -1 is reserved for the host device.") .set_attr("FInferStructInfo", InferStructInfoMemAllocTensor) // memory allocation isn't considered a "visible effect" as far as purity is concerned .set_attr("FPurity", Bool(true)) .set_attr("TAllocator", Bool(true)); -Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) { +Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype, + PrimValue virtual_device_index) { static const Op& op = Op::Get("relax.memory.alloc_tensor"); - return Call(op, {storage, offset, shape, dtype}, Attrs(), {}); + return Call(op, {storage, offset, shape, dtype, virtual_device_index}, Attrs(), {}); } TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; - refl::GlobalDef().def("relax.op.memory.alloc_tensor", MakeMemAllocTensor); + refl::GlobalDef().def_packed( + "relax.op.memory.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) { + if (args.size() == 5) { + *ret = MakeMemAllocTensor(args[0].cast(), args[1].cast(), + args[2].cast(), args[3].cast(), + args[4].cast()); + } else { + *ret = MakeMemAllocTensor(args[0].cast(), args[1].cast(), + args[2].cast(), args[3].cast(), + PrimValue::Int64(0)); + } + }); } // memory planning kill_storage @@ -1363,41 +1400,60 @@ StructInfo InferStructInfoVMAllocTensor(const Call& call, const BlockBuilder& ct const DataTypeImm dtype_imm = ffi::GetRef(dtype_node); out_dtype = dtype_imm->value; } + int64_t vdevice_index = -1; + if (auto* prim_value_node = call->args[4].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index); + if (const auto* output_shape = call->args[2].as()) { - return TensorStructInfo(ffi::GetRef(output_shape), out_dtype); + return TensorStructInfo(ffi::GetRef(output_shape), out_dtype, vdevice); } else if (const auto* shape_sinfo = GetStructInfoAs(call->args[2])) { if (shape_sinfo->values.defined()) { - return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype); + return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype, vdevice); } else { - return TensorStructInfo(out_dtype, shape_sinfo->ndim); + return TensorStructInfo(out_dtype, shape_sinfo->ndim, vdevice); } } - return TensorStructInfo(out_dtype, kUnknownNDim); + return TensorStructInfo(out_dtype, kUnknownNDim, vdevice); } TVM_REGISTER_OP("relax.vm.alloc_tensor") - .set_num_inputs(4) + .set_num_inputs(5) .add_argument("storage", "Expr", "The storage to allocate the tensor to.") .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.") .add_argument("shape", "Expr", "The shape of the tensor to allocate.") .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.") + .add_argument("runtime_device_index", "PrimValue", + "The device index indicating on which device the tensor is " + "to be allocated at runtime.") .set_attr("FInferStructInfo", InferStructInfoVMAllocTensor) // memory allocation isn't considered a "visible effect" as far as purity is concerned .set_attr("FPurity", Bool(true)) .set_attr("TAllocator", Bool(true)); -Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) { +Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype, + PrimValue runtime_device_index) { static const Op& op = Op::Get("relax.vm.alloc_tensor"); - return Call(op, {storage, offset, shape, dtype}, Attrs(), {}); + return Call(op, {storage, offset, shape, dtype, runtime_device_index}, Attrs(), {}); } TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; - refl::GlobalDef().def("relax.op.vm.alloc_tensor", MakeVMAllocTensor); + refl::GlobalDef().def_packed("relax.op.vm.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) { + if (args.size() == 5) { + *ret = + MakeVMAllocTensor(args[0].cast(), args[1].cast(), args[2].cast(), + args[3].cast(), args[4].cast()); + } else { + *ret = + MakeVMAllocTensor(args[0].cast(), args[1].cast(), args[2].cast(), + args[3].cast(), PrimValue::Int64(0)); + } + }); } // vm kill_object - TVM_REGISTER_OP("relax.vm.kill_object") .set_num_inputs(1) .add_argument("obj", "Expr", "The object to be killed.") diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h index 5a556cbd7413..5c4f563bebee 100644 --- a/src/relax/op/op_common.h +++ b/src/relax/op/op_common.h @@ -211,7 +211,17 @@ inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx } auto output_sinfo = ffi::make_object(*input_sinfo.get()); output_sinfo->dtype = f_compute_out_dtype(input_sinfo); - return TensorStructInfo(output_sinfo); + if (call->sinfo_args.size() > 0) { + auto defined_sinfo = call->sinfo_args[0].as(); + ICHECK(defined_sinfo); + auto shape = output_sinfo->GetShape(); + ICHECK(shape.defined()); + ICHECK(defined_sinfo->vdevice.has_value()); + return TensorStructInfo(ShapeExpr(shape.value()), output_sinfo->dtype, + defined_sinfo->vdevice.value()); + } else { + return TensorStructInfo(output_sinfo); + } } /*! @@ -568,7 +578,8 @@ inline ffi::Optional CheckNdimPerLayoutAndGetShape(const Call& call, Expr MakeVMAllocStorage(Expr size, PrimValue runtime_device_index, DataTypeImm dtype, StringImm storage_scope = StringImm("global")); -Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype); +Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype, + PrimValue runtime_device_index); Expr MakeAllocTensor(Expr shape, DataTypeImm dtype, PrimValue runtime_device_index, StringImm storage_scope = StringImm("global")); diff --git a/src/relax/transform/call_tir_rewrite.cc b/src/relax/transform/call_tir_rewrite.cc index d4763b44b713..2c0e515be7ec 100644 --- a/src/relax/transform/call_tir_rewrite.cc +++ b/src/relax/transform/call_tir_rewrite.cc @@ -81,15 +81,20 @@ class CallTIRMutator : public ExprMutator { ICHECK(tensor_sinfo->shape.defined()) << "the TensorStructInfo shape of call_tir has not populated"; int dev_index = 0; + ffi::String scope = "global"; if (tensor_sinfo->vdevice.defined()) { dev_index = GetDeviceIndex(mod_, tensor_sinfo->vdevice.value()); + scope = tensor_sinfo->vdevice.value()->memory_scope; + } else { + dev_index = GetDeviceIndexByScope(mod_, scope); } + if (!is_inplace) { outs.push_back(builder_->Emit(Call(alloc_tensor_op, {Downcast(tensor_sinfo->shape.value()), DataTypeImm(tensor_sinfo->dtype), - PrimValue::Int64(dev_index), StringImm("global")}, - Attrs()), + PrimValue::Int64(dev_index), StringImm(scope)}, + Attrs(), {tensor_sinfo}), "alloc")); } else { // if there is only one output, it must be an in-place argument, but check anyway @@ -112,13 +117,21 @@ class CallTIRMutator : public ExprMutator { ICHECK(field_tensor->shape.defined()) << "call_tir expects all TensorStructInfo has shape, but got " << field_tensor << " as an element of TupleStructInfo"; + + int dev_index = 0; + ffi::String scope = "global"; + if (field_tensor->vdevice.defined()) { + dev_index = GetDeviceIndex(mod_, field_tensor->vdevice.value()); + scope = field_tensor->vdevice.value()->memory_scope; + } + if (!is_inplace || inplace_attrs->inplace_indices[i].IntValue() == -1) { - outs.push_back(builder_->Emit( - Call(alloc_tensor_op, - {Downcast(field_tensor->shape.value()), - DataTypeImm(field_tensor->dtype), PrimValue::Int64(0), StringImm("global")}, - Attrs()), - "alloc")); + outs.push_back(builder_->Emit(Call(alloc_tensor_op, + {Downcast(field_tensor->shape.value()), + DataTypeImm(field_tensor->dtype), + PrimValue::Int64(dev_index), StringImm(scope)}, + Attrs(), {field_tensor}), + "alloc")); } else { outs.push_back(Downcast(call->args[1]) ->fields[inplace_attrs->inplace_indices[i].IntValue()]); diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc index d1e61b1c5748..d63cd15744bd 100644 --- a/src/relax/transform/lower_alloc_tensor.cc +++ b/src/relax/transform/lower_alloc_tensor.cc @@ -24,11 +24,16 @@ #include #include +#include "utils.h" + namespace tvm { namespace relax { namespace { class Mutator : public ExprMutator { + public: + explicit Mutator(IRModule mod) : ctx_mod_(mod) {} + using ExprMutator::VisitExpr_; Expr VisitExpr_(const CallNode* op) override { static const Op& alloc_tensor_op = Op::Get("relax.builtin.alloc_tensor"); @@ -71,23 +76,56 @@ class Mutator : public ExprMutator { return nbytes; }(); + ShapeExpr size({nbytes}); + + int64_t vdevice_index = -1; + if (auto* prim_value_node = op->args[2].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + ffi::Optional vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index); + + if (vdevice.defined()) { + std::string dev_kind = vdevice.value()->target->kind->name; + PrimExpr dev_size = tir::make_const(DataType::Int(64), 1); + if (vdevice.value()->memory_scope != "global") { + auto device_size_handler = + tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize.") + dev_kind); + if (device_size_handler.has_value()) { + dev_size *= + (*device_size_handler)(shape, dtype->value, vdevice.value()).cast(); + size = ShapeExpr({dev_size}); + } + auto device_scope_handler = + tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility.") + dev_kind); + if (device_scope_handler.has_value()) { + ffi::String dev_scope = + (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope) + .cast(); + storage_scope = StringImm(dev_scope); + } + } + } + auto offset = PrimValue::Int64(0); - Expr storage = - relax::Call(mem_alloc_storage_op, {ShapeExpr({nbytes}), runtime_device_index, - storage_scope, DataTypeImm(DataType::UInt(8))}); + Expr storage = relax::Call(mem_alloc_storage_op, {size, runtime_device_index, storage_scope, + DataTypeImm(DataType::UInt(8))}); storage = builder_->Emit(storage, "storage"); - Expr tensor = relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype}); + Expr tensor = + relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype, op->args[2]}); return tensor; } else { return ExprMutator::VisitExpr_(op); } } + + private: + IRModule ctx_mod_; }; } // namespace -Expr LowerAllocTensor(Expr expr) { - Mutator mutator; +Expr LowerAllocTensor(IRModule m, Expr expr) { + Mutator mutator(m); return mutator(expr); } @@ -95,7 +133,7 @@ namespace transform { Pass LowerAllocTensor() { auto pass_func = [=](Function func, IRModule m, PassContext pc) { - return Downcast(relax::LowerAllocTensor(std::move(func))); + return Downcast(relax::LowerAllocTensor(m, std::move(func))); }; return CreateFunctionPass(pass_func, /*opt_level=*/0, "LowerAllocTensor", {}); } diff --git a/src/relax/transform/specialize_primfunc_based_on_callsite.cc b/src/relax/transform/specialize_primfunc_based_on_callsite.cc index 6258e14b666d..bb2855b4b0bb 100644 --- a/src/relax/transform/specialize_primfunc_based_on_callsite.cc +++ b/src/relax/transform/specialize_primfunc_based_on_callsite.cc @@ -132,8 +132,9 @@ class SpecializeTIRCallArgs : ExprMutator { if (sinfo->vdevice.defined()) { scope = sinfo->vdevice.value()->memory_scope; } - const Buffer& buffer = - tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype, "ret_val", scope); + + const Buffer& buffer = tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype, + "ret_val_" + std::to_string(index), scope); param_map.Set(pfunc->params[args.size() + index], buffer); index++; } diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc index fc3c2259ff9a..cebbaa4ce5ac 100644 --- a/src/relax/transform/static_plan_block_memory.cc +++ b/src/relax/transform/static_plan_block_memory.cc @@ -77,6 +77,9 @@ #include #include +#include "../../runtime/texture.h" +#include "utils.h" + namespace tvm { namespace relax { @@ -105,6 +108,8 @@ class StorageTokenNode : public Object { DataType dtype; /*! \brief The memory scope of the token. */ std::string storage_scope; + /*! \brief The VDevice information. */ + ffi::Optional vdevice; /*! \brief The storage id, reserved for debug and demo use. */ int storage_id{-1}; @@ -129,23 +134,51 @@ class StorageTokenNode : public Object { */ class StorageToken : public ObjectRef { public: - explicit StorageToken(ffi::Array shape, DataType dtype, std::string storage_scope) { + explicit StorageToken(ffi::Array shape, DataType dtype, std::string storage_scope, + ffi::Optional vdevice = std::nullopt) { // Compute the tensor size from the shape. int64_t const_coeff = dtype.bytes() * dtype.lanes(); PrimExpr size = tir::make_const(DataType::Int(64), 1); - for (const PrimExpr& dim_len : shape) { - if (const IntImmNode* const_dim_len = dim_len.as()) { - const_coeff *= const_dim_len->value; - } else { - size *= dim_len; + bool size_computed = false; + + if (vdevice.defined()) { + VDevice vdev = vdevice.value(); + std::string dev_kind = vdev->target->kind->name; + + if (vdev->memory_scope != "global") { + auto device_size_handler = + tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize." + dev_kind)); + if (device_size_handler.has_value()) { + size *= (*device_size_handler)(shape, dtype, vdevice.value()).cast(); + size_computed = true; + } + auto device_scope_handler = + tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility." + dev_kind)); + if (device_scope_handler.has_value()) { + ffi::String dev_scope = + (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope) + .cast(); + storage_scope = dev_scope; + } + } + } + if (!size_computed) { + for (const PrimExpr& dim_len : shape) { + if (const IntImmNode* const_dim_len = dim_len.as()) { + const_coeff *= const_dim_len->value; + } else { + size *= dim_len; + } } } + size = tir::make_const(DataType::Int(64), const_coeff) * size; ObjectPtr n = ffi::make_object(); n->bytes = size; n->dtype = dtype; n->storage_scope = std::move(storage_scope); + n->vdevice = std::move(vdevice); data_ = std::move(n); } TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(StorageToken, ObjectRef, StorageTokenNode); @@ -159,9 +192,9 @@ using Tokens = NestedMsg; * \note We can generalize this implementation to multi-dimensional memory * following the same flow in the future. */ -class TokenAllocator1D { +class TokenAllocatorMixed { public: - explicit TokenAllocator1D(arith::Analyzer* analyzer) : analyzer_(analyzer) {} + explicit TokenAllocatorMixed(arith::Analyzer* analyzer) : analyzer_(analyzer) {} /*! * \brief Request a storage token from the available token pool for a @@ -606,7 +639,14 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor { // Create and set token. StringImm storage_scope = Downcast(call->args[3]); - StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value); + + int64_t vdevice_index = -1; + if (auto* prim_value_node = call->args[2].as()) { + vdevice_index = prim_value_node->value.as()->value; + } + ffi::Optional vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index); + + StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value, vdevice); Tokens tokens(token); SetTokens(call, tokens); @@ -823,7 +863,7 @@ class StorageAllocator : public StorageAllocatorBaseVisitor { /*! \brief Number of allocated storages. */ int n_storage_{0}; /*! \brief The 1D memory allocator. */ - TokenAllocator1D allocator_; + TokenAllocatorMixed allocator_; /*! \brief The mapping from each token to the tensors that are currently using it. */ std::unordered_map> token2cur_tensor_; }; @@ -905,7 +945,8 @@ class StorageAllocationRewriter : public ExprMutator { // And always create a `memory.alloc_tensor` for the old `builtin.alloc_tensor`. PrimValue offset = PrimValue::Int64(0); DataType dtype = sinfo->dtype; - return Call(mem_alloc_tensor, {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype)}, + return Call(mem_alloc_tensor, + {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype), call->args[2]}, Attrs()); } else if (plan_dynamic_output_ && call->op == alloc_tensor_op) { // Case 2. For a `alloc_tensor` that is not planned for memory reuse, @@ -936,7 +977,8 @@ class StorageAllocationRewriter : public ExprMutator { return Call(mem_alloc_tensor, {storage, // /*offset=*/PrimValue::Int64(0), /*shape=*/ffi::GetRef(shape), // - /*dtype=*/DataTypeImm(sinfo->dtype)}); + /*dtype=*/DataTypeImm(sinfo->dtype), + /*vdevice_index=*/call->args[2]}); } } @@ -990,6 +1032,32 @@ TVM_FFI_STATIC_INIT_BLOCK() { refl::GlobalDef().def("relax.transform.StaticPlanBlockMemory", StaticPlanBlockMemory); } +PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array pshape, DataType dtype, + VDevice vdevice) { + int image_row_align = vdevice->target->GetAttr("image_base_address_alignment") + .value_or(Integer(64)) + ->value; + + struct Shape { + const ffi::Array& shape; + int64_t operator[](size_t i) const { + ICHECK(tir::as_const_int(shape[i])) << "Dymamic shapes not suported over texture now"; + return *tir::as_const_int(shape[i]); + } + int size() { return this->shape.size(); } + }; + auto shape = Shape{pshape}; + + size_t size = runtime::GetTextureMemorySize(shape, dtype.bytes() * 8, dtype.lanes(), + vdevice->memory_scope, image_row_align); + return tir::make_const(DataType::Int(64), size); +} + +TVM_FFI_STATIC_INIT_BLOCK() { + namespace refl = tvm::ffi::reflection; + refl::GlobalDef().def("DeviceGetMemSize.opencl", GetTextureMemorySizeFromVDevice); +} + } // namespace transform } // namespace relax } // namespace tvm diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h index 91d75079f73d..2cce918f4ef7 100644 --- a/src/relax/transform/utils.h +++ b/src/relax/transform/utils.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "../../support/array.h" @@ -383,6 +384,19 @@ inline ffi::String GetCodegenName(const std::string& composite_name) { return composite_name.substr(0, delim_pos); } +inline int GetDeviceIndexByScope(const IRModule& mod, const ffi::String& scope) { + if (mod->global_infos.find("vdevice") == mod->global_infos.end()) { + return 0; + } + ffi::Array vdevices = mod->global_infos["vdevice"]; + for (int i = 0; i < static_cast(vdevices.size()); ++i) { + if (scope == vdevices[i].as().value()->memory_scope) { + return i; + } + } + return 0; +} + inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) { ffi::Array vdevices = mod->global_infos["vdevice"]; for (int i = 0; i < static_cast(vdevices.size()); ++i) { @@ -394,6 +408,17 @@ inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) { return -1; } +inline ffi::Optional GetGlobalVDevice(const IRModule& mod, const int index) { + ffi::Optional ret; + if (mod->global_infos.find("vdevice") != mod->global_infos.end()) { + ffi::Array vdevices = mod->global_infos["vdevice"]; + if (index < static_cast(vdevices.size())) { + ret = vdevices[index].as(); + } + } + return ret; +} + /* \brief Eliminate common subexpressions * * Utility for simplifying relax expressions by removing common diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index 933cd0b7a7cf..99d5f2be2a9d 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -187,6 +187,8 @@ inline const char* CLGetErrorString(cl_int error) { inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) { DataType dtype(data_type); + dtype = dtype.with_lanes(1); + if (dtype == DataType::Float(32)) { return CL_FLOAT; } else if (dtype == DataType::Float(16)) { @@ -343,7 +345,6 @@ class OpenCLWorkspace : public DeviceAPI { void* AllocDataSpaceView(Device dev, void* data, ffi::Shape shape, DLDataType dtype, ffi::Optional mem_scope = std::nullopt); void FreeDataSpaceView(Device dev, void* ptr); - cl_device_id GetCLDeviceID(int device_id); // override device API void SetDevice(Device dev) final; @@ -351,7 +352,7 @@ class OpenCLWorkspace : public DeviceAPI { void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final; void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, ffi::Optional mem_scope = std::nullopt) final; - void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint, + void* AllocDataSpace(Device dev, size_t width, size_t height, size_t depth, DLDataType type_hint, ffi::Optional mem_scope = std::nullopt); void* GetNativePtr(const tvm::runtime::Tensor& narr); void SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr, size_t buf_size); @@ -365,8 +366,8 @@ class OpenCLWorkspace : public DeviceAPI { // cl_mem alloc utils void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint); - void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch, - DLDataType type_hint, ffi::Optional mem_scope); + void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t depth, + size_t row_pitch, DLDataType type_hint, ffi::Optional mem_scope); /*! * \brief Get the thread local ThreadEntry diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 8b6fba24988e..4a022b07b30c 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -71,7 +71,7 @@ ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor) auto texture_shape = ApplyTexture2DFlattening(tensor->shape, tensor->ndim, axis); info.region[0] = texture_shape.width; info.region[1] = texture_shape.height; - info.region[2] = 1; + info.region[2] = texture_shape.depth; return info; } @@ -260,13 +260,13 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment, return AllocCLBuffer(dev, size, alignment, type_hint); } -void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint, - ffi::Optional mem_scope) { +void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, size_t depth, + DLDataType type_hint, ffi::Optional mem_scope) { // Texture allocation given width and height cl_uint row_align = GetImageAlignment(dev.device_id); size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8; size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align); // CL_RGBA = 4 - size_t mem_size = row_pitch * height; + size_t mem_size = row_pitch * height * depth; // Alloc back buffer from pool cl::BufferDescriptor* back_buffer = nullptr; @@ -280,7 +280,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, D if (!mem_scope.has_value()) { mem_scope = ffi::String("global.texture"); } - return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope); + return AllocCLImage(dev, back_buffer, width, height, depth, row_pitch, type_hint, mem_scope); } void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype, @@ -298,7 +298,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value()); auto texture = ApplyTexture2DFlattening(shape, ndim, axis); - return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope); + return AllocDataSpace(dev, texture.width, texture.height, texture.depth, dtype, mem_scope); } void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment, @@ -320,7 +320,7 @@ void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment, } void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, - size_t row_pitch, DLDataType type_hint, + size_t depth, size_t row_pitch, DLDataType type_hint, ffi::Optional mem_scope) { this->Init(); ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos) @@ -331,7 +331,7 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, cl_int err_code; cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint); cl_image_format format = {CL_RGBA, cl_type}; - cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0}; + cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D_ARRAY, width, height, 0, depth, 0, 0, 0, 0}; if (IsBufferToImageSupported(dev.device_id)) { descriptor.image_row_pitch = row_pitch; @@ -344,7 +344,6 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope); desc->buffer = mptr; desc->back_buffer = back_desc; - return desc; } @@ -383,7 +382,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align); // CL_RGBA = 4 ret_desc = static_cast(OpenCLWorkspace::Global()->AllocCLImage( - dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope)); + dev, nullptr, texture.width, texture.height, texture.depth, row_pitch, dtype, mem_scope)); ret_desc->is_compat_view = true; } return ret_desc; @@ -414,7 +413,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha } return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height, - row_pitch, dtype, mem_scope); + texture.depth, row_pitch, dtype, mem_scope); } void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) { @@ -768,28 +767,27 @@ TVM_FFI_STATIC_INIT_BLOCK() { [](ffi::PackedArgs args, ffi::Any* rv) { int32_t device_type = args[0].cast(); int32_t device_id = args[1].cast(); - int32_t dtype_code_hint = args[2].cast(); - int32_t dtype_bits_hint = args[3].cast(); auto scope = args[4].cast(); CHECK(scope.find("texture") != std::string::npos); int64_t ndim = args[5].cast(); - CHECK_EQ(ndim, 2); + CHECK_EQ(ndim, 3); int64_t* shape = static_cast(args[6].cast()); int64_t width = shape[0]; int64_t height = shape[1]; - + int64_t depth = shape[2]; + int64_t channel_size = args[7].cast(); + DataType channel_type = GetChannelType(channel_size); Device dev; dev.device_type = static_cast(device_type); dev.device_id = device_id; - DLDataType type_hint; - type_hint.code = static_cast(dtype_code_hint); - type_hint.bits = static_cast(dtype_bits_hint); - type_hint.lanes = 1; + type_hint.code = channel_type.code(); + type_hint.bits = channel_type.bits(); + type_hint.lanes = channel_type.lanes(); *rv = OpenCLWorkspace::Global()->AllocDataSpace( - dev, static_cast(width), static_cast(height), type_hint, - ffi::String("global.texture")); + dev, static_cast(width), static_cast(height), + static_cast(depth), type_hint, ffi::String("global.texture")); }) .def_packed("device_api.opencl.free_nd", [](ffi::PackedArgs args, ffi::Any* rv) { @@ -857,7 +855,7 @@ class OpenCLPooledAllocator final : public memory::PooledAllocator { Buffer Alloc(Device dev, ffi::Shape shape, DLDataType type_hint, const std::string& mem_scope) override { if (AllowMemoryScope(mem_scope)) { - size_t size = ffi::GetDataSize(shape.Product(), type_hint); + size_t size = GetMemObjectSize(dev, shape.size(), shape.data(), type_hint); Buffer buf; buf.device = dev; buf.size = size; diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index 3f9dadbb3af1..980c6bb3994c 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -50,6 +50,7 @@ class OpenCLWrappedFunc { arg_size_ = arg_size; launch_param_config_.Init(arg_size.size(), launch_param_tags); } + // invoke the function with void arguments void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const { ICHECK(w_->devices.size() > 0) << "No OpenCL device"; @@ -63,6 +64,8 @@ class OpenCLWrappedFunc { if (kernel == nullptr || e.version != entry_.version) { kernel = m_->InstallKernel(w_, t, func_name_, entry_); } + ThreadWorkLoad wl = launch_param_config_.Extract(args); + cl_uint work_dim = static_cast(launch_param_config_.work_dim()); // setup arguments. for (cl_uint i = 0; i < arg_size_.size(); ++i) { void* arg = nullptr; @@ -74,13 +77,10 @@ class OpenCLWrappedFunc { OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg)); } cl_command_queue queue = w_->GetQueue(t->device); - ThreadWorkLoad wl = launch_param_config_.Extract(args); - cl_uint work_dim = static_cast(launch_param_config_.work_dim()); for (cl_uint i = 0; i < work_dim; ++i) { wl.work_size[i] *= wl.work_size[i + 3]; } // launch kernel - if (w_->IsProfiling(t->device)) { w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1); OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size, diff --git a/src/runtime/texture.h b/src/runtime/texture.h index f3a827aa8792..e2b6d603ed50 100644 --- a/src/runtime/texture.h +++ b/src/runtime/texture.h @@ -40,6 +40,7 @@ template struct Texture2DShape { T width; T height; + T depth; T channel; }; @@ -60,11 +61,7 @@ inline size_t DefaultTextureLayoutSeparator(size_t shape_rank, } else if (convention == "global.texture-weight") { separator = 1; } else if (convention == "global.texture-nhwc") { - if (shape_rank == 3) { - separator = 1; - } else { - separator = 2; - } + separator = 2; } else { LOG(FATAL) << "Encountered unknown texture lowering convention: " << convention; } @@ -81,9 +78,11 @@ template Texture2DShape ApplyTexture2DFlattening(const S& shape, size_t rank, size_t axis) { ICHECK(axis < rank) << "Number of axes to flatten into rows must be less than shape rank for 2d flattening"; - Texture2DShape texture{1, 1, shape[rank - 1]}; + Texture2DShape texture{1, 1, 1, shape[rank - 1]}; for (size_t i = 0; i < rank - 1; i++) { - if (i < axis) { + if (i < (axis - 1)) { + texture.depth *= shape[i]; + } else if (i < axis) { texture.height *= shape[i]; } else { texture.width *= shape[i]; @@ -114,7 +113,23 @@ size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope, auto pack_size = shape[shape.size() - 1]; auto pixel_size = (bits * lanes + 7) / 8; size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align); - return row_pitch * tshape.height; + return row_pitch * tshape.height * tshape.depth; +} + +/*! + * \brief Returns the standard channel datatype for any given type. + * \param channel_size The Number of bits in a Channel + * \return DataType to be used in the codegen. + */ +inline DataType GetChannelType(size_t channel_size) { + DataType channel_type; + + if (channel_size == 128) + return DataType::Float(32, 4); + else if (channel_size == 64) + return DataType::Float(16, 4); + + LOG(FATAL) << "Unsupported Channel Size: " << channel_size; } } // namespace runtime diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc index 1bd3084c210b..69a4c74169a4 100644 --- a/src/runtime/vm/builtin.cc +++ b/src/runtime/vm/builtin.cc @@ -390,7 +390,18 @@ TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::GlobalDef() .def("vm.builtin.alloc_storage", VMAllocStorage) - .def_method("vm.builtin.alloc_tensor", &StorageObj::AllocTensor); + .def_packed("vm.builtin.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* rv) { + Storage sobj = args[0].cast(); + int64_t offset = args[1].cast(); + ffi::Shape shape = args[2].cast(); + DataType dtype = args[3].cast(); + if (args.size() == 5) { + ffi::String scope = args[4].cast(); + *rv = sobj->AllocTensorScoped(offset, shape, dtype, scope); + } else { + *rv = sobj->AllocTensor(offset, shape, dtype); + } + }); } //------------------------------------------------- @@ -516,9 +527,16 @@ TVM_FFI_STATIC_INIT_BLOCK() { "vm.builtin.reshape", [](Tensor data, ffi::Shape new_shape) { return data.CreateView(new_shape, data->dtype); }) .def("vm.builtin.null_value", []() -> std::nullptr_t { return nullptr; }) - .def("vm.builtin.to_device", [](Tensor data, int dev_type, int dev_id) { + .def_packed("vm.builtin.to_device", [](ffi::PackedArgs args, ffi::Any* rv) { + Tensor data = args[0].cast(); + int dev_type = args[1].cast(); + int dev_id = args[2].cast(); Device dst_device = {(DLDeviceType)dev_type, dev_id}; - return data.CopyTo(dst_device); + ffi::String mem_scope = "global"; + if (args.size() == 4) { + mem_scope = args[3].cast(); + } + *rv = data.CopyTo(dst_device, mem_scope); }); } diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index 40edbc14c433..efa05fd38376 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -38,6 +38,7 @@ namespace vm { /*! \brief The magic number for the serialized VM bytecode file */ constexpr uint64_t kTVMVMBytecodeMagic = 0xD225DE2F4214151D; +constexpr uint64_t kTVMVMBytecodeMagicV2 = 0xD225DE2F4214151E; #define STREAM_CHECK(val, section) \ ICHECK(val) << "Invalid VM file format in the " << section << " section." \ @@ -143,22 +144,24 @@ Instruction VMExecutable::GetInstruction(Index i) const { } void SaveHeader(dmlc::Stream* strm) { - uint64_t header = kTVMVMBytecodeMagic; + uint64_t header = kTVMVMBytecodeMagicV2; strm->Write(header); std::string version = VM_VERSION; strm->Write(version); } -void LoadHeader(dmlc::Stream* strm) { +uint64_t LoadHeader(dmlc::Stream* strm) { // Check header. uint64_t header; STREAM_CHECK(strm->Read(&header), "header"); - STREAM_CHECK(header == kTVMVMBytecodeMagic, "header"); + STREAM_CHECK((header == kTVMVMBytecodeMagic) || (header == kTVMVMBytecodeMagicV2), "header"); // Check version. std::string version; STREAM_CHECK(strm->Read(&version), "version"); STREAM_CHECK(version == VM_VERSION, "version"); + + return header; } ffi::Bytes VMExecutable::SaveToBytes() const { @@ -172,6 +175,9 @@ ffi::Bytes VMExecutable::SaveToBytes() const { // Global section. SaveGlobalSection(&strm); + // Memory Scopes + SaveMemoryScopeSection(&strm); + // Constant section. SaveConstantSection(&strm); @@ -192,11 +198,16 @@ ffi::Module VMExecutable::LoadFromBytes(const ffi::Bytes& bytes) { ObjectPtr exec = ffi::make_object(); // Load header. - LoadHeader(&strm); + uint64_t header_magic = LoadHeader(&strm); // Global section. exec->LoadGlobalSection(&strm); + if (kTVMVMBytecodeMagicV2 == header_magic) { + // Memory Scopes + exec->LoadMemoryScopeSection(&strm); + } + // Constant section. exec->LoadConstantSection(&strm); @@ -245,6 +256,15 @@ bool VMFuncInfo::Load(dmlc::Stream* strm) { void VMExecutable::SaveGlobalSection(dmlc::Stream* strm) const { strm->Write(func_table); } +void VMExecutable::SaveMemoryScopeSection(dmlc::Stream* strm) const { + strm->Write(static_cast(this->memory_scopes.size())); + for (auto it = this->memory_scopes.begin(); it != this->memory_scopes.end(); it++) { + LOG(WARNING) << "Scope Saving:" << it->second; + strm->Write(it->first); + strm->Write(it->second); + } +} + void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const { strm->Write(static_cast(this->constants.size())); for (const auto& it : this->constants) { @@ -293,6 +313,23 @@ void VMExecutable::LoadGlobalSection(dmlc::Stream* strm) { } } +void VMExecutable::LoadMemoryScopeSection(dmlc::Stream* strm) { + uint64_t sz; + // Load the number of memory scope entries. + STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "memory scopes"); + + size_t size = static_cast(sz); + // Load each of the scopes. + for (size_t i = 0; i < size; i++) { + Index const_idx; + std::string scope; + STREAM_CHECK(strm->Read(&const_idx), "memory scopes"); + STREAM_CHECK(strm->Read(&scope), "memory scopes"); + LOG(WARNING) << "Scope Loaded:" << scope; + this->memory_scopes[const_idx] = scope; + } +} + void VMExecutable::LoadConstantSection(dmlc::Stream* strm) { uint64_t sz; // Load the number of constants. diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index be981b205cbb..cd5475bf1ad5 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -84,19 +84,21 @@ ffi::Any IndexIntoNestedObject(ffi::Any obj, ffi::PackedArgs args, int starting_ return obj; } -Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc) { +Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc, + ffi::String scope = "global") { if (src->device.device_type == dev.device_type && src->device.device_id == dev.device_id) { return src; } else { - auto res = alloc->Empty(src.Shape(), src->dtype, dev); + auto res = alloc->Empty(src.Shape(), src->dtype, dev, scope); res.CopyFrom(src); return res; } } -Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc) { +Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc, + ffi::String scope = "global") { if (src.as()) { - return ConvertTensorToDevice(src.cast(), dev, alloc); + return ConvertTensorToDevice(src.cast(), dev, alloc, scope); } else if (src.as()) { std::vector ret; auto arr = src.cast>(); @@ -129,10 +131,11 @@ ffi::Any ConvertArgToDevice(ffi::AnyView input, Device dev, Allocator* alloc) { return ret; } -ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc) { +ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc, + ffi::String scope = "global") { ffi::Any ret; if (auto opt_obj = input.as()) { - ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc); + ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc, scope); } else { ret = input; } @@ -469,11 +472,12 @@ void VirtualMachineImpl::Init(const std::vector& devices, } // Setup constant sections. this->const_pool_.reserve(exec_->constants.size()); - for (const auto& constant : exec_->constants) { - if (auto opt_nd = constant.as()) { - this->const_pool_.push_back(ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0])); + for (size_t i = 0; i < exec_->constants.size(); ++i) { + if (auto opt_nd = exec_->constants[i].as()) { + this->const_pool_.push_back( + ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0], exec_->memory_scopes[i])); } else { - this->const_pool_.push_back(constant); + this->const_pool_.push_back(exec_->constants[i]); } } // Setup function sections. diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc index 8ea55b8ff5d8..b3926e106f6f 100644 --- a/src/target/source/codegen_opencl.cc +++ b/src/target/source/codegen_opencl.cc @@ -240,11 +240,6 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) { // NOLINT(*) if (t.is_uint()) { os << 'u'; } - if (t.bits() == 8 && t.lanes() == 4) { - // directly 4 8 bit int in integer. - os << "int"; - return; - } switch (t.bits()) { case 8: os << "char"; @@ -279,7 +274,7 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) { // NOLINT(* return PrintType(ptr->dtype, os); } else if (auto* ptr = type.as()) { if (runtime::IsTextureStorage(std::string(ptr->storage_scope))) { - os << "image2d_t"; + os << "image2d_array_t"; } else { PrintType(ptr->element_type, os); os << '*'; @@ -428,47 +423,85 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) { ICHECK(ptr_type != nullptr) << "Texture Var's must be of PointerType"; ICHECK(runtime::IsTextureStorage(std::string(ptr_type->storage_scope))) << "builtin::texture2d_store() only supports storing to texture buffers"; + const int channel_size = Downcast(op->args[4])->value; + ICHECK(channel_size == 64 || channel_size == 128) + << "Unsupported Channel Size: " << channel_size; + DataType channel_type = runtime::GetChannelType(channel_size); + DataType buffer_type = ptr_type->element_type.as()->dtype; - if (buffer_type.is_float16()) { + std::stringstream ss; + this->PrintExpr(op->args[5], ss); + std::string value; + value = this->SSAGetID(ss.str(), buffer_type.with_lanes(channel_size / buffer_type.bits())); + if (channel_size == 64) { os << "write_imageh("; - } else if (buffer_type.is_float()) { + } else if (channel_size == 128) { os << "write_imagef("; } else { - LOG(FATAL) << "Unsupported type: " << buffer_type - << ", currently only float and half are supported for image2d OpenCL codegen."; + LOG(FATAL) << "Unsupported Channel Size: " << channel_size; } this->PrintExpr(op->args[0], os); os << ", "; - os << "(int2)("; + os << "(int4)("; this->PrintExpr(op->args[1], os); os << ", "; this->PrintExpr(op->args[2], os); - os << "), "; + os << ", "; this->PrintExpr(op->args[3], os); + os << ", "; + this->PrintExpr(make_const(DataType::Int(32), 0), os); + os << "), "; + os << "as_"; + this->PrintType(channel_type, os); + os << "(" << value << ")"; os << ")"; } else if (op->op.same_as(builtin::texture2d_load())) { enable_compliant_texture_reads_ = true; std::stringstream ss; - if (op->dtype.is_float16()) { + const int channel_size = Downcast(op->args[4])->value; + const int data_lanes = channel_size / op->dtype.bits(); + ICHECK(channel_size == 64 || channel_size == 128) + << "Unsupported Channel Size: " << channel_size; + ss << "as_"; + this->PrintType(op->dtype.with_lanes(data_lanes), ss); + ss << "("; + if (channel_size == 64) { ss << "READ_IMAGEH("; - } else if (op->dtype.is_float()) { + } else if (channel_size == 128) { ss << "READ_IMAGEF("; } else { - LOG(FATAL) << "Unsupported type: " << op->dtype - << ", currently only float and half are supported for image2d OpenCL codegen."; + LOG(FATAL) << "Unsupported Channel Size: " << channel_size; } this->PrintExpr(op->args[0], ss); ss << ", "; ss << "image_sampler, "; - ss << "((int2)("; + ss << "((int4)("; this->PrintExpr(op->args[1], ss); ss << ", "; this->PrintExpr(op->args[2], ss); - ss << ")))"; - - std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4)); - if (op->args.back().as()) { - os << rhs; + ss << ", "; + this->PrintExpr(op->args[3], ss); + ss << ", "; + this->PrintExpr(make_const(DataType::Int(32), 0), ss); + ss << "))))"; + + std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(data_lanes)); + if (auto ramp = op->args.back().as()) { + if (ramp->base.as() && *tir::as_const_int(ramp->base) == 0 && + *tir::as_const_int(ramp->lanes) == data_lanes && *tir::as_const_int(ramp->stride) == 1) { + os << rhs; + } else if (*tir::as_const_int(ramp->stride) == 1) { + os << "(*("; + this->PrintType(op->dtype.with_lanes(*tir::as_const_int(ramp->lanes)), os); + os << "*)"; + os << "(("; + this->PrintType(op->dtype.with_lanes(1), os); + os << "*)&" << rhs << " + "; + this->PrintExpr(ramp->base, os); + os << "))"; + } else { + LOG(FATAL) << "Unsupported Texture Load Args"; + } } else { os << "(("; this->PrintType(op->dtype.with_lanes(1), os); diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index d44173a2ae3c..96e90f17ac79 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -362,6 +362,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL) .add_attr_option("max_num_threads", 256) .add_attr_option("thread_warp_size", 1) .add_attr_option("texture_spatial_limit", 16384) + .add_attr_option("texture_depth_limit", 2048) // Faced that Qualcomm OpenCL runtime crashed without any error message in // the case when the number of kernel arguments was pretty big. OpenCL doesn't // specify any limitations on the number of kernel arguments. max_function_args diff --git a/src/tir/backend/adreno/inject_texture_alloc.cc b/src/tir/backend/adreno/inject_texture_alloc.cc new file mode 100644 index 000000000000..4799b7ce2643 --- /dev/null +++ b/src/tir/backend/adreno/inject_texture_alloc.cc @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file inject_texture_alloc.cc + */ + +#include +#include +#include +#include + +#include "../../../arith/ir_mutator_with_analyzer.h" +#include "../../../runtime/texture.h" +#include "../../transforms/ir_utils.h" + +namespace tvm { +namespace tir { +namespace backend { +namespace adreno { +using runtime::ApplyTexture2DFlattening; +using runtime::DefaultTextureLayoutSeparator; +using runtime::IsTextureStorage; + +/*! + * \brief Inject Texture Alloc Intrensic right after AllocateNode are realized. + */ +class TextureAllocInjector : public arith::IRMutatorWithAnalyzer { + public: + static PrimFunc Inject(PrimFunc func) { + arith::Analyzer ana; + auto pass = TextureAllocInjector(&ana); + auto writer = func.CopyOnWrite(); + pass.MarkBufferMapShapes(func); + writer->body = pass.VisitStmt(func->body); + return func; + } + + private: + using IRMutatorWithAnalyzer::VisitExpr; + using IRMutatorWithAnalyzer::VisitExpr_; + using IRMutatorWithAnalyzer::VisitStmt; + using IRMutatorWithAnalyzer::VisitStmt_; + + explicit TextureAllocInjector(arith::Analyzer* ana) : IRMutatorWithAnalyzer(ana) {} + + Stmt VisitStmt_(const AllocateNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + std::string storage_scope = GetStorageScope(op->buffer_var); + if (IsTextureStorage(storage_scope)) { + op = stmt.as(); + ICHECK(op->extents.size() >= 3) << "Only 2D Array RGBA texture is currently supported"; + const int data_bits = op->dtype.bits(), + vec_length = static_cast(op->extents.back().as()->value); + const int channel_size = data_bits * vec_length; + ICHECK(channel_size == 128 || channel_size == 64) + << "Invalid Channel Size: " << channel_size << " bits"; + + size_t axis = DefaultTextureLayoutSeparator(op->extents.size(), storage_scope); + auto texture = ApplyTexture2DFlattening(op->extents, op->extents.size(), axis); + ffi::Array args; + args.push_back(StringImm(storage_scope)); + args.push_back(IntImm(DataType::Int(64), 3)); // 2d Array + args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), + {texture.width, texture.height, texture.depth})); + args.push_back(IntImm(DataType::Int(64), channel_size)); + stmt = + LetStmt(op->buffer_var, + Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), op->body); + } + return stmt; + } + + protected: + std::string GetStorageScope(const Var& buffer_var) { + auto* ptr = buffer_var->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; + } +}; + +namespace transform { + +Pass InjectTextureAlloc() { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + return TextureAllocInjector::Inject(std::move(f)); + }; + return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.InjectTextureAlloc", {}); +} + +TVM_FFI_STATIC_INIT_BLOCK() { + namespace refl = tvm::ffi::reflection; + refl::GlobalDef().def("tir.backend.adreno.transform.InjectTextureAlloc", InjectTextureAlloc); +} + +} // namespace transform + +} // namespace adreno +} // namespace backend +} // namespace tir +} // namespace tvm diff --git a/src/tir/backend/adreno/texture_flatten.cc b/src/tir/backend/adreno/texture_flatten.cc new file mode 100644 index 000000000000..5e8a922c30ca --- /dev/null +++ b/src/tir/backend/adreno/texture_flatten.cc @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file texture_flatten.cc + * \brief Flattens texture storage from multi-dimensional array + * to 2D (width, height, depth) array access + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "../../../arith/ir_visitor_with_analyzer.h" +#include "../../../runtime/texture.h" +#include "../../../runtime/thread_storage_scope.h" + +namespace tvm { +namespace tir { +namespace backend { +namespace adreno { +using arith::IRVisitorWithAnalyzer; +using runtime::ApplyTexture2DFlattening; +using runtime::DefaultTextureLayoutSeparator; +using runtime::IsTextureStorage; + +class TextureLoweringBase : public StmtExprMutator { + public: + explicit TextureLoweringBase(const ffi::Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : bound_analyzer_{bound_analyzer} { + for (auto kv : extern_buffer_map) { + extern_buf_.insert(kv.second); + } + } + + inline PrimExpr SimplifyOffset(const ffi::Array& shape, + const ffi::Array& index) const { + PrimExpr base = make_const(DataType::Int(32), 0); + ICHECK_EQ(shape.size(), index.size()); + if (index.size() > 0) { + PrimExpr offset = index[0]; + for (size_t i = 1; i < index.size(); ++i) { + offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]); + } + base = base + offset; + } + return base; + } + + protected: + std::string GetStorageScope(const Buffer& buffer) { + auto* ptr = buffer->data->type_annotation.as(); + ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType"; + return ptr->storage_scope; + } + + // Set of all external input and output buffers + std::unordered_set extern_buf_; + // Bound analzer + IRVisitorWithAnalyzer* bound_analyzer_; +}; + +// Lower Nd storage access to 2d texture access using lowering convention +// specified by the buffers storage scope. +class TextureFlattener : public TextureLoweringBase { + public: + using StmtExprMutator::VisitStmt_; + explicit TextureFlattener(const ffi::Map& extern_buffer_map, + IRVisitorWithAnalyzer* bound_analyzer) + : TextureLoweringBase(extern_buffer_map, bound_analyzer) {} + + Stmt VisitStmt_(const BufferRealizeNode* op) final { + if (extern_buf_.count(op->buffer)) { + return this->VisitStmt(op->body); + } + + std::string storage_scope = GetStorageScope(op->buffer); + Var buffer_var(op->buffer->data->name_hint, + PointerType(PrimType(op->buffer->dtype), ffi::String(storage_scope))); + let_binding_.insert({op->buffer->data, buffer_var}); + + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + + // Rewrite any buffer realizations with storage scope to 2d texture allocations + if (IsTextureStorage(storage_scope)) { + Stmt body = this->VisitStmt(op->body); + ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported"; + const int bits = op->buffer->dtype.bits(), + lanes = static_cast(op->bounds.back()->extent.as()->value); + const int channel_size = bits * lanes; + ICHECK(channel_size == 128 || channel_size == 64) + << "Invalid Channel Size: " << channel_size << " bits"; + + struct ShapeFromRange { + const ffi::Array& bounds; + PrimExpr operator[](size_t i) const { return bounds[i]->extent; } + }; + size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope); + auto texture = + ApplyTexture2DFlattening(ShapeFromRange{op->bounds}, op->bounds.size(), axis); + ffi::Array args; + args.push_back(StringImm(storage_scope)); + args.push_back(IntImm(DataType::Int(64), 3)); // 2D-Array + args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(), + {texture.width, texture.height, texture.depth})); + args.push_back(IntImm(DataType::Int(64), channel_size)); + + stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), + body); + } + + return stmt; + } + + Stmt VisitStmt_(const BufferStoreNode* op) final { + Stmt stmt = StmtExprMutator::VisitStmt_(op); + op = stmt.as(); + std::string storage_scope = GetStorageScope(op->buffer); + // Lower to two dimensional access + if (IsTextureStorage(storage_scope)) { + ffi::Array args = GetTextureAccessArgs(op, op->buffer); + args.push_back(op->value); + stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args)); + } + + return stmt; + } + + PrimExpr VisitExpr_(const BufferLoadNode* op) final { + PrimExpr expr = StmtExprMutator::VisitExpr_(op); + op = expr.as(); + // Lower to two dimensional access + std::string storage_scope = GetStorageScope(op->buffer); + if (IsTextureStorage(storage_scope)) { + ffi::Array args = GetTextureAccessArgs(op, op->buffer); + args.push_back(op->indices.back()); + expr = Call(op->buffer->dtype, builtin::texture2d_load(), args); + } + + return expr; + } + + protected: + template + ffi::Array GetTextureAccessArgs(const T* op, const Buffer& buffer) { + ffi::Array args; + if (let_binding_.count(op->buffer->data)) { + args.push_back(let_binding_[op->buffer->data]); + } else { + args.push_back(buffer->data); + } + ffi::Array row_dims, row_indices, col_dims, col_indices, depth_dims, depth_indices; + size_t axis = DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer)); + for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) { + if (i < (axis - 1)) { + depth_dims.push_back(op->buffer->shape[i]); + depth_indices.push_back(op->indices[i]); + } else if (i < axis) { + col_dims.push_back(op->buffer->shape[i]); + col_indices.push_back(op->indices[i]); + } else { + row_dims.push_back(op->buffer->shape[i]); + row_indices.push_back(op->indices[i]); + } + } + PrimExpr row_offset = SimplifyOffset(row_dims, row_indices); + PrimExpr col_offset = SimplifyOffset(col_dims, col_indices); + PrimExpr depth_offset = SimplifyOffset(depth_dims, depth_indices); + PrimExpr channel_size = IntImm(DataType::Int(32, 1), + *tir::as_const_int(buffer->shape.back()) * buffer->dtype.bits()); + args.push_back(row_offset); + args.push_back(col_offset); + args.push_back(depth_offset); + args.push_back(channel_size); + return args; + } + + // Bindings to new texture vars with texture pointer scope + std::unordered_map let_binding_; +}; + +PrimFunc TextureFlattenHandler(PrimFunc func) { + auto fptr = func.CopyOnWrite(); + IRVisitorWithAnalyzer bound_analyzer; + bound_analyzer(fptr->body); + fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body)); + return func; +} + +namespace transform { + +Pass TextureFlatten() { + auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) { + return TextureFlattenHandler(std::move(f)); + }; + return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.TextureFlatten", {}); +} + +TVM_FFI_STATIC_INIT_BLOCK() { + namespace refl = tvm::ffi::reflection; + refl::GlobalDef().def("tir.backend.adreno.transform.TextureFlatten", TextureFlatten); +} + +} // namespace transform + +} // namespace adreno +} // namespace backend +} // namespace tir +} // namespace tvm diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 068903baa814..331f556a9442 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -37,6 +37,8 @@ #include "../../src/arith/scalable_expression.h" #include "../../tir/analysis/check_contains.h" +#include "tvm/runtime/data_type.h" +#include "tvm/tir/buffer.h" namespace tvm { namespace tir { @@ -519,16 +521,32 @@ class Vectorizer : public StmtMutator, public ExprFunctorop.same_as(builtin::texture2d_load())) { int lane = 0; ffi::Array fcd = MutateArray({op->args.back()}, &lane); + auto dtype = op->args[0] + .as() + ->type_annotation.as() + ->element_type.as() + ->dtype; + ICHECK(lane * dtype.bits() <= op->args[4].as()->value) + << "Expected Data to be Read is lesser than or equal to Texture Load length"; + auto new_args = op->args; new_args.pop_back(); new_args.push_back(fcd[0]); - return Call(op->dtype.with_lanes(4), op->op, new_args); + return Call(op->dtype.with_lanes(lane), op->op, new_args); } else if (op->op.same_as(builtin::texture2d_store())) { int lane = 0; // Vectorize the value to store ffi::Array value{op->args.back()}; ffi::Array mutated_value = MutateArray(value, &lane); - ffi::Array new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]}; + auto dtype = op->args[0] + .as() + ->type_annotation.as() + ->element_type.as() + ->dtype; + ICHECK(lane * dtype.bits() == op->args[4].as()->value) + << "Expected Data to be Written equal to Texture Store length"; + ffi::Array new_args{op->args[0], op->args[1], op->args[2], + op->args[3], op->args[4], mutated_value[0]}; return Call(op->dtype.with_lanes(lane), op->op, new_args); } else if (op->op.same_as(builtin::reinterpret())) { return MutateReinterpretExpr_(op); diff --git a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py index 24b4cf66b888..57abfd081350 100644 --- a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py +++ b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py @@ -123,8 +123,8 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-nhwc"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), - "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global.texture-weight"], ["global"]), + "te_layout_transform1": (["global"], ["global"]), + "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global"], ["global"]), "te_layout_transform2": (["global"], ["global"]), } @@ -151,9 +151,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -182,9 +182,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -213,9 +213,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -239,9 +239,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "fused_conv2d_NCHWc_OIHWo_opencl_relu": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -267,9 +267,9 @@ def main( Expected = { "relu": (["global"], ["global"]), "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "fused_conv2d_NCHWc_OIHWo_opencl_relu1": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -294,9 +294,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "fused_conv2d_NCHWc_OIHWo_opencl_relu_tir_tanh": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -321,10 +321,10 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "te_layout_transform2": (["global"], ["global.texture-weight"]), "fused_conv2d_NCHWc_OIHWo_opencl_add": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global"], ), "te_layout_transform3": (["global"], ["global"]), @@ -352,16 +352,16 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), "relu": (["global"], ["global"]), "te_layout_transform3": (["global"], ["global.texture-weight"]), "conv2d_NCHWc_OIHWo1_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform4": (["global"], ["global"]), @@ -384,9 +384,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "sum": (["global"], ["global"]), @@ -410,9 +410,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "sum": (["global"], ["global"]), @@ -436,9 +436,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "sum": (["global"], ["global"]), @@ -462,9 +462,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -488,9 +488,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -514,9 +514,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -542,9 +542,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -569,9 +569,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "fused_relu_concatenate": (["global.texture-weight"], ["global"]), @@ -595,9 +595,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]), @@ -625,9 +625,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]), @@ -660,9 +660,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "max_pool2d_opencl": (["global.texture-weight"], ["global"]), @@ -686,9 +686,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "adaptive_avg_pool2d_opencl": (["global.texture-weight"], ["global"]), @@ -712,9 +712,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -743,9 +743,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "layer_norm": (["global", "global", "global"], ["global"]), @@ -771,9 +771,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -797,9 +797,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "fused_conv2d_NCHWc_OIHWo_opencl_add": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -852,20 +852,20 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "te_layout_transform2": (["global"], ["global.texture-weight"]), "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global.texture-weight"], ), - "te_layout_transform3": (["global"], ["global.texture-weight"]), + "te_layout_transform3": (["global"], ["global"]), "multiply": (["global"], ["global"]), "fused_conv2d_NCHWc_OIHWo1_opencl_add_relu": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global.texture-weight"], ), "fused_conv2d_NCHWc_OIHWo2_opencl_relu1": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform4": (["global"], ["global"]), @@ -911,15 +911,15 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "te_layout_transform2": (["global"], ["global.texture-weight"]), "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global"], ), - "te_layout_transform3": (["global"], ["global.texture-weight"]), + "te_layout_transform3": (["global"], ["global"]), "conv2d_NCHWc_OIHWo1_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform4": (["global"], ["global"]), @@ -968,20 +968,20 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "te_layout_transform2": (["global"], ["global.texture-weight"]), "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global.texture-weight"], ), - "te_layout_transform3": (["global"], ["global.texture-weight"]), + "te_layout_transform3": (["global"], ["global"]), "conv2d_NCHWc_OIHWo1_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), - "te_layout_transform4": (["global"], ["global.texture-weight"]), + "te_layout_transform4": (["global"], ["global"]), "conv2d_NCHWc_OIHWo2_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "concatenate": (["global.texture-weight", "global.texture-weight"], ["global"]), @@ -1041,23 +1041,23 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "max_pool2d_opencl": (["global.texture-weight"], ["global.texture-weight"]), - "te_layout_transform2": (["global"], ["global.texture-weight"]), + "te_layout_transform2": (["global"], ["global"]), "conv2d_NCHWc_OIHWo2_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global.texture-weight"], ), "fused_conv2d_NCHWc_OIHWo1_opencl_add": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global.texture-weight"], ), "fused_conv2d_NCHWc_OIHWo3_opencl_add": ( - ["global.texture-weight", "global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global", "global.texture-weight"], ["global"], ), "te_layout_transform3": (["global"], ["global"]), @@ -1111,9 +1111,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -1121,7 +1121,7 @@ def main( "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": ( [ "global.texture-weight", - "global.texture-weight", + "global", "global.texture-weight", "global.texture-weight", "global.texture-weight", @@ -1180,9 +1180,9 @@ def main( Expected = { "te_layout_transform": (["global"], ["global.texture-weight"]), - "te_layout_transform1": (["global"], ["global.texture-weight"]), + "te_layout_transform1": (["global"], ["global"]), "conv2d_NCHWc_OIHWo_opencl": ( - ["global.texture-weight", "global.texture-weight"], + ["global.texture-weight", "global"], ["global"], ), "te_layout_transform2": (["global"], ["global"]), @@ -1190,7 +1190,7 @@ def main( "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": ( [ "global.texture-weight", - "global.texture-weight", + "global", "global.texture-weight", "global.texture-weight", ], diff --git a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py index 4e5b4b652b45..d0a4a525adf7 100644 --- a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py +++ b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py @@ -20,6 +20,7 @@ import tvm import tvm.testing import json +import os from tvm import relax, rpc from tvm.script import relax as R @@ -73,7 +74,6 @@ def test_conv2d_offload( has_pad, is_depthwise, dtype, - rpc, ): low, high = 0, 1 data_shape = (1, *shape) @@ -117,7 +117,7 @@ def test_conv2d_offload( has_pad=has_pad, is_depthwise=is_depthwise, ) - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -131,7 +131,7 @@ def test_conv2d_offload( [(1, 256, 1, 1), 1, 3e-4], ], ) -def test_batchnorm(dtype, trials, rpc): +def test_batchnorm(dtype, trials): low, high = 0, 1 if clml.clml_sdk_version() < 3: print("Skip due to unsupported CLML version:", clml.clml_sdk_version()) @@ -159,7 +159,7 @@ def _get_axis_tuple(axis): inputs = [data] params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, "moving_var": variance} mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype) - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -182,9 +182,9 @@ def _get_axis_tuple(axis): ], ) @tvm.testing.requires_openclml -def test_binary_ops(a_shape, b_shape, op, rpc, dtype): +def test_binary_ops(a_shape, b_shape, op, dtype): (mod, inputs) = get_binary_op_mod(a_shape, b_shape, op, dtype) - run_compare(mod, inputs, {}, rpc) + run_compare(mod, inputs, {}) @tvm.testing.requires_openclml @@ -204,9 +204,9 @@ def test_binary_ops(a_shape, b_shape, op, rpc, dtype): ], ) @tvm.testing.requires_openclml -def test_unary_ops(a_shape, op, rpc, dtype): +def test_unary_ops(a_shape, op, dtype): (mod, inputs) = get_unary_op_mod(a_shape, op, dtype) - run_compare(mod, inputs, {}, rpc) + run_compare(mod, inputs, {}) @tvm.testing.requires_openclml @@ -223,14 +223,14 @@ def test_unary_ops(a_shape, op, rpc, dtype): [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True], ], ) -def test_max_pool(dtype, trials, rpc): +def test_max_pool(dtype, trials): low, high = -1, 1 (input_shape, pool_size, stride, dilation, padding, has_pad) = trials data = np.random.uniform(low, high, size=input_shape).astype(dtype) inputs = [data] mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad) params_np = {} - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -247,14 +247,14 @@ def test_max_pool(dtype, trials, rpc): [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True], ], ) -def test_avg_pool(dtype, trials, rpc): +def test_avg_pool(dtype, trials): low, high = -1, 1 (input_shape, pool_size, stride, dilation, padding, has_pad) = trials data = np.random.uniform(low, high, size=input_shape).astype(dtype) inputs = [data] mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad) params_np = {} - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -267,14 +267,14 @@ def test_avg_pool(dtype, trials, rpc): [(1, 64, 3, 3), (1, 32, 3, -1)], ], ) -def test_reshape(dtype, trials, rpc): +def test_reshape(dtype, trials): low, high = -1, 1 (input_shape, output_shape) = trials data = np.random.uniform(low, high, size=input_shape).astype(dtype) inputs = [data] mod = get_relax_reshape_mod(input_shape, output_shape, dtype) params_np = {} - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -288,7 +288,7 @@ def test_reshape(dtype, trials, rpc): [(1, 32, 256, 256), False], ], ) -def test_global_avg_pool(dtype, trials, rpc): +def test_global_avg_pool(dtype, trials): """Test function for global average pooling.""" low, high = -1, 1 (input_shape, keep_dims) = trials @@ -296,7 +296,7 @@ def test_global_avg_pool(dtype, trials, rpc): inputs = [data] mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype) params_np = {} - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) @tvm.testing.requires_openclml @@ -310,7 +310,7 @@ def test_global_avg_pool(dtype, trials, rpc): [(1, 32, 256, 256), False], ], ) -def test_global_max_pool(dtype, trials, rpc): +def test_global_max_pool(dtype, trials): """Test function for global average pooling.""" low, high = -1, 1 (input_shape, keep_dims) = trials @@ -322,7 +322,7 @@ def test_global_max_pool(dtype, trials, rpc): inputs = [data] mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype) params_np = {} - run_compare(mod, inputs, params_np, rpc) + run_compare(mod, inputs, params_np) if __name__ == "__main__": diff --git a/tests/python/relax/backend/clml/utils.py b/tests/python/relax/backend/clml/utils.py index d32a2df38ffd..0f3516f96122 100644 --- a/tests/python/relax/backend/clml/utils.py +++ b/tests/python/relax/backend/clml/utils.py @@ -28,6 +28,25 @@ from tvm.contrib import utils, ndk from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad +import os +from tvm import rpc as _rpc + + +def get_rpc(): + rpc_target = os.getenv("RPC_TARGET", None) + if rpc_target: + connection_type = "tracker" + host = os.getenv("TVM_TRACKER_HOST", "localhost") + port = int(os.getenv("TVM_TRACKER_PORT", 9090)) + target = "opencl" + target_host = "llvm -mtriple=aarch64-linux-gnu" + device_key = os.getenv("RPC_DEVICE_KEY", "android") + cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++") + tracker = _rpc.connect_tracker(host, port) + return tracker.request(device_key, priority=1, session_timeout=1000) + else: + return None + def build_and_run( mod, @@ -35,40 +54,39 @@ def build_and_run( target, rpc=None, load_path="vm_library.so", - clml_enable=False, ): tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu") - pipeline = relax.pipeline.get_default_pipeline(tgt) - mod = pipeline(mod) - if rpc: - ex = tvm.compile(mod, tgt) - temp = utils.tempdir() - path = temp.relpath(load_path) - path = "./" + load_path - ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"]) - rpc.upload(path) - rexec = rpc.load_module(load_path) - dev = rpc.cl(0) - vm = relax.VirtualMachine(rexec, dev) - else: - ex = tvm.compile(mod, target) - dev = tvm.device(target, 0) - vm = relax.VirtualMachine(ex, dev) + relax_pipeline = relax.pipeline.get_default_pipeline(tgt) + tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt) + + ex = tvm.compile(mod, tgt, relax_pipeline=relax_pipeline, tir_pipeline=tir_pipeline) + temp = utils.tempdir() + path = temp.relpath(load_path) + path = "./" + load_path + ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"]) + + rpc.upload(path) + rexec = rpc.load_module(load_path) + dev = rpc.cl(0) + vm = relax.VirtualMachine(rexec, dev) f = vm["main"] inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np] vm.set_input("main", *inputs) vm.invoke_stateful("main") - tvm_output = vm.get_outputs("main") - return tvm_output.numpy() + tvm_output = vm.get_outputs("main").numpy() + if rpc: + rpc.get_function("CloseRPCConnection")() + return tvm_output -def run_compare(mod, inputs, params_np, rpc=None): +def run_compare(mod, inputs, params_np): clml_mod = copy.deepcopy(mod) mod = tvm.relax.transform.BindParams("main", params_np)(mod) clml_mod = tvm.relax.transform.BindParams("main", params_np)(clml_mod) - if not rpc: + rpc = get_rpc() + if rpc is None: return ref = build_and_run( @@ -78,12 +96,13 @@ def run_compare(mod, inputs, params_np, rpc=None): rpc=rpc, load_path="vm_library_opencl.so", ) + + rpc = get_rpc() out = build_and_run( clml_mod, inputs, - tvm.target.adreno(clml=True), + tvm.target.adreno(cfg="clml"), rpc=rpc, load_path="vm_library_clml.so", - clml_enable=True, ) np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5) diff --git a/tests/python/relax/test_transform.py b/tests/python/relax/test_transform.py index b0bec5e858af..78a52c66723e 100644 --- a/tests/python/relax/test_transform.py +++ b/tests/python/relax/test_transform.py @@ -441,8 +441,16 @@ def foo( R.Tensor((2, 3), "int32"), R.Tensor((2, 3), "int32"), R.Tensor((2, 3), dtype="int32") ): R.func_attr({"relax.force_pure": True}) - gv0 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0)) - gv1 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0)) + gv0: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo( + "relax.builtin.alloc_tensor", + (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")), + (R.Tensor((2, 3), dtype="int32"),), + ) + gv1: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo( + "relax.builtin.alloc_tensor", + (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")), + (R.Tensor((2, 3), dtype="int32"),), + ) _ = Expected.copy(x, y, z, gv0, gv1) gv2 = (x, gv0, gv1) return gv2 diff --git a/tests/python/relax/test_transform_legalize_ops_manipulate.py b/tests/python/relax/test_transform_legalize_ops_manipulate.py index ae865f1fb16b..17e0160c6183 100644 --- a/tests/python/relax/test_transform_legalize_ops_manipulate.py +++ b/tests/python/relax/test_transform_legalize_ops_manipulate.py @@ -1745,8 +1745,10 @@ def main( ): R.func_attr({"relax.force_pure": True}) cls = Expected - alloc: R.Tensor((4, 4), dtype="float32") = R.builtin.alloc_tensor( - R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global") + alloc: R.Tensor((4, 4), dtype="float32") = R.emit_with_sinfo( + "relax.builtin.alloc_tensor", + (R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global")), + (R.Tensor((4, 4), dtype="float32"),), ) cls.te_layout_transform(x, alloc) lv = alloc diff --git a/tests/python/relax/texture/adreno_utils.py b/tests/python/relax/texture/adreno_utils.py new file mode 100644 index 000000000000..b6ffd837145f --- /dev/null +++ b/tests/python/relax/texture/adreno_utils.py @@ -0,0 +1,195 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import tvm +import numpy as np +from tvm import relax +from tvm.contrib import utils, ndk +from tvm.script.parser import ir as I, relax as R, tir as T +from tvm.relax.transform.legalize_ops import adreno as legalize_adreno +from tvm.contrib import dlpack as dl +import tvm.testing +from tvm.rpc import connect_tracker + + +def get_target(backend, is_adreno=False): + """ + Get the target for the Adreno GPU. + + Returns + ------- + tvm.target.Target + The target for the Adreno GPU. + """ + target = tvm.target.adreno(backend=backend) + if is_adreno: + target = tvm.target.adreno(cfg="texture", backend=backend) + return target + + +def get_rpc(): + """ + Establish an RPC connection to the remote device. + + Returns + ------- + tvm.rpc.RPCSession or None + The RPC session object if RPC_TARGET is set; otherwise, None. + """ + rpc_target = os.getenv("RPC_TARGET", None) + if rpc_target: + host = os.getenv("TVM_TRACKER_HOST", "localhost") + port = int(os.getenv("TVM_TRACKER_PORT", 9090)) + device_key = os.getenv("RPC_DEVICE_KEY", "android") + tracker = connect_tracker(host, port) + return tracker.request(device_key, priority=1, session_timeout=1000) + else: + return None + + +def get_unique_dso_lib(): + """ + Generate a unique shared library filename based on environment variables. + + Returns + ------- + str + The unique shared library filename. + """ + rpc_tracker_port = os.getenv("TVM_TRACKER_PORT", "") + device_port = os.getenv("DEVICE_LISTEN_PORT", "") + return f"dev_lib_cl-{rpc_tracker_port}-{device_port}.so" + + +def run_cpu(mod, inputs, save_lib=False): + """ + Run the Relax module on the local CPU for verification. + + Parameters + ---------- + mod : tvm.IRModule + The Relax IRModule to execute. + inputs : list of numpy.ndarray + The input data for the module. + save_lib : bool, optional + Whether to save the compiled library. Default is False. + + Returns + ------- + tvm.runtime.NDArray or tuple of tvm.runtime.NDArray + The output from the module execution. + """ + print("Running on local CPU for verification") + target = tvm.target.Target("llvm") + ex = relax.build(mod, target) + if save_lib: + ex.export_library("mod.so") + dev = tvm.cpu() + vm = relax.VirtualMachine(ex, dev) + inputs = [tvm.nd.array(inp, dev) for inp in inputs] + vm.set_input("main", *inputs) + vm.invoke_stateful("main") + tvm_output = vm.get_outputs("main") + return tvm_output + + +def build_run(mod, inputs, backend, is_adreno=False): + remote = get_rpc() + target = get_target(backend, is_adreno) + if remote is None: + tgt = tvm.target.Target(target, host="llvm") + else: + tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu") + relax_pipeline = relax.pipeline.get_default_pipeline(tgt) + tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt) + mod = relax_pipeline(mod) + ex = tvm.compile(mod, tgt, tir_pipeline=tir_pipeline) + + if remote is None: + # local execution + if "opencl" in backend: + dev = tvm.opencl(0) + elif "vulkan" in backend: + dev = tvm.vulkan(0) + else: + raise RuntimeError("Unsupported backend") + + if "vdevice" in mod.global_infos: + device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))] + else: + device_arr = [dev] + vm = relax.VirtualMachine(ex, device_arr) + else: + # remote execution + temp = utils.tempdir() + filename = get_unique_dso_lib() + file_path = temp.relpath(filename) + ex.export_library( + file_path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"] + ) + + remote.upload(file_path) + rexec = remote.load_module(filename) + + if "opencl" in backend: + dev = remote.cl(0) + elif "vulkan" in backend: + dev = remote.vulkan(0) + else: + raise RuntimeError("Unsupported backend") + + if "vdevice" in mod.global_infos: + device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))] + else: + device_arr = [dev] + + vm = relax.VirtualMachine(rexec, device_arr) + + inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs] + vm.set_input("main", *inputs) + vm.invoke_stateful("main") + tvm_output = vm.get_outputs("main") + + if isinstance(tvm_output, tuple): + tvm_output = (out.numpy() for out in tvm_output) + else: + tvm_output = tvm_output.numpy() + + if remote: + remote.get_function("CloseRPCConnection")() + return tvm_output + + +def verify(mod, backend): + + if backend not in ["opencl", "vulkan"]: + raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.") + + inputs = [] + for arg in mod["main"].params: + shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values) + inputs.append(np.random.uniform(0, 1, size=shape).astype(arg.struct_info.dtype)) + + ret1 = build_run(mod, inputs, backend, True) + ret2 = build_run(mod, inputs, backend) + + if isinstance(ret1, tuple): + for val1, val2 in zip(ret1, ret2): + tvm.testing.assert_allclose(val1, ret2, rtol=1e-3, atol=1e-3) + else: + tvm.testing.assert_allclose(ret1, ret2, rtol=1e-3, atol=1e-3) diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py new file mode 100644 index 000000000000..aeb4f1248c10 --- /dev/null +++ b/tests/python/relax/texture/test_network.py @@ -0,0 +1,805 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest +import numpy as np +import tvm +import tvm.testing +import onnx +import copy + +from tvm import relax +from tvm.script import relax as R +from tvm.script import ir as I +from tvm.script import tir as T +from adreno_utils import verify +from tvm.script.ir_builder import IRBuilder +from tvm.script.ir_builder import relax as relax_builder +from tvm.relax.frontend.onnx import from_onnx +from tvm.relax.transform.legalize_ops import adreno as legalize_adreno + +import pytest +import json +import copy + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_network_resnet(target): + @I.ir_module + class Resnet: + @R.function + def main( + data: R.Tensor((1, 3, 224, 224), dtype="float32"), + resnetv22_batchnorm0_gamma: R.Tensor((3,), dtype="float32"), + resnetv22_batchnorm0_beta: R.Tensor((3,), dtype="float32"), + resnetv22_batchnorm0_running_mea: R.Tensor((3,), dtype="float32"), + resnetv22_batchnorm0_running_var: R.Tensor((3,), dtype="float32"), + resnetv22_conv0_weight: R.Tensor((64, 3, 7, 7), dtype="float32"), + resnetv22_batchnorm1_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_batchnorm1_beta: R.Tensor((64,), dtype="float32"), + resnetv22_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_batchnorm1_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm0_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm0_beta: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm0_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_conv0_weight: R.Tensor((64, 64, 3, 3), dtype="float32"), + resnetv22_stage1_batchnorm1_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm1_beta: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm1_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_conv1_weight: R.Tensor((64, 64, 3, 3), dtype="float32"), + resnetv22_stage1_batchnorm2_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm2_beta: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm2_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm2_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_conv2_weight: R.Tensor((64, 64, 3, 3), dtype="float32"), + resnetv22_stage1_batchnorm3_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm3_beta: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm3_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_batchnorm3_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage1_conv3_weight: R.Tensor((64, 64, 3, 3), dtype="float32"), + resnetv22_stage2_batchnorm0_gamma: R.Tensor((64,), dtype="float32"), + resnetv22_stage2_batchnorm0_beta: R.Tensor((64,), dtype="float32"), + resnetv22_stage2_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"), + resnetv22_stage2_batchnorm0_running_var: R.Tensor((64,), dtype="float32"), + resnetv22_stage2_conv0_weight: R.Tensor((128, 64, 3, 3), dtype="float32"), + resnetv22_stage2_batchnorm1_gamma: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm1_beta: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm1_running_mea: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm1_running_var: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_conv1_weight: R.Tensor((128, 128, 3, 3), dtype="float32"), + resnetv22_stage2_conv2_weight: R.Tensor((128, 64, 1, 1), dtype="float32"), + resnetv22_stage2_batchnorm2_gamma: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm2_beta: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm2_running_mea: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm2_running_var: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_conv3_weight: R.Tensor((128, 128, 3, 3), dtype="float32"), + resnetv22_stage2_batchnorm3_gamma: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm3_beta: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm3_running_mea: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_batchnorm3_running_var: R.Tensor((128,), dtype="float32"), + resnetv22_stage2_conv4_weight: R.Tensor((128, 128, 3, 3), dtype="float32"), + resnetv22_stage3_batchnorm0_gamma: R.Tensor((128,), dtype="float32"), + resnetv22_stage3_batchnorm0_beta: R.Tensor((128,), dtype="float32"), + resnetv22_stage3_batchnorm0_running_mea: R.Tensor((128,), dtype="float32"), + resnetv22_stage3_batchnorm0_running_var: R.Tensor((128,), dtype="float32"), + resnetv22_stage3_conv0_weight: R.Tensor((256, 128, 3, 3), dtype="float32"), + resnetv22_stage3_batchnorm1_gamma: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm1_beta: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm1_running_mea: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm1_running_var: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_conv1_weight: R.Tensor((256, 256, 3, 3), dtype="float32"), + resnetv22_stage3_conv2_weight: R.Tensor((256, 128, 1, 1), dtype="float32"), + resnetv22_stage3_batchnorm2_gamma: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm2_beta: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm2_running_mea: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm2_running_var: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_conv3_weight: R.Tensor((256, 256, 3, 3), dtype="float32"), + resnetv22_stage3_batchnorm3_gamma: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm3_beta: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm3_running_mea: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_batchnorm3_running_var: R.Tensor((256,), dtype="float32"), + resnetv22_stage3_conv4_weight: R.Tensor((256, 256, 3, 3), dtype="float32"), + resnetv22_stage4_batchnorm0_gamma: R.Tensor((256,), dtype="float32"), + resnetv22_stage4_batchnorm0_beta: R.Tensor((256,), dtype="float32"), + resnetv22_stage4_batchnorm0_running_mea: R.Tensor((256,), dtype="float32"), + resnetv22_stage4_batchnorm0_running_var: R.Tensor((256,), dtype="float32"), + resnetv22_stage4_conv0_weight: R.Tensor((512, 256, 3, 3), dtype="float32"), + resnetv22_stage4_batchnorm1_gamma: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm1_beta: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm1_running_mea: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm1_running_var: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_conv1_weight: R.Tensor((512, 512, 3, 3), dtype="float32"), + resnetv22_stage4_conv2_weight: R.Tensor((512, 256, 1, 1), dtype="float32"), + resnetv22_stage4_batchnorm2_gamma: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm2_beta: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm2_running_var: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_conv3_weight: R.Tensor((512, 512, 3, 3), dtype="float32"), + resnetv22_stage4_batchnorm3_gamma: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm3_beta: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm3_running_mea: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_batchnorm3_running_var: R.Tensor((512,), dtype="float32"), + resnetv22_stage4_conv4_weight: R.Tensor((512, 512, 3, 3), dtype="float32"), + resnetv22_batchnorm2_gamma: R.Tensor((512,), dtype="float32"), + resnetv22_batchnorm2_beta: R.Tensor((512,), dtype="float32"), + resnetv22_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"), + resnetv22_batchnorm2_running_var: R.Tensor((512,), dtype="float32"), + reshape_attr_tensor164: R.Tensor((2,), dtype="int64"), + resnetv22_dense0_weight: R.Tensor((1000, 512), dtype="float32"), + resnetv22_dense0_bias: R.Tensor((1000,), dtype="float32"), + ) -> R.Tensor((1, 1000), dtype="float32"): + with R.dataflow(): + lv: R.Tuple( + R.Tensor((1, 3, 224, 224), dtype="float32"), + R.Tensor((3,), dtype="float32"), + R.Tensor((3,), dtype="float32"), + ) = R.nn.batch_norm( + data, + resnetv22_batchnorm0_gamma, + resnetv22_batchnorm0_beta, + resnetv22_batchnorm0_running_mea, + resnetv22_batchnorm0_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0] + lv2: R.Tensor((3,), dtype="float32") = lv[1] + lv3: R.Tensor((3,), dtype="float32") = lv[2] + lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d( + lv1, + resnetv22_conv0_weight, + strides=[2, 2], + padding=[3, 3, 3, 3], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv5: R.Tuple( + R.Tensor((1, 64, 112, 112), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv4, + resnetv22_batchnorm1_gamma, + resnetv22_batchnorm1_beta, + resnetv22_batchnorm1_running_mea, + resnetv22_batchnorm1_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv6: R.Tensor((1, 64, 112, 112), dtype="float32") = lv5[0] + lv7: R.Tensor((64,), dtype="float32") = lv5[1] + lv8: R.Tensor((64,), dtype="float32") = lv5[2] + lv9: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.relu(lv6) + lv10: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.max_pool2d( + lv9, + pool_size=[3, 3], + strides=[2, 2], + dilation=[1, 1], + padding=[1, 1, 1, 1], + ceil_mode=False, + count_include_pad=False, + layout="NCHW", + out_layout="NCHW", + ) + lv11: R.Tuple( + R.Tensor((1, 64, 56, 56), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv10, + resnetv22_stage1_batchnorm0_gamma, + resnetv22_stage1_batchnorm0_beta, + resnetv22_stage1_batchnorm0_running_mea, + resnetv22_stage1_batchnorm0_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv12: R.Tensor((1, 64, 56, 56), dtype="float32") = lv11[0] + lv13: R.Tensor((64,), dtype="float32") = lv11[1] + lv14: R.Tensor((64,), dtype="float32") = lv11[2] + lv15: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv12) + lv16: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d( + lv15, + resnetv22_stage1_conv0_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv17: R.Tuple( + R.Tensor((1, 64, 56, 56), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv16, + resnetv22_stage1_batchnorm1_gamma, + resnetv22_stage1_batchnorm1_beta, + resnetv22_stage1_batchnorm1_running_mea, + resnetv22_stage1_batchnorm1_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv18: R.Tensor((1, 64, 56, 56), dtype="float32") = lv17[0] + lv19: R.Tensor((64,), dtype="float32") = lv17[1] + lv20: R.Tensor((64,), dtype="float32") = lv17[2] + lv21: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv18) + lv22: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d( + lv21, + resnetv22_stage1_conv1_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv23: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv22, lv10) + lv24: R.Tuple( + R.Tensor((1, 64, 56, 56), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv23, + resnetv22_stage1_batchnorm2_gamma, + resnetv22_stage1_batchnorm2_beta, + resnetv22_stage1_batchnorm2_running_mea, + resnetv22_stage1_batchnorm2_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv25: R.Tensor((1, 64, 56, 56), dtype="float32") = lv24[0] + lv26: R.Tensor((64,), dtype="float32") = lv24[1] + lv27: R.Tensor((64,), dtype="float32") = lv24[2] + lv28: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv25) + lv29: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d( + lv28, + resnetv22_stage1_conv2_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv30: R.Tuple( + R.Tensor((1, 64, 56, 56), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv29, + resnetv22_stage1_batchnorm3_gamma, + resnetv22_stage1_batchnorm3_beta, + resnetv22_stage1_batchnorm3_running_mea, + resnetv22_stage1_batchnorm3_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv31: R.Tensor((1, 64, 56, 56), dtype="float32") = lv30[0] + lv32: R.Tensor((64,), dtype="float32") = lv30[1] + lv33: R.Tensor((64,), dtype="float32") = lv30[2] + lv34: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv31) + lv35: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d( + lv34, + resnetv22_stage1_conv3_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv36: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv35, lv23) + lv37: R.Tuple( + R.Tensor((1, 64, 56, 56), dtype="float32"), + R.Tensor((64,), dtype="float32"), + R.Tensor((64,), dtype="float32"), + ) = R.nn.batch_norm( + lv36, + resnetv22_stage2_batchnorm0_gamma, + resnetv22_stage2_batchnorm0_beta, + resnetv22_stage2_batchnorm0_running_mea, + resnetv22_stage2_batchnorm0_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv38: R.Tensor((1, 64, 56, 56), dtype="float32") = lv37[0] + lv39: R.Tensor((64,), dtype="float32") = lv37[1] + lv40: R.Tensor((64,), dtype="float32") = lv37[2] + lv41: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv38) + lv42: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d( + lv41, + resnetv22_stage2_conv0_weight, + strides=[2, 2], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv43: R.Tuple( + R.Tensor((1, 128, 28, 28), dtype="float32"), + R.Tensor((128,), dtype="float32"), + R.Tensor((128,), dtype="float32"), + ) = R.nn.batch_norm( + lv42, + resnetv22_stage2_batchnorm1_gamma, + resnetv22_stage2_batchnorm1_beta, + resnetv22_stage2_batchnorm1_running_mea, + resnetv22_stage2_batchnorm1_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv44: R.Tensor((1, 128, 28, 28), dtype="float32") = lv43[0] + lv45: R.Tensor((128,), dtype="float32") = lv43[1] + lv46: R.Tensor((128,), dtype="float32") = lv43[2] + lv47: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv44) + lv48: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d( + lv47, + resnetv22_stage2_conv1_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv49: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d( + lv41, + resnetv22_stage2_conv2_weight, + strides=[2, 2], + padding=[0, 0, 0, 0], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv50: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv48, lv49) + lv51: R.Tuple( + R.Tensor((1, 128, 28, 28), dtype="float32"), + R.Tensor((128,), dtype="float32"), + R.Tensor((128,), dtype="float32"), + ) = R.nn.batch_norm( + lv50, + resnetv22_stage2_batchnorm2_gamma, + resnetv22_stage2_batchnorm2_beta, + resnetv22_stage2_batchnorm2_running_mea, + resnetv22_stage2_batchnorm2_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv52: R.Tensor((1, 128, 28, 28), dtype="float32") = lv51[0] + lv53: R.Tensor((128,), dtype="float32") = lv51[1] + lv54: R.Tensor((128,), dtype="float32") = lv51[2] + lv55: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv52) + lv56: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d( + lv55, + resnetv22_stage2_conv3_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv57: R.Tuple( + R.Tensor((1, 128, 28, 28), dtype="float32"), + R.Tensor((128,), dtype="float32"), + R.Tensor((128,), dtype="float32"), + ) = R.nn.batch_norm( + lv56, + resnetv22_stage2_batchnorm3_gamma, + resnetv22_stage2_batchnorm3_beta, + resnetv22_stage2_batchnorm3_running_mea, + resnetv22_stage2_batchnorm3_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv58: R.Tensor((1, 128, 28, 28), dtype="float32") = lv57[0] + lv59: R.Tensor((128,), dtype="float32") = lv57[1] + lv60: R.Tensor((128,), dtype="float32") = lv57[2] + lv61: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv58) + lv62: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d( + lv61, + resnetv22_stage2_conv4_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv63: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv62, lv50) + lv64: R.Tuple( + R.Tensor((1, 128, 28, 28), dtype="float32"), + R.Tensor((128,), dtype="float32"), + R.Tensor((128,), dtype="float32"), + ) = R.nn.batch_norm( + lv63, + resnetv22_stage3_batchnorm0_gamma, + resnetv22_stage3_batchnorm0_beta, + resnetv22_stage3_batchnorm0_running_mea, + resnetv22_stage3_batchnorm0_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv65: R.Tensor((1, 128, 28, 28), dtype="float32") = lv64[0] + lv66: R.Tensor((128,), dtype="float32") = lv64[1] + lv67: R.Tensor((128,), dtype="float32") = lv64[2] + lv68: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv65) + lv69: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d( + lv68, + resnetv22_stage3_conv0_weight, + strides=[2, 2], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv70: R.Tuple( + R.Tensor((1, 256, 14, 14), dtype="float32"), + R.Tensor((256,), dtype="float32"), + R.Tensor((256,), dtype="float32"), + ) = R.nn.batch_norm( + lv69, + resnetv22_stage3_batchnorm1_gamma, + resnetv22_stage3_batchnorm1_beta, + resnetv22_stage3_batchnorm1_running_mea, + resnetv22_stage3_batchnorm1_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv71: R.Tensor((1, 256, 14, 14), dtype="float32") = lv70[0] + lv72: R.Tensor((256,), dtype="float32") = lv70[1] + lv73: R.Tensor((256,), dtype="float32") = lv70[2] + lv74: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv71) + lv75: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d( + lv74, + resnetv22_stage3_conv1_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv76: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d( + lv68, + resnetv22_stage3_conv2_weight, + strides=[2, 2], + padding=[0, 0, 0, 0], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv77: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv75, lv76) + lv78: R.Tuple( + R.Tensor((1, 256, 14, 14), dtype="float32"), + R.Tensor((256,), dtype="float32"), + R.Tensor((256,), dtype="float32"), + ) = R.nn.batch_norm( + lv77, + resnetv22_stage3_batchnorm2_gamma, + resnetv22_stage3_batchnorm2_beta, + resnetv22_stage3_batchnorm2_running_mea, + resnetv22_stage3_batchnorm2_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv79: R.Tensor((1, 256, 14, 14), dtype="float32") = lv78[0] + lv80: R.Tensor((256,), dtype="float32") = lv78[1] + lv81: R.Tensor((256,), dtype="float32") = lv78[2] + lv82: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv79) + lv83: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d( + lv82, + resnetv22_stage3_conv3_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv84: R.Tuple( + R.Tensor((1, 256, 14, 14), dtype="float32"), + R.Tensor((256,), dtype="float32"), + R.Tensor((256,), dtype="float32"), + ) = R.nn.batch_norm( + lv83, + resnetv22_stage3_batchnorm3_gamma, + resnetv22_stage3_batchnorm3_beta, + resnetv22_stage3_batchnorm3_running_mea, + resnetv22_stage3_batchnorm3_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv85: R.Tensor((1, 256, 14, 14), dtype="float32") = lv84[0] + lv86: R.Tensor((256,), dtype="float32") = lv84[1] + lv87: R.Tensor((256,), dtype="float32") = lv84[2] + lv88: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv85) + lv89: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d( + lv88, + resnetv22_stage3_conv4_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv90: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv89, lv77) + lv91: R.Tuple( + R.Tensor((1, 256, 14, 14), dtype="float32"), + R.Tensor((256,), dtype="float32"), + R.Tensor((256,), dtype="float32"), + ) = R.nn.batch_norm( + lv90, + resnetv22_stage4_batchnorm0_gamma, + resnetv22_stage4_batchnorm0_beta, + resnetv22_stage4_batchnorm0_running_mea, + resnetv22_stage4_batchnorm0_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv92: R.Tensor((1, 256, 14, 14), dtype="float32") = lv91[0] + lv93: R.Tensor((256,), dtype="float32") = lv91[1] + lv94: R.Tensor((256,), dtype="float32") = lv91[2] + lv95: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv92) + lv96: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d( + lv95, + resnetv22_stage4_conv0_weight, + strides=[2, 2], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv97: R.Tuple( + R.Tensor((1, 512, 7, 7), dtype="float32"), + R.Tensor((512,), dtype="float32"), + R.Tensor((512,), dtype="float32"), + ) = R.nn.batch_norm( + lv96, + resnetv22_stage4_batchnorm1_gamma, + resnetv22_stage4_batchnorm1_beta, + resnetv22_stage4_batchnorm1_running_mea, + resnetv22_stage4_batchnorm1_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv98: R.Tensor((1, 512, 7, 7), dtype="float32") = lv97[0] + lv99: R.Tensor((512,), dtype="float32") = lv97[1] + lv100: R.Tensor((512,), dtype="float32") = lv97[2] + lv101: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv98) + lv102: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d( + lv101, + resnetv22_stage4_conv1_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv103: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d( + lv95, + resnetv22_stage4_conv2_weight, + strides=[2, 2], + padding=[0, 0, 0, 0], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv104: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv102, lv103) + lv105: R.Tuple( + R.Tensor((1, 512, 7, 7), dtype="float32"), + R.Tensor((512,), dtype="float32"), + R.Tensor((512,), dtype="float32"), + ) = R.nn.batch_norm( + lv104, + resnetv22_stage4_batchnorm2_gamma, + resnetv22_stage4_batchnorm2_beta, + resnetv22_stage4_batchnorm2_running_mea, + resnetv22_stage4_batchnorm2_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv106: R.Tensor((1, 512, 7, 7), dtype="float32") = lv105[0] + lv107: R.Tensor((512,), dtype="float32") = lv105[1] + lv108: R.Tensor((512,), dtype="float32") = lv105[2] + lv109: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv106) + lv110: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d( + lv109, + resnetv22_stage4_conv3_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv111: R.Tuple( + R.Tensor((1, 512, 7, 7), dtype="float32"), + R.Tensor((512,), dtype="float32"), + R.Tensor((512,), dtype="float32"), + ) = R.nn.batch_norm( + lv110, + resnetv22_stage4_batchnorm3_gamma, + resnetv22_stage4_batchnorm3_beta, + resnetv22_stage4_batchnorm3_running_mea, + resnetv22_stage4_batchnorm3_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv112: R.Tensor((1, 512, 7, 7), dtype="float32") = lv111[0] + lv113: R.Tensor((512,), dtype="float32") = lv111[1] + lv114: R.Tensor((512,), dtype="float32") = lv111[2] + lv115: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv112) + lv116: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d( + lv115, + resnetv22_stage4_conv4_weight, + strides=[1, 1], + padding=[1, 1, 1, 1], + dilation=[1, 1], + groups=1, + data_layout="NCHW", + kernel_layout="OIHW", + out_layout="NCHW", + out_dtype="void", + ) + lv117: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv116, lv104) + lv118: R.Tuple( + R.Tensor((1, 512, 7, 7), dtype="float32"), + R.Tensor((512,), dtype="float32"), + R.Tensor((512,), dtype="float32"), + ) = R.nn.batch_norm( + lv117, + resnetv22_batchnorm2_gamma, + resnetv22_batchnorm2_beta, + resnetv22_batchnorm2_running_mea, + resnetv22_batchnorm2_running_var, + axis=1, + epsilon=9.9999997473787516e-06, + center=True, + scale=True, + momentum=0.10000000000000001, + ) + lv119: R.Tensor((1, 512, 7, 7), dtype="float32") = lv118[0] + lv120: R.Tensor((512,), dtype="float32") = lv118[1] + lv121: R.Tensor((512,), dtype="float32") = lv118[2] + lv122: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv119) + lv123: R.Tensor((1, 512, 1, 1), dtype="float32") = R.mean( + lv122, axis=[2, 3], keepdims=True + ) + lv124: R.Tensor((1, 512), dtype="float32") = R.reshape(lv123, R.shape([1, 512])) + lv125: R.Tensor((512, 1000), dtype="float32") = R.permute_dims( + resnetv22_dense0_weight, axes=[1, 0] + ) + lv126: R.Tensor((1, 1000), dtype="float32") = R.matmul( + lv124, lv125, out_dtype="void" + ) + gv: R.Tensor((1, 1000), dtype="float32") = R.add(lv126, resnetv22_dense0_bias) + R.output(gv) + return gv + + verify(Resnet, target) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relax/texture/test_ops.py b/tests/python/relax/texture/test_ops.py new file mode 100644 index 000000000000..7011852aa3ab --- /dev/null +++ b/tests/python/relax/texture/test_ops.py @@ -0,0 +1,714 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import tvm.testing +from tvm.relax.transform import ConvertLayout, Normalize +from tvm.script.parser import ir as I, relax as R, tir as T +from tvm.relax.transform.legalize_ops import adreno as legalize_adreno +from adreno_utils import verify + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 64, 56, 56), "float32"), w: R.Tensor((32, 64, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 32, 54, 54), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + R.output(gv) + return gv + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_relu(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_relu_conv2d_relu(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + x0: R.Tensor((2, 16, 28, 28), "float32") = R.nn.relu(x) + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x0, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_relu_tanh(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + gv3: R.Tensor((2, 4, 26, 26), "float32") = R.tanh(gv2) + R.output(gv3) + return gv3 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_add(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), + w: R.Tensor((4, 16, 3, 3), "float32"), + bias: R.Tensor((2, 4, 26, 26), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_sum(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=2): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4), "float32") = R.sum(gv, axis=[2, 3]) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_sum_keepdims(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=2): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 1, 1), "float32") = R.sum(gv, axis=[2, 3], keepdims=True) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_sum_reduce(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=2): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 26), "float32") = R.sum(gv, axis=[1, 2]) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_transpose(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv, axes=[3, 2, 1, 0]) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_expand_dims(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=6): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 1, 4, 1, 26, 26), "float32") = R.expand_dims(gv, axis=(-3, 1)) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_squeeze(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((1, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=3): + with R.dataflow(): + gv: R.Tensor((1, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((4, 26, 26), "float32") = R.squeeze(gv, axis=[0]) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_strided_slice(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 2, 9, 7), dtype="float32") = R.strided_slice( + gv, begin=[0, 0, 0], end=[4, 26, 26], strides=[2, 3, 4], axes=[1, 2, 3] + ) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_relu_concat(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1) + R.output(gv3) + return gv3 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_relu_concat_split(target): + @I.ir_module + class Input: + @R.function + def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1) + gv4 = R.split(gv3, indices_or_sections=2, axis=1) + # TODO @Siva: Multi value return have an issue at runtime. + gv5 = gv4[0] + R.output(gv5) + return gv5 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_relu_concat_split_transpose_concat(target): + @I.ir_module + class Input: + @R.function + def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv) + gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1) + gv4 = R.split(gv3, indices_or_sections=2, axis=1) + gv5: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[0], axes=[3, 2, 1, 0]) + gv6: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[1], axes=[3, 2, 1, 0]) + gv7: R.Tensor((26, 26, 8, 2), "float32") = R.concat((gv5, gv6), axis=2) + R.output(gv7) + return gv7 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_maxpool2d(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2 = R.nn.max_pool2d( + gv, + pool_size=[2, 2], + strides=[2, 2], + padding=[0, 0], + layout="NCHW", + out_layout="NCHW", + ) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_avgpool2d(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2 = R.nn.adaptive_avg_pool2d(gv, output_size=[13, 13], layout="NCHW") + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_softmax(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2 = R.nn.softmax(gv, axis=1) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_layernorm(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), + w: R.Tensor((4, 16, 3, 3), "float32"), + gamma: R.Tensor((26, 26), dtype="float32"), + beta: R.Tensor((26, 26), dtype="float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), dtype="float32") = R.nn.layer_norm( + gv, gamma, beta, axes=[-2, -1] + ) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_binary_broadcast(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), + w: R.Tensor((4, 16, 3, 3), "float32"), + bias: R.Tensor((26, 26), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_binary_ewise_scalar(target): + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32") + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32") + gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, R.const(1, "float32")) + R.output(gv2) + return gv2 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_residual_block(target): + """ + - some kind of residual block followed by convolution to have texture after residual block + - scalar data type verification which should be mapped to global memory scope + layout_transform (NCHW->NCHW4c) + | <- buffer + conv2d (1) <- to get textures as output + / \ + conv2d (2) | + \ / + add <- add should be fused into conv2d (2) + multiply to scalar <- buffer to the input of multiply scalar value + relu + | <- texture in intermediate tensor + conv2d (3) + relu + | <- buffer + layout_transform (NCHW4c->NCHW) + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 32, 40, 40), "float32"), + w1: R.Tensor((32, 32, 2, 2), "float32"), + w2: R.Tensor((32, 32, 1, 1), "float32"), + w3: R.Tensor((32, 32, 2, 2), "float32"), + bias: R.Tensor((1, 32, 1, 1), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32") + gv1 = R.add(gv, bias) + gv2 = R.nn.relu(gv1) + gv3 = R.nn.conv2d(gv2, w2, strides=[1, 1], out_dtype="float32") + bias_1 = R.multiply(bias, R.const(0.15, "float32")) + gv4 = R.add(gv3, bias_1) + gv5 = R.nn.relu(gv4) + gv6 = R.nn.conv2d(gv5, w3, strides=[2, 2], out_dtype="float32") + gv7 = R.nn.relu(gv6) + R.output(gv7) + return gv7 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_conv2d_fallback_to_buffer_conv2d(target): + """ + layout_transform (NCHW->NCHW4c) + | <- texture + conv2d (1) <- textures as output + / \ + conv2d (2) conv2d (3) <- conv2d (2) emits texture, conv2d (3) emits buffer + \ / <- concat shouldn't support textures here + concatenation + | <- buffer + layout_transform (NCHW4c->NCHW) + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 32, 40, 40), "float32"), + w1: R.Tensor((96, 32, 2, 2), "float32"), + w2: R.Tensor((32, 96, 2, 2), "float32"), + w3: R.Tensor((5, 96, 2, 2), "float32"), + bias1: R.Tensor((1, 96, 1, 1), "float32"), + bias2: R.Tensor((1, 32, 1, 1), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32") + gv1 = R.add(gv, bias1) + gv2 = R.nn.relu(gv1) + gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32") + gv4 = R.add(gv3, bias2) + gv5 = R.nn.relu(gv4) + gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32") + gv7 = R.concat((gv3, gv6), axis=1) + R.output(gv7) + return gv7 + + verify(Input, "opencl") + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_conv2d_conv2d_conv2d_concat(target): + """ + layout_transform (NCHW->NCHW4c) + | <- texture + conv2d (1) <- textures as output + / \ + conv2d (2) conv2d (3) + \ / <- concat does support textures here + concatenation + | <- buffer + layout_transform (NCHW4c->NCHW) + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 32, 40, 40), "float32"), + w1: R.Tensor((96, 32, 2, 2), "float32"), + w2: R.Tensor((32, 96, 2, 2), "float32"), + w3: R.Tensor((8, 96, 2, 2), "float32"), + bias1: R.Tensor((1, 96, 1, 1), "float32"), + bias2: R.Tensor((1, 32, 1, 1), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32") + gv1 = R.add(gv, bias1) + gv2 = R.nn.relu(gv1) + gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32") + gv4 = R.add(gv3, bias2) + gv5 = R.nn.relu(gv4) + gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32") + gv7 = R.concat((gv3, gv6), axis=1) + R.output(gv7) + return gv7 + + verify(Input, "opencl") + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_pooling_branching_texture_params(target): + """ + Verification of the pooling and many branches having textures + layout_transform (NCHW->NCHW4c) + | <- texture + conv2d (0) <- to get textures + | <- textures + pooling + / \ \ <- textures + conv2d (1) conv2d (2) conv2d (3) + \ / | + add | <- to have the only one output, will be fused + \ / + add <- to have the only one output, will be fused + | <- buffer + layout_transform (NCHW4c->NCHW) + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((2, 32, 40, 40), "float32"), + w1: R.Tensor((32, 32, 1, 1), "float32"), + w2: R.Tensor((32, 32, 2, 2), "float32"), + w3: R.Tensor((32, 32, 1, 1), "float32"), + w4: R.Tensor((32, 32, 2, 2), "float32"), + bias1: R.Tensor((1, 32, 1, 1), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + gv = R.nn.conv2d(x, w1, strides=[1, 1], out_dtype="float32") + gv1 = R.nn.max_pool2d(gv, pool_size=[2, 2], strides=[2, 2]) + gv2 = R.nn.conv2d( + gv1, w2, padding=[0, 0, 1, 1], strides=[1, 1], out_dtype="float32" + ) + gv3 = R.add(gv2, bias1) + gv4 = R.nn.relu(gv3) + gv5 = R.nn.conv2d( + gv1, w3, padding=[0, 0, 0, 0], strides=[1, 1], out_dtype="float32" + ) + gv6 = R.nn.conv2d( + gv1, w4, padding=[0, 1, 1, 0], strides=[1, 1], out_dtype="float32" + ) + gv7 = R.nn.relu(gv6) + gv8 = R.add(gv2, gv5) + gv9 = R.add(gv8, gv6) + R.output(gv9) + return gv9 + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_injective_inputs1(target): + """ + Input + / \ + / | + | / + conv2d (1) / + | / + conv2d (2) mean + / \ / + | | \ / + | | (3) add + | | | + | \ / + \ mul + \ / + add + + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((1, 4, 40, 40), "float32"), + w1: R.Tensor((4, 4, 3, 3), "float32"), + w2: R.Tensor((4, 4, 3, 3), "float32"), + w3: R.Tensor((4, 4, 3, 3), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + mean = R.mean(x, axis=1, keepdims=True) + conv1 = R.nn.conv2d( + x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32" + ) + conv2 = R.nn.conv2d( + conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32" + ) + ad3 = R.add(conv1, conv2) + ad1 = R.add(mean, conv1) + ad2 = R.multiply(ad1, conv2) + gv = R.add(ad3, ad2) + R.output(gv) + return gv + + verify(Input, target) + + +@tvm.testing.requires_opencl +@tvm.testing.parametrize_targets("opencl") +def test_injective_nwo_inputs2(target): + """ + Input + / \ + | \ + conv2d \ + | / + conv2d mean / + / \ / + add | \ | + | | \ | + | | \ / + | | (3) add + | | | + | \ / + | \ / + \ mul + \ / + add + + """ + + @I.ir_module + class Input: + @R.function + def main( + x: R.Tensor((1, 4, 40, 40), "float32"), + w1: R.Tensor((4, 4, 3, 3), "float32"), + w2: R.Tensor((4, 4, 3, 3), "float32"), + w3: R.Tensor((4, 4, 3, 3), "float32"), + ) -> R.Tensor(None, "float32", ndim=4): + with R.dataflow(): + mean = R.mean(x, axis=1, keepdims=True) + conv1 = R.nn.conv2d( + x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32" + ) + conv2 = R.nn.conv2d( + conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32" + ) + ad3 = R.add(conv1, conv2) + ad1 = R.add(mean, conv1) + ad2 = R.multiply(ad1, conv2) + gv = R.add(ad2, ad3) + R.output(gv) + return gv + + verify(Input, target) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relax/texture/test_texture_nd.py b/tests/python/relax/texture/test_texture_nd.py new file mode 100644 index 000000000000..520f309c6e41 --- /dev/null +++ b/tests/python/relax/texture/test_texture_nd.py @@ -0,0 +1,202 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import tvm +import tvm.testing +import pytest +import tempfile +import numpy as np + +from tvm import ( + relax, + IRModule, +) +from tvm.relax.transform.legalize_ops import adreno as legalize_adreno +from tvm.script import ir as I, tir as T +from tvm.target import Target +from tvm.contrib import ndk +from tvm import tir, DataType +from tvm.rpc import connect_tracker + + +def get_rpc(): + """ + Establish an RPC connection to the remote device. + + Returns + ------- + tvm.rpc.RPCSession or None + The RPC session object if RPC_TARGET is set; otherwise, None. + """ + rpc_target = os.getenv("RPC_TARGET", None) + if rpc_target: + host = os.getenv("TVM_TRACKER_HOST", "localhost") + port = int(os.getenv("TVM_TRACKER_PORT", 9090)) + device_key = os.getenv("RPC_DEVICE_KEY", "android") + tracker = connect_tracker(host, port) + return tracker.request(device_key, priority=1, session_timeout=1000) + else: + return None + + +def preprocess_pipeline(mod: IRModule) -> IRModule: + desired_layouts = {"relax.nn.conv2d": ["NCHW16c", "OIHW16o", "NCHW16c"]} + seq = tvm.transform.Sequential( + [ + tvm.tir.transform.BindTarget(Target.current(allow_none=False)), + tvm.relax.transform.FoldConstant(), + tvm.relax.transform.DecomposeOpsForInference(), + tvm.relax.transform.FoldConstant(), + tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)), + tvm.relax.transform.ConvertLayout(desired_layouts), + tvm.relax.transform.Normalize(), + tvm.relax.transform.FoldConstant(), + tvm.relax.transform.LegalizeOps(), + tvm.relax.transform.LegalizeOps( + {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo} + ), + tvm.relax.transform.FoldConstant(), + tvm.relax.transform.AnnotateTIROpPattern(), + tvm.relax.transform.FuseOps(), + tvm.relax.transform.FuseTIR(), + tvm.relax.transform.DeadCodeElimination(), + tvm.relax.transform.Normalize(), + ] + ) + mod = seq(mod) + return mod + + +def postprocess_pipeline(mod: IRModule) -> IRModule: + seq = tvm.transform.Sequential( + [ + tvm.relax.transform.ToNonDataflow(), + tvm.relax.transform.RemovePurityChecking(), + tvm.relax.transform.CallTIRRewrite(), + tvm.relax.transform.Normalize(), + tvm.relax.transform.StaticPlanBlockMemory(), + tvm.relax.transform.LowerAllocTensor(), + tvm.relax.transform.KillAfterLastUse(), + tvm.relax.transform.LowerRuntimeBuiltin(), + tvm.relax.transform.VMShapeLower(), + tvm.relax.transform.AttachGlobalSymbol(), + ] + ) + mod = seq(mod) + return mod + + +@tvm.testing.requires_rpc +@tvm.testing.requires_adreno_opencl +@pytest.mark.parametrize("backend", ["opencl"]) +@pytest.mark.parametrize("dtype", ["int8", "float16", "int16", "float32", "int32"]) +@pytest.mark.parametrize("channel_size", [64, 128]) +@pytest.mark.parametrize("read_width", [1, 2, 4, 8, 16]) +def test_texture_copy(backend, dtype, channel_size, read_width): + remote = get_rpc() + M, N, K = (256, 1024, 128) + lanes = channel_size // DataType(dtype).bits + if read_width > lanes: + return + + @I.ir_module + class TextureCopy: + @T.prim_func + def main(A: T.Buffer((M, N), dtype), B: T.Buffer((M, N), dtype)): + T.func_attr({"global_symbol": "main"}) + for li, lj in T.grid(M, N): + with T.block("Copy"): + i, j = T.axis.remap("SS", [li, lj]) + B[i, j] = A[i, j] + + def schedule_texture_read(sch: tir.Schedule): + B_blk = sch.get_block("Copy") + Ai_block = sch.cache_read(B_blk, 0, "global.texture") + sch.transform_layout(Ai_block, ("write", 0), lambda i, j: (i, j // lanes, j % lanes)) + + def schedule_default(blk, lanes): + i, j = sch.get_loops(blk) + jo, jv = sch.split(j, [None, lanes]) + + b = sch.fuse(i, jo) + bx, tx = sch.split(b, [None, 256]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + + sch.vectorize(jv) + + schedule_default(Ai_block, lanes) + schedule_default(B_blk, read_width) + + mod = TextureCopy + + if remote is None: + target = Target(backend + " -device=adreno") + else: + target = Target(backend + " -device=adreno", "llvm -mtriple=aarch64-linux-android") + + with target: + mod = preprocess_pipeline(mod) + sch = tir.Schedule(mod) + schedule_texture_read(sch) + mod = postprocess_pipeline(sch.mod) + + ex = relax.build(mod, target) + load_path = "vm_library.so" + inputs = [np.random.randint(0, 128, (M, N)).astype(dtype), np.zeros((M, N), dtype)] + with tempfile.TemporaryDirectory() as temp_dir: + if remote is not None: + path = temp_dir + "/" + load_path + ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"]) + remote.upload(path) + rexec = remote.load_module(load_path) + dev = remote.cl() + if "vdevice" in mod.global_infos: + device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))] + else: + device_arr = [dev] + vm = relax.VirtualMachine(rexec, device_arr) + else: + # local execution + if "opencl" in backend: + dev = tvm.opencl(0) + elif "vulkan" in backend: + dev = tvm.vulkan(0) + else: + raise RuntimeError("Unsupported backend") + + if "vdevice" in mod.global_infos: + device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))] + else: + device_arr = [dev] + vm = relax.VirtualMachine(ex, device_arr) + + inps = [tvm.runtime.tensor(inp, dev) for inp in inputs] + vm["main"](*inps) + + out1 = inps[-1].numpy() + out2 = inps[0].numpy() + + if remote: + remote.get_function("CloseRPCConnection")() + + np.testing.assert_equal(out1, out2) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh index a35a633e1dfd..a401fb808acb 100755 --- a/tests/scripts/setup-adreno-env.sh +++ b/tests/scripts/setup-adreno-env.sh @@ -103,6 +103,7 @@ case ${ENVIRONMENT} in adb shell "mkdir -p ${TARGET_FOLDER}" adb push build-adreno-target/tvm_rpc ${TARGET_FOLDER}/tvm_rpc-${USER} adb push build-adreno-target/libtvm_runtime.so ${TARGET_FOLDER} + adb push build-adreno-target/lib/libtvm_ffi.so ${TARGET_FOLDER} if [ -f ${CPP_LIB} ] ; then adb push ${CPP_LIB} ${TARGET_FOLDER} fi diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py index 96fc781744f9..2f9f91df8a0e 100755 --- a/tests/scripts/task_build.py +++ b/tests/scripts/task_build.py @@ -58,6 +58,8 @@ logging.info(f"Using sccache region: {env['SCCACHE_REGION']}") else: logging.info(f"No sccache bucket set, using local cache") + if "SCCACHE_SERVER_PORT" in os.environ: + env["SCCACHE_SERVER_PORT"] = os.getenv("SCCACHE_SERVER_PORT") env["CXX"] = "/opt/sccache/c++" env["CC"] = "/opt/sccache/cc" diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh index 8b85a27277e0..bc5a075b2325 100755 --- a/tests/scripts/task_build_adreno_bins.sh +++ b/tests/scripts/task_build_adreno_bins.sh @@ -39,7 +39,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake fi echo set\(USE_RPC ON\) >> config.cmake echo set\(USE_CPP_RPC ON\) >> config.cmake -#echo set\(USE_CPP_RTVM ON\) >> config.cmake echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake @@ -52,6 +51,9 @@ echo set\(USE_OPENCL_GTEST ON\) >> config.cmake echo set\(USE_OPENCL_EXTN_QCOM ON\) >> config.cmake cmake -DANDROID_ABI=arm64-v8a \ + -DCMAKE_SYSTEM_NAME=Android \ + -DCMAKE_ANDROID_ARCH_ABI="arm64-v8a" \ + -DCMAKE_SYSROOT="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/" \ -DANDROID_PLATFORM=android-28 \ -DCMAKE_SYSTEM_VERSION=1 \ -DCMAKE_FIND_ROOT_PATH="${ADRENO_OPENCL}" \ diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh index 1714a3c06358..b381fddc2427 100755 --- a/tests/scripts/task_python_adreno.sh +++ b/tests/scripts/task_python_adreno.sh @@ -84,5 +84,14 @@ for node_id in $RELAX_TESTS; do i=$((i+1)) done +# Texture test +RELAX_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/relax/texture 2> /dev/null | grep -v dlerror) +i=0 +for node_id in $RELAX_TESTS; do + echo "$node_id" + CXX=${TVM_NDK_CC} run_pytest ctypes "$TVM_INTEGRATION_TESTSUITE_NAME-texture-relax-$i" "$node_id" --reruns=0 + i=$((i+1)) +done + kill ${TRACKER_PID} kill ${DEVICE_PID} diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh index 6d28d8f30f8b..776d29fda07f 100755 --- a/tests/scripts/task_python_unittest_gpuonly.sh +++ b/tests/scripts/task_python_unittest_gpuonly.sh @@ -34,3 +34,13 @@ export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-vulkan source tests/scripts/setup-pytest-env.sh run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/codegen/test_target_codegen_vulkan.py + + +# Adreno : A comprehensive Texture tests on Nvidia GPU and clml codegen tests. +export PYTEST_ADDOPTS="" +export TVM_TEST_TARGETS="opencl" +export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-clml-texture + +source tests/scripts/setup-pytest-env.sh + +run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/relax/texture/ diff --git a/tests/scripts/unity/task_python_relax_gpuonly.sh b/tests/scripts/unity/task_python_relax_gpuonly.sh index acbcce44f279..19ab942a1193 100755 --- a/tests/scripts/unity/task_python_relax_gpuonly.sh +++ b/tests/scripts/unity/task_python_relax_gpuonly.sh @@ -18,7 +18,7 @@ export TVM_TEST_TARGETS="llvm;cuda" export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS" -export TVM_RELAY_TEST_TARGETS="cuda" +export TVM_RELAY_TEST_TARGETS="cuda;opencl" export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu export TVM_INTEGRATION_GPU_ONLY=1