diff --git a/include/tvm/relax/exec_builder.h b/include/tvm/relax/exec_builder.h
index 4fd0fd66bb90..222dea3fb1f2 100644
--- a/include/tvm/relax/exec_builder.h
+++ b/include/tvm/relax/exec_builder.h
@@ -122,6 +122,15 @@ class ExecBuilderNode : public Object {
     rv = value;
     return ConvertConstant_(rv);
   }
+  /*!
+   * \brief update memory scopes.
+   *
+   * This function builds the memory scopes for constants.
+   *
+   * \param idx Index of the constant
+   * \param scope The memory scope.
+   */
+  void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope);
   /*!
    * \brief Raw access to underlying executable build in progress.
    */
diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 37488ff31f52..7e2380ddf691 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -155,6 +155,8 @@ class VMExecutable : public ffi::ModuleObj {
   std::unordered_map<std::string, Index> func_map;
   /*! \brief The global constant pool. */
   std::vector<ffi::Any> constants;
+  /*! \brief The VDevice memory scopes */
+  std::unordered_map<Index, std::string> memory_scopes;
   /*! \brief The offset of instruction. */
   std::vector<Index> instr_offset;
   /*! \brief The byte data of instruction. */
@@ -177,6 +179,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void SaveGlobalSection(dmlc::Stream* strm) const;
+  /*!
+   * \brief Save the memory scopes.
+   * \param strm The output stream.
+   */
+  void SaveMemoryScopeSection(dmlc::Stream* strm) const;
   /*!
    * \brief Save the constant pool.
    * \param strm The input stream.
@@ -197,6 +204,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void LoadGlobalSection(dmlc::Stream* strm);
+  /*!
+   * \brief Load the memory scopes.
+   * \param strm The input stream.
+   */
+  void LoadMemoryScopeSection(dmlc::Stream* strm);
   /*!
    * \brief Load the constant pool.
    * \param strm The input stream.
diff --git a/include/tvm/tir/backend/adreno/transform.h b/include/tvm/tir/backend/adreno/transform.h
new file mode 100644
index 000000000000..d2ca90a8cede
--- /dev/null
+++ b/include/tvm/tir/backend/adreno/transform.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/tir/backend/adreno/transform.h
+ * \brief TIR specific Adreno GPU transformation passes.
+ */
+#ifndef TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
+#define TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
+
+#include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/transform.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace tir {
+namespace backend {
+namespace adreno {
+namespace transform {
+
+using tvm::tir::transform::CreatePrimFuncPass;
+using tvm::transform::Pass;
+using tvm::transform::PassContext;
+using tvm::transform::PassContextNode;
+using tvm::transform::PassInfo;
+using tvm::transform::PassInfoNode;
+using tvm::transform::PassNode;
+using tvm::transform::Sequential;
+
+/*!
+ * \brief Texture flattening pass.
+ * \return The pass.
+ */
+TVM_DLL Pass TextureFlatten();
+
+/*!
+ * \brief Inject Texture Allocation intrinsic.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectTextureAlloc();
+
+}  // namespace transform
+}  // namespace adreno
+}  // namespace backend
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
diff --git a/python/tvm/dlight/adreno/__init__.py b/python/tvm/dlight/adreno/__init__.py
index ea2781455989..87afa08818b2 100644
--- a/python/tvm/dlight/adreno/__init__.py
+++ b/python/tvm/dlight/adreno/__init__.py
@@ -18,3 +18,6 @@
 Adreno schedule rules.
 """
 from .convolution import Conv2d
+from .layout_transform import LayoutTransform
+from .fallback import Fallback
+from .pool import Pool2D
diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index fc2cc449a1c6..462fb2550bd4 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -16,215 +16,85 @@
 # under the License.
 # pylint: disable=missing-docstring, invalid-name
 """A Conv2d schedule rule for Adreno GPU operators."""
-from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional, Union
 
 from tvm import tir
 from tvm.target import Target
-from tvm.tir import IterVar
-from tvm.tir.schedule.schedule import BlockRV
 
-from ..analysis import BlockInfo, IterInfo
+from .utils import schedule_inline_blocks, schedule_default
+from .. import analysis
 from .base import AdrenoScheduleRule
 
 
-def is_spatial_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.DataPar}
-
-
-def is_reduction_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.CommReduce, IterVar.DataPar}
-
-
-def _collect_producers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for producer in sch.get_producers(block):
-        result.append(producer)
-        result.extend(_collect_producers(sch, producer))
-    return result
-
-
-def _collect_consumers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for consumer in sch.get_consumers(block):
-        result.append(consumer)
-        result.extend(_collect_consumers(sch, consumer))
-    return result
-
-
-def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo:
-    def _iter_kind(loop: tir.IterVar) -> str:
-        return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
-
-    def _is_reduction_block(block: tir.schedule.BlockRV):
-        for iter_var in sch.get(block).iter_vars:
-            if _iter_kind(iter_var) == "R":
-                return True
-        return False
-
-    return BlockInfo(
-        name=sch.get(block).name_hint,
-        iters=[
-            IterInfo(
-                kind=_iter_kind(iter_var),
-                var=iter_var.var,
-                dom=iter_var.dom.extent,
-                loop_rv=loop_rv,
-            )
-            for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars)
-        ],
-        block_rv=block,
-        reduction_block=_is_reduction_block(block),
-    )
-
-
-def get_reduction_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]) -> bool:
-    # NOTE: We assume there is only one reduction block in the function
-    # all blocks are required to be spatial or reduction
-    if not all(
-        [is_reduction_block(sch, block) or is_spatial_block(sch, block) for block in blocks]
-    ):
-        return None
-
-    # There is only one reduction block
-    reduction_blocks = [block for block in blocks if is_reduction_block(sch, block)]
-    if len(reduction_blocks) != 1:
-        return None
-
-    return reduction_blocks[0]
-
-
-def is_convolution(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    # TODO: Use buffer access patterns to discover convolution type kernels instead of using name.
-    return (
-        sch.get(block).name_hint.count("conv2d_NCHWc_OIHWo")
-        and "".join([iter_type.kind for iter_type in get_block_info(sch, block).iters])
-        == "SSSSSRRR"
-    )
-
-
 class Conv2d(AdrenoScheduleRule):
     """The schedule rule for convolution computation"""
 
-    @dataclass
-    class Config:
-        block_size_x: int = 8
-        block_size_y: int = 8
-        vector_size: int = 1
-        unroll: int = 256  # 0 means no unroll
-        use_shared: bool = True
-        storage_align: bool = False
-        inner_x: bool = False
-
-    def get_configs(self, target: Target) -> Config:
-        """Get the schedule config for the target"""
-        if target.kind.name == "cuda" or target.kind.name == "rocm":
-            return Conv2d.Config(
-                block_size_x=8,
-                block_size_y=16,
-                vector_size=2,
-                unroll=256,
-                use_shared=True,
-                storage_align=True,
-                inner_x=False,
-            )
-        elif target.kind.name == "opencl" and (
-            ("android" in str(target.host)) or ("adreno" in str(target.attrs))
-        ):
-            return Conv2d.Config(
-                block_size_x=32,
-                block_size_y=4,
-                vector_size=8,
-                unroll=16,
-                use_shared=False,
-                storage_align=False,
-                inner_x=True,
-            )
-        else:
-            return Conv2d.Config()
+    @staticmethod
+    def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
+        n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
+
+        bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True)
+        by, vy, ty = sch.split(oh, [None, 1, 16], preserve_unit_iters=True)
+        bx, vx, tx = sch.split(ow, [None, 1, 16], preserve_unit_iters=True)
+
+        bz = sch.fuse(n, bz, preserve_unit_iters=True)
+        sch.reorder(bz, by, bx, vz, vy, vx, tz, ty, tx, ob)
+        sch.bind(bz, "blockIdx.z")
+        sch.bind(by, "blockIdx.y")
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(vz, "vthread.z")
+        sch.bind(vy, "vthread.y")
+        sch.bind(vx, "vthread.x")
+        sch.bind(tz, "threadIdx.z")
+        sch.bind(ty, "threadIdx.y")
+        sch.bind(tx, "threadIdx.x")
+
+        rblk = sch.cache_read(blk, 0, "local")
+        ico, icb = sch.split(ic, [None, 4], preserve_unit_iters=True)
+        sch.reorder(ico, kh, kw, icb, ob)
+
+        sch.compute_at(rblk, kw, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(rblk)[-1])
+        wblk = sch.cache_write(blk, 0, "local")
+        sch.reverse_compute_at(wblk, tx, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(wblk)[-1])
+        init_blk = sch.decompose_reduction(blk, tx)
+        sch.vectorize(sch.get_loops(init_blk)[-1])
 
     def apply(  # pylint: disable=too-many-locals,missing-docstring
         self,
-        func: tir.PrimFunc,
+        func: Union[tir.PrimFunc, tir.Schedule],
         target: Target,
         _: bool,
     ) -> Optional[tir.Schedule]:
-        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+        if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available(
+            target
+        ):
             return None
 
         if isinstance(func, tir.PrimFunc):
             sch = tir.Schedule(func)
+            sch.work_on("main")
+        elif isinstance(func, tir.Schedule):
+            sch = func
 
-        # config = self.get_configs(target)
-        root_block = analysis.get_root_block(sch)
+        root_block = analysis.get_root_block(sch, sch.func_working_on)
         blocks = sch.get_child_blocks(root_block)
-        reduction_block = get_reduction_blocks(sch, blocks)
+        reduction_blocks = list(
+            filter(lambda block: analysis.get_block_info(sch, block).is_reduction(), blocks)
+        )
+        remaining_blocks = [blk for blk in blocks if blk not in reduction_blocks]
 
-        if reduction_block is None:
-            return None
-        if not is_convolution(sch, reduction_block):
+        def is_convolution(blk):
+            block_info = analysis.get_block_info(sch, blk)
+            return "conv2d_NCHWc" in block_info.name
+
+        if len(reduction_blocks) != 1 or not is_convolution(reduction_blocks[0]):
             return None
 
-        def schedule_data_pad(blk):
-            axes = sch.get_loops(blk)
-            axes, vec = axes[:-1], axes[-1]
-            axis = sch.fuse(*axes)
-            bx, ty, tx = sch.split(axis, [None, 16, 16])
-            sch.bind(bx, "blockIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(tx, "threadIdx.x")
-            sch.vectorize(vec)
-
-        def schedule_conv2d(blk):
-            # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
-            n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
-            sch.reorder(n, oc, oh, ow, ic, kh, kw, ob)
-            main_lp = sch.fuse(n, oc, oh, ow)
-            bx, ty, tx = sch.split(main_lp, [None, 16, 16])
-            sch.bind(tx, "threadIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(bx, "blockIdx.x")
-
-            ico, icv = sch.split(ic, [None, 4])
-            sch.reorder(ico, kh, kw, icv, ob)
-            rblk = sch.cache_read(blk, 0, "local")
-            sch.compute_at(rblk, kw)
-            sch.vectorize(sch.get_loops(rblk)[-1])
-            wblk = sch.cache_write(blk, 0, "local")
-            sch.reverse_compute_at(wblk, tx)
-            sch.vectorize(sch.get_loops(wblk)[-1])
-            sch.vectorize(ob)
-            init_blk = sch.decompose_reduction(blk, ico)
-            sch.vectorize(sch.get_loops(init_blk)[-1])
-
-        def is_data_pad(block: tir.stmt.Block):
-            return is_spatial_block(sch, block) and tir.analysis.has_if_then_else(sch.get(block))
-
-        def schedule_conv2d_blocks():
-
-            # Do analysis to find block type
-            blocks = sch.get_child_blocks(root_block)
-            passed_reduction = False
-            for blk in blocks:
-                if is_reduction_block(sch, blk):
-                    schedule_conv2d(blk)
-                    passed_reduction = True
-                elif is_data_pad(blk):
-                    schedule_data_pad(blk)
-                elif is_spatial_block(sch, blk):
-                    try:
-                        if not passed_reduction:
-                            sch.compute_inline(blk)
-                        else:
-                            sch.reverse_compute_inline(blk)
-                    except:  # pylint: disable=W0702
-                        pass
-                else:
-                    raise TypeError("Can't Schedule this Block", sch.get(blk))
-
-        schedule_conv2d_blocks()
+        conv_blk = reduction_blocks[0]
+        Conv2d.schedule_conv2d(sch, conv_blk)
+        remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
+        schedule_default(sch, remaining_blocks)
+
         return sch
diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
new file mode 100644
index 000000000000..050781a21f32
--- /dev/null
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -0,0 +1,177 @@
+# licensed to the apache software foundation (asf) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Dlight Adreno Fallback Schedules"""
+
+
+from typing import List, Union
+
+from tvm import tir
+from tvm.target import Target
+from .. import analysis
+from .base import AdrenoScheduleRule
+
+
+def _assert_gpu_target(target: Target):
+    if "gpu" not in target.keys:
+        raise ValueError(f"Expect a GPU target, but got {target}")
+
+
+def get_max_threads_per_block(target: Target) -> int:
+    _assert_gpu_target(target)
+    max_threads_per_block = None
+    for name in ["max_threads_per_block", "max_num_threads"]:
+        if max_threads_per_block is None:
+            max_threads_per_block = target.attrs.get(name, None)
+    if max_threads_per_block is None:
+        max_threads_per_block = 64
+    return int(max_threads_per_block)
+
+
+# pylint: disable=invalid-name,missing-function-docstring,unused-variable,unused-import
+class Fallback(AdrenoScheduleRule):
+    """Texture Based Fallback Schedule(s) for Adreno"""
+
+    @staticmethod
+    def schedule_inline_blocks(
+        sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]
+    ) -> List[tir.schedule.BlockRV]:
+        """
+        Auto Inlines Injective and Element-wise Operations while trying to omit data pad blocks...
+        """
+
+        if blocks is None:
+            root_blk = analysis.get_root_block(sch)
+            blocks = sch.get_child_blocks(root_blk)
+
+        remaining_blocks = []
+        for blk in blocks:
+            block_info = analysis.get_block_info(sch, blk)
+            if block_info.is_injective() and not block_info.is_data_pad(sch):
+                if len(sch.get_consumers(blk)) == 1:
+                    try:
+                        sch.compute_inline(blk)
+                    except Exception:  # pylint: disable=broad-exception-caught
+                        remaining_blocks.append(blk)
+                elif len(sch.get_producers(blk)) == 1:
+                    inlined_once = False
+                    try:
+                        # Would cause an issue inlining to producer with multiple consumers
+                        while (
+                            len(sch.get_producers(blk)) == 1
+                            and len(sch.get_consumers(sch.get_producers(blk)[0])) == 1
+                        ):
+                            sch.reverse_compute_inline(blk)
+                            inlined_once = True
+                    except Exception:  # pylint: disable=broad-exception-caught
+                        break
+                    if not inlined_once:
+                        remaining_blocks.append(blk)
+                else:
+                    remaining_blocks.append(blk)
+            else:
+                remaining_blocks.append(blk)
+        return remaining_blocks
+
+    @staticmethod
+    def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
+        block_info = analysis.get_block_info(sch, blk)
+
+        s_loops, r_loops, o_loops = [], [], []
+        v_loop = block_info.write_bufs(sch)[0].assoc_lps[-1]
+
+        for iter_info in block_info.iters:
+            if sch.get(iter_info.loop_rv) == sch.get(v_loop):
+                continue
+            {"S": s_loops, "R": r_loops, "O": o_loops}.get(iter_info.kind).append(iter_info.loop_rv)
+
+        iter_vars = analysis.collect_block_iter_vars_used_in_access_region(
+            sch.get(blk), block_info.write_bufs(sch)[0].buf_region.region
+        )
+        o_outer = [lp for lp in o_loops if sch.get(lp).var in iter_vars]
+        o_inner = [lp for lp in o_loops if sch.get(lp).var not in iter_vars]
+
+        # Can't change loop order for opaque loops
+        if o_loops != o_outer + o_inner:
+            return
+
+        o_outer.append(v_loop)
+        sch.reorder(*s_loops, *o_outer, *r_loops, *o_inner)
+
+        assert s_loops
+        tgt = Target.current(allow_none=True)
+
+        b = sch.fuse(*s_loops)
+        tx_extent = get_max_threads_per_block(tgt) if tgt is not None else 256
+        bx, tx = sch.split(b, [None, tx_extent])
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
+
+        if len(r_loops) > 1:
+            lp = [*s_loops, *o_outer][-1]
+            init_block = sch.decompose_reduction(blk, lp)
+            wblk = sch.cache_write(blk, 0, "local")
+            sch.compute_at(wblk, lp)
+            if v_loop:
+                sch.vectorize(sch.get_loops(init_block)[-1])
+                sch.vectorize(sch.get_loops(wblk)[-1])
+        elif v_loop is not None:
+            sch.vectorize(v_loop)
+
+    @staticmethod
+    def schedule_fallback(sch):
+        root_block = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_block)
+
+        schedule_blocks = [
+            blk
+            for blk in blocks
+            if analysis.get_block_info(sch, blk).is_reduction()
+            or analysis.get_block_info(sch, blk).is_data_pad(sch)
+        ]
+        remaining_blocks = [blk for blk in blocks if blk not in schedule_blocks]
+
+        for blk in schedule_blocks:
+            Fallback.schedule_default(sch, blk)
+        remaining_blocks = Fallback.schedule_inline_blocks(sch, remaining_blocks)
+        # TODO: Analyze unscheduled blocks to schedule instead of relying on remaining
+        for blk in remaining_blocks:
+            Fallback.schedule_default(sch, blk)
+
+    def apply(  # pylint: disable=too-many-locals
+        self,
+        func: tir.PrimFunc,
+        target: Target,
+        _: bool,
+    ) -> Union[None, tir.Schedule, List[tir.Schedule]]:
+        # pylint: disable=invalid-name
+
+        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+            return None
+
+        sch = tir.Schedule(func)
+        root_block = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_block)
+
+        if any(len(sch.get_child_blocks(block)) != 0 for block in blocks):
+            return None
+
+        block_infos = [analysis.get_block_info(sch, block) for block in blocks]
+        if not any("texture" in block.write_bufs(sch)[0].get_scope() for block in block_infos):
+            return None
+
+        Fallback.schedule_fallback(sch)
+        return sch
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
new file mode 100644
index 000000000000..bf3c446d59ff
--- /dev/null
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -0,0 +1,129 @@
+# Licensed to the apache software foundation (asf) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name, unused-variable
+
+"Schedules for Texture Based Layout Transforms"
+from typing import List, Union
+
+from tvm import tir
+from tvm.target import Target
+from .. import analysis
+
+from .base import AdrenoScheduleRule
+
+
+class LayoutTransform(AdrenoScheduleRule):
+    """Texture based Layout Transform Dlight Schedule for Adreno"""
+
+    def __init__(self, use_op_name=True):
+        self.use_op_name = use_op_name
+
+    # TODO: Try using Coalesced Writes...
+    def apply(  # pylint: disable=too-many-locals
+        self,
+        func: Union[tir.PrimFunc, tir.Schedule],
+        target: Target,
+        _: bool,
+    ) -> Union[None, tir.Schedule, List[tir.Schedule]]:
+        # pylint: disable=invalid-name
+        if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available(
+            target
+        ):
+            return None
+
+        if isinstance(func, tir.PrimFunc):
+            sch = tir.Schedule(func)
+            sch.work_on("main")
+        elif isinstance(func, tir.Schedule):
+            sch = func
+
+        root_block = analysis.get_root_block(sch, sch.func_working_on)
+
+        if len(sch.get_child_blocks(root_block)) != 1:
+            return None
+
+        blk = sch.get_child_blocks(root_block)[0]
+        block_info = analysis.get_block_info(sch, blk)
+        if not (
+            (self.use_op_name and block_info.name == "te_layout_transform")
+            or (not self.use_op_name and block_info.is_layout_transform(sch))
+        ):
+            return None
+
+        read_buf, write_buf = (block_info.read_bufs(sch)[0], block_info.write_bufs(sch)[0])
+        lps = block_info.get_loops()
+        lpv_read, lpv_write = (
+            read_buf.assoc_lps[-1],
+            write_buf.assoc_lps[-1],
+        )
+
+        if lpv_read is None or lpv_write is None:
+            return None
+
+        vlen_read, vlen_write = read_buf.get_vecsize(), write_buf.get_vecsize()
+        local_cache = sch.get(lpv_read) != sch.get(lpv_write) or vlen_read != vlen_write
+        block_loops = [
+            lp
+            for lp in lps
+            if sch.get(lp) != sch.get(lpv_read) and sch.get(lp) != sch.get(lpv_write)
+        ]
+        vec_loops = (
+            [lpv_read, lpv_write] if sch.get(lpv_read) != sch.get(lpv_write) else (lpv_read,)
+        )
+        sch.reorder(*block_loops, *vec_loops)
+        if local_cache:
+            if sch.get(lpv_read) != sch.get(lpv_write):
+                blp_read, vlp_read = sch.split(
+                    lpv_read, [None, vlen_read], preserve_unit_iters=True
+                )
+                blp_write, vlp_write = sch.split(
+                    lpv_write, [None, vlen_write], preserve_unit_iters=True
+                )
+                sch.reorder(blp_read, blp_write, vlp_read, vlp_write)
+                block_loops += [blp_read, blp_write]
+                rblk = sch.cache_read(blk, 0, "local")
+                sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True)
+                sch.vectorize(sch.get_loops(rblk)[-1])
+                sch.vectorize(vlp_write)
+            else:
+                if vlen_read > vlen_write:
+                    read_lp, vec_lp = sch.split(blk, [None, vlen_write], preserve_unit_iters=True)
+                    rblk = sch.cache_read(blk, 0, "local")
+                    sch.compute_at(rblk, read_lp, preserve_unit_loops=True)
+                    sch.vectorize(sch.get_loops(rblk)[-1])
+                    sch.vectorize(vec_lp)
+                else:
+                    rblk = sch.cache_read(blk, 0, "local")
+                    sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True)
+                    _, vread_lp = sch.split(
+                        sch.get_loops(rblk)[-1], vlen_read, preserve_unit_iters=True
+                    )
+                    sch.vectorize(vread_lp)
+                    sch.vectorize(vlp_write)
+        else:
+            blp, vlp = sch.split(lpv_read, [None, vlen_read], preserve_unit_iters=True)
+            block_loops += [blp]
+            sch.vectorize(vlp)
+
+        b = sch.fuse(*block_loops)
+        tx_extent = min(sch.get(b).extent, 256)
+        candidates = [1, 2, 4, 8, 16, 32]
+        bx, tx = sch.split(b, [None, 256], preserve_unit_iters=True)
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
+        return sch
diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
new file mode 100644
index 000000000000..e3709caad96a
--- /dev/null
+++ b/python/tvm/dlight/adreno/pool.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+""" Pool schedule rule for Adreno operators."""
+
+from tvm import tir
+from tvm.target import Target
+
+from .base import AdrenoScheduleRule
+from .. import analysis
+
+
+# pylint: disable=invalid-name, unused-variable
+class Pool2D(AdrenoScheduleRule):
+    def apply(  # pylint: disable=too-many-locals,missing-docstring
+        self,
+        func: tir.PrimFunc,
+        target: Target,
+        _: bool,
+    ) -> tir.Schedule:
+        sch = tir.Schedule(func)
+        root = sch.get_block(name="root", func_name="main")
+
+        blocks = sch.get_child_blocks(root)
+        blocks_names = [sch.get(blk).name_hint for blk in blocks]
+
+        if not "adaptive_pool_sum" in blocks_names and not "pool_max" in blocks_names:
+            return None
+
+        def schedule_pad(blk: tir.schedule.BlockRV):
+            lps, veclp = sch.get_loops(blk)[:-1], sch.get_loops(blk)[-1]
+            sch.vectorize(veclp)
+            b = sch.fuse(*lps)
+            tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256)
+            bx, tx = sch.split(b, [None, tx_extent])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+
+        def schedule_max_pool(blk: tir.schedule.BlockRV):
+            block_info = analysis.get_block_info(sch, blk)
+            iters_kind = "".join([_iter.kind for _iter in block_info.iters])
+            if iters_kind != "SSSSSRR":
+                return None
+
+            lps = sch.get_loops(blk)
+            block_lps, vec_lp, red_lps = lps[:4], lps[4], lps[5:]
+            write_blk = sch.cache_write(blk, 0, "local")
+            sch.reverse_compute_at(write_blk, vec_lp)
+            b = sch.fuse(*block_lps)
+            tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256)
+            bx, tx = sch.split(b, [None, tx_extent])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+            sch.vectorize(vec_lp)
+
+            return True
+
+        passed_reduction = False
+        for blk in blocks:
+            if sch.get(blk).name_hint == "pad_temp":
+                schedule_pad(blk)
+            elif (
+                sch.get(blk).name_hint == "adaptive_pool_sum"
+                or sch.get(blk).name_hint == "pool_max"
+            ):
+                ok = schedule_max_pool(blk)
+                if not ok:
+                    return None
+                passed_reduction = True
+            else:
+                try:
+                    if passed_reduction:
+                        sch.reverse_compute_inline(blk)
+                    else:
+                        sch.compute_inline(blk)
+                except Exception:  # pylint: disable=broad-except
+                    pass
+        return sch
diff --git a/python/tvm/dlight/adreno/utils.py b/python/tvm/dlight/adreno/utils.py
new file mode 100644
index 000000000000..73597fe8578c
--- /dev/null
+++ b/python/tvm/dlight/adreno/utils.py
@@ -0,0 +1,89 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilis for Adreno operators."""
+
+# pylint: disable=import-outside-toplevel, unused-argument, invalid-name, missing-function-docstring
+from typing import List
+
+from tvm.target import Target
+from tvm import tir
+
+from ..analysis import BlockInfo
+
+
+def get_texture_storage(block_info: BlockInfo):
+    """
+    Returns the texture layout acceptable for the shape
+
+    Parameters
+    ----------
+    shape: array
+        Shape of the tensor to be packed to texture
+    """
+    # certain limitation of the Qualcomm devices. Subject to be determined for certain device
+    # individually, but until we have access to remote device during compilation, we have to
+    # define it uniformly for all target devices
+    # spatial_limit = 16384, depth_limit = 2048
+    # TODO: Check Write Bufs.
+    shape = block_info.write_bufs[0].buf_region.buffer.shape
+
+    spatial_limit = Target.current().attrs["texture_spatial_limit"]
+    depth_limit = Target.current().attrs["texture_depth_limit"]
+
+    if len(shape) > 4:
+        if shape[0] < spatial_limit and shape[1] * shape[2] * shape[3] < spatial_limit:
+            return "global.texture-weight"
+        elif shape[0] < depth_limit and shape[2] * shape[3] < spatial_limit:
+            return "global.texture-nhwc"
+        elif (
+            shape[0] * shape[1] < depth_limit
+            and shape[2] < spatial_limit
+            and shape[3] < spatial_limit
+        ):
+            return "global.texture"
+    elif len(shape) > 3:
+        if shape[0] < spatial_limit and shape[1] * shape[2] < spatial_limit:
+            return "global.texture-weight"
+        elif shape[0] < depth_limit and shape[1] < spatial_limit and shape[2] < spatial_limit:
+            return "global.texture"
+    elif len(shape) == 3:
+        if shape[0] < spatial_limit and shape[1] < spatial_limit:
+            return "global.texture-weight"
+
+    return "global"
+
+
+def schedule_inline_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV] = None):
+    from .fallback import Fallback
+
+    return Fallback.schedule_inline_blocks(sch, blocks)
+
+
+def schedule_default(sch, blocks: List[tir.schedule.BlockRV] = None):
+    from .fallback import Fallback
+
+    ret = []
+    for blk in blocks:
+        ret.append(Fallback.schedule_default(sch, blk))
+
+    return ret
+
+
+def schedule_fallback(sch, blk):
+    from .fallback import Fallback
+
+    return Fallback.schedule_fallback(sch)
diff --git a/python/tvm/dlight/analysis/__init__.py b/python/tvm/dlight/analysis/__init__.py
index bf68d0855015..0df8abb2bf5c 100644
--- a/python/tvm/dlight/analysis/__init__.py
+++ b/python/tvm/dlight/analysis/__init__.py
@@ -24,6 +24,7 @@
     is_broadcast_epilogue,
     normalize_prim_func,
     get_root_block,
+    get_block_info,
 )
 from .gemv import (
     is_gemv,
diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index e3357c6e78db..161deaf53772 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -14,9 +14,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Analysis on TIR blocks, loops and functions."""
-from typing import List, Optional, Set, Union
 
+# pylint: disable=missing-function-docstring, missing-class-docstring
+# pylint: disable=unused-argument, unused-variable
+"""Analysis on TIR blocks, loops and functions."""
+from collections import namedtuple
+from typing import List, Optional, Set, Union, Tuple
 from typing_extensions import Literal
 from tvm_ffi import get_global_func
 
@@ -24,6 +27,7 @@
 from tvm.target.target import Target
 from tvm.tir import Schedule
 from tvm.tir.schedule import BlockRV
+from tvm.runtime import DataType
 
 
 class IterInfo:
@@ -59,6 +63,106 @@ def __repr__(self) -> str:
         return str(self)
 
 
+get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
+# BufferIndex Types
+Index = namedtuple("Index", ["sub"])  # c
+RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
+DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
+MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
+BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
+
+
+class BufferInfo:
+    "Information about Buffer. Provides useful analysis"
+    buf_region: tir.BufferRegion
+    shape: Tuple[int]
+    assoc_lps: List[Union[tir.schedule.LoopRV, None]]
+    assoc_lps_info: List[Union[tir.For, None]]
+
+    def __init__(
+        self,
+        sch: tir.Schedule,
+        block_rv: tir.schedule.BlockRV,
+        buf_region: tir.BufferRegion,
+        lps: Union[List[tir.schedule.LoopRV], None],
+    ):
+        block = sch.get(block_rv)
+        if lps is None:
+            lps = sch.get_loops(block_rv)
+        loops = [sch.get(lp) for lp in lps]
+        iter_vars = [Var.var for Var in block.iter_vars]
+        iter_values = get_blockrealize(sch, block_rv).iter_values
+        lpvar_lp = dict([loop.loop_var, lp] for loop, lp in zip(loops, lps))
+        var_lp = dict(zip(iter_vars, [lpvar_lp.get(val, None) for val in iter_values]))
+
+        def extract_index_types(buf: tir.BufferRegion) -> BufIndex:
+            buf_index = []
+            for expr in buf.region:
+                expr = expr.min
+                dim = None
+                if isinstance(expr, tir.expr.Add) and isinstance(expr.b, tir.expr.Var):
+                    var_add = expr.b
+                    if (
+                        isinstance(expr, tir.expr.Mul)
+                        and isinstance(expr.a, tir.expr.Var)
+                        and isinstance(expr.b, tir.expr.IntImm)
+                    ):
+                        mul = expr.b
+                        var_mul = expr.a
+                        dim = MergeIndex(var_mul, mul, var_add)
+                elif (
+                    isinstance(expr, tir.expr.FloorMod)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = RemIndex(expr.a, expr.b)
+                elif (
+                    isinstance(expr, tir.expr.FloorDiv)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = DivIndex(expr.a, expr.b)
+                elif isinstance(expr, tir.expr.Var):
+                    dim = Index(expr)
+                buf_index.append(dim)
+            return buf_index
+
+        indexes = extract_index_types(buf_region)
+        assoc_lps = [
+            (
+                var_lp.get(getattr(idx, "sub"), None)
+                if not isinstance(idx, DivIndex) and not idx is None
+                else None
+            )
+            for idx in indexes
+        ]
+
+        self.buf_region = buf_region
+        self.assoc_lps = assoc_lps
+        self.assoc_lps_info = [(sch.get(lp) if lp is not None else None) for lp in assoc_lps]
+        self.shape = buf_region.buffer.shape
+
+    def get_scope(self) -> str:
+        return self.buf_region.buffer.scope()
+
+    def get_vecsize(self, buf_index: int = 0, vbits: int = 128):
+        if self.assoc_lps_info[-1] is None:
+            return None
+
+        vlp_extent = int(self.assoc_lps_info[-1].extent) & ~(
+            int(self.assoc_lps_info[-1].extent) - 1
+        )
+        vbuf_extent = int(self.shape[-1]) & ~(int(self.shape[-1]) - 1)
+
+        return min(vlp_extent, vbuf_extent, vbits // DataType(self.buf_region.buffer.dtype).bits)
+
+    def __str__(self) -> str:
+        return f"BufferInfo({self.buf_region})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
 class BlockInfo:
     """Information about a TIR block."""
 
@@ -84,6 +188,16 @@ def dom(self) -> List[Union[int, tir.PrimExpr]]:
         """The iteration domain of the block."""
         return [i.dom for i in self.iters]
 
+    def read_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
+
+    def write_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
+
     def dom_kind(self) -> str:
         """The iteration domain kind of the block, for example, SSSS, SSSR."""
         return "".join(i.kind for i in self.iters)
@@ -108,15 +222,48 @@ def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool:
         if len(r_region) != len(w_region):
             return False
         for var, r_dom, w_dom in zip(block.iter_vars, r_region, w_region):
-            if not _check_unit_var_range(var, r_dom) or not _check_unit_var_range(var, w_dom):
+            if not _check_unit_var_range(r_dom, var) or not _check_unit_var_range(w_dom, var):
                 return False
         return True
 
+    def get_loops(self) -> List[tir.schedule.LoopRV]:
+        return [iter_info.loop_rv for iter_info in self.iters]
+
     def is_reduction(self) -> bool:
         """Whether the block is a reduction workload."""
         # TODO(@junrushao): distinguish GEMV and reduction
         return self._reduction_block
 
+    def is_layout_transform(self, sch: tir.Schedule) -> bool:
+        """Whether the Block can be considered having a Layout Transform Pattern"""
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
+            and not self.is_elementwise(sch)
+            and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
+        )
+
+    def is_data_pad(self, sch: tir.Schedule) -> bool:
+        """Whether the Block can be considered having a data pad pattern"""
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
+            and not self.is_elementwise(sch)
+            and len(self.write_bufs(sch)[0].buf_region.region)
+            == len(self.read_bufs(sch)[0].buf_region.region)
+            and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
+        )
+
+    def is_convolution(self) -> bool:
+        """Whether a Block can be considered having Convolution Pattern"""
+        raise NotImplementedError
+
+    def is_pool(self) -> bool:
+        """Whether a Block can be considered having Pooling Pattern"""
+        raise NotImplementedError
+
     def is_gemv(self) -> bool:
         """Whether the block is a GEMV workload."""
         raise NotImplementedError
@@ -171,6 +318,32 @@ def _iter_kind(i: tir.IterVar) -> str:
     return blocks
 
 
+def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo:
+    def _iter_kind(loop: tir.IterVar) -> str:
+        return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
+
+    def _is_reduction_block(block: tir.schedule.BlockRV):
+        for iter_var in sch.get(block).iter_vars:
+            if _iter_kind(iter_var) == "R":
+                return True
+        return False
+
+    return BlockInfo(
+        name=sch.get(block).name_hint,
+        iters=[
+            IterInfo(
+                kind=_iter_kind(iter_var),
+                var=iter_var.var,
+                dom=iter_var.dom.extent,
+                loop_rv=loop_rv,
+            )
+            for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars)
+        ],
+        block_rv=block,
+        reduction_block=_is_reduction_block(block),
+    )
+
+
 def _assert_gpu_target(target: Target):
     if "gpu" not in target.keys:
         raise ValueError(f"Expect a GPU target, but got {target}")
diff --git a/python/tvm/ir/global_info.py b/python/tvm/ir/global_info.py
index 185e10b88cce..597ff68dc1d8 100644
--- a/python/tvm/ir/global_info.py
+++ b/python/tvm/ir/global_info.py
@@ -18,6 +18,7 @@
 import tvm
 import tvm_ffi
 from tvm.runtime.object import Object
+from tvm.runtime import Device
 from . import _ffi_api
 
 
@@ -39,6 +40,8 @@ def same_as(self, other):
 
 @tvm_ffi.register_object("ir.DummyGlobalInfo")
 class DummyGlobalInfo(GlobalInfo):
+    """DummyGlobalInfo"""
+
     def __init__(self) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.DummyGlobalInfo,
@@ -47,6 +50,8 @@ def __init__(self) -> None:
 
 @tvm_ffi.register_object("ir.VDevice")
 class VDevice(GlobalInfo):
+    """VDevice"""
+
     def __init__(
         self,
         target=None,
@@ -55,4 +60,6 @@ def __init__(
     ) -> None:
         if isinstance(target, (dict, str)):
             target = tvm.target.Target(tvm.runtime.convert(target))
+        if isinstance(target, Device):
+            target = tvm.target.Target.from_device(target)
         self.__init_handle_by_constructor__(_ffi_api.VDevice, target, vdevice_id, memory_scope)
diff --git a/python/tvm/relax/backend/adreno/pipeline.py b/python/tvm/relax/backend/adreno/pipeline.py
index 612b8ce7011d..cbbb17a6180a 100644
--- a/python/tvm/relax/backend/adreno/pipeline.py
+++ b/python/tvm/relax/backend/adreno/pipeline.py
@@ -18,6 +18,7 @@
 import tvm
 from tvm import dlight as dl
 from tvm import relax
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
 
 
 def library_dispatch_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
@@ -30,21 +31,78 @@ def library_dispatch_passes(target: tvm.target.Target):  # pylint: disable=unuse
 
 def legalize_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
     """The default legalization passes for Adreno GPU backend."""
-    return [
-        relax.transform.DecomposeOpsForInference(),
-        relax.transform.FoldConstant(),
-        relax.transform.LegalizeOps(),
-        relax.transform.AnnotateTIROpPattern(),
-        relax.transform.FoldConstant(),
-        relax.transform.FuseOps(),
-        relax.transform.FuseTIR(),
-        relax.transform.DeadCodeElimination(),
-        dl.ApplyDefaultSchedule(
-            dl.gpu.Reduction(),
-            dl.gpu.GeneralReduction(),
-            dl.gpu.Fallback(),
-        ),
+    desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
+    skip_ops = [
+        "relax.nn.conv2d",
+        "relax.nn.max_pool2d",
+        "relax.nn.adaptive_avg_pool2d",
     ]
+    pass_list = []
+
+    pass_list.extend(
+        [
+            tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)),
+            relax.transform.DecomposeOpsForInference(),
+        ]
+    )
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.transform.ConvertLayout(desired_layouts),
+                relax.transform.Normalize(),
+                relax.transform.FoldConstant(),
+                relax.transform.LegalizeOps(skip_ops=skip_ops),
+                relax.transform.AnnotateTIROpPattern(),
+                relax.backend.adreno.transform.AnnotateCustomMemoryScope(target),
+            ]
+        )
+    pass_list.extend([tvm.relax.transform.LegalizeOps()])
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.transform.LegalizeOps(
+                    {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
+                )
+            ]
+        )
+
+    pass_list.extend(
+        [
+            relax.transform.AnnotateTIROpPattern(),
+            relax.transform.FoldConstant(),
+            relax.transform.FuseOps(),
+            relax.transform.FuseTIR(),
+            relax.transform.DeadCodeElimination(),
+        ]
+    )
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.backend.adreno.transform.FoldVDeviceScopeChange(),
+                relax.transform.DeadCodeElimination(),
+                relax.transform.SpecializePrimFuncBasedOnCallSite(),
+            ]
+        )
+    pass_list.extend([relax.transform.Normalize()])
+    pass_list.extend(
+        [
+            dl.ApplyDefaultSchedule(
+                dl.adreno.Conv2d(),
+                dl.adreno.LayoutTransform(),
+                dl.adreno.Pool2D(),
+            )
+        ]
+    )
+    pass_list.extend(
+        [
+            dl.ApplyDefaultSchedule(
+                dl.gpu.Reduction(),
+                dl.gpu.GeneralReduction(),
+                dl.gpu.Fallback(),
+            )
+        ]
+    )
+    return pass_list
 
 
 def dataflow_lower_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
diff --git a/python/tvm/relax/op/memory/memory.py b/python/tvm/relax/op/memory/memory.py
index 7b84ffc48bb6..8e17194f67d9 100644
--- a/python/tvm/relax/op/memory/memory.py
+++ b/python/tvm/relax/op/memory/memory.py
@@ -62,7 +62,11 @@ def alloc_storage(
 
 @args_converter.auto
 def alloc_tensor(
-    storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr]
+    storage: Expr,
+    offset: Union[int, Expr],
+    shape: Expr,
+    dtype: Union[str, Expr],
+    runtime_device_ind: Union[int, Expr] = PrimValue(0),
 ) -> Call:
     """Construct a Call to allocate a tensor on a certain storage starting from the given offset.
 
@@ -80,6 +84,10 @@ def alloc_tensor(
     dtype : Union[str, Expr]
         The datatype of the tensor to be allocated.
 
+    runtime_device_ind: Union[int, Expr]
+        The device index indicating on which device the tensor is to be
+        allocated at runtime. Index -1 is reserved for the host device.
+
     Returns
     -------
     result : Call
@@ -89,7 +97,7 @@ def alloc_tensor(
         offset = PrimValue(offset)
     if isinstance(dtype, str):
         dtype = DataTypeImm(dtype)
-    return _ffi_api.alloc_tensor(storage, offset, shape, dtype)  # type: ignore
+    return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind)  # type: ignore
 
 
 @args_converter.auto
diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py
index 3ed6b29648b4..dcc9e5a33a53 100644
--- a/python/tvm/relax/op/vm/vm.py
+++ b/python/tvm/relax/op/vm/vm.py
@@ -62,7 +62,11 @@ def alloc_storage(
 
 @args_converter.auto
 def alloc_tensor(
-    storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr]
+    storage: Expr,
+    offset: Union[int, Expr],
+    shape: Expr,
+    dtype: Union[str, Expr],
+    runtime_device_ind: Union[int, Expr] = PrimValue(0),
 ) -> Call:
     """Construct a Call to allocate a tensor on a certain storage starting from the given offset.
 
@@ -80,6 +84,10 @@ def alloc_tensor(
     dtype : Union[str, Expr]
         The datatype of the tensor to be allocated.
 
+    runtime_device_ind: Union[int, Expr]
+        The device index indicating on which device the tensor is to be
+        allocated at runtime. Index -1 is reserved for the host device.
+
     Returns
     -------
     result : Call
@@ -89,7 +97,7 @@ def alloc_tensor(
         offset = PrimValue(offset)
     if isinstance(dtype, str):
         dtype = DataTypeImm(dtype)
-    return _ffi_api.alloc_tensor(storage, offset, shape, dtype)  # type: ignore
+    return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind)  # type: ignore
 
 
 def kill_object(obj: Expr) -> Call:
diff --git a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
index 00081f92b197..b73b40d6fc9c 100644
--- a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
+++ b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
@@ -78,6 +78,6 @@ def rewrite_alloc_tensor(self, call: relax.Call) -> relax.Call:
         )
         return relax.Call(
             self.memory_alloc_tensor_op,
-            args=[ipc_alloc_storage, call.args[2], shape, dtype],
+            args=[ipc_alloc_storage, call.args[2], shape, dtype, relax.PrimValue(0)],
             sinfo_args=call.sinfo_args,
         )
diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py
index f221a1308965..094ec3379fe9 100644
--- a/python/tvm/script/ir_builder/relax/ir.py
+++ b/python/tvm/script/ir_builder/relax/ir.py
@@ -619,6 +619,30 @@ def emit_var_binding(value: VarBinding) -> Var:
     return _ffi_api.EmitVarBinding(value)  # type: ignore
 
 
+def emit_with_sinfo(
+    op: str,
+    args: Expr,
+    sinfo_args: Optional[Union[StructInfo, List[StructInfo]]] = None,
+) -> Call:
+    """Create a relax Call with sinfo_args.
+    Parameters
+    ----------
+    op: Expr
+        The relax op for which sinfo_args to be appended
+    args : Expr
+        The arguments.
+    sinfo_args: Optional[Union[StructInfo, List[StructInfo]]]
+        The list of structure info arguments.
+
+    Returns
+    -------
+    call: Call
+        The created Relax Call
+    """
+    builtin_call = tvm.ir.Op.get(op)
+    return Call(builtin_call, args, attrs=None, sinfo_args=sinfo_args)
+
+
 ############################### SeqExpr ###############################
 
 
@@ -822,6 +846,7 @@ def dtype(value: Union[py_str, DataType]) -> Expr:
     "emit_te",
     "emit_var_binding",
     "emit_match_cast",
+    "emit_with_sinfo",
     "equal",
     "ewise_fma",
     "exp",
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index eb6e25f0450c..31788d5f062a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -838,7 +838,7 @@ def stm32(series="unknown", options=None):
     return Target(" ".join(["c"] + opts))
 
 
-def adreno(model="unknown", options=None, clml=False):
+def adreno(model="unknown", options=None, cfg=None, backend="opencl"):
     """Returns a Qualcomm GPU target.
     Parameters
     ----------
@@ -846,13 +846,22 @@ def adreno(model="unknown", options=None, clml=False):
         The model of this device
     options : str or list of str
         Additional options
+    cfg : str
+        Additional hints for target pipeline behavior
+    backend : str
+            Backend API, can be "opencl" or "vulkan"
     """
-    if clml:
-        opts = ["-device=adreno", "--keys=adreno,opencl,gpu,clml", "-model=%s" % model]
-    else:
-        opts = ["-device=adreno", "--keys=adreno,opencl,gpu", "-model=%s" % model]
+
+    if backend not in ["opencl", "vulkan"]:
+        raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.")
+
+    keys = f"adreno,{backend},gpu"
+    if cfg:
+        keys += f",{cfg}"
+
+    opts = ["-device=adreno", f"--keys={keys}", f"-model={model}"]
     opts = _merge_opts(opts, options)
-    return Target(" ".join(["opencl"] + opts))
+    return Target(" ".join([backend] + opts))
 
 
 def create(target):
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index da22cf77466f..fc9a2296e786 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -918,6 +918,17 @@ def _multi_gpu_exists():
     parent_features="cuda",
 )
 
+# Mark a test as requiring the OpenCL runtime on remote RPC
+requires_adreno_opencl = Feature(
+    "opencl",
+    long_name="Remote Adreno OpenCL",
+    cmake_flag="USE_OPENCL",
+    target_kind_enabled="opencl",
+    target_kind_hardware=None,
+    parent_features="gpu",
+    run_time_check=lambda: os.getenv("RPC_TARGET") is not None,
+)
+
 # Mark a test as requiring the OpenCL runtime
 requires_opencl = Feature(
     "opencl",
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 0a598e5e9bb9..74f2f6b2f757 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -126,3 +126,4 @@
 from .build import build
 from .pipeline import get_tir_pipeline, get_default_tir_pipeline
 from .functor import PyStmtExprVisitor, PyStmtExprMutator
+from . import backend
diff --git a/python/tvm/tir/backend/__init__.py b/python/tvm/tir/backend/__init__.py
new file mode 100644
index 000000000000..bd04016a2bb3
--- /dev/null
+++ b/python/tvm/tir/backend/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TIR backends"""
+
+from . import adreno
diff --git a/python/tvm/tir/backend/adreno/__init__.py b/python/tvm/tir/backend/adreno/__init__.py
new file mode 100644
index 000000000000..cb59e73b57e2
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The TIR Adreno backend passes """
+
+from . import transform
+from .pipeline import get_tir_pipeline
diff --git a/python/tvm/tir/backend/adreno/pipeline.py b/python/tvm/tir/backend/adreno/pipeline.py
new file mode 100644
index 000000000000..1efc8e83963c
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/pipeline.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name
+"""The TIR backend compilation pipeline for Adreno"""
+
+import tvm
+from tvm import tir
+
+
+def default_tir_pipeline():
+    """The default tir pipeline used in tvm.tir.build"""
+
+    @tvm.transform.module_pass(opt_level=0)
+    def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.IRModule:
+        """The default lowering passes for TIR backend."""
+        pass_ctx = tvm.transform.PassContext.current()
+        config = pass_ctx.config
+        passes = [
+            tir.backend.adreno.transform.TextureFlatten(),
+            tir.transform.CanonicalizeLoop(),
+            tir.transform.LowerCrossThreadReduction(),
+            tir.transform.LowerInitBlock(),
+            tir.transform.PlanAndUpdateBufferAllocationLocation(),
+            tir.transform.ConvertBlocksToOpaque(),
+            tir.transform.LiftThreadBinding(),
+            tir.transform.ManifestSharedMemoryLocalStage(),
+            tir.transform.CompactBufferAllocation(),
+            tir.transform.LowerAutoCopy(),
+            tir.transform.UnifyThreadBinding(),
+            tir.transform.LowerMatchBuffer(),
+            tir.transform.Simplify(),
+            tir.transform.InjectPermutedLayout(),
+            tir.transform.AnnotateIrregularLoop(),
+            tir.transform.InjectSoftwarePipeline(),
+            tir.transform.TransformMmaBufferLayout(),
+            tir.transform.LowerOpaqueBlock(),
+            tir.backend.adreno.transform.InjectTextureAlloc(),
+            tir.transform.FlattenBuffer(),
+            tir.transform.BF16ComputeLegalize(),
+            tir.transform.NarrowDataType(32),
+            tir.transform.LoopPartition(),
+            tir.transform.VectorizeLoop(not bool(config.get("tir.disable_vectorize", False))),
+            tir.transform.InjectVirtualThread(),
+            tir.transform.InjectDoubleBuffer(),
+        ]
+        if not bool(config.get("tir.disable_storage_rewrite", False)):
+            passes.append(tir.transform.StorageRewrite())
+        if config.get("tir.use_async_copy", False):
+            passes.append(tir.transform.LowerAsyncDMA())
+        passes.extend(
+            [
+                tir.transform.HoistIfThenElse(),
+                tir.transform.UnrollLoop(),
+                tir.transform.RenormalizeSplitPattern(),
+                tir.transform.Simplify(),
+                tir.transform.RemoveNoOp(),
+                tir.transform.RewriteUnsafeSelect(),
+            ]
+        )
+        # Additional passes based on configuration.
+        if bool(config.get("tir.instrument_bound_checkers", False)):
+            passes.append(tir.transform.InstrumentBoundCheckers())
+        if bool(config.get("tir.ptx_ldg32", False)):
+            passes.append(tir.transform.InjectPTXLDG32(True))
+        passes.append(
+            tir.transform.CommonSubexprElimTIR(
+                not bool(config.get("tir.disable_cse_tir", False)),
+                bool(config.get("tir.enable_equiv_terms_in_cse_tir", False)),
+            )
+        )
+        if bool(config.get("tir.instrument_lwp", False)):
+            passes.append(tir.transform.InstrumentProfileIntrinsics())
+        passes.extend(
+            [
+                # Bind the target first so that target-specific attributes are available.
+                tir.transform.FP8ComputeLegalize(),
+                # VerifyVTCMLimit must occur before LowerVtcmAlloc.
+                tir.transform.VerifyVTCMLimit(),
+                tir.transform.LowerVtcmAlloc(),
+                tir.transform.VerifyMemory(),
+                tir.transform.AnnotateEntryFunc(),
+            ]
+        )
+        if bool(config.get("tir.detect_global_barrier", False)):
+            passes.append(tir.transform.ThreadSync("global"))
+        passes.extend(
+            [
+                tir.transform.ThreadSync("shared"),
+                tir.transform.ThreadSync("shared.dyn"),
+                tir.transform.ThreadSync("warp"),
+                tir.transform.InferFragment(),
+                tir.transform.LowerThreadAllreduce(),
+            ]
+        )
+        if bool(config.get("tir.use_async_copy", False)):
+            passes.append(tir.transform.InjectPTXAsyncCopy())
+        if bool(config.get("tir.ptx_ldg32", False)):
+            passes.append(tir.transform.InjectPTXLDG32())
+        passes.extend(
+            [
+                tir.transform.AnnotateDeviceRegions(),
+                tir.transform.SplitHostDevice(),
+                # MergeSharedMemoryAllocations must follow SplitHostDevice.
+                tir.transform.MergeSharedMemoryAllocations(),
+                tir.transform.MakePackedAPI(),
+                tir.transform.FP8StorageLegalize(),
+                tir.transform.BF16StorageLegalize(),
+                tir.transform.LowerDeviceKernelLaunch(),
+            ]
+        )
+        mod = tvm.ir.transform.Sequential(passes)(mod)
+        return mod
+
+    return _pipeline
+
+
+def finalize_host_passes():  # pylint: disable=unused-argument
+    """The default finalization passes for TIR backend."""
+    host_pass_list = [
+        tir.transform.LowerTVMBuiltin(),
+        tir.transform.LowerCustomDatatypes(),
+        tir.transform.LowerIntrin(),
+        tir.transform.LowerDeviceStorageAccessInfo(),
+        tir.transform.CombineContextCall(),
+    ]
+    return tvm.ir.transform.Sequential(host_pass_list)
+
+
+def finalize_device_passes():  # pylint: disable=unused-argument
+    """The default finalization passes for TIR backend."""
+    device_pass_list = [
+        tir.transform.LowerWarpMemory(),
+        tir.transform.Simplify(),
+        tir.transform.LowerCustomDatatypes(),
+        tir.transform.LowerDeviceStorageAccessInfo(),
+        tir.transform.LowerIntrin(),
+    ]
+    return tvm.ir.transform.Sequential(device_pass_list)
+
+
+def get_tir_pipeline(
+    target: tvm.target.Target,  # pylint: disable=unused-argument
+) -> tvm.transform.Pass:
+    """Get the TIR pipeline for Adreno GPU."""
+    return default_tir_pipeline()
diff --git a/python/tvm/tir/backend/adreno/transform/__init__.py b/python/tvm/tir/backend/adreno/transform/__init__.py
new file mode 100644
index 000000000000..25c83f604916
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Adreno TIR transformations. """
+
+from .transform import (
+    InjectTextureAlloc,
+    TextureFlatten,
+)
diff --git a/python/tvm/tir/backend/adreno/transform/_ffi_api.py b/python/tvm/tir/backend/adreno/transform/_ffi_api.py
new file mode 100644
index 000000000000..7df0ae3eae47
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.tir.backend.adreno.transform"""
+import tvm_ffi
+
+
+tvm_ffi.init_ffi_api("tir.backend.adreno.transform", __name__)
diff --git a/python/tvm/tir/backend/adreno/transform/transform.py b/python/tvm/tir/backend/adreno/transform/transform.py
new file mode 100644
index 000000000000..03cc162a4bad
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/transform.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Wrapping existing transformations."""
+# pylint: disable=invalid-name, unsupported-binary-operation
+
+from . import _ffi_api
+
+
+def InjectTextureAlloc():
+    """Inject Texture Allocation Intrinsic to make sure appropriate lowering
+    via alloc_nd/alloc_free calls
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectTextureAlloc()  # type: ignore
+
+
+def TextureFlatten():
+    """Flatten the multi-dimensional read/write to 2D.
+
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.TextureFlatten()  # type: ignore
diff --git a/python/tvm/tir/pipeline.py b/python/tvm/tir/pipeline.py
index 96ed9dfdbc96..1ee4a5b1d315 100644
--- a/python/tvm/tir/pipeline.py
+++ b/python/tvm/tir/pipeline.py
@@ -21,6 +21,8 @@
 import tvm
 from tvm import tir
 
+from . import backend
+
 
 def default_tir_pipeline():
     """The default tir pipeline used in tvm.tir.build"""
@@ -176,4 +178,7 @@ def get_default_tir_pipeline(
     target: tvm.target.Target,  # pylint: disable=unused-argument
 ) -> tvm.transform.Pass:
     """Get the default TIR pipeline for the given target."""
-    return default_tir_pipeline()
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.get_tir_pipeline(target)
+    else:
+        return default_tir_pipeline()
diff --git a/src/relax/backend/adreno/annotate_custom_storage.cc b/src/relax/backend/adreno/annotate_custom_storage.cc
index 887b81872940..f42c5c456124 100644
--- a/src/relax/backend/adreno/annotate_custom_storage.cc
+++ b/src/relax/backend/adreno/annotate_custom_storage.cc
@@ -280,30 +280,58 @@ class CollectConsumerScopeInfo : public ExprVisitor {
     mod_ = mod;
     target_ = target;
     VisitExpr(func->body);
-    // Extend the scope for tuple items
-    for (const auto& val : arg_to_binding) {
+
+    for (const auto& val : tuple_item_to_binding) {
       if (scope_info.find(val.first) != scope_info.end()) {
-        if (scope_info.find(val.second) == scope_info.end()) {
-          scope_info.Set(val.second, scope_info[val.first]);
-        } else {
-          auto ent = scope_info[val.second];
-          for (auto ent_val : scope_info[val.first]) {
-            ent.Set(ent_val.first, ent_val.second);
+        for (const auto& item_val : val.second) {
+          // TODO(Siva): How about ops that generate tuples like split ?
+          if (tuples_to_binding.find(item_val.first) == tuples_to_binding.end()) {
+            continue;
+          }
+          auto producer_var = tuples_to_binding[item_val.first][item_val.second];
+          if (scope_info.find(producer_var) == scope_info.end()) {
+            scope_info.Set(producer_var, scope_info[val.first]);
+          } else {
+            auto ent = scope_info[producer_var];
+            for (auto ent_val : scope_info[val.first]) {
+              ent.Set(ent_val.first, ent_val.second);
+            }
+            scope_info.Set(producer_var, ent);
           }
-          scope_info.Set(val.second, ent);
         }
       }
     }
-
     return std::make_pair(call_scope_info, scope_info);
   }
 
   void VisitBinding_(const VarBindingNode* binding,
                      const TupleGetItemNode* tuple_get_item_node) final {
-    if (arg_to_binding.find(ffi::GetRef<Expr>(binding->var.get())) == arg_to_binding.end()) {
-      arg_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()),
-                         ffi::GetRef<Expr>(tuple_get_item_node->tuple.get()));
+    /*
+     * lv9 = R.call_tir(add, (lv7, m
+     * lv: R.Tuple(
+     *     R.Tensor((1, 3, 224, 224), dtype="float32"),
+     *     R.Tensor((3,), dtype="float32"),
+     *     R.Tensor((3,), dtype="float32")
+     * ) = lv9, metadata["relax.expr.Constant"][4], metadata["relax.expr.Constant"][5]
+     * lv1_1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0]
+     * lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d(lv1_1, .....
+     *
+     * lv1_1 scope is requested by conv2d now we need to populate the same to lv9
+     * Capture essestial information here as
+     *
+     * lv => {(0, lv1_1), (1, ) ...}
+     */
+    ffi::Map<Expr, int> field_map;
+    if (tuple_item_to_binding.find(ffi::GetRef<Expr>(binding->var.get())) !=
+        tuple_item_to_binding.end()) {
+      field_map = tuple_item_to_binding[ffi::GetRef<Expr>(binding->var.get())];
     }
+    field_map.Set(ffi::GetRef<Expr>(tuple_get_item_node->tuple.get()), tuple_get_item_node->index);
+    tuple_item_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()), field_map);
+  }
+
+  void VisitBinding_(const VarBindingNode* binding, const TupleNode* tuple) final {
+    tuples_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()), tuple->fields);
   }
 
   void VisitExpr_(const CallNode* call) final {
@@ -325,17 +353,18 @@ class CollectConsumerScopeInfo : public ExprVisitor {
       func_args = Tuple(call->args);
     }
 
-    bool is_texture_supported = SupportsTexture(op_attrs, op_pattern.value());
+    auto is_texture_supported = SupportsTexture(op_attrs, op_pattern.value());
 
     ffi::Array<ffi::String> arg_scope;
-    for (auto arg : func_args->fields) {
-      auto sinfo = GetStructInfo(arg);
+    for (uint32_t i = 0; i < func_args->fields.size(); ++i) {
+      auto sinfo = GetStructInfo(func_args->fields[i]);
       if (auto tensor_sinfo = sinfo.as<TensorStructInfo>()) {
-        auto scope = is_texture_supported
-                         ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value()))
-                         : "global";
+        bool is_texture =
+            i < is_texture_supported.size() ? is_texture_supported[i] : is_texture_supported[0];
+        auto scope =
+            is_texture ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value())) : "global";
         ffi::Map<Expr, ffi::Array<ffi::String>> ent_call;
-        const VarNode* arg_var = arg.as<VarNode>();
+        const VarNode* arg_var = func_args->fields[i].as<VarNode>();
         if (scope_info.find(ffi::GetRef<Expr>(arg_var)) != scope_info.end()) {
           ent_call = scope_info[ffi::GetRef<Expr>(arg_var)];
         }
@@ -368,28 +397,29 @@ class CollectConsumerScopeInfo : public ExprVisitor {
     return op_pat;
   }
 
-  bool SupportsTexture(const ffi::Array<Attrs>& op_attrs, Integer op_pattern) {
-    if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return true;
+  std::vector<bool> SupportsTexture(const ffi::Array<Attrs>& op_attrs, Integer op_pattern) {
+    if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return {true};
 
     for (auto attr : op_attrs) {
       if (auto conv_attr = attr.as<Conv2DAttrs>()) {
         if (conv_attr->data_layout == "NCHW4c" && conv_attr->kernel_layout == "OIHW4o") {
-          return true;
+          // No Texture for weights
+          return {true, false};
         }
       } else if (auto pool_attrs = attr.as<Pool2DAttrs>()) {
         if (pool_attrs->layout == "NCHW4c") {
-          return true;
+          return {true};
         }
       } else if (auto avg_attrs = attr.as<AdaptivePool2DAttrs>()) {
         if (avg_attrs->layout == "NCHW4c") {
-          return true;
+          return {true};
         }
       } else if (attr.as<LayerNormAttrs>()) {
-        return true;
+        return {true};
       }
     }
 
-    return false;
+    return {false};
   }
 
   std::string Scope(ffi::Array<PrimExpr> shape) {
@@ -432,7 +462,8 @@ class CollectConsumerScopeInfo : public ExprVisitor {
   ffi::Map<Expr, ffi::Map<Expr, ffi::Array<ffi::String>>> scope_info;
   /* A map of call node and scope info for each argument it consunes */
   ffi::Map<Expr, ffi::Array<ffi::String>> call_scope_info;
-  ffi::Map<Expr, Expr> arg_to_binding;
+  ffi::Map<Expr, ffi::Map<Expr, int>> tuple_item_to_binding;
+  ffi::Map<Expr, ffi::Array<Expr>> tuples_to_binding;
   IRModule mod_;
   Target target_;
 };
@@ -600,12 +631,9 @@ class DefineVDevice : ExprMutator {
 
     if (call->op == call_tir_op) {
       gv = Downcast<GlobalVar>(call->args[0]);
-      // tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(mod_->Lookup(gv));
-      // out_sinfo = call->sinfo_args[0];
       func_args = Downcast<Tuple>(call->args[1]);
     } else {
       func_args = Tuple(call->args);
-      // return call;
     }
 
     ffi::Array<Expr> new_args;
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index e2d9b5b068b7..04e9880b262a 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -35,6 +35,7 @@
 
 #include "../../../runtime/const_loader_module.h"
 #include "../../../target/source/codegen_source_base.h"
+#include "../../transform/utils.h"
 
 namespace tvm {
 namespace relax {
@@ -214,7 +215,15 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   Instruction::Arg VisitExpr_(const ConstantNode* op) final {
-    return builder_->ConvertConstant(op->data);
+    auto arg = builder_->ConvertConstant(op->data);
+
+    if (auto tsinfo = op->struct_info_.as<TensorStructInfoNode>()) {
+      if (tsinfo->vdevice.defined()) {
+        VDevice vdev = tsinfo->vdevice.value();
+        builder_->SaveMemoryScope(arg, vdev->memory_scope);
+      }
+    }
+    return arg;
   }
 
   Instruction::Arg VisitExpr_(const ShapeExprNode* op) final {
@@ -333,11 +342,19 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   void EmitAllocTensor(const Call& call_node, RegName dst_reg) {
-    ICHECK_EQ(call_node->args.size(), 4);
+    ICHECK_EQ(call_node->args.size(), 5);
     std::vector<Instruction::Arg> args;
-    args.reserve(4);
-    for (Expr arg : call_node->args) {
-      args.push_back(this->VisitExpr(arg));
+    for (int i = 0; i < 4; ++i) {
+      args.push_back(this->VisitExpr(call_node->args[i]));
+    }
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call_node->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    if (vdevice.defined()) {
+      args.push_back(this->VisitExpr(StringImm(vdevice.value()->memory_scope)));
     }
     builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
   }
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index a5bb83d406a5..c783a9da3d2a 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -38,6 +38,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "../../transform/utils.h"
+
 namespace tvm {
 namespace relax {
 namespace codegen_vm {
@@ -401,12 +403,21 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
   }
 
   void EmitAllocTensor(const Call& call_node, int64_t dst_reg) {
-    ICHECK_EQ(call_node->args.size(), 4);
+    ICHECK_EQ(call_node->args.size(), 5);
     ffi::Array<PrimExpr> args;
-    args.reserve(4);
-    for (Expr arg : call_node->args) {
-      args.push_back(this->VisitExpr(arg).value());
+    for (int i = 0; i < 4; ++i) {
+      args.push_back(this->VisitExpr(call_node->args[i]).value());
+    }
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call_node->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
     }
+    auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    if (vdevice.defined()) {
+      args.push_back(tir::StringImm(vdevice.value()->memory_scope));
+    }
+
     this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
diff --git a/src/relax/backend/vm/exec_builder.cc b/src/relax/backend/vm/exec_builder.cc
index b893b48830ce..a2fef22609be 100644
--- a/src/relax/backend/vm/exec_builder.cc
+++ b/src/relax/backend/vm/exec_builder.cc
@@ -46,6 +46,10 @@ ObjectPtr<VMExecutable> ExecBuilderNode::Get() {
   return exec_;
 }
 
+void ExecBuilderNode::SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope) {
+  exec_->memory_scopes[idx.value()] = scope;
+}
+
 vm::Instruction::Arg ExecBuilderNode::ConvertConstant_(Any cvalue) {
   // emit constant immediate as immediate.
   if (auto opt_int = cvalue.as<int64_t>()) {
diff --git a/src/relax/backend/vm/lower_runtime_builtin.cc b/src/relax/backend/vm/lower_runtime_builtin.cc
index 71b8413e9889..a1ffa4618423 100644
--- a/src/relax/backend/vm/lower_runtime_builtin.cc
+++ b/src/relax/backend/vm/lower_runtime_builtin.cc
@@ -92,7 +92,13 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
   Expr MakeMemAllocTensor(const Call& call) {
     PrimValue offset = Downcast<PrimValue>(call->args[1]);
     DataTypeImm dtype = Downcast<DataTypeImm>(call->args[3]);
-    return Call(vm_alloc_tensor_op_, {call->args[0], offset, call->args[2], dtype}, Attrs());
+
+    ffi::Array<Expr> call_args = {call->args[0], offset, call->args[2], dtype};
+    if (5 == call->args.size()) {
+      call_args.push_back(call->args[4]);
+    }
+
+    return Call(vm_alloc_tensor_op_, call_args, Attrs());
   }
 
   Expr MakeMemKillObject(const Call& call) {
@@ -168,8 +174,10 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
     VDevice vdev = attrs->dst_vdevice;
     int dev_type = vdev->target->GetTargetDeviceType();
     int dev_id = vdev->vdevice_id;
+    StringImm storage_scope = StringImm(vdev->memory_scope);
     args.push_back(PrimValue::Int64(dev_type));
     args.push_back(PrimValue::Int64(dev_id));
+    args.push_back(storage_scope);
     return Call(builtin_to_device_, args, call_node->attrs, {GetStructInfo(call_node)});
   }
 
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 54f9da4c786f..3acfb53b2784 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -23,6 +23,7 @@
 #include <tvm/relax/expr.h>
 #include <tvm/relax/utils.h>
 
+#include "../transform/utils.h"
 #include "op_common.h"
 
 namespace tvm {
@@ -1200,6 +1201,15 @@ StructInfo InferStructInfoAllocateTensor(const Call& call, const BlockBuilder& c
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+  int64_t vdevice_index = -1;
+  if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
+    vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+  }
+  auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+
+  if (vdevice.defined()) {
+    return TensorStructInfo(call->args[0], out_dtype, vdevice.value());
+  }
   return TensorStructInfo(call->args[0], out_dtype);
 }
 
@@ -1266,28 +1276,55 @@ StructInfo InferStructInfoMemAllocTensor(const Call& call, const BlockBuilder& c
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+
+  if (call->args.size() == 5) {
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+    if (vdevice.defined()) {
+      return TensorStructInfo(call->args[2], out_dtype, vdevice.value());
+    }
+  }
+
   return TensorStructInfo(call->args[2], out_dtype);
 }
 
 TVM_REGISTER_OP("relax.memory.alloc_tensor")
-    .set_num_inputs(4)
+    .set_num_inputs(5)
     .add_argument("storage", "Expr", "The storage to allocate the tensor to.")
     .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.")
     .add_argument("shape", "Expr", "The shape of the tensor to allocate.")
     .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.")
+    .add_argument("runtime_device_index", "PrimValue",
+                  "The device index indicating on which device the tensor is to be "
+                  "allocated at runtime. Index -1 is reserved for the host device.")
     .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoMemAllocTensor)
     // memory allocation isn't considered a "visible effect" as far as purity is concerned
     .set_attr<Bool>("FPurity", Bool(true))
     .set_attr<Bool>("TAllocator", Bool(true));
 
-Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) {
+Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                        PrimValue virtual_device_index) {
   static const Op& op = Op::Get("relax.memory.alloc_tensor");
-  return Call(op, {storage, offset, shape, dtype}, Attrs(), {});
+  return Call(op, {storage, offset, shape, dtype, virtual_device_index}, Attrs(), {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.op.memory.alloc_tensor", MakeMemAllocTensor);
+  refl::GlobalDef().def_packed(
+      "relax.op.memory.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) {
+        if (args.size() == 5) {
+          *ret = MakeMemAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(),
+                                    args[2].cast<Expr>(), args[3].cast<DataTypeImm>(),
+                                    args[4].cast<PrimValue>());
+        } else {
+          *ret = MakeMemAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(),
+                                    args[2].cast<Expr>(), args[3].cast<DataTypeImm>(),
+                                    PrimValue::Int64(0));
+        }
+      });
 }
 
 // memory planning kill_storage
@@ -1363,41 +1400,60 @@ StructInfo InferStructInfoVMAllocTensor(const Call& call, const BlockBuilder& ct
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+  int64_t vdevice_index = -1;
+  if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
+    vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+  }
+  auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+
   if (const auto* output_shape = call->args[2].as<ShapeExprNode>()) {
-    return TensorStructInfo(ffi::GetRef<Expr>(output_shape), out_dtype);
+    return TensorStructInfo(ffi::GetRef<Expr>(output_shape), out_dtype, vdevice);
   } else if (const auto* shape_sinfo = GetStructInfoAs<ShapeStructInfoNode>(call->args[2])) {
     if (shape_sinfo->values.defined()) {
-      return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype);
+      return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype, vdevice);
     } else {
-      return TensorStructInfo(out_dtype, shape_sinfo->ndim);
+      return TensorStructInfo(out_dtype, shape_sinfo->ndim, vdevice);
     }
   }
-  return TensorStructInfo(out_dtype, kUnknownNDim);
+  return TensorStructInfo(out_dtype, kUnknownNDim, vdevice);
 }
 
 TVM_REGISTER_OP("relax.vm.alloc_tensor")
-    .set_num_inputs(4)
+    .set_num_inputs(5)
     .add_argument("storage", "Expr", "The storage to allocate the tensor to.")
     .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.")
     .add_argument("shape", "Expr", "The shape of the tensor to allocate.")
     .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.")
+    .add_argument("runtime_device_index", "PrimValue",
+                  "The device index indicating on which device the tensor is "
+                  "to be allocated at runtime.")
     .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoVMAllocTensor)
     // memory allocation isn't considered a "visible effect" as far as purity is concerned
     .set_attr<Bool>("FPurity", Bool(true))
     .set_attr<Bool>("TAllocator", Bool(true));
 
-Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) {
+Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                       PrimValue runtime_device_index) {
   static const Op& op = Op::Get("relax.vm.alloc_tensor");
-  return Call(op, {storage, offset, shape, dtype}, Attrs(), {});
+  return Call(op, {storage, offset, shape, dtype, runtime_device_index}, Attrs(), {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.op.vm.alloc_tensor", MakeVMAllocTensor);
+  refl::GlobalDef().def_packed("relax.op.vm.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) {
+    if (args.size() == 5) {
+      *ret =
+          MakeVMAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(), args[2].cast<Expr>(),
+                            args[3].cast<DataTypeImm>(), args[4].cast<PrimValue>());
+    } else {
+      *ret =
+          MakeVMAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(), args[2].cast<Expr>(),
+                            args[3].cast<DataTypeImm>(), PrimValue::Int64(0));
+    }
+  });
 }
 
 // vm kill_object
-
 TVM_REGISTER_OP("relax.vm.kill_object")
     .set_num_inputs(1)
     .add_argument("obj", "Expr", "The object to be killed.")
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index 5a556cbd7413..5c4f563bebee 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -211,7 +211,17 @@ inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx
   }
   auto output_sinfo = ffi::make_object<TensorStructInfoNode>(*input_sinfo.get());
   output_sinfo->dtype = f_compute_out_dtype(input_sinfo);
-  return TensorStructInfo(output_sinfo);
+  if (call->sinfo_args.size() > 0) {
+    auto defined_sinfo = call->sinfo_args[0].as<TensorStructInfoNode>();
+    ICHECK(defined_sinfo);
+    auto shape = output_sinfo->GetShape();
+    ICHECK(shape.defined());
+    ICHECK(defined_sinfo->vdevice.has_value());
+    return TensorStructInfo(ShapeExpr(shape.value()), output_sinfo->dtype,
+                            defined_sinfo->vdevice.value());
+  } else {
+    return TensorStructInfo(output_sinfo);
+  }
 }
 
 /*!
@@ -568,7 +578,8 @@ inline ffi::Optional<ShapeExpr> CheckNdimPerLayoutAndGetShape(const Call& call,
 
 Expr MakeVMAllocStorage(Expr size, PrimValue runtime_device_index, DataTypeImm dtype,
                         StringImm storage_scope = StringImm("global"));
-Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype);
+Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                       PrimValue runtime_device_index);
 
 Expr MakeAllocTensor(Expr shape, DataTypeImm dtype, PrimValue runtime_device_index,
                      StringImm storage_scope = StringImm("global"));
diff --git a/src/relax/transform/call_tir_rewrite.cc b/src/relax/transform/call_tir_rewrite.cc
index d4763b44b713..2c0e515be7ec 100644
--- a/src/relax/transform/call_tir_rewrite.cc
+++ b/src/relax/transform/call_tir_rewrite.cc
@@ -81,15 +81,20 @@ class CallTIRMutator : public ExprMutator {
         ICHECK(tensor_sinfo->shape.defined())
             << "the TensorStructInfo shape of call_tir has not populated";
         int dev_index = 0;
+        ffi::String scope = "global";
         if (tensor_sinfo->vdevice.defined()) {
           dev_index = GetDeviceIndex(mod_, tensor_sinfo->vdevice.value());
+          scope = tensor_sinfo->vdevice.value()->memory_scope;
+        } else {
+          dev_index = GetDeviceIndexByScope(mod_, scope);
         }
+
         if (!is_inplace) {
           outs.push_back(builder_->Emit(Call(alloc_tensor_op,
                                              {Downcast<ShapeExpr>(tensor_sinfo->shape.value()),
                                               DataTypeImm(tensor_sinfo->dtype),
-                                              PrimValue::Int64(dev_index), StringImm("global")},
-                                             Attrs()),
+                                              PrimValue::Int64(dev_index), StringImm(scope)},
+                                             Attrs(), {tensor_sinfo}),
                                         "alloc"));
         } else {
           // if there is only one output, it must be an in-place argument, but check anyway
@@ -112,13 +117,21 @@ class CallTIRMutator : public ExprMutator {
           ICHECK(field_tensor->shape.defined())
               << "call_tir expects all TensorStructInfo has shape, but got " << field_tensor
               << " as an element of TupleStructInfo";
+
+          int dev_index = 0;
+          ffi::String scope = "global";
+          if (field_tensor->vdevice.defined()) {
+            dev_index = GetDeviceIndex(mod_, field_tensor->vdevice.value());
+            scope = field_tensor->vdevice.value()->memory_scope;
+          }
+
           if (!is_inplace || inplace_attrs->inplace_indices[i].IntValue() == -1) {
-            outs.push_back(builder_->Emit(
-                Call(alloc_tensor_op,
-                     {Downcast<ShapeExpr>(field_tensor->shape.value()),
-                      DataTypeImm(field_tensor->dtype), PrimValue::Int64(0), StringImm("global")},
-                     Attrs()),
-                "alloc"));
+            outs.push_back(builder_->Emit(Call(alloc_tensor_op,
+                                               {Downcast<ShapeExpr>(field_tensor->shape.value()),
+                                                DataTypeImm(field_tensor->dtype),
+                                                PrimValue::Int64(dev_index), StringImm(scope)},
+                                               Attrs(), {field_tensor}),
+                                          "alloc"));
           } else {
             outs.push_back(Downcast<Tuple>(call->args[1])
                                ->fields[inplace_attrs->inplace_indices[i].IntValue()]);
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index d1e61b1c5748..d63cd15744bd 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -24,11 +24,16 @@
 #include <tvm/relax/expr_functor.h>
 #include <tvm/relax/transform.h>
 
+#include "utils.h"
+
 namespace tvm {
 namespace relax {
 
 namespace {
 class Mutator : public ExprMutator {
+ public:
+  explicit Mutator(IRModule mod) : ctx_mod_(mod) {}
+
   using ExprMutator::VisitExpr_;
   Expr VisitExpr_(const CallNode* op) override {
     static const Op& alloc_tensor_op = Op::Get("relax.builtin.alloc_tensor");
@@ -71,23 +76,56 @@ class Mutator : public ExprMutator {
         return nbytes;
       }();
 
+      ShapeExpr size({nbytes});
+
+      int64_t vdevice_index = -1;
+      if (auto* prim_value_node = op->args[2].as<PrimValueNode>()) {
+        vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+      }
+      ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+      if (vdevice.defined()) {
+        std::string dev_kind = vdevice.value()->target->kind->name;
+        PrimExpr dev_size = tir::make_const(DataType::Int(64), 1);
+        if (vdevice.value()->memory_scope != "global") {
+          auto device_size_handler =
+              tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize.") + dev_kind);
+          if (device_size_handler.has_value()) {
+            dev_size *=
+                (*device_size_handler)(shape, dtype->value, vdevice.value()).cast<PrimExpr>();
+            size = ShapeExpr({dev_size});
+          }
+          auto device_scope_handler =
+              tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility.") + dev_kind);
+          if (device_scope_handler.has_value()) {
+            ffi::String dev_scope =
+                (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope)
+                    .cast<ffi::String>();
+            storage_scope = StringImm(dev_scope);
+          }
+        }
+      }
+
       auto offset = PrimValue::Int64(0);
 
-      Expr storage =
-          relax::Call(mem_alloc_storage_op, {ShapeExpr({nbytes}), runtime_device_index,
-                                             storage_scope, DataTypeImm(DataType::UInt(8))});
+      Expr storage = relax::Call(mem_alloc_storage_op, {size, runtime_device_index, storage_scope,
+                                                        DataTypeImm(DataType::UInt(8))});
       storage = builder_->Emit(storage, "storage");
-      Expr tensor = relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype});
+      Expr tensor =
+          relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype, op->args[2]});
       return tensor;
     } else {
       return ExprMutator::VisitExpr_(op);
     }
   }
+
+ private:
+  IRModule ctx_mod_;
 };
 }  // namespace
 
-Expr LowerAllocTensor(Expr expr) {
-  Mutator mutator;
+Expr LowerAllocTensor(IRModule m, Expr expr) {
+  Mutator mutator(m);
   return mutator(expr);
 }
 
@@ -95,7 +133,7 @@ namespace transform {
 
 Pass LowerAllocTensor() {
   auto pass_func = [=](Function func, IRModule m, PassContext pc) {
-    return Downcast<Function>(relax::LowerAllocTensor(std::move(func)));
+    return Downcast<Function>(relax::LowerAllocTensor(m, std::move(func)));
   };
   return CreateFunctionPass(pass_func, /*opt_level=*/0, "LowerAllocTensor", {});
 }
diff --git a/src/relax/transform/specialize_primfunc_based_on_callsite.cc b/src/relax/transform/specialize_primfunc_based_on_callsite.cc
index 6258e14b666d..bb2855b4b0bb 100644
--- a/src/relax/transform/specialize_primfunc_based_on_callsite.cc
+++ b/src/relax/transform/specialize_primfunc_based_on_callsite.cc
@@ -132,8 +132,9 @@ class SpecializeTIRCallArgs : ExprMutator {
         if (sinfo->vdevice.defined()) {
           scope = sinfo->vdevice.value()->memory_scope;
         }
-        const Buffer& buffer =
-            tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype, "ret_val", scope);
+
+        const Buffer& buffer = tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype,
+                                                "ret_val_" + std::to_string(index), scope);
         param_map.Set(pfunc->params[args.size() + index], buffer);
         index++;
       }
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index fc3c2259ff9a..cebbaa4ce5ac 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -77,6 +77,9 @@
 #include <set>
 #include <vector>
 
+#include "../../runtime/texture.h"
+#include "utils.h"
+
 namespace tvm {
 namespace relax {
 
@@ -105,6 +108,8 @@ class StorageTokenNode : public Object {
   DataType dtype;
   /*! \brief The memory scope of the token. */
   std::string storage_scope;
+  /*! \brief The VDevice information. */
+  ffi::Optional<VDevice> vdevice;
   /*! \brief The storage id, reserved for debug and demo use. */
   int storage_id{-1};
 
@@ -129,23 +134,51 @@ class StorageTokenNode : public Object {
  */
 class StorageToken : public ObjectRef {
  public:
-  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope) {
+  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope,
+                        ffi::Optional<VDevice> vdevice = std::nullopt) {
     // Compute the tensor size from the shape.
     int64_t const_coeff = dtype.bytes() * dtype.lanes();
     PrimExpr size = tir::make_const(DataType::Int(64), 1);
-    for (const PrimExpr& dim_len : shape) {
-      if (const IntImmNode* const_dim_len = dim_len.as<IntImmNode>()) {
-        const_coeff *= const_dim_len->value;
-      } else {
-        size *= dim_len;
+    bool size_computed = false;
+
+    if (vdevice.defined()) {
+      VDevice vdev = vdevice.value();
+      std::string dev_kind = vdev->target->kind->name;
+
+      if (vdev->memory_scope != "global") {
+        auto device_size_handler =
+            tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize." + dev_kind));
+        if (device_size_handler.has_value()) {
+          size *= (*device_size_handler)(shape, dtype, vdevice.value()).cast<PrimExpr>();
+          size_computed = true;
+        }
+        auto device_scope_handler =
+            tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility." + dev_kind));
+        if (device_scope_handler.has_value()) {
+          ffi::String dev_scope =
+              (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope)
+                  .cast<ffi::String>();
+          storage_scope = dev_scope;
+        }
+      }
+    }
+    if (!size_computed) {
+      for (const PrimExpr& dim_len : shape) {
+        if (const IntImmNode* const_dim_len = dim_len.as<IntImmNode>()) {
+          const_coeff *= const_dim_len->value;
+        } else {
+          size *= dim_len;
+        }
       }
     }
+
     size = tir::make_const(DataType::Int(64), const_coeff) * size;
 
     ObjectPtr<StorageTokenNode> n = ffi::make_object<StorageTokenNode>();
     n->bytes = size;
     n->dtype = dtype;
     n->storage_scope = std::move(storage_scope);
+    n->vdevice = std::move(vdevice);
     data_ = std::move(n);
   }
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(StorageToken, ObjectRef, StorageTokenNode);
@@ -159,9 +192,9 @@ using Tokens = NestedMsg<StorageToken>;
  * \note We can generalize this implementation to multi-dimensional memory
  * following the same flow in the future.
  */
-class TokenAllocator1D {
+class TokenAllocatorMixed {
  public:
-  explicit TokenAllocator1D(arith::Analyzer* analyzer) : analyzer_(analyzer) {}
+  explicit TokenAllocatorMixed(arith::Analyzer* analyzer) : analyzer_(analyzer) {}
 
   /*!
    * \brief Request a storage token from the available token pool for a
@@ -606,7 +639,14 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
 
     // Create and set token.
     StringImm storage_scope = Downcast<StringImm>(call->args[3]);
-    StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value);
+
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value, vdevice);
 
     Tokens tokens(token);
     SetTokens(call, tokens);
@@ -823,7 +863,7 @@ class StorageAllocator : public StorageAllocatorBaseVisitor {
   /*! \brief Number of allocated storages. */
   int n_storage_{0};
   /*! \brief The 1D memory allocator. */
-  TokenAllocator1D allocator_;
+  TokenAllocatorMixed allocator_;
   /*! \brief The mapping from each token to the tensors that are currently using it. */
   std::unordered_map<const StorageTokenNode*, std::vector<Var>> token2cur_tensor_;
 };
@@ -905,7 +945,8 @@ class StorageAllocationRewriter : public ExprMutator {
       // And always create a `memory.alloc_tensor` for the old `builtin.alloc_tensor`.
       PrimValue offset = PrimValue::Int64(0);
       DataType dtype = sinfo->dtype;
-      return Call(mem_alloc_tensor, {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype)},
+      return Call(mem_alloc_tensor,
+                  {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype), call->args[2]},
                   Attrs());
     } else if (plan_dynamic_output_ && call->op == alloc_tensor_op) {
       // Case 2. For a `alloc_tensor` that is not planned for memory reuse,
@@ -936,7 +977,8 @@ class StorageAllocationRewriter : public ExprMutator {
         return Call(mem_alloc_tensor, {storage,  //
                                        /*offset=*/PrimValue::Int64(0),
                                        /*shape=*/ffi::GetRef<ShapeExpr>(shape),  //
-                                       /*dtype=*/DataTypeImm(sinfo->dtype)});
+                                       /*dtype=*/DataTypeImm(sinfo->dtype),
+                                       /*vdevice_index=*/call->args[2]});
       }
     }
 
@@ -990,6 +1032,32 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def("relax.transform.StaticPlanBlockMemory", StaticPlanBlockMemory);
 }
 
+PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType dtype,
+                                         VDevice vdevice) {
+  int image_row_align = vdevice->target->GetAttr<Integer>("image_base_address_alignment")
+                            .value_or(Integer(64))
+                            ->value;
+
+  struct Shape {
+    const ffi::Array<PrimExpr>& shape;
+    int64_t operator[](size_t i) const {
+      ICHECK(tir::as_const_int(shape[i])) << "Dymamic shapes not suported over texture now";
+      return *tir::as_const_int(shape[i]);
+    }
+    int size() { return this->shape.size(); }
+  };
+  auto shape = Shape{pshape};
+
+  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bytes() * 8, dtype.lanes(),
+                                                     vdevice->memory_scope, image_row_align);
+  return tir::make_const(DataType::Int(64), size);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("DeviceGetMemSize.opencl", GetTextureMemorySizeFromVDevice);
+}
+
 }  // namespace transform
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 91d75079f73d..2cce918f4ef7 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "../../support/array.h"
@@ -383,6 +384,19 @@ inline ffi::String GetCodegenName(const std::string& composite_name) {
   return composite_name.substr(0, delim_pos);
 }
 
+inline int GetDeviceIndexByScope(const IRModule& mod, const ffi::String& scope) {
+  if (mod->global_infos.find("vdevice") == mod->global_infos.end()) {
+    return 0;
+  }
+  ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
+  for (int i = 0; i < static_cast<int>(vdevices.size()); ++i) {
+    if (scope == vdevices[i].as<VDevice>().value()->memory_scope) {
+      return i;
+    }
+  }
+  return 0;
+}
+
 inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) {
   ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
   for (int i = 0; i < static_cast<int>(vdevices.size()); ++i) {
@@ -394,6 +408,17 @@ inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) {
   return -1;
 }
 
+inline ffi::Optional<VDevice> GetGlobalVDevice(const IRModule& mod, const int index) {
+  ffi::Optional<VDevice> ret;
+  if (mod->global_infos.find("vdevice") != mod->global_infos.end()) {
+    ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
+    if (index < static_cast<int>(vdevices.size())) {
+      ret = vdevices[index].as<VDevice>();
+    }
+  }
+  return ret;
+}
+
 /* \brief Eliminate common subexpressions
  *
  * Utility for simplifying relax expressions by removing common
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 933cd0b7a7cf..99d5f2be2a9d 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -187,6 +187,8 @@ inline const char* CLGetErrorString(cl_int error) {
 
 inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
   DataType dtype(data_type);
+  dtype = dtype.with_lanes(1);
+
   if (dtype == DataType::Float(32)) {
     return CL_FLOAT;
   } else if (dtype == DataType::Float(16)) {
@@ -343,7 +345,6 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpaceView(Device dev, void* data, ffi::Shape shape, DLDataType dtype,
                            ffi::Optional<ffi::String> mem_scope = std::nullopt);
   void FreeDataSpaceView(Device dev, void* ptr);
-
   cl_device_id GetCLDeviceID(int device_id);
   // override device API
   void SetDevice(Device dev) final;
@@ -351,7 +352,7 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        ffi::Optional<ffi::String> mem_scope = std::nullopt) final;
-  void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+  void* AllocDataSpace(Device dev, size_t width, size_t height, size_t depth, DLDataType type_hint,
                        ffi::Optional<ffi::String> mem_scope = std::nullopt);
   void* GetNativePtr(const tvm::runtime::Tensor& narr);
   void SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr, size_t buf_size);
@@ -365,8 +366,8 @@ class OpenCLWorkspace : public DeviceAPI {
 
   // cl_mem alloc utils
   void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint);
-  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch,
-                     DLDataType type_hint, ffi::Optional<ffi::String> mem_scope);
+  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t depth,
+                     size_t row_pitch, DLDataType type_hint, ffi::Optional<ffi::String> mem_scope);
 
   /*!
    * \brief Get the thread local ThreadEntry
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 8b6fba24988e..4a022b07b30c 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -71,7 +71,7 @@ ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor)
   auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
   info.region[0] = texture_shape.width;
   info.region[1] = texture_shape.height;
-  info.region[2] = 1;
+  info.region[2] = texture_shape.depth;
   return info;
 }
 
@@ -260,13 +260,13 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   return AllocCLBuffer(dev, size, alignment, type_hint);
 }
 
-void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
-                                      ffi::Optional<ffi::String> mem_scope) {
+void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, size_t depth,
+                                      DLDataType type_hint, ffi::Optional<ffi::String> mem_scope) {
   // Texture allocation given width and height
   cl_uint row_align = GetImageAlignment(dev.device_id);
   size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
-  size_t mem_size = row_pitch * height;
+  size_t mem_size = row_pitch * height * depth;
 
   // Alloc back buffer from pool
   cl::BufferDescriptor* back_buffer = nullptr;
@@ -280,7 +280,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, D
   if (!mem_scope.has_value()) {
     mem_scope = ffi::String("global.texture");
   }
-  return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
+  return AllocCLImage(dev, back_buffer, width, height, depth, row_pitch, type_hint, mem_scope);
 }
 
 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
@@ -298,7 +298,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape
   size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
   auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
 
-  return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
+  return AllocDataSpace(dev, texture.width, texture.height, texture.depth, dtype, mem_scope);
 }
 
 void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
@@ -320,7 +320,7 @@ void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
 }
 
 void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
-                                    size_t row_pitch, DLDataType type_hint,
+                                    size_t depth, size_t row_pitch, DLDataType type_hint,
                                     ffi::Optional<ffi::String> mem_scope) {
   this->Init();
   ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
@@ -331,7 +331,7 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width,
   cl_int err_code;
   cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
   cl_image_format format = {CL_RGBA, cl_type};
-  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
+  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D_ARRAY, width, height, 0, depth, 0, 0, 0, 0};
 
   if (IsBufferToImageSupported(dev.device_id)) {
     descriptor.image_row_pitch = row_pitch;
@@ -344,7 +344,6 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width,
   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
   desc->buffer = mptr;
   desc->back_buffer = back_desc;
-
   return desc;
 }
 
@@ -383,7 +382,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha
       size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
 
       ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
-          dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
+          dev, nullptr, texture.width, texture.height, texture.depth, row_pitch, dtype, mem_scope));
       ret_desc->is_compat_view = true;
     }
     return ret_desc;
@@ -414,7 +413,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha
   }
 
   return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
-                                             row_pitch, dtype, mem_scope);
+                                             texture.depth, row_pitch, dtype, mem_scope);
 }
 
 void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
@@ -768,28 +767,27 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int32_t device_type = args[0].cast<int32_t>();
                     int32_t device_id = args[1].cast<int32_t>();
-                    int32_t dtype_code_hint = args[2].cast<int32_t>();
-                    int32_t dtype_bits_hint = args[3].cast<int32_t>();
                     auto scope = args[4].cast<std::string>();
                     CHECK(scope.find("texture") != std::string::npos);
                     int64_t ndim = args[5].cast<int64_t>();
-                    CHECK_EQ(ndim, 2);
+                    CHECK_EQ(ndim, 3);
                     int64_t* shape = static_cast<int64_t*>(args[6].cast<void*>());
                     int64_t width = shape[0];
                     int64_t height = shape[1];
-
+                    int64_t depth = shape[2];
+                    int64_t channel_size = args[7].cast<int64_t>();
+                    DataType channel_type = GetChannelType(channel_size);
                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;
-
                     DLDataType type_hint;
-                    type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
-                    type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
-                    type_hint.lanes = 1;
+                    type_hint.code = channel_type.code();
+                    type_hint.bits = channel_type.bits();
+                    type_hint.lanes = channel_type.lanes();
 
                     *rv = OpenCLWorkspace::Global()->AllocDataSpace(
-                        dev, static_cast<size_t>(width), static_cast<size_t>(height), type_hint,
-                        ffi::String("global.texture"));
+                        dev, static_cast<size_t>(width), static_cast<size_t>(height),
+                        static_cast<size_t>(depth), type_hint, ffi::String("global.texture"));
                   })
       .def_packed("device_api.opencl.free_nd",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -857,7 +855,7 @@ class OpenCLPooledAllocator final : public memory::PooledAllocator {
   Buffer Alloc(Device dev, ffi::Shape shape, DLDataType type_hint,
                const std::string& mem_scope) override {
     if (AllowMemoryScope(mem_scope)) {
-      size_t size = ffi::GetDataSize(shape.Product(), type_hint);
+      size_t size = GetMemObjectSize(dev, shape.size(), shape.data(), type_hint);
       Buffer buf;
       buf.device = dev;
       buf.size = size;
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 3f9dadbb3af1..980c6bb3994c 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -50,6 +50,7 @@ class OpenCLWrappedFunc {
     arg_size_ = arg_size;
     launch_param_config_.Init(arg_size.size(), launch_param_tags);
   }
+
   // invoke the function with void arguments
   void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const {
     ICHECK(w_->devices.size() > 0) << "No OpenCL device";
@@ -63,6 +64,8 @@ class OpenCLWrappedFunc {
     if (kernel == nullptr || e.version != entry_.version) {
       kernel = m_->InstallKernel(w_, t, func_name_, entry_);
     }
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+    cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim());
     // setup arguments.
     for (cl_uint i = 0; i < arg_size_.size(); ++i) {
       void* arg = nullptr;
@@ -74,13 +77,10 @@ class OpenCLWrappedFunc {
       OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg));
     }
     cl_command_queue queue = w_->GetQueue(t->device);
-    ThreadWorkLoad wl = launch_param_config_.Extract(args);
-    cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim());
     for (cl_uint i = 0; i < work_dim; ++i) {
       wl.work_size[i] *= wl.work_size[i + 3];
     }
     // launch kernel
-
     if (w_->IsProfiling(t->device)) {
       w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
       OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index f3a827aa8792..e2b6d603ed50 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -40,6 +40,7 @@ template <typename T>
 struct Texture2DShape {
   T width;
   T height;
+  T depth;
   T channel;
 };
 
@@ -60,11 +61,7 @@ inline size_t DefaultTextureLayoutSeparator(size_t shape_rank,
   } else if (convention == "global.texture-weight") {
     separator = 1;
   } else if (convention == "global.texture-nhwc") {
-    if (shape_rank == 3) {
-      separator = 1;
-    } else {
-      separator = 2;
-    }
+    separator = 2;
   } else {
     LOG(FATAL) << "Encountered unknown texture lowering convention: " << convention;
   }
@@ -81,9 +78,11 @@ template <typename T, typename S>
 Texture2DShape<T> ApplyTexture2DFlattening(const S& shape, size_t rank, size_t axis) {
   ICHECK(axis < rank)
       << "Number of axes to flatten into rows must be less than shape rank for 2d flattening";
-  Texture2DShape<T> texture{1, 1, shape[rank - 1]};
+  Texture2DShape<T> texture{1, 1, 1, shape[rank - 1]};
   for (size_t i = 0; i < rank - 1; i++) {
-    if (i < axis) {
+    if (i < (axis - 1)) {
+      texture.depth *= shape[i];
+    } else if (i < axis) {
       texture.height *= shape[i];
     } else {
       texture.width *= shape[i];
@@ -114,7 +113,23 @@ size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
   auto pack_size = shape[shape.size() - 1];
   auto pixel_size = (bits * lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align);
-  return row_pitch * tshape.height;
+  return row_pitch * tshape.height * tshape.depth;
+}
+
+/*!
+ * \brief Returns the standard channel datatype for any given type.
+ * \param channel_size The Number of bits in a Channel
+ * \return DataType to be used in the codegen.
+ */
+inline DataType GetChannelType(size_t channel_size) {
+  DataType channel_type;
+
+  if (channel_size == 128)
+    return DataType::Float(32, 4);
+  else if (channel_size == 64)
+    return DataType::Float(16, 4);
+
+  LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
 }
 
 }  // namespace runtime
diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc
index 1bd3084c210b..69a4c74169a4 100644
--- a/src/runtime/vm/builtin.cc
+++ b/src/runtime/vm/builtin.cc
@@ -390,7 +390,18 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("vm.builtin.alloc_storage", VMAllocStorage)
-      .def_method("vm.builtin.alloc_tensor", &StorageObj::AllocTensor);
+      .def_packed("vm.builtin.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* rv) {
+        Storage sobj = args[0].cast<Storage>();
+        int64_t offset = args[1].cast<int64_t>();
+        ffi::Shape shape = args[2].cast<ffi::Shape>();
+        DataType dtype = args[3].cast<DataType>();
+        if (args.size() == 5) {
+          ffi::String scope = args[4].cast<ffi::String>();
+          *rv = sobj->AllocTensorScoped(offset, shape, dtype, scope);
+        } else {
+          *rv = sobj->AllocTensor(offset, shape, dtype);
+        }
+      });
 }
 
 //-------------------------------------------------
@@ -516,9 +527,16 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           "vm.builtin.reshape",
           [](Tensor data, ffi::Shape new_shape) { return data.CreateView(new_shape, data->dtype); })
       .def("vm.builtin.null_value", []() -> std::nullptr_t { return nullptr; })
-      .def("vm.builtin.to_device", [](Tensor data, int dev_type, int dev_id) {
+      .def_packed("vm.builtin.to_device", [](ffi::PackedArgs args, ffi::Any* rv) {
+        Tensor data = args[0].cast<Tensor>();
+        int dev_type = args[1].cast<int>();
+        int dev_id = args[2].cast<int>();
         Device dst_device = {(DLDeviceType)dev_type, dev_id};
-        return data.CopyTo(dst_device);
+        ffi::String mem_scope = "global";
+        if (args.size() == 4) {
+          mem_scope = args[3].cast<ffi::String>();
+        }
+        *rv = data.CopyTo(dst_device, mem_scope);
       });
 }
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 40edbc14c433..efa05fd38376 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -38,6 +38,7 @@ namespace vm {
 
 /*! \brief The magic number for the serialized VM bytecode file  */
 constexpr uint64_t kTVMVMBytecodeMagic = 0xD225DE2F4214151D;
+constexpr uint64_t kTVMVMBytecodeMagicV2 = 0xD225DE2F4214151E;
 
 #define STREAM_CHECK(val, section)                                          \
   ICHECK(val) << "Invalid VM file format in the " << section << " section." \
@@ -143,22 +144,24 @@ Instruction VMExecutable::GetInstruction(Index i) const {
 }
 
 void SaveHeader(dmlc::Stream* strm) {
-  uint64_t header = kTVMVMBytecodeMagic;
+  uint64_t header = kTVMVMBytecodeMagicV2;
   strm->Write(header);
   std::string version = VM_VERSION;
   strm->Write(version);
 }
 
-void LoadHeader(dmlc::Stream* strm) {
+uint64_t LoadHeader(dmlc::Stream* strm) {
   // Check header.
   uint64_t header;
   STREAM_CHECK(strm->Read(&header), "header");
-  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
+  STREAM_CHECK((header == kTVMVMBytecodeMagic) || (header == kTVMVMBytecodeMagicV2), "header");
 
   // Check version.
   std::string version;
   STREAM_CHECK(strm->Read(&version), "version");
   STREAM_CHECK(version == VM_VERSION, "version");
+
+  return header;
 }
 
 ffi::Bytes VMExecutable::SaveToBytes() const {
@@ -172,6 +175,9 @@ ffi::Bytes VMExecutable::SaveToBytes() const {
   // Global section.
   SaveGlobalSection(&strm);
 
+  // Memory Scopes
+  SaveMemoryScopeSection(&strm);
+
   // Constant section.
   SaveConstantSection(&strm);
 
@@ -192,11 +198,16 @@ ffi::Module VMExecutable::LoadFromBytes(const ffi::Bytes& bytes) {
   ObjectPtr<VMExecutable> exec = ffi::make_object<VMExecutable>();
 
   // Load header.
-  LoadHeader(&strm);
+  uint64_t header_magic = LoadHeader(&strm);
 
   // Global section.
   exec->LoadGlobalSection(&strm);
 
+  if (kTVMVMBytecodeMagicV2 == header_magic) {
+    // Memory Scopes
+    exec->LoadMemoryScopeSection(&strm);
+  }
+
   // Constant section.
   exec->LoadConstantSection(&strm);
 
@@ -245,6 +256,15 @@ bool VMFuncInfo::Load(dmlc::Stream* strm) {
 
 void VMExecutable::SaveGlobalSection(dmlc::Stream* strm) const { strm->Write(func_table); }
 
+void VMExecutable::SaveMemoryScopeSection(dmlc::Stream* strm) const {
+  strm->Write(static_cast<uint64_t>(this->memory_scopes.size()));
+  for (auto it = this->memory_scopes.begin(); it != this->memory_scopes.end(); it++) {
+    LOG(WARNING) << "Scope Saving:" << it->second;
+    strm->Write(it->first);
+    strm->Write(it->second);
+  }
+}
+
 void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const {
   strm->Write(static_cast<uint64_t>(this->constants.size()));
   for (const auto& it : this->constants) {
@@ -293,6 +313,23 @@ void VMExecutable::LoadGlobalSection(dmlc::Stream* strm) {
   }
 }
 
+void VMExecutable::LoadMemoryScopeSection(dmlc::Stream* strm) {
+  uint64_t sz;
+  // Load the number of memory scope entries.
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "memory scopes");
+
+  size_t size = static_cast<size_t>(sz);
+  // Load each of the scopes.
+  for (size_t i = 0; i < size; i++) {
+    Index const_idx;
+    std::string scope;
+    STREAM_CHECK(strm->Read(&const_idx), "memory scopes");
+    STREAM_CHECK(strm->Read(&scope), "memory scopes");
+    LOG(WARNING) << "Scope Loaded:" << scope;
+    this->memory_scopes[const_idx] = scope;
+  }
+}
+
 void VMExecutable::LoadConstantSection(dmlc::Stream* strm) {
   uint64_t sz;
   // Load the number of constants.
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index be981b205cbb..cd5475bf1ad5 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -84,19 +84,21 @@ ffi::Any IndexIntoNestedObject(ffi::Any obj, ffi::PackedArgs args, int starting_
   return obj;
 }
 
-Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc) {
+Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc,
+                             ffi::String scope = "global") {
   if (src->device.device_type == dev.device_type && src->device.device_id == dev.device_id) {
     return src;
   } else {
-    auto res = alloc->Empty(src.Shape(), src->dtype, dev);
+    auto res = alloc->Empty(src.Shape(), src->dtype, dev, scope);
     res.CopyFrom(src);
     return res;
   }
 }
 
-Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc) {
+Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc,
+                          ffi::String scope = "global") {
   if (src.as<Tensor::ContainerType>()) {
-    return ConvertTensorToDevice(src.cast<Tensor>(), dev, alloc);
+    return ConvertTensorToDevice(src.cast<Tensor>(), dev, alloc, scope);
   } else if (src.as<ffi::ArrayObj>()) {
     std::vector<Any> ret;
     auto arr = src.cast<ffi::Array<Any>>();
@@ -129,10 +131,11 @@ ffi::Any ConvertArgToDevice(ffi::AnyView input, Device dev, Allocator* alloc) {
   return ret;
 }
 
-ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc) {
+ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc,
+                            ffi::String scope = "global") {
   ffi::Any ret;
   if (auto opt_obj = input.as<ObjectRef>()) {
-    ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc);
+    ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc, scope);
   } else {
     ret = input;
   }
@@ -469,11 +472,12 @@ void VirtualMachineImpl::Init(const std::vector<Device>& devices,
   }
   // Setup constant sections.
   this->const_pool_.reserve(exec_->constants.size());
-  for (const auto& constant : exec_->constants) {
-    if (auto opt_nd = constant.as<Tensor>()) {
-      this->const_pool_.push_back(ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0]));
+  for (size_t i = 0; i < exec_->constants.size(); ++i) {
+    if (auto opt_nd = exec_->constants[i].as<Tensor>()) {
+      this->const_pool_.push_back(
+          ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0], exec_->memory_scopes[i]));
     } else {
-      this->const_pool_.push_back(constant);
+      this->const_pool_.push_back(exec_->constants[i]);
     }
   }
   // Setup function sections.
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 8ea55b8ff5d8..b3926e106f6f 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -240,11 +240,6 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     if (t.is_uint()) {
       os << 'u';
     }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int";
-      return;
-    }
     switch (t.bits()) {
       case 8:
         os << "char";
@@ -279,7 +274,7 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
     return PrintType(ptr->dtype, os);
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
     if (runtime::IsTextureStorage(std::string(ptr->storage_scope))) {
-      os << "image2d_t";
+      os << "image2d_array_t";
     } else {
       PrintType(ptr->element_type, os);
       os << '*';
@@ -428,47 +423,85 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     ICHECK(ptr_type != nullptr) << "Texture Var's must be of PointerType";
     ICHECK(runtime::IsTextureStorage(std::string(ptr_type->storage_scope)))
         << "builtin::texture2d_store() only supports storing to texture buffers";
+    const int channel_size = Downcast<IntImm>(op->args[4])->value;
+    ICHECK(channel_size == 64 || channel_size == 128)
+        << "Unsupported Channel Size: " << channel_size;
+    DataType channel_type = runtime::GetChannelType(channel_size);
+
     DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
-    if (buffer_type.is_float16()) {
+    std::stringstream ss;
+    this->PrintExpr(op->args[5], ss);
+    std::string value;
+    value = this->SSAGetID(ss.str(), buffer_type.with_lanes(channel_size / buffer_type.bits()));
+    if (channel_size == 64) {
       os << "write_imageh(";
-    } else if (buffer_type.is_float()) {
+    } else if (channel_size == 128) {
       os << "write_imagef(";
     } else {
-      LOG(FATAL) << "Unsupported type: " << buffer_type
-                 << ", currently only float and half are supported for image2d OpenCL codegen.";
+      LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
     }
     this->PrintExpr(op->args[0], os);
     os << ", ";
-    os << "(int2)(";
+    os << "(int4)(";
     this->PrintExpr(op->args[1], os);
     os << ", ";
     this->PrintExpr(op->args[2], os);
-    os << "), ";
+    os << ", ";
     this->PrintExpr(op->args[3], os);
+    os << ", ";
+    this->PrintExpr(make_const(DataType::Int(32), 0), os);
+    os << "), ";
+    os << "as_";
+    this->PrintType(channel_type, os);
+    os << "(" << value << ")";
     os << ")";
   } else if (op->op.same_as(builtin::texture2d_load())) {
     enable_compliant_texture_reads_ = true;
     std::stringstream ss;
-    if (op->dtype.is_float16()) {
+    const int channel_size = Downcast<IntImm>(op->args[4])->value;
+    const int data_lanes = channel_size / op->dtype.bits();
+    ICHECK(channel_size == 64 || channel_size == 128)
+        << "Unsupported Channel Size: " << channel_size;
+    ss << "as_";
+    this->PrintType(op->dtype.with_lanes(data_lanes), ss);
+    ss << "(";
+    if (channel_size == 64) {
       ss << "READ_IMAGEH(";
-    } else if (op->dtype.is_float()) {
+    } else if (channel_size == 128) {
       ss << "READ_IMAGEF(";
     } else {
-      LOG(FATAL) << "Unsupported type: " << op->dtype
-                 << ", currently only float and half are supported for image2d OpenCL codegen.";
+      LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
     }
     this->PrintExpr(op->args[0], ss);
     ss << ", ";
     ss << "image_sampler, ";
-    ss << "((int2)(";
+    ss << "((int4)(";
     this->PrintExpr(op->args[1], ss);
     ss << ", ";
     this->PrintExpr(op->args[2], ss);
-    ss << ")))";
-
-    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4));
-    if (op->args.back().as<RampNode>()) {
-      os << rhs;
+    ss << ", ";
+    this->PrintExpr(op->args[3], ss);
+    ss << ", ";
+    this->PrintExpr(make_const(DataType::Int(32), 0), ss);
+    ss << "))))";
+
+    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(data_lanes));
+    if (auto ramp = op->args.back().as<RampNode>()) {
+      if (ramp->base.as<IntImmNode>() && *tir::as_const_int(ramp->base) == 0 &&
+          *tir::as_const_int(ramp->lanes) == data_lanes && *tir::as_const_int(ramp->stride) == 1) {
+        os << rhs;
+      } else if (*tir::as_const_int(ramp->stride) == 1) {
+        os << "(*(";
+        this->PrintType(op->dtype.with_lanes(*tir::as_const_int(ramp->lanes)), os);
+        os << "*)";
+        os << "((";
+        this->PrintType(op->dtype.with_lanes(1), os);
+        os << "*)&" << rhs << " + ";
+        this->PrintExpr(ramp->base, os);
+        os << "))";
+      } else {
+        LOG(FATAL) << "Unsupported Texture Load Args";
+      }
     } else {
       os << "((";
       this->PrintType(op->dtype.with_lanes(1), os);
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index d44173a2ae3c..96e90f17ac79 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -362,6 +362,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<int64_t>("max_num_threads", 256)
     .add_attr_option<int64_t>("thread_warp_size", 1)
     .add_attr_option<int64_t>("texture_spatial_limit", 16384)
+    .add_attr_option<int64_t>("texture_depth_limit", 2048)
     // Faced that Qualcomm OpenCL runtime crashed without any error message in
     // the case when the number of kernel arguments was pretty big. OpenCL doesn't
     // specify any limitations on the number of kernel arguments. max_function_args
diff --git a/src/tir/backend/adreno/inject_texture_alloc.cc b/src/tir/backend/adreno/inject_texture_alloc.cc
new file mode 100644
index 000000000000..4799b7ce2643
--- /dev/null
+++ b/src/tir/backend/adreno/inject_texture_alloc.cc
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file inject_texture_alloc.cc
+ */
+
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/backend/adreno/transform.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include "../../../arith/ir_mutator_with_analyzer.h"
+#include "../../../runtime/texture.h"
+#include "../../transforms/ir_utils.h"
+
+namespace tvm {
+namespace tir {
+namespace backend {
+namespace adreno {
+using runtime::ApplyTexture2DFlattening;
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::IsTextureStorage;
+
+/*!
+ * \brief Inject Texture Alloc Intrensic right after AllocateNode are realized.
+ */
+class TextureAllocInjector : public arith::IRMutatorWithAnalyzer {
+ public:
+  static PrimFunc Inject(PrimFunc func) {
+    arith::Analyzer ana;
+    auto pass = TextureAllocInjector(&ana);
+    auto writer = func.CopyOnWrite();
+    pass.MarkBufferMapShapes(func);
+    writer->body = pass.VisitStmt(func->body);
+    return func;
+  }
+
+ private:
+  using IRMutatorWithAnalyzer::VisitExpr;
+  using IRMutatorWithAnalyzer::VisitExpr_;
+  using IRMutatorWithAnalyzer::VisitStmt;
+  using IRMutatorWithAnalyzer::VisitStmt_;
+
+  explicit TextureAllocInjector(arith::Analyzer* ana) : IRMutatorWithAnalyzer(ana) {}
+
+  Stmt VisitStmt_(const AllocateNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    std::string storage_scope = GetStorageScope(op->buffer_var);
+    if (IsTextureStorage(storage_scope)) {
+      op = stmt.as<AllocateNode>();
+      ICHECK(op->extents.size() >= 3) << "Only 2D Array RGBA texture is currently supported";
+      const int data_bits = op->dtype.bits(),
+                vec_length = static_cast<int>(op->extents.back().as<IntImmNode>()->value);
+      const int channel_size = data_bits * vec_length;
+      ICHECK(channel_size == 128 || channel_size == 64)
+          << "Invalid Channel Size: " << channel_size << " bits";
+
+      size_t axis = DefaultTextureLayoutSeparator(op->extents.size(), storage_scope);
+      auto texture = ApplyTexture2DFlattening<PrimExpr>(op->extents, op->extents.size(), axis);
+      ffi::Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), 3));  // 2d Array
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+                          {texture.width, texture.height, texture.depth}));
+      args.push_back(IntImm(DataType::Int(64), channel_size));
+      stmt =
+          LetStmt(op->buffer_var,
+                  Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), op->body);
+    }
+    return stmt;
+  }
+
+ protected:
+  std::string GetStorageScope(const Var& buffer_var) {
+    auto* ptr = buffer_var->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
+  }
+};
+
+namespace transform {
+
+Pass InjectTextureAlloc() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TextureAllocInjector::Inject(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.InjectTextureAlloc", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tir.backend.adreno.transform.InjectTextureAlloc", InjectTextureAlloc);
+}
+
+}  // namespace transform
+
+}  // namespace adreno
+}  // namespace backend
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/backend/adreno/texture_flatten.cc b/src/tir/backend/adreno/texture_flatten.cc
new file mode 100644
index 000000000000..5e8a922c30ca
--- /dev/null
+++ b/src/tir/backend/adreno/texture_flatten.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture_flatten.cc
+ * \brief Flattens texture storage from multi-dimensional array
+ * to 2D (width, height, depth) array access
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/tir/backend/adreno/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+
+#include <unordered_map>
+
+#include "../../../arith/ir_visitor_with_analyzer.h"
+#include "../../../runtime/texture.h"
+#include "../../../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace tir {
+namespace backend {
+namespace adreno {
+using arith::IRVisitorWithAnalyzer;
+using runtime::ApplyTexture2DFlattening;
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::IsTextureStorage;
+
+class TextureLoweringBase : public StmtExprMutator {
+ public:
+  explicit TextureLoweringBase(const ffi::Map<Var, Buffer>& extern_buffer_map,
+                               IRVisitorWithAnalyzer* bound_analyzer)
+      : bound_analyzer_{bound_analyzer} {
+    for (auto kv : extern_buffer_map) {
+      extern_buf_.insert(kv.second);
+    }
+  }
+
+  inline PrimExpr SimplifyOffset(const ffi::Array<PrimExpr>& shape,
+                                 const ffi::Array<PrimExpr>& index) const {
+    PrimExpr base = make_const(DataType::Int(32), 0);
+    ICHECK_EQ(shape.size(), index.size());
+    if (index.size() > 0) {
+      PrimExpr offset = index[0];
+      for (size_t i = 1; i < index.size(); ++i) {
+        offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]);
+      }
+      base = base + offset;
+    }
+    return base;
+  }
+
+ protected:
+  std::string GetStorageScope(const Buffer& buffer) {
+    auto* ptr = buffer->data->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
+  }
+
+  // Set of all external input and output buffers
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
+  // Bound analzer
+  IRVisitorWithAnalyzer* bound_analyzer_;
+};
+
+// Lower Nd storage access to 2d texture access using lowering convention
+// specified by the buffers storage scope.
+class TextureFlattener : public TextureLoweringBase {
+ public:
+  using StmtExprMutator::VisitStmt_;
+  explicit TextureFlattener(const ffi::Map<Var, Buffer>& extern_buffer_map,
+                            IRVisitorWithAnalyzer* bound_analyzer)
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
+
+  Stmt VisitStmt_(const BufferRealizeNode* op) final {
+    if (extern_buf_.count(op->buffer)) {
+      return this->VisitStmt(op->body);
+    }
+
+    std::string storage_scope = GetStorageScope(op->buffer);
+    Var buffer_var(op->buffer->data->name_hint,
+                   PointerType(PrimType(op->buffer->dtype), ffi::String(storage_scope)));
+    let_binding_.insert({op->buffer->data, buffer_var});
+
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferRealizeNode>();
+
+    // Rewrite any buffer realizations with storage scope to 2d texture allocations
+    if (IsTextureStorage(storage_scope)) {
+      Stmt body = this->VisitStmt(op->body);
+      ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
+      const int bits = op->buffer->dtype.bits(),
+                lanes = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
+      const int channel_size = bits * lanes;
+      ICHECK(channel_size == 128 || channel_size == 64)
+          << "Invalid Channel Size: " << channel_size << " bits";
+
+      struct ShapeFromRange {
+        const ffi::Array<Range>& bounds;
+        PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
+      };
+      size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
+      auto texture =
+          ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
+      ffi::Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), 3));  // 2D-Array
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+                          {texture.width, texture.height, texture.depth}));
+      args.push_back(IntImm(DataType::Int(64), channel_size));
+
+      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args),
+                     body);
+    }
+
+    return stmt;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferStoreNode>();
+    std::string storage_scope = GetStorageScope(op->buffer);
+    // Lower to two dimensional access
+    if (IsTextureStorage(storage_scope)) {
+      ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
+      args.push_back(op->value);
+      stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args));
+    }
+
+    return stmt;
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+    op = expr.as<BufferLoadNode>();
+    // Lower to two dimensional access
+    std::string storage_scope = GetStorageScope(op->buffer);
+    if (IsTextureStorage(storage_scope)) {
+      ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
+      args.push_back(op->indices.back());
+      expr = Call(op->buffer->dtype, builtin::texture2d_load(), args);
+    }
+
+    return expr;
+  }
+
+ protected:
+  template <typename T>
+  ffi::Array<PrimExpr> GetTextureAccessArgs(const T* op, const Buffer& buffer) {
+    ffi::Array<PrimExpr> args;
+    if (let_binding_.count(op->buffer->data)) {
+      args.push_back(let_binding_[op->buffer->data]);
+    } else {
+      args.push_back(buffer->data);
+    }
+    ffi::Array<PrimExpr> row_dims, row_indices, col_dims, col_indices, depth_dims, depth_indices;
+    size_t axis = DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer));
+    for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) {
+      if (i < (axis - 1)) {
+        depth_dims.push_back(op->buffer->shape[i]);
+        depth_indices.push_back(op->indices[i]);
+      } else if (i < axis) {
+        col_dims.push_back(op->buffer->shape[i]);
+        col_indices.push_back(op->indices[i]);
+      } else {
+        row_dims.push_back(op->buffer->shape[i]);
+        row_indices.push_back(op->indices[i]);
+      }
+    }
+    PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
+    PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
+    PrimExpr depth_offset = SimplifyOffset(depth_dims, depth_indices);
+    PrimExpr channel_size = IntImm(DataType::Int(32, 1),
+                                   *tir::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
+    args.push_back(row_offset);
+    args.push_back(col_offset);
+    args.push_back(depth_offset);
+    args.push_back(channel_size);
+    return args;
+  }
+
+  // Bindings to new texture vars with texture pointer scope
+  std::unordered_map<Var, PrimExpr> let_binding_;
+};
+
+PrimFunc TextureFlattenHandler(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  IRVisitorWithAnalyzer bound_analyzer;
+  bound_analyzer(fptr->body);
+  fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+  return func;
+}
+
+namespace transform {
+
+Pass TextureFlatten() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TextureFlattenHandler(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.TextureFlatten", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tir.backend.adreno.transform.TextureFlatten", TextureFlatten);
+}
+
+}  // namespace transform
+
+}  // namespace adreno
+}  // namespace backend
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 068903baa814..331f556a9442 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -37,6 +37,8 @@
 
 #include "../../src/arith/scalable_expression.h"
 #include "../../tir/analysis/check_contains.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/tir/buffer.h"
 
 namespace tvm {
 namespace tir {
@@ -519,16 +521,32 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       ffi::Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
+      auto dtype = op->args[0]
+                       .as<VarNode>()
+                       ->type_annotation.as<PointerTypeNode>()
+                       ->element_type.as<PrimTypeNode>()
+                       ->dtype;
+      ICHECK(lane * dtype.bits() <= op->args[4].as<IntImmNode>()->value)
+          << "Expected Data to be Read is lesser than or equal to Texture Load length";
+
       auto new_args = op->args;
       new_args.pop_back();
       new_args.push_back(fcd[0]);
-      return Call(op->dtype.with_lanes(4), op->op, new_args);
+      return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::texture2d_store())) {
       int lane = 0;
       // Vectorize the value to store
       ffi::Array<PrimExpr> value{op->args.back()};
       ffi::Array<PrimExpr> mutated_value = MutateArray(value, &lane);
-      ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]};
+      auto dtype = op->args[0]
+                       .as<VarNode>()
+                       ->type_annotation.as<PointerTypeNode>()
+                       ->element_type.as<PrimTypeNode>()
+                       ->dtype;
+      ICHECK(lane * dtype.bits() == op->args[4].as<IntImmNode>()->value)
+          << "Expected Data to be Written equal to Texture Store length";
+      ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2],
+                                    op->args[3], op->args[4], mutated_value[0]};
       return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
diff --git a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
index 24b4cf66b888..57abfd081350 100644
--- a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
+++ b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
@@ -123,8 +123,8 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-nhwc"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
-        "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global.texture-weight"], ["global"]),
+        "te_layout_transform1": (["global"], ["global"]),
+        "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global"], ["global"]),
         "te_layout_transform2": (["global"], ["global"]),
     }
 
@@ -151,9 +151,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -182,9 +182,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -213,9 +213,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -239,9 +239,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -267,9 +267,9 @@ def main(
     Expected = {
         "relu": (["global"], ["global"]),
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu1": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -294,9 +294,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu_tir_tanh": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -321,10 +321,10 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
         "te_layout_transform3": (["global"], ["global"]),
@@ -352,16 +352,16 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
         "relu": (["global"], ["global"]),
         "te_layout_transform3": (["global"], ["global.texture-weight"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -384,9 +384,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -410,9 +410,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -436,9 +436,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -462,9 +462,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -488,9 +488,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -514,9 +514,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -542,9 +542,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -569,9 +569,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate": (["global.texture-weight"], ["global"]),
@@ -595,9 +595,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]),
@@ -625,9 +625,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]),
@@ -660,9 +660,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "max_pool2d_opencl": (["global.texture-weight"], ["global"]),
@@ -686,9 +686,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "adaptive_avg_pool2d_opencl": (["global.texture-weight"], ["global"]),
@@ -712,9 +712,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -743,9 +743,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "layer_norm": (["global", "global", "global"], ["global"]),
@@ -771,9 +771,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -797,9 +797,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -852,20 +852,20 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "multiply": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo1_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo2_opencl_relu1": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -911,15 +911,15 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -968,20 +968,20 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform4": (["global"], ["global.texture-weight"]),
+        "te_layout_transform4": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo2_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "concatenate": (["global.texture-weight", "global.texture-weight"], ["global"]),
@@ -1041,23 +1041,23 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "max_pool2d_opencl": (["global.texture-weight"], ["global.texture-weight"]),
-        "te_layout_transform2": (["global"], ["global.texture-weight"]),
+        "te_layout_transform2": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo2_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo1_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo3_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
         "te_layout_transform3": (["global"], ["global"]),
@@ -1111,9 +1111,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -1121,7 +1121,7 @@ def main(
         "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": (
             [
                 "global.texture-weight",
-                "global.texture-weight",
+                "global",
                 "global.texture-weight",
                 "global.texture-weight",
                 "global.texture-weight",
@@ -1180,9 +1180,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -1190,7 +1190,7 @@ def main(
         "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": (
             [
                 "global.texture-weight",
-                "global.texture-weight",
+                "global",
                 "global.texture-weight",
                 "global.texture-weight",
             ],
diff --git a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
index 4e5b4b652b45..d0a4a525adf7 100644
--- a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
+++ b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
@@ -20,6 +20,7 @@
 import tvm
 import tvm.testing
 import json
+import os
 
 from tvm import relax, rpc
 from tvm.script import relax as R
@@ -73,7 +74,6 @@ def test_conv2d_offload(
     has_pad,
     is_depthwise,
     dtype,
-    rpc,
 ):
     low, high = 0, 1
     data_shape = (1, *shape)
@@ -117,7 +117,7 @@ def test_conv2d_offload(
         has_pad=has_pad,
         is_depthwise=is_depthwise,
     )
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -131,7 +131,7 @@ def test_conv2d_offload(
         [(1, 256, 1, 1), 1, 3e-4],
     ],
 )
-def test_batchnorm(dtype, trials, rpc):
+def test_batchnorm(dtype, trials):
     low, high = 0, 1
     if clml.clml_sdk_version() < 3:
         print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
@@ -159,7 +159,7 @@ def _get_axis_tuple(axis):
     inputs = [data]
     params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, "moving_var": variance}
     mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -182,9 +182,9 @@ def _get_axis_tuple(axis):
     ],
 )
 @tvm.testing.requires_openclml
-def test_binary_ops(a_shape, b_shape, op, rpc, dtype):
+def test_binary_ops(a_shape, b_shape, op, dtype):
     (mod, inputs) = get_binary_op_mod(a_shape, b_shape, op, dtype)
-    run_compare(mod, inputs, {}, rpc)
+    run_compare(mod, inputs, {})
 
 
 @tvm.testing.requires_openclml
@@ -204,9 +204,9 @@ def test_binary_ops(a_shape, b_shape, op, rpc, dtype):
     ],
 )
 @tvm.testing.requires_openclml
-def test_unary_ops(a_shape, op, rpc, dtype):
+def test_unary_ops(a_shape, op, dtype):
     (mod, inputs) = get_unary_op_mod(a_shape, op, dtype)
-    run_compare(mod, inputs, {}, rpc)
+    run_compare(mod, inputs, {})
 
 
 @tvm.testing.requires_openclml
@@ -223,14 +223,14 @@ def test_unary_ops(a_shape, op, rpc, dtype):
         [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
-def test_max_pool(dtype, trials, rpc):
+def test_max_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -247,14 +247,14 @@ def test_max_pool(dtype, trials, rpc):
         [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
-def test_avg_pool(dtype, trials, rpc):
+def test_avg_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -267,14 +267,14 @@ def test_avg_pool(dtype, trials, rpc):
         [(1, 64, 3, 3), (1, 32, 3, -1)],
     ],
 )
-def test_reshape(dtype, trials, rpc):
+def test_reshape(dtype, trials):
     low, high = -1, 1
     (input_shape, output_shape) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -288,7 +288,7 @@ def test_reshape(dtype, trials, rpc):
         [(1, 32, 256, 256), False],
     ],
 )
-def test_global_avg_pool(dtype, trials, rpc):
+def test_global_avg_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
@@ -296,7 +296,7 @@ def test_global_avg_pool(dtype, trials, rpc):
     inputs = [data]
     mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -310,7 +310,7 @@ def test_global_avg_pool(dtype, trials, rpc):
         [(1, 32, 256, 256), False],
     ],
 )
-def test_global_max_pool(dtype, trials, rpc):
+def test_global_max_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
@@ -322,7 +322,7 @@ def test_global_max_pool(dtype, trials, rpc):
     inputs = [data]
     mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relax/backend/clml/utils.py b/tests/python/relax/backend/clml/utils.py
index d32a2df38ffd..0f3516f96122 100644
--- a/tests/python/relax/backend/clml/utils.py
+++ b/tests/python/relax/backend/clml/utils.py
@@ -28,6 +28,25 @@
 from tvm.contrib import utils, ndk
 from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad
 
+import os
+from tvm import rpc as _rpc
+
+
+def get_rpc():
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        connection_type = "tracker"
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        target = "opencl"
+        target_host = "llvm -mtriple=aarch64-linux-gnu"
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+        tracker = _rpc.connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
 
 def build_and_run(
     mod,
@@ -35,40 +54,39 @@ def build_and_run(
     target,
     rpc=None,
     load_path="vm_library.so",
-    clml_enable=False,
 ):
     tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
-    pipeline = relax.pipeline.get_default_pipeline(tgt)
-    mod = pipeline(mod)
-    if rpc:
-        ex = tvm.compile(mod, tgt)
-        temp = utils.tempdir()
-        path = temp.relpath(load_path)
-        path = "./" + load_path
-        ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
-        rpc.upload(path)
-        rexec = rpc.load_module(load_path)
-        dev = rpc.cl(0)
-        vm = relax.VirtualMachine(rexec, dev)
-    else:
-        ex = tvm.compile(mod, target)
-        dev = tvm.device(target, 0)
-        vm = relax.VirtualMachine(ex, dev)
+    relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
+    tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
+
+    ex = tvm.compile(mod, tgt, relax_pipeline=relax_pipeline, tir_pipeline=tir_pipeline)
+    temp = utils.tempdir()
+    path = temp.relpath(load_path)
+    path = "./" + load_path
+    ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+
+    rpc.upload(path)
+    rexec = rpc.load_module(load_path)
+    dev = rpc.cl(0)
+    vm = relax.VirtualMachine(rexec, dev)
 
     f = vm["main"]
     inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
     vm.set_input("main", *inputs)
     vm.invoke_stateful("main")
-    tvm_output = vm.get_outputs("main")
-    return tvm_output.numpy()
+    tvm_output = vm.get_outputs("main").numpy()
+    if rpc:
+        rpc.get_function("CloseRPCConnection")()
+    return tvm_output
 
 
-def run_compare(mod, inputs, params_np, rpc=None):
+def run_compare(mod, inputs, params_np):
     clml_mod = copy.deepcopy(mod)
     mod = tvm.relax.transform.BindParams("main", params_np)(mod)
     clml_mod = tvm.relax.transform.BindParams("main", params_np)(clml_mod)
 
-    if not rpc:
+    rpc = get_rpc()
+    if rpc is None:
         return
 
     ref = build_and_run(
@@ -78,12 +96,13 @@ def run_compare(mod, inputs, params_np, rpc=None):
         rpc=rpc,
         load_path="vm_library_opencl.so",
     )
+
+    rpc = get_rpc()
     out = build_and_run(
         clml_mod,
         inputs,
-        tvm.target.adreno(clml=True),
+        tvm.target.adreno(cfg="clml"),
         rpc=rpc,
         load_path="vm_library_clml.so",
-        clml_enable=True,
     )
     np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
diff --git a/tests/python/relax/test_transform.py b/tests/python/relax/test_transform.py
index b0bec5e858af..78a52c66723e 100644
--- a/tests/python/relax/test_transform.py
+++ b/tests/python/relax/test_transform.py
@@ -441,8 +441,16 @@ def foo(
             R.Tensor((2, 3), "int32"), R.Tensor((2, 3), "int32"), R.Tensor((2, 3), dtype="int32")
         ):
             R.func_attr({"relax.force_pure": True})
-            gv0 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0))
-            gv1 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0))
+            gv0: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((2, 3), dtype="int32"),),
+            )
+            gv1: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((2, 3), dtype="int32"),),
+            )
             _ = Expected.copy(x, y, z, gv0, gv1)
             gv2 = (x, gv0, gv1)
             return gv2
diff --git a/tests/python/relax/test_transform_legalize_ops_manipulate.py b/tests/python/relax/test_transform_legalize_ops_manipulate.py
index ae865f1fb16b..17e0160c6183 100644
--- a/tests/python/relax/test_transform_legalize_ops_manipulate.py
+++ b/tests/python/relax/test_transform_legalize_ops_manipulate.py
@@ -1745,8 +1745,10 @@ def main(
         ):
             R.func_attr({"relax.force_pure": True})
             cls = Expected
-            alloc: R.Tensor((4, 4), dtype="float32") = R.builtin.alloc_tensor(
-                R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global")
+            alloc: R.Tensor((4, 4), dtype="float32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((4, 4), dtype="float32"),),
             )
             cls.te_layout_transform(x, alloc)
             lv = alloc
diff --git a/tests/python/relax/texture/adreno_utils.py b/tests/python/relax/texture/adreno_utils.py
new file mode 100644
index 000000000000..b6ffd837145f
--- /dev/null
+++ b/tests/python/relax/texture/adreno_utils.py
@@ -0,0 +1,195 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import numpy as np
+from tvm import relax
+from tvm.contrib import utils, ndk
+from tvm.script.parser import ir as I, relax as R, tir as T
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from tvm.contrib import dlpack as dl
+import tvm.testing
+from tvm.rpc import connect_tracker
+
+
+def get_target(backend, is_adreno=False):
+    """
+    Get the target for the Adreno GPU.
+
+    Returns
+    -------
+    tvm.target.Target
+        The target for the Adreno GPU.
+    """
+    target = tvm.target.adreno(backend=backend)
+    if is_adreno:
+        target = tvm.target.adreno(cfg="texture", backend=backend)
+    return target
+
+
+def get_rpc():
+    """
+    Establish an RPC connection to the remote device.
+
+    Returns
+    -------
+    tvm.rpc.RPCSession or None
+        The RPC session object if RPC_TARGET is set; otherwise, None.
+    """
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        tracker = connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
+
+def get_unique_dso_lib():
+    """
+    Generate a unique shared library filename based on environment variables.
+
+    Returns
+    -------
+    str
+        The unique shared library filename.
+    """
+    rpc_tracker_port = os.getenv("TVM_TRACKER_PORT", "")
+    device_port = os.getenv("DEVICE_LISTEN_PORT", "")
+    return f"dev_lib_cl-{rpc_tracker_port}-{device_port}.so"
+
+
+def run_cpu(mod, inputs, save_lib=False):
+    """
+    Run the Relax module on the local CPU for verification.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The Relax IRModule to execute.
+    inputs : list of numpy.ndarray
+        The input data for the module.
+    save_lib : bool, optional
+        Whether to save the compiled library. Default is False.
+
+    Returns
+    -------
+    tvm.runtime.NDArray or tuple of tvm.runtime.NDArray
+        The output from the module execution.
+    """
+    print("Running on local CPU for verification")
+    target = tvm.target.Target("llvm")
+    ex = relax.build(mod, target)
+    if save_lib:
+        ex.export_library("mod.so")
+    dev = tvm.cpu()
+    vm = relax.VirtualMachine(ex, dev)
+    inputs = [tvm.nd.array(inp, dev) for inp in inputs]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+    return tvm_output
+
+
+def build_run(mod, inputs, backend, is_adreno=False):
+    remote = get_rpc()
+    target = get_target(backend, is_adreno)
+    if remote is None:
+        tgt = tvm.target.Target(target, host="llvm")
+    else:
+        tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+    relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
+    tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
+    mod = relax_pipeline(mod)
+    ex = tvm.compile(mod, tgt, tir_pipeline=tir_pipeline)
+
+    if remote is None:
+        # local execution
+        if "opencl" in backend:
+            dev = tvm.opencl(0)
+        elif "vulkan" in backend:
+            dev = tvm.vulkan(0)
+        else:
+            raise RuntimeError("Unsupported backend")
+
+        if "vdevice" in mod.global_infos:
+            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+        else:
+            device_arr = [dev]
+        vm = relax.VirtualMachine(ex, device_arr)
+    else:
+        # remote execution
+        temp = utils.tempdir()
+        filename = get_unique_dso_lib()
+        file_path = temp.relpath(filename)
+        ex.export_library(
+            file_path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"]
+        )
+
+        remote.upload(file_path)
+        rexec = remote.load_module(filename)
+
+        if "opencl" in backend:
+            dev = remote.cl(0)
+        elif "vulkan" in backend:
+            dev = remote.vulkan(0)
+        else:
+            raise RuntimeError("Unsupported backend")
+
+        if "vdevice" in mod.global_infos:
+            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+        else:
+            device_arr = [dev]
+
+        vm = relax.VirtualMachine(rexec, device_arr)
+
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+
+    if isinstance(tvm_output, tuple):
+        tvm_output = (out.numpy() for out in tvm_output)
+    else:
+        tvm_output = tvm_output.numpy()
+
+    if remote:
+        remote.get_function("CloseRPCConnection")()
+    return tvm_output
+
+
+def verify(mod, backend):
+
+    if backend not in ["opencl", "vulkan"]:
+        raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.")
+
+    inputs = []
+    for arg in mod["main"].params:
+        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
+        inputs.append(np.random.uniform(0, 1, size=shape).astype(arg.struct_info.dtype))
+
+    ret1 = build_run(mod, inputs, backend, True)
+    ret2 = build_run(mod, inputs, backend)
+
+    if isinstance(ret1, tuple):
+        for val1, val2 in zip(ret1, ret2):
+            tvm.testing.assert_allclose(val1, ret2, rtol=1e-3, atol=1e-3)
+    else:
+        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-3, atol=1e-3)
diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py
new file mode 100644
index 000000000000..aeb4f1248c10
--- /dev/null
+++ b/tests/python/relax/texture/test_network.py
@@ -0,0 +1,805 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+import onnx
+import copy
+
+from tvm import relax
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script import tir as T
+from adreno_utils import verify
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.frontend.onnx import from_onnx
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+
+import pytest
+import json
+import copy
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_network_resnet(target):
+    @I.ir_module
+    class Resnet:
+        @R.function
+        def main(
+            data: R.Tensor((1, 3, 224, 224), dtype="float32"),
+            resnetv22_batchnorm0_gamma: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_beta: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_running_mea: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_running_var: R.Tensor((3,), dtype="float32"),
+            resnetv22_conv0_weight: R.Tensor((64, 3, 7, 7), dtype="float32"),
+            resnetv22_batchnorm1_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv0_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm1_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv1_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm2_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv2_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm3_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv3_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm0_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_conv0_weight: R.Tensor((128, 64, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm1_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv1_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage2_conv2_weight: R.Tensor((128, 64, 1, 1), dtype="float32"),
+            resnetv22_stage2_batchnorm2_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv3_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm3_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv4_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm0_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_conv0_weight: R.Tensor((256, 128, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm1_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv1_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage3_conv2_weight: R.Tensor((256, 128, 1, 1), dtype="float32"),
+            resnetv22_stage3_batchnorm2_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv3_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm3_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv4_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm0_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_conv0_weight: R.Tensor((512, 256, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm1_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv1_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_stage4_conv2_weight: R.Tensor((512, 256, 1, 1), dtype="float32"),
+            resnetv22_stage4_batchnorm2_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv3_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm3_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv4_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_batchnorm2_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_running_var: R.Tensor((512,), dtype="float32"),
+            reshape_attr_tensor164: R.Tensor((2,), dtype="int64"),
+            resnetv22_dense0_weight: R.Tensor((1000, 512), dtype="float32"),
+            resnetv22_dense0_bias: R.Tensor((1000,), dtype="float32"),
+        ) -> R.Tensor((1, 1000), dtype="float32"):
+            with R.dataflow():
+                lv: R.Tuple(
+                    R.Tensor((1, 3, 224, 224), dtype="float32"),
+                    R.Tensor((3,), dtype="float32"),
+                    R.Tensor((3,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    data,
+                    resnetv22_batchnorm0_gamma,
+                    resnetv22_batchnorm0_beta,
+                    resnetv22_batchnorm0_running_mea,
+                    resnetv22_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0]
+                lv2: R.Tensor((3,), dtype="float32") = lv[1]
+                lv3: R.Tensor((3,), dtype="float32") = lv[2]
+                lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d(
+                    lv1,
+                    resnetv22_conv0_weight,
+                    strides=[2, 2],
+                    padding=[3, 3, 3, 3],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv5: R.Tuple(
+                    R.Tensor((1, 64, 112, 112), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv4,
+                    resnetv22_batchnorm1_gamma,
+                    resnetv22_batchnorm1_beta,
+                    resnetv22_batchnorm1_running_mea,
+                    resnetv22_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv6: R.Tensor((1, 64, 112, 112), dtype="float32") = lv5[0]
+                lv7: R.Tensor((64,), dtype="float32") = lv5[1]
+                lv8: R.Tensor((64,), dtype="float32") = lv5[2]
+                lv9: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.relu(lv6)
+                lv10: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.max_pool2d(
+                    lv9,
+                    pool_size=[3, 3],
+                    strides=[2, 2],
+                    dilation=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    ceil_mode=False,
+                    count_include_pad=False,
+                    layout="NCHW",
+                    out_layout="NCHW",
+                )
+                lv11: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv10,
+                    resnetv22_stage1_batchnorm0_gamma,
+                    resnetv22_stage1_batchnorm0_beta,
+                    resnetv22_stage1_batchnorm0_running_mea,
+                    resnetv22_stage1_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv12: R.Tensor((1, 64, 56, 56), dtype="float32") = lv11[0]
+                lv13: R.Tensor((64,), dtype="float32") = lv11[1]
+                lv14: R.Tensor((64,), dtype="float32") = lv11[2]
+                lv15: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv12)
+                lv16: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv15,
+                    resnetv22_stage1_conv0_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv17: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv16,
+                    resnetv22_stage1_batchnorm1_gamma,
+                    resnetv22_stage1_batchnorm1_beta,
+                    resnetv22_stage1_batchnorm1_running_mea,
+                    resnetv22_stage1_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv18: R.Tensor((1, 64, 56, 56), dtype="float32") = lv17[0]
+                lv19: R.Tensor((64,), dtype="float32") = lv17[1]
+                lv20: R.Tensor((64,), dtype="float32") = lv17[2]
+                lv21: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv18)
+                lv22: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv21,
+                    resnetv22_stage1_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv23: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv22, lv10)
+                lv24: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv23,
+                    resnetv22_stage1_batchnorm2_gamma,
+                    resnetv22_stage1_batchnorm2_beta,
+                    resnetv22_stage1_batchnorm2_running_mea,
+                    resnetv22_stage1_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv25: R.Tensor((1, 64, 56, 56), dtype="float32") = lv24[0]
+                lv26: R.Tensor((64,), dtype="float32") = lv24[1]
+                lv27: R.Tensor((64,), dtype="float32") = lv24[2]
+                lv28: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv25)
+                lv29: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv28,
+                    resnetv22_stage1_conv2_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv30: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv29,
+                    resnetv22_stage1_batchnorm3_gamma,
+                    resnetv22_stage1_batchnorm3_beta,
+                    resnetv22_stage1_batchnorm3_running_mea,
+                    resnetv22_stage1_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv31: R.Tensor((1, 64, 56, 56), dtype="float32") = lv30[0]
+                lv32: R.Tensor((64,), dtype="float32") = lv30[1]
+                lv33: R.Tensor((64,), dtype="float32") = lv30[2]
+                lv34: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv31)
+                lv35: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv34,
+                    resnetv22_stage1_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv36: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv35, lv23)
+                lv37: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv36,
+                    resnetv22_stage2_batchnorm0_gamma,
+                    resnetv22_stage2_batchnorm0_beta,
+                    resnetv22_stage2_batchnorm0_running_mea,
+                    resnetv22_stage2_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv38: R.Tensor((1, 64, 56, 56), dtype="float32") = lv37[0]
+                lv39: R.Tensor((64,), dtype="float32") = lv37[1]
+                lv40: R.Tensor((64,), dtype="float32") = lv37[2]
+                lv41: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv38)
+                lv42: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv41,
+                    resnetv22_stage2_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv43: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv42,
+                    resnetv22_stage2_batchnorm1_gamma,
+                    resnetv22_stage2_batchnorm1_beta,
+                    resnetv22_stage2_batchnorm1_running_mea,
+                    resnetv22_stage2_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv44: R.Tensor((1, 128, 28, 28), dtype="float32") = lv43[0]
+                lv45: R.Tensor((128,), dtype="float32") = lv43[1]
+                lv46: R.Tensor((128,), dtype="float32") = lv43[2]
+                lv47: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv44)
+                lv48: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv47,
+                    resnetv22_stage2_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv49: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv41,
+                    resnetv22_stage2_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv50: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv48, lv49)
+                lv51: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv50,
+                    resnetv22_stage2_batchnorm2_gamma,
+                    resnetv22_stage2_batchnorm2_beta,
+                    resnetv22_stage2_batchnorm2_running_mea,
+                    resnetv22_stage2_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv52: R.Tensor((1, 128, 28, 28), dtype="float32") = lv51[0]
+                lv53: R.Tensor((128,), dtype="float32") = lv51[1]
+                lv54: R.Tensor((128,), dtype="float32") = lv51[2]
+                lv55: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv52)
+                lv56: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv55,
+                    resnetv22_stage2_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv57: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv56,
+                    resnetv22_stage2_batchnorm3_gamma,
+                    resnetv22_stage2_batchnorm3_beta,
+                    resnetv22_stage2_batchnorm3_running_mea,
+                    resnetv22_stage2_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv58: R.Tensor((1, 128, 28, 28), dtype="float32") = lv57[0]
+                lv59: R.Tensor((128,), dtype="float32") = lv57[1]
+                lv60: R.Tensor((128,), dtype="float32") = lv57[2]
+                lv61: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv58)
+                lv62: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv61,
+                    resnetv22_stage2_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv63: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv62, lv50)
+                lv64: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv63,
+                    resnetv22_stage3_batchnorm0_gamma,
+                    resnetv22_stage3_batchnorm0_beta,
+                    resnetv22_stage3_batchnorm0_running_mea,
+                    resnetv22_stage3_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv65: R.Tensor((1, 128, 28, 28), dtype="float32") = lv64[0]
+                lv66: R.Tensor((128,), dtype="float32") = lv64[1]
+                lv67: R.Tensor((128,), dtype="float32") = lv64[2]
+                lv68: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv65)
+                lv69: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv68,
+                    resnetv22_stage3_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv70: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv69,
+                    resnetv22_stage3_batchnorm1_gamma,
+                    resnetv22_stage3_batchnorm1_beta,
+                    resnetv22_stage3_batchnorm1_running_mea,
+                    resnetv22_stage3_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv71: R.Tensor((1, 256, 14, 14), dtype="float32") = lv70[0]
+                lv72: R.Tensor((256,), dtype="float32") = lv70[1]
+                lv73: R.Tensor((256,), dtype="float32") = lv70[2]
+                lv74: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv71)
+                lv75: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv74,
+                    resnetv22_stage3_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv76: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv68,
+                    resnetv22_stage3_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv77: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv75, lv76)
+                lv78: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv77,
+                    resnetv22_stage3_batchnorm2_gamma,
+                    resnetv22_stage3_batchnorm2_beta,
+                    resnetv22_stage3_batchnorm2_running_mea,
+                    resnetv22_stage3_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv79: R.Tensor((1, 256, 14, 14), dtype="float32") = lv78[0]
+                lv80: R.Tensor((256,), dtype="float32") = lv78[1]
+                lv81: R.Tensor((256,), dtype="float32") = lv78[2]
+                lv82: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv79)
+                lv83: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv82,
+                    resnetv22_stage3_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv84: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv83,
+                    resnetv22_stage3_batchnorm3_gamma,
+                    resnetv22_stage3_batchnorm3_beta,
+                    resnetv22_stage3_batchnorm3_running_mea,
+                    resnetv22_stage3_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv85: R.Tensor((1, 256, 14, 14), dtype="float32") = lv84[0]
+                lv86: R.Tensor((256,), dtype="float32") = lv84[1]
+                lv87: R.Tensor((256,), dtype="float32") = lv84[2]
+                lv88: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv85)
+                lv89: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv88,
+                    resnetv22_stage3_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv90: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv89, lv77)
+                lv91: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv90,
+                    resnetv22_stage4_batchnorm0_gamma,
+                    resnetv22_stage4_batchnorm0_beta,
+                    resnetv22_stage4_batchnorm0_running_mea,
+                    resnetv22_stage4_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv92: R.Tensor((1, 256, 14, 14), dtype="float32") = lv91[0]
+                lv93: R.Tensor((256,), dtype="float32") = lv91[1]
+                lv94: R.Tensor((256,), dtype="float32") = lv91[2]
+                lv95: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv92)
+                lv96: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv95,
+                    resnetv22_stage4_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv97: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv96,
+                    resnetv22_stage4_batchnorm1_gamma,
+                    resnetv22_stage4_batchnorm1_beta,
+                    resnetv22_stage4_batchnorm1_running_mea,
+                    resnetv22_stage4_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv98: R.Tensor((1, 512, 7, 7), dtype="float32") = lv97[0]
+                lv99: R.Tensor((512,), dtype="float32") = lv97[1]
+                lv100: R.Tensor((512,), dtype="float32") = lv97[2]
+                lv101: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv98)
+                lv102: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv101,
+                    resnetv22_stage4_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv103: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv95,
+                    resnetv22_stage4_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv104: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv102, lv103)
+                lv105: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv104,
+                    resnetv22_stage4_batchnorm2_gamma,
+                    resnetv22_stage4_batchnorm2_beta,
+                    resnetv22_stage4_batchnorm2_running_mea,
+                    resnetv22_stage4_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv106: R.Tensor((1, 512, 7, 7), dtype="float32") = lv105[0]
+                lv107: R.Tensor((512,), dtype="float32") = lv105[1]
+                lv108: R.Tensor((512,), dtype="float32") = lv105[2]
+                lv109: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv106)
+                lv110: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv109,
+                    resnetv22_stage4_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv111: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv110,
+                    resnetv22_stage4_batchnorm3_gamma,
+                    resnetv22_stage4_batchnorm3_beta,
+                    resnetv22_stage4_batchnorm3_running_mea,
+                    resnetv22_stage4_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv112: R.Tensor((1, 512, 7, 7), dtype="float32") = lv111[0]
+                lv113: R.Tensor((512,), dtype="float32") = lv111[1]
+                lv114: R.Tensor((512,), dtype="float32") = lv111[2]
+                lv115: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv112)
+                lv116: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv115,
+                    resnetv22_stage4_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv117: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv116, lv104)
+                lv118: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv117,
+                    resnetv22_batchnorm2_gamma,
+                    resnetv22_batchnorm2_beta,
+                    resnetv22_batchnorm2_running_mea,
+                    resnetv22_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv119: R.Tensor((1, 512, 7, 7), dtype="float32") = lv118[0]
+                lv120: R.Tensor((512,), dtype="float32") = lv118[1]
+                lv121: R.Tensor((512,), dtype="float32") = lv118[2]
+                lv122: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv119)
+                lv123: R.Tensor((1, 512, 1, 1), dtype="float32") = R.mean(
+                    lv122, axis=[2, 3], keepdims=True
+                )
+                lv124: R.Tensor((1, 512), dtype="float32") = R.reshape(lv123, R.shape([1, 512]))
+                lv125: R.Tensor((512, 1000), dtype="float32") = R.permute_dims(
+                    resnetv22_dense0_weight, axes=[1, 0]
+                )
+                lv126: R.Tensor((1, 1000), dtype="float32") = R.matmul(
+                    lv124, lv125, out_dtype="void"
+                )
+                gv: R.Tensor((1, 1000), dtype="float32") = R.add(lv126, resnetv22_dense0_bias)
+                R.output(gv)
+            return gv
+
+    verify(Resnet, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/texture/test_ops.py b/tests/python/relax/texture/test_ops.py
new file mode 100644
index 000000000000..7011852aa3ab
--- /dev/null
+++ b/tests/python/relax/texture/test_ops.py
@@ -0,0 +1,714 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import tvm.testing
+from tvm.relax.transform import ConvertLayout, Normalize
+from tvm.script.parser import ir as I, relax as R, tir as T
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from adreno_utils import verify
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 64, 56, 56), "float32"), w: R.Tensor((32, 64, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 32, 54, 54), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                R.output(gv)
+            return gv
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_relu_conv2d_relu(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                x0: R.Tensor((2, 16, 28, 28), "float32") = R.nn.relu(x)
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x0, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_tanh(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 4, 26, 26), "float32") = R.tanh(gv2)
+                R.output(gv3)
+            return gv3
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_add(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            bias: R.Tensor((2, 4, 26, 26), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4), "float32") = R.sum(gv, axis=[2, 3])
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum_keepdims(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 1, 1), "float32") = R.sum(gv, axis=[2, 3], keepdims=True)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum_reduce(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 26), "float32") = R.sum(gv, axis=[1, 2])
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_transpose(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv, axes=[3, 2, 1, 0])
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_expand_dims(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=6):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 1, 4, 1, 26, 26), "float32") = R.expand_dims(gv, axis=(-3, 1))
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_squeeze(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=3):
+            with R.dataflow():
+                gv: R.Tensor((1, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((4, 26, 26), "float32") = R.squeeze(gv, axis=[0])
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_strided_slice(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 2, 9, 7), dtype="float32") = R.strided_slice(
+                    gv, begin=[0, 0, 0], end=[4, 26, 26], strides=[2, 3, 4], axes=[1, 2, 3]
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                R.output(gv3)
+            return gv3
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat_split(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                gv4 = R.split(gv3, indices_or_sections=2, axis=1)
+                # TODO @Siva: Multi value return have an issue at runtime.
+                gv5 = gv4[0]
+                R.output(gv5)
+            return gv5
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat_split_transpose_concat(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                gv4 = R.split(gv3, indices_or_sections=2, axis=1)
+                gv5: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[0], axes=[3, 2, 1, 0])
+                gv6: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[1], axes=[3, 2, 1, 0])
+                gv7: R.Tensor((26, 26, 8, 2), "float32") = R.concat((gv5, gv6), axis=2)
+                R.output(gv7)
+            return gv7
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_maxpool2d(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.max_pool2d(
+                    gv,
+                    pool_size=[2, 2],
+                    strides=[2, 2],
+                    padding=[0, 0],
+                    layout="NCHW",
+                    out_layout="NCHW",
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_avgpool2d(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.adaptive_avg_pool2d(gv, output_size=[13, 13], layout="NCHW")
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_softmax(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.softmax(gv, axis=1)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_layernorm(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            gamma: R.Tensor((26, 26), dtype="float32"),
+            beta: R.Tensor((26, 26), dtype="float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), dtype="float32") = R.nn.layer_norm(
+                    gv, gamma, beta, axes=[-2, -1]
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_binary_broadcast(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            bias: R.Tensor((26, 26), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias)
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_binary_ewise_scalar(target):
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, R.const(1, "float32"))
+                R.output(gv2)
+            return gv2
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_residual_block(target):
+    """
+    - some kind of residual block followed by convolution to have texture after residual block
+    - scalar data type verification which should be mapped to global memory scope
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- buffer
+                conv2d (1)                  <- to get textures as output
+               /         \
+            conv2d (2)    |
+                 \       /
+                    add                     <- add should be fused into conv2d (2)
+                multiply to scalar          <- buffer to the input of multiply scalar value
+                    relu
+                     |                      <- texture in intermediate tensor
+                  conv2d (3)
+                   relu
+                     |                      <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((32, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 32, 1, 1), "float32"),
+            w3: R.Tensor((32, 32, 2, 2), "float32"),
+            bias: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[1, 1], out_dtype="float32")
+                bias_1 = R.multiply(bias, R.const(0.15, "float32"))
+                gv4 = R.add(gv3, bias_1)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv5, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.nn.relu(gv6)
+                R.output(gv7)
+            return gv7
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_conv2d_fallback_to_buffer_conv2d(target):
+    """
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- texture
+                conv2d (1)               <- textures as output
+               /         \
+            conv2d (2)    conv2d (3)     <- conv2d (2) emits texture, conv2d (3) emits buffer
+                 \       /               <- concat shouldn't support textures here
+                concatenation
+                     |                   <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((96, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 96, 2, 2), "float32"),
+            w3: R.Tensor((5, 96, 2, 2), "float32"),
+            bias1: R.Tensor((1, 96, 1, 1), "float32"),
+            bias2: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias1)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32")
+                gv4 = R.add(gv3, bias2)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.concat((gv3, gv6), axis=1)
+                R.output(gv7)
+            return gv7
+
+    verify(Input, "opencl")
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_conv2d_conv2d_concat(target):
+    """
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- texture
+                conv2d (1)               <- textures as output
+               /         \
+            conv2d (2)    conv2d (3)
+                 \       /               <- concat does support textures here
+                concatenation
+                     |                   <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((96, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 96, 2, 2), "float32"),
+            w3: R.Tensor((8, 96, 2, 2), "float32"),
+            bias1: R.Tensor((1, 96, 1, 1), "float32"),
+            bias2: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias1)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32")
+                gv4 = R.add(gv3, bias2)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.concat((gv3, gv6), axis=1)
+                R.output(gv7)
+            return gv7
+
+    verify(Input, "opencl")
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_pooling_branching_texture_params(target):
+    """
+    Verification of the pooling and many branches having textures
+                layout_transform (NCHW->NCHW4c)
+                         |                        <- texture
+                      conv2d (0)                  <- to get textures
+                         |                        <- textures
+                     pooling
+               /           \           \          <- textures
+            conv2d (1)    conv2d (2)    conv2d (3)
+                \             /           |
+                     add                  |       <- to have  the only one output, will be fused
+                      \                  /
+                            add                  <- to have  the only one output, will be fused
+                             |                   <- buffer
+                    layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((32, 32, 1, 1), "float32"),
+            w2: R.Tensor((32, 32, 2, 2), "float32"),
+            w3: R.Tensor((32, 32, 1, 1), "float32"),
+            w4: R.Tensor((32, 32, 2, 2), "float32"),
+            bias1: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[1, 1], out_dtype="float32")
+                gv1 = R.nn.max_pool2d(gv, pool_size=[2, 2], strides=[2, 2])
+                gv2 = R.nn.conv2d(
+                    gv1, w2, padding=[0, 0, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                gv3 = R.add(gv2, bias1)
+                gv4 = R.nn.relu(gv3)
+                gv5 = R.nn.conv2d(
+                    gv1, w3, padding=[0, 0, 0, 0], strides=[1, 1], out_dtype="float32"
+                )
+                gv6 = R.nn.conv2d(
+                    gv1, w4, padding=[0, 1, 1, 0], strides=[1, 1], out_dtype="float32"
+                )
+                gv7 = R.nn.relu(gv6)
+                gv8 = R.add(gv2, gv5)
+                gv9 = R.add(gv8, gv6)
+                R.output(gv9)
+            return gv9
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_injective_inputs1(target):
+    """
+                                     Input
+                               /                   \
+                            /                      |
+                         |                        /
+                      conv2d (1)                 /
+                         |                      /
+                      conv2d (2)              mean
+                  /         \                 /
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+                 |             \    /
+                 \                mul
+                  \            /
+                        add
+
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 4, 40, 40), "float32"),
+            w1: R.Tensor((4, 4, 3, 3), "float32"),
+            w2: R.Tensor((4, 4, 3, 3), "float32"),
+            w3: R.Tensor((4, 4, 3, 3), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                mean = R.mean(x, axis=1, keepdims=True)
+                conv1 = R.nn.conv2d(
+                    x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                conv2 = R.nn.conv2d(
+                    conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                ad3 = R.add(conv1, conv2)
+                ad1 = R.add(mean, conv1)
+                ad2 = R.multiply(ad1, conv2)
+                gv = R.add(ad3, ad2)
+                R.output(gv)
+            return gv
+
+    verify(Input, target)
+
+
+@tvm.testing.requires_opencl
+@tvm.testing.parametrize_targets("opencl")
+def test_injective_nwo_inputs2(target):
+    """
+                                     Input
+                               /             \
+                         |                    \
+                      conv2d                   \
+                         |                     /
+                      conv2d           mean    /
+                  /         \                 /
+                add         |   \             |
+                 |           |    \           |
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+                 |            \       /
+                 |             \    /
+                 \                mul
+                  \            /
+                        add
+
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 4, 40, 40), "float32"),
+            w1: R.Tensor((4, 4, 3, 3), "float32"),
+            w2: R.Tensor((4, 4, 3, 3), "float32"),
+            w3: R.Tensor((4, 4, 3, 3), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                mean = R.mean(x, axis=1, keepdims=True)
+                conv1 = R.nn.conv2d(
+                    x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                conv2 = R.nn.conv2d(
+                    conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                ad3 = R.add(conv1, conv2)
+                ad1 = R.add(mean, conv1)
+                ad2 = R.multiply(ad1, conv2)
+                gv = R.add(ad2, ad3)
+                R.output(gv)
+            return gv
+
+    verify(Input, target)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/texture/test_texture_nd.py b/tests/python/relax/texture/test_texture_nd.py
new file mode 100644
index 000000000000..520f309c6e41
--- /dev/null
+++ b/tests/python/relax/texture/test_texture_nd.py
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import tvm.testing
+import pytest
+import tempfile
+import numpy as np
+
+from tvm import (
+    relax,
+    IRModule,
+)
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from tvm.script import ir as I, tir as T
+from tvm.target import Target
+from tvm.contrib import ndk
+from tvm import tir, DataType
+from tvm.rpc import connect_tracker
+
+
+def get_rpc():
+    """
+    Establish an RPC connection to the remote device.
+
+    Returns
+    -------
+    tvm.rpc.RPCSession or None
+        The RPC session object if RPC_TARGET is set; otherwise, None.
+    """
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        tracker = connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
+
+def preprocess_pipeline(mod: IRModule) -> IRModule:
+    desired_layouts = {"relax.nn.conv2d": ["NCHW16c", "OIHW16o", "NCHW16c"]}
+    seq = tvm.transform.Sequential(
+        [
+            tvm.tir.transform.BindTarget(Target.current(allow_none=False)),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.DecomposeOpsForInference(),
+            tvm.relax.transform.FoldConstant(),
+            tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)),
+            tvm.relax.transform.ConvertLayout(desired_layouts),
+            tvm.relax.transform.Normalize(),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.LegalizeOps(),
+            tvm.relax.transform.LegalizeOps(
+                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo}
+            ),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.AnnotateTIROpPattern(),
+            tvm.relax.transform.FuseOps(),
+            tvm.relax.transform.FuseTIR(),
+            tvm.relax.transform.DeadCodeElimination(),
+            tvm.relax.transform.Normalize(),
+        ]
+    )
+    mod = seq(mod)
+    return mod
+
+
+def postprocess_pipeline(mod: IRModule) -> IRModule:
+    seq = tvm.transform.Sequential(
+        [
+            tvm.relax.transform.ToNonDataflow(),
+            tvm.relax.transform.RemovePurityChecking(),
+            tvm.relax.transform.CallTIRRewrite(),
+            tvm.relax.transform.Normalize(),
+            tvm.relax.transform.StaticPlanBlockMemory(),
+            tvm.relax.transform.LowerAllocTensor(),
+            tvm.relax.transform.KillAfterLastUse(),
+            tvm.relax.transform.LowerRuntimeBuiltin(),
+            tvm.relax.transform.VMShapeLower(),
+            tvm.relax.transform.AttachGlobalSymbol(),
+        ]
+    )
+    mod = seq(mod)
+    return mod
+
+
+@tvm.testing.requires_rpc
+@tvm.testing.requires_adreno_opencl
+@pytest.mark.parametrize("backend", ["opencl"])
+@pytest.mark.parametrize("dtype", ["int8", "float16", "int16", "float32", "int32"])
+@pytest.mark.parametrize("channel_size", [64, 128])
+@pytest.mark.parametrize("read_width", [1, 2, 4, 8, 16])
+def test_texture_copy(backend, dtype, channel_size, read_width):
+    remote = get_rpc()
+    M, N, K = (256, 1024, 128)
+    lanes = channel_size // DataType(dtype).bits
+    if read_width > lanes:
+        return
+
+    @I.ir_module
+    class TextureCopy:
+        @T.prim_func
+        def main(A: T.Buffer((M, N), dtype), B: T.Buffer((M, N), dtype)):
+            T.func_attr({"global_symbol": "main"})
+            for li, lj in T.grid(M, N):
+                with T.block("Copy"):
+                    i, j = T.axis.remap("SS", [li, lj])
+                    B[i, j] = A[i, j]
+
+    def schedule_texture_read(sch: tir.Schedule):
+        B_blk = sch.get_block("Copy")
+        Ai_block = sch.cache_read(B_blk, 0, "global.texture")
+        sch.transform_layout(Ai_block, ("write", 0), lambda i, j: (i, j // lanes, j % lanes))
+
+        def schedule_default(blk, lanes):
+            i, j = sch.get_loops(blk)
+            jo, jv = sch.split(j, [None, lanes])
+
+            b = sch.fuse(i, jo)
+            bx, tx = sch.split(b, [None, 256])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+
+            sch.vectorize(jv)
+
+        schedule_default(Ai_block, lanes)
+        schedule_default(B_blk, read_width)
+
+    mod = TextureCopy
+
+    if remote is None:
+        target = Target(backend + " -device=adreno")
+    else:
+        target = Target(backend + " -device=adreno", "llvm -mtriple=aarch64-linux-android")
+
+    with target:
+        mod = preprocess_pipeline(mod)
+        sch = tir.Schedule(mod)
+        schedule_texture_read(sch)
+        mod = postprocess_pipeline(sch.mod)
+
+    ex = relax.build(mod, target)
+    load_path = "vm_library.so"
+    inputs = [np.random.randint(0, 128, (M, N)).astype(dtype), np.zeros((M, N), dtype)]
+    with tempfile.TemporaryDirectory() as temp_dir:
+        if remote is not None:
+            path = temp_dir + "/" + load_path
+            ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+            remote.upload(path)
+            rexec = remote.load_module(load_path)
+            dev = remote.cl()
+            if "vdevice" in mod.global_infos:
+                device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+            else:
+                device_arr = [dev]
+            vm = relax.VirtualMachine(rexec, device_arr)
+        else:
+            # local execution
+            if "opencl" in backend:
+                dev = tvm.opencl(0)
+            elif "vulkan" in backend:
+                dev = tvm.vulkan(0)
+            else:
+                raise RuntimeError("Unsupported backend")
+
+            if "vdevice" in mod.global_infos:
+                device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+            else:
+                device_arr = [dev]
+            vm = relax.VirtualMachine(ex, device_arr)
+
+        inps = [tvm.runtime.tensor(inp, dev) for inp in inputs]
+        vm["main"](*inps)
+
+        out1 = inps[-1].numpy()
+        out2 = inps[0].numpy()
+
+        if remote:
+            remote.get_function("CloseRPCConnection")()
+
+        np.testing.assert_equal(out1, out2)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index a35a633e1dfd..a401fb808acb 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -103,6 +103,7 @@ case ${ENVIRONMENT} in
     adb shell "mkdir -p ${TARGET_FOLDER}"
     adb push build-adreno-target/tvm_rpc ${TARGET_FOLDER}/tvm_rpc-${USER}
     adb push build-adreno-target/libtvm_runtime.so ${TARGET_FOLDER}
+    adb push build-adreno-target/lib/libtvm_ffi.so ${TARGET_FOLDER}
     if [ -f ${CPP_LIB} ] ; then
         adb push ${CPP_LIB} ${TARGET_FOLDER}
     fi
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 96fc781744f9..2f9f91df8a0e 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -58,6 +58,8 @@
             logging.info(f"Using sccache region: {env['SCCACHE_REGION']}")
         else:
             logging.info(f"No sccache bucket set, using local cache")
+        if "SCCACHE_SERVER_PORT" in os.environ:
+            env["SCCACHE_SERVER_PORT"] = os.getenv("SCCACHE_SERVER_PORT")
         env["CXX"] = "/opt/sccache/c++"
         env["CC"] = "/opt/sccache/cc"
 
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 8b85a27277e0..bc5a075b2325 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -39,7 +39,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
-#echo set\(USE_CPP_RTVM ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake
 
@@ -52,6 +51,9 @@ echo set\(USE_OPENCL_GTEST ON\) >> config.cmake
 echo set\(USE_OPENCL_EXTN_QCOM ON\) >> config.cmake
 
 cmake -DANDROID_ABI=arm64-v8a \
+      -DCMAKE_SYSTEM_NAME=Android \
+      -DCMAKE_ANDROID_ARCH_ABI="arm64-v8a" \
+      -DCMAKE_SYSROOT="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/" \
       -DANDROID_PLATFORM=android-28 \
       -DCMAKE_SYSTEM_VERSION=1 \
       -DCMAKE_FIND_ROOT_PATH="${ADRENO_OPENCL}" \
diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh
index 1714a3c06358..b381fddc2427 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -84,5 +84,14 @@ for node_id in $RELAX_TESTS; do
     i=$((i+1))
 done
 
+# Texture test
+RELAX_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/relax/texture 2> /dev/null  | grep -v dlerror)
+i=0
+for node_id in $RELAX_TESTS; do
+    echo "$node_id"
+    CXX=${TVM_NDK_CC} run_pytest ctypes "$TVM_INTEGRATION_TESTSUITE_NAME-texture-relax-$i" "$node_id" --reruns=0
+    i=$((i+1))
+done
+
 kill ${TRACKER_PID}
 kill ${DEVICE_PID}
diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh
index 6d28d8f30f8b..776d29fda07f 100755
--- a/tests/scripts/task_python_unittest_gpuonly.sh
+++ b/tests/scripts/task_python_unittest_gpuonly.sh
@@ -34,3 +34,13 @@ export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-vulkan
 source tests/scripts/setup-pytest-env.sh
 
 run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/codegen/test_target_codegen_vulkan.py
+
+
+# Adreno : A comprehensive Texture tests on Nvidia GPU and clml codegen tests.
+export PYTEST_ADDOPTS=""
+export TVM_TEST_TARGETS="opencl"
+export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-clml-texture
+
+source tests/scripts/setup-pytest-env.sh
+
+run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/relax/texture/
diff --git a/tests/scripts/unity/task_python_relax_gpuonly.sh b/tests/scripts/unity/task_python_relax_gpuonly.sh
index acbcce44f279..19ab942a1193 100755
--- a/tests/scripts/unity/task_python_relax_gpuonly.sh
+++ b/tests/scripts/unity/task_python_relax_gpuonly.sh
@@ -18,7 +18,7 @@
 
 export TVM_TEST_TARGETS="llvm;cuda"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
-export TVM_RELAY_TEST_TARGETS="cuda"
+export TVM_RELAY_TEST_TARGETS="cuda;opencl"
 export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
 export TVM_INTEGRATION_GPU_ONLY=1