From b0a9307da2f22820432df0e17ae57b537ec78ab1 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 27 Nov 2025 13:54:34 +0530
Subject: [PATCH 01/17] Texture Lowering

---
 include/tvm/runtime/tensor.h                  |  32 +-
 include/tvm/tir/transform.h                   |   6 +
 python/tvm/dlight/adreno/__init__.py          |   5 +
 python/tvm/dlight/adreno/convolution.py       | 249 ++---
 python/tvm/dlight/adreno/fallback.py          | 177 ++++
 python/tvm/dlight/adreno/layout_transform.py  | 133 +++
 python/tvm/dlight/adreno/pool.py              |  92 ++
 python/tvm/dlight/adreno/utils.py             |  96 ++
 python/tvm/dlight/analysis/__init__.py        |   1 +
 python/tvm/dlight/analysis/common_analysis.py |  28 +
 python/tvm/dlight/base/__init__.py            |   9 +
 python/tvm/dlight/base/analysis.py            | 442 ++++++++
 python/tvm/dlight/base/common_schedules.py    |   2 +-
 python/tvm/ir/global_info.py                  |   8 +
 python/tvm/relax/op/memory/memory.py          |  12 +-
 python/tvm/relax/op/vm/vm.py                  |  12 +-
 .../transform/lower_gpu_ipc_alloc_storage.py  |   2 +-
 python/tvm/tir/pipeline.py                    |   2 +
 python/tvm/tir/transform/transform.py         |  26 +
 .../backend/adreno/annotate_custom_storage.cc |  90 +-
 src/relax/backend/vm/codegen_vm.cc            |  26 +-
 src/relax/backend/vm/codegen_vm_tir.cc        |  20 +-
 src/relax/backend/vm/lower_runtime_builtin.cc |  10 +-
 src/relax/op/op.cc                            |  84 +-
 src/relax/op/op_common.h                      |  13 +-
 src/relax/transform/call_tir_rewrite.cc       |  29 +-
 src/relax/transform/lower_alloc_tensor.cc     |  54 +-
 .../specialize_primfunc_based_on_callsite.cc  |   5 +-
 .../transform/static_plan_block_memory.cc     |  91 +-
 src/relax/transform/utils.h                   |  25 +
 src/runtime/opencl/opencl_common.h            |  14 +-
 src/runtime/opencl/opencl_device_api.cc       |  51 +-
 src/runtime/opencl/opencl_module.cc           |   1 -
 src/runtime/tensor.cc                         |  17 +-
 src/runtime/texture.h                         |  31 +-
 src/runtime/vm/builtin.cc                     |  24 +-
 src/runtime/vm/executable.cc                  |   4 +-
 src/runtime/vm/vm.cc                          |   2 +-
 src/target/source/codegen_opencl.cc           |  77 +-
 src/target/target_kind.cc                     |   1 +
 src/tir/transforms/inject_texture_alloc.cc    | 113 ++
 src/tir/transforms/texture_flatten.cc         | 228 ++++
 src/tir/transforms/vectorize_loop.cc          |  22 +-
 .../test_transform_annotate_custom_scope.py   |   1 +
 tests/python/relax/test_transform.py          |   2 +-
 .../test_transform_legalize_ops_manipulate.py |   2 +-
 tests/python/relax/texture/adreno_utils.py    | 182 ++++
 tests/python/relax/texture/test_network.py    | 992 ++++++++++++++++++
 tests/python/relax/texture/test_ops.py        | 825 +++++++++++++++
 tests/python/relax/texture/test_texture_nd.py | 168 +++
 ...test_tir_transform_inject_texture_alloc.py |  87 ++
 tests/scripts/setup-adreno-env.sh             |   1 +
 tests/scripts/task_build_adreno_bins.sh       |   4 +-
 tests/scripts/task_python_adreno.sh           |   9 +
 54 files changed, 4287 insertions(+), 352 deletions(-)
 create mode 100644 python/tvm/dlight/adreno/fallback.py
 create mode 100644 python/tvm/dlight/adreno/layout_transform.py
 create mode 100644 python/tvm/dlight/adreno/pool.py
 create mode 100644 python/tvm/dlight/adreno/utils.py
 create mode 100644 python/tvm/dlight/base/analysis.py
 create mode 100644 src/tir/transforms/inject_texture_alloc.cc
 create mode 100644 src/tir/transforms/texture_flatten.cc
 create mode 100644 tests/python/relax/texture/adreno_utils.py
 create mode 100644 tests/python/relax/texture/test_network.py
 create mode 100644 tests/python/relax/texture/test_ops.py
 create mode 100644 tests/python/relax/texture/test_texture_nd.py
 create mode 100644 tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py

diff --git a/include/tvm/runtime/tensor.h b/include/tvm/runtime/tensor.h
index 615cfd8cccfe..ab15efc41ced 100644
--- a/include/tvm/runtime/tensor.h
+++ b/include/tvm/runtime/tensor.h
@@ -36,6 +36,7 @@
 
 #include <atomic>
 #include <functional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -188,14 +189,25 @@ class Tensor : public tvm::ffi::Tensor {
    */
   TVM_DLL static void CopyFromBytes(const DLTensor* to, void* from, size_t nbytes,
                                     TVMStreamHandle stream = nullptr);
+
+  TVM_DLL void SetScope(ffi::String scope);
+  TVM_DLL ffi::String GetScope() const;
+
+ protected:
+  /*!
+   * \brief The memory scope
+   * represents the underlaying scope information of device
+   */
+  ffi::String scope = "global";
 };
 
 /*!
  * \brief Save a DLTensor to stream
  * \param strm The output stream
  * \param tensor The tensor to be saved.
+ * \param scope The tensor storage scope.
  */
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope = "global");
 
 inline void Tensor::CopyFrom(const DLTensor* other) {
   ICHECK(data_ != nullptr);
@@ -220,10 +232,11 @@ inline void Tensor::CopyTo(const Tensor& other) const {
 }
 
 /*! \brief Magic number for Tensor file */
-constexpr uint64_t kTVMTensorMagic = 0xDD5E40F096B4A13F;
+constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
+constexpr uint64_t kTVMNDArrayScopedMagic = 0xDD5E40F096B4A13E;
 
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
-  uint64_t header = kTVMTensorMagic, reserved = 0;
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope) {
+  uint64_t header = kTVMNDArrayScopedMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
   // Always save data as CPU context
@@ -243,6 +256,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
   strm->Write(tensor->dtype);
   int ndim = tensor->ndim;
   strm->WriteArray(tensor->shape, ndim);
+  strm->Write(std::string(scope));
   int type_bytes = (tensor->dtype.bits + 7) / 8;
   int64_t num_elems = 1;
   for (int i = 0; i < ndim; ++i) {
@@ -266,13 +280,14 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
   return true;
 }
 
-inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
+inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->(), GetScope()); }
 
 inline bool Tensor::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  ICHECK(header == kTVMTensorMagic) << "Invalid DLTensor file format";
+  ICHECK((header == kTVMNDArrayMagic) || (header == kTVMNDArrayScopedMagic))
+      << "Invalid DLTensor file format";
   Device dev;
   int ndim;
   DLDataType dtype;
@@ -290,6 +305,11 @@ inline bool Tensor::Load(dmlc::Stream* strm) {
   for (int i = 0; i < ret->ndim; ++i) {
     num_elems *= ret->shape[i];
   }
+  if (header == kTVMNDArrayScopedMagic) {
+    std::string scope;
+    strm->Read(&scope);
+    ret.SetScope(scope);
+  }
   int64_t data_byte_size;
   ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
   ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index bf100dc49c4c..b825d78cc0b0 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -773,6 +773,12 @@ TVM_DLL Pass DefaultGPUSchedule();
  */
 TVM_DLL Pass UseAssumeToReduceBranches();
 
+/*!
+ * \brief Inject Texture Allocation intrensic.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectTextureAlloc();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/dlight/adreno/__init__.py b/python/tvm/dlight/adreno/__init__.py
index ea2781455989..d77f41e3f55c 100644
--- a/python/tvm/dlight/adreno/__init__.py
+++ b/python/tvm/dlight/adreno/__init__.py
@@ -18,3 +18,8 @@
 Adreno schedule rules.
 """
 from .convolution import Conv2d
+from .layout_transform import LayoutTransform
+from .fallback import Fallback
+from .pool import Pool2D
+
+# from .fallback import Fallback
diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index fc2cc449a1c6..830e1924c446 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -16,215 +16,92 @@
 # under the License.
 # pylint: disable=missing-docstring, invalid-name
 """A Conv2d schedule rule for Adreno GPU operators."""
-from dataclasses import dataclass
-from typing import List, Optional
+from typing import Optional, Union
 
 from tvm import tir
 from tvm.target import Target
-from tvm.tir import IterVar
-from tvm.tir.schedule.schedule import BlockRV
 
-from ..analysis import BlockInfo, IterInfo
+from .utils import schedule_inline_blocks, schedule_storage_annotate, schedule_default
+from ..base import analysis
 from .base import AdrenoScheduleRule
 
 
-def is_spatial_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.DataPar}
-
-
-def is_reduction_block(sch: tir.Schedule, block: BlockRV) -> bool:
-    block_stmt = sch.get(block)
-    iter_types = {iter_var.iter_type for iter_var in block_stmt.iter_vars}
-    return iter_types == {IterVar.CommReduce, IterVar.DataPar}
-
-
-def _collect_producers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for producer in sch.get_producers(block):
-        result.append(producer)
-        result.extend(_collect_producers(sch, producer))
-    return result
-
-
-def _collect_consumers(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    result = []
-    for consumer in sch.get_consumers(block):
-        result.append(consumer)
-        result.extend(_collect_consumers(sch, consumer))
-    return result
-
-
-def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo:
-    def _iter_kind(loop: tir.IterVar) -> str:
-        return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
-
-    def _is_reduction_block(block: tir.schedule.BlockRV):
-        for iter_var in sch.get(block).iter_vars:
-            if _iter_kind(iter_var) == "R":
-                return True
-        return False
-
-    return BlockInfo(
-        name=sch.get(block).name_hint,
-        iters=[
-            IterInfo(
-                kind=_iter_kind(iter_var),
-                var=iter_var.var,
-                dom=iter_var.dom.extent,
-                loop_rv=loop_rv,
-            )
-            for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars)
-        ],
-        block_rv=block,
-        reduction_block=_is_reduction_block(block),
-    )
-
-
-def get_reduction_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]) -> bool:
-    # NOTE: We assume there is only one reduction block in the function
-    # all blocks are required to be spatial or reduction
-    if not all(
-        [is_reduction_block(sch, block) or is_spatial_block(sch, block) for block in blocks]
-    ):
-        return None
-
-    # There is only one reduction block
-    reduction_blocks = [block for block in blocks if is_reduction_block(sch, block)]
-    if len(reduction_blocks) != 1:
-        return None
-
-    return reduction_blocks[0]
-
-
-def is_convolution(sch: tir.Schedule, block: tir.schedule.BlockRV):
-    # TODO: Use buffer access patterns to discover convolution type kernels instead of using name.
-    return (
-        sch.get(block).name_hint.count("conv2d_NCHWc_OIHWo")
-        and "".join([iter_type.kind for iter_type in get_block_info(sch, block).iters])
-        == "SSSSSRRR"
-    )
-
-
 class Conv2d(AdrenoScheduleRule):
     """The schedule rule for convolution computation"""
 
-    @dataclass
-    class Config:
-        block_size_x: int = 8
-        block_size_y: int = 8
-        vector_size: int = 1
-        unroll: int = 256  # 0 means no unroll
-        use_shared: bool = True
-        storage_align: bool = False
-        inner_x: bool = False
-
-    def get_configs(self, target: Target) -> Config:
-        """Get the schedule config for the target"""
-        if target.kind.name == "cuda" or target.kind.name == "rocm":
-            return Conv2d.Config(
-                block_size_x=8,
-                block_size_y=16,
-                vector_size=2,
-                unroll=256,
-                use_shared=True,
-                storage_align=True,
-                inner_x=False,
-            )
-        elif target.kind.name == "opencl" and (
-            ("android" in str(target.host)) or ("adreno" in str(target.attrs))
-        ):
-            return Conv2d.Config(
-                block_size_x=32,
-                block_size_y=4,
-                vector_size=8,
-                unroll=16,
-                use_shared=False,
-                storage_align=False,
-                inner_x=True,
-            )
-        else:
-            return Conv2d.Config()
+    @staticmethod
+    def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
+        # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
+        n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
+
+        # bz, vz, tz = sch.split(oc, sch.sample_perfect_tile(oc, 3, 32))
+        # by, vy, ty = sch.split(oh, sch.sample_perfect_tile(oh, 3, 32))
+        # bx, vx, tx = sch.split(ow, sch.sample_perfect_tile(ow, 3, 32))
+
+        bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True)
+        by, vy, ty = sch.split(oh, [None, 1, 16], preserve_unit_iters=True)
+        bx, vx, tx = sch.split(ow, [None, 1, 16], preserve_unit_iters=True)
+
+        bz = sch.fuse(n, bz, preserve_unit_iters=True)
+        sch.reorder(bz, by, bx, vz, vy, vx, tz, ty, tx, ob)
+        sch.bind(bz, "blockIdx.z")
+        sch.bind(by, "blockIdx.y")
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(vz, "vthread.z")
+        sch.bind(vy, "vthread.y")
+        sch.bind(vx, "vthread.x")
+        sch.bind(tz, "threadIdx.z")
+        sch.bind(ty, "threadIdx.y")
+        sch.bind(tx, "threadIdx.x")
+
+        rblk = sch.cache_read(blk, 0, "local")
+        ico, icb = sch.split(ic, [None, 4], preserve_unit_iters=True)
+        sch.reorder(ico, kh, kw, icb, ob)
+
+        sch.compute_at(rblk, kw, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(rblk)[-1])
+        wblk = sch.cache_write(blk, 0, "local")
+        sch.reverse_compute_at(wblk, tx, preserve_unit_loops=True)
+        sch.vectorize(sch.get_loops(wblk)[-1])
+        init_blk = sch.decompose_reduction(blk, tx)
+        sch.vectorize(sch.get_loops(init_blk)[-1])
 
     def apply(  # pylint: disable=too-many-locals,missing-docstring
         self,
-        func: tir.PrimFunc,
+        func: Union[tir.PrimFunc],
         target: Target,
         _: bool,
     ) -> Optional[tir.Schedule]:
-        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+        if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available(
+            target
+        ):
             return None
 
         if isinstance(func, tir.PrimFunc):
             sch = tir.Schedule(func)
+            sch.work_on("main")
+        elif isinstance(func, tir.Schedule):
+            sch = func
 
-        # config = self.get_configs(target)
-        root_block = analysis.get_root_block(sch)
+        root_block = analysis.get_root_block(sch, sch.func_working_on)
         blocks = sch.get_child_blocks(root_block)
-        reduction_block = get_reduction_blocks(sch, blocks)
+        reduction_blocks = list(
+            filter(lambda block: analysis.get_block_info(sch, block).is_reduction(), blocks)
+        )
+        remaining_blocks = [blk for blk in blocks if blk not in reduction_blocks]
 
-        if reduction_block is None:
-            return None
-        if not is_convolution(sch, reduction_block):
+        def is_convolution(blk):
+            block_info = analysis.get_block_info(sch, blk)
+            return "conv2d_NCHWc" in block_info.name
+
+        if len(reduction_blocks) != 1 or not is_convolution(reduction_blocks[0]):
             return None
 
-        def schedule_data_pad(blk):
-            axes = sch.get_loops(blk)
-            axes, vec = axes[:-1], axes[-1]
-            axis = sch.fuse(*axes)
-            bx, ty, tx = sch.split(axis, [None, 16, 16])
-            sch.bind(bx, "blockIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(tx, "threadIdx.x")
-            sch.vectorize(vec)
-
-        def schedule_conv2d(blk):
-            # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
-            n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
-            sch.reorder(n, oc, oh, ow, ic, kh, kw, ob)
-            main_lp = sch.fuse(n, oc, oh, ow)
-            bx, ty, tx = sch.split(main_lp, [None, 16, 16])
-            sch.bind(tx, "threadIdx.x")
-            sch.bind(ty, "threadIdx.y")
-            sch.bind(bx, "blockIdx.x")
-
-            ico, icv = sch.split(ic, [None, 4])
-            sch.reorder(ico, kh, kw, icv, ob)
-            rblk = sch.cache_read(blk, 0, "local")
-            sch.compute_at(rblk, kw)
-            sch.vectorize(sch.get_loops(rblk)[-1])
-            wblk = sch.cache_write(blk, 0, "local")
-            sch.reverse_compute_at(wblk, tx)
-            sch.vectorize(sch.get_loops(wblk)[-1])
-            sch.vectorize(ob)
-            init_blk = sch.decompose_reduction(blk, ico)
-            sch.vectorize(sch.get_loops(init_blk)[-1])
-
-        def is_data_pad(block: tir.stmt.Block):
-            return is_spatial_block(sch, block) and tir.analysis.has_if_then_else(sch.get(block))
-
-        def schedule_conv2d_blocks():
-
-            # Do analysis to find block type
-            blocks = sch.get_child_blocks(root_block)
-            passed_reduction = False
-            for blk in blocks:
-                if is_reduction_block(sch, blk):
-                    schedule_conv2d(blk)
-                    passed_reduction = True
-                elif is_data_pad(blk):
-                    schedule_data_pad(blk)
-                elif is_spatial_block(sch, blk):
-                    try:
-                        if not passed_reduction:
-                            sch.compute_inline(blk)
-                        else:
-                            sch.reverse_compute_inline(blk)
-                    except:  # pylint: disable=W0702
-                        pass
-                else:
-                    raise TypeError("Can't Schedule this Block", sch.get(blk))
-
-        schedule_conv2d_blocks()
+        # sch.set_scope(blocks[0], 0, "global.texture")
+        conv_blk = reduction_blocks[0]
+        Conv2d.schedule_conv2d(sch, conv_blk)
+        remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
+        schedule_default(sch, remaining_blocks)
+        schedule_storage_annotate(sch, remaining_blocks)
+
         return sch
diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
new file mode 100644
index 000000000000..cec05cbf22a1
--- /dev/null
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -0,0 +1,177 @@
+# licensed to the apache software foundation (asf) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Dlight Adreno Fallback Schedules"""
+
+
+from typing import List, Union
+
+from tvm import tir
+from tvm.target import Target
+from ..base import analysis
+from .base import AdrenoScheduleRule
+from .utils import get_texture_storage
+
+
+# pylint: disable=invalid-name,missing-function-docstring,unused-variable,unused-import
+class Fallback(AdrenoScheduleRule):
+    """Texture Based Fallback Schedule(s) for Adreno"""
+
+    @staticmethod
+    def schedule_inline_blocks(
+        sch: tir.Schedule, blocks: List[tir.schedule.BlockRV]
+    ) -> List[tir.schedule.BlockRV]:
+        """
+        Auto Inlines Injective and Element-wise Operations while trying to omit data pad blocks...
+        """
+
+        if blocks is None:
+            root_blk = analysis.get_root_block(sch)
+            blocks = sch.get_child_blocks(root_blk)
+
+        remaining_blocks = []
+        for blk in blocks:
+            block_info = analysis.get_block_info(sch, blk)
+            if block_info.is_injective() and not block_info.is_data_pad():
+                if len(block_info.consumers) == 1:
+                    try:
+                        sch.compute_inline(blk)
+                    except Exception:  # pylint: disable=broad-exception-caught
+                        remaining_blocks.append(blk)
+                elif len(block_info.producers) == 1:
+                    inlined_once = False
+                    try:
+                        # Would cause an issue inlining to producer with multiple consumers
+                        while (
+                            len(sch.get_producers(blk)) == 1
+                            and len(sch.get_consumers(sch.get_producers(blk)[0])) == 1
+                        ):
+                            sch.reverse_compute_inline(blk)
+                            inlined_once = True
+                    except Exception:  # pylint: disable=broad-exception-caught
+                        break
+                    if not inlined_once:
+                        remaining_blocks.append(blk)
+                else:
+                    remaining_blocks.append(blk)
+            else:
+                remaining_blocks.append(blk)
+        return remaining_blocks
+
+    @staticmethod
+    def schedule_annotate_storage(sch: tir.Schedule, func=get_texture_storage):
+        """Annotates intermediate buffers to textures whenever it's possible to do so"""
+        return
+        # pylint: disable=unreachable
+        root_blk = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_blk)
+
+        for blk in blocks:
+            block_info = analysis.get_block_info(sch, blk)
+            scope = func(block_info)
+            if scope is not None and len(sch.get_consumers(blk)) > 0:
+                sch.set_scope(blk, 0, scope)
+
+    @staticmethod
+    def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
+        block_info = analysis.get_block_info(sch, blk)
+
+        s_loops, r_loops, o_loops = [], [], []
+        v_loop = block_info.write_bufs[0].assoc_lps[-1]
+
+        for iter_info in block_info.iters:
+            if sch.get(iter_info.loop_rv) == sch.get(v_loop):
+                continue
+            {"S": s_loops, "R": r_loops, "O": o_loops}.get(iter_info.kind).append(iter_info.loop_rv)
+
+        iter_vars = analysis.collect_block_iter_vars_used_in_access_region(
+            block_info.block_stmt, block_info.write_bufs[0].buf_region.region
+        )
+        o_outer = [lp for lp in o_loops if sch.get(lp).var in iter_vars]
+        o_inner = [lp for lp in o_loops if sch.get(lp).var not in iter_vars]
+
+        # Can't change loop order for opaque loops
+        if o_loops != o_outer + o_inner:
+            return
+
+        o_outer.append(v_loop)
+        sch.reorder(*s_loops, *o_outer, *r_loops, *o_inner)
+
+        assert s_loops
+        tgt = Target.current(allow_none=True)
+
+        b = sch.fuse(*s_loops)
+        tx_extent = analysis.get_max_threads_per_block(tgt) if tgt is not None else 256
+        bx, tx = sch.split(b, [None, tx_extent])
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
+
+        if len(r_loops) > 1:
+            lp = [*s_loops, *o_outer][-1]
+            init_block = sch.decompose_reduction(blk, lp)
+            wblk = sch.cache_write(blk, 0, "local")
+            sch.compute_at(wblk, lp)
+            if v_loop:
+                sch.vectorize(sch.get_loops(init_block)[-1])
+                sch.vectorize(sch.get_loops(wblk)[-1])
+        elif v_loop is not None:
+            sch.vectorize(v_loop)
+
+    @staticmethod
+    def schedule_fallback(sch):
+        root_block = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_block)
+
+        schedule_blocks = [
+            blk
+            for blk in blocks
+            if analysis.get_block_info(sch, blk).is_reduction()
+            or analysis.get_block_info(sch, blk).is_data_pad()
+        ]
+        remaining_blocks = [blk for blk in blocks if blk not in schedule_blocks]
+
+        for blk in schedule_blocks:
+            Fallback.schedule_default(sch, blk)
+        remaining_blocks = Fallback.schedule_inline_blocks(sch, remaining_blocks)
+        # TODO: Analyze unscheduled blocks to schedule instead of relying on remaining
+        for blk in remaining_blocks:
+            Fallback.schedule_default(sch, blk)
+        Fallback.schedule_annotate_storage(sch, schedule_blocks + remaining_blocks)
+
+    def apply(  # pylint: disable=too-many-locals
+        self,
+        func: tir.PrimFunc,
+        target: Target,
+        _: bool,
+    ) -> Union[None, tir.Schedule, List[tir.Schedule]]:
+        # pylint: disable=invalid-name
+
+        if not isinstance(func, tir.PrimFunc) or not self.is_target_available(target):
+            return None
+
+        sch = tir.Schedule(func)
+        root_block = analysis.get_root_block(sch)
+        blocks = sch.get_child_blocks(root_block)
+
+        if any(len(sch.get_child_blocks(block)) != 0 for block in blocks):
+            return None
+
+        block_infos = [analysis.get_block_info(sch, block) for block in blocks]
+        if not any("texture" in block.write_bufs[0].get_scope() for block in block_infos):
+            return None
+
+        Fallback.schedule_fallback(sch)
+        return sch
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
new file mode 100644
index 000000000000..c34bd8324aaf
--- /dev/null
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -0,0 +1,133 @@
+# Licensed to the apache software foundation (asf) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name, unused-variable
+
+"Schedules for Texture Based Layout Transforms"
+from typing import List, Union
+
+from tvm import tir
+from tvm.target import Target
+from ..base import analysis
+
+from .base import AdrenoScheduleRule
+
+
+class LayoutTransform(AdrenoScheduleRule):
+    """Texture based Layout Transform Dlight Schedule for Adreno"""
+
+    def __init__(self, use_op_name=True):
+        self.use_op_name = use_op_name
+
+    # TODO: Try using Coalesced Writes...
+    def apply(  # pylint: disable=too-many-locals
+        self,
+        func: Union[tir.PrimFunc, tir.Schedule],
+        target: Target,
+        _: bool,
+    ) -> Union[None, tir.Schedule, List[tir.Schedule]]:
+        # pylint: disable=invalid-name
+        if not (isinstance(func, (tir.PrimFunc, tir.Schedule))) or not self.is_target_available(
+            target
+        ):
+            return None
+
+        if isinstance(func, tir.PrimFunc):
+            sch = tir.Schedule(func)
+            sch.work_on("main")
+        elif isinstance(func, tir.Schedule):
+            sch = func
+
+        root_block = analysis.get_root_block(sch, sch.func_working_on)
+
+        if len(sch.get_child_blocks(root_block)) != 1:
+            return None
+
+        blk = sch.get_child_blocks(root_block)[0]
+        block_info = analysis.get_block_info(sch, blk)
+        if not (
+            (self.use_op_name and block_info.name == "te_layout_transform")
+            or (not self.use_op_name and block_info.is_layout_transform())
+        ):
+            return None
+
+        read_buf, write_buf = (block_info.read_bufs[0], block_info.write_bufs[0])
+        lps = block_info.get_loops()
+        lpv_read, lpv_write = (
+            read_buf.assoc_lps[-1],
+            write_buf.assoc_lps[-1],
+        )
+
+        if lpv_read is None or lpv_write is None:
+            return None
+
+        vlen_read, vlen_write = read_buf.get_vecsize(), write_buf.get_vecsize()
+        local_cache = sch.get(lpv_read) != sch.get(lpv_write) or vlen_read != vlen_write
+        block_loops = [
+            lp
+            for lp in lps
+            if sch.get(lp) != sch.get(lpv_read) and sch.get(lp) != sch.get(lpv_write)
+        ]
+        vec_loops = (
+            [lpv_read, lpv_write] if sch.get(lpv_read) != sch.get(lpv_write) else (lpv_read,)
+        )
+        sch.reorder(*block_loops, *vec_loops)
+        # TODO: Additional Pragmas and stuff
+        if local_cache:
+            if sch.get(lpv_read) != sch.get(lpv_write):
+                blp_read, vlp_read = sch.split(
+                    lpv_read, [None, vlen_read], preserve_unit_iters=True
+                )
+                blp_write, vlp_write = sch.split(
+                    lpv_write, [None, vlen_write], preserve_unit_iters=True
+                )
+                sch.reorder(blp_read, blp_write, vlp_read, vlp_write)
+                block_loops += [blp_read, blp_write]
+                rblk = sch.cache_read(blk, 0, "local")
+                sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True)
+                sch.vectorize(sch.get_loops(rblk)[-1])
+                sch.vectorize(vlp_write)
+            else:
+                if vlen_read > vlen_write:
+                    read_lp, vec_lp = sch.split(blk, [None, vlen_write], preserve_unit_iters=True)
+                    rblk = sch.cache_read(blk, 0, "local")
+                    sch.compute_at(rblk, read_lp, preserve_unit_loops=True)
+                    sch.vectorize(sch.get_loops(rblk)[-1])
+                    sch.vectorize(vec_lp)
+                else:
+                    rblk = sch.cache_read(blk, 0, "local")
+                    sch.compute_at(rblk, block_loops[-1], preserve_unit_loops=True)
+                    _, vread_lp = sch.split(
+                        sch.get_loops(rblk)[-1], vlen_read, preserve_unit_iters=True
+                    )
+                    sch.vectorize(vread_lp)
+                    sch.vectorize(vlp_write)
+        else:
+            blp, vlp = sch.split(lpv_read, [None, vlen_read], preserve_unit_iters=True)
+            block_loops += [blp]
+            sch.vectorize(vlp)
+
+        b = sch.fuse(*block_loops)
+        tx_extent = min(sch.get(b).extent, 256)
+        candidates = [1, 2, 4, 8, 16, 32]
+        ux = sch.sample_categorical(
+            candidates, [1 / len(candidates) for _ in range(len(candidates))]
+        )
+        bx, tx = sch.split(b, [None, 256], preserve_unit_iters=True)
+        sch.bind(bx, "blockIdx.x")
+        sch.bind(tx, "threadIdx.x")
+        return sch
diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
new file mode 100644
index 000000000000..20b829fa8929
--- /dev/null
+++ b/python/tvm/dlight/adreno/pool.py
@@ -0,0 +1,92 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+""" Pool schedule rule for Adreno operators."""
+
+from tvm import tir
+from tvm.target import Target
+
+from .base import AdrenoScheduleRule
+from ..base import analysis
+
+
+# pylint: disable=invalid-name, unused-variable
+class Pool2D(AdrenoScheduleRule):
+    def apply(  # pylint: disable=too-many-locals,missing-docstring
+        self,
+        func: tir.PrimFunc,
+        target: Target,
+        _: bool,
+    ) -> tir.Schedule:
+        sch = tir.Schedule(func)
+        root = sch.get_block(name="root", func_name="main")
+
+        blocks = sch.get_child_blocks(root)
+        blocks_names = [sch.get(blk).name_hint for blk in blocks]
+
+        if not "adaptive_pool_sum" in blocks_names and not "pool_max" in blocks_names:
+            return None
+
+        def schedule_pad(blk: tir.schedule.BlockRV):
+            lps, veclp = sch.get_loops(blk)[:-1], sch.get_loops(blk)[-1]
+            sch.vectorize(veclp)
+            b = sch.fuse(*lps)
+            tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256)
+            bx, tx = sch.split(b, [None, tx_extent])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+
+        def schedule_max_pool(blk: tir.schedule.BlockRV):
+            block_info = analysis.get_block_info(sch, blk)
+            iters_kind = "".join([_iter.kind for _iter in block_info.iters])
+            if iters_kind != "SSSSSRR":
+                return None
+
+            lps = sch.get_loops(blk)
+            block_lps, vec_lp, red_lps = lps[:4], lps[4], lps[5:]
+            write_blk = sch.cache_write(blk, 0, "local")
+            sch.reverse_compute_at(write_blk, vec_lp)
+            b = sch.fuse(*block_lps)
+            tx_extent = min(int(sch.get(b).extent) & ~int(sch.get(b).extent - 1), 256)
+            bx, tx = sch.split(b, [None, tx_extent])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+            sch.vectorize(vec_lp)
+
+            return True
+
+        passed_reduction = False
+        for blk in blocks:
+            if sch.get(blk).name_hint == "pad_temp":
+                schedule_pad(blk)
+            elif (
+                sch.get(blk).name_hint == "adaptive_pool_sum"
+                or sch.get(blk).name_hint == "pool_max"
+            ):
+                ok = schedule_max_pool(blk)
+                if not ok:
+                    return None
+                passed_reduction = True
+            else:
+                try:
+                    if passed_reduction:
+                        sch.reverse_compute_inline(blk)
+                    else:
+                        sch.compute_inline(blk)
+                except:  # pylint: disable=bare-except
+                    pass
+        return sch
diff --git a/python/tvm/dlight/adreno/utils.py b/python/tvm/dlight/adreno/utils.py
new file mode 100644
index 000000000000..0d4f0abe7d20
--- /dev/null
+++ b/python/tvm/dlight/adreno/utils.py
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Utilis for Adreno operators."""
+
+# pylint: disable=import-outside-toplevel, unused-argument, invalid-name, missing-function-docstring
+from typing import List
+
+from tvm.target import Target
+from tvm import tir
+
+from ..base.analysis import BlockInfo
+
+
+def get_texture_storage(block_info: BlockInfo):
+    """
+    Returns the texture layout acceptable for the shape
+
+    Parameters
+    ----------
+    shape: array
+        Shape of the tensor to be packed to texture
+    """
+    # certain limitation of the Qualcomm devices. Subject to be determined for certain device
+    # individually, but until we have access to remote device during compilation, we have to
+    # define it uniformly for all target devices
+    # spatial_limit = 16384, depth_limit = 2048
+    # TODO: Check Write Bufs.
+    shape = block_info.write_bufs[0].buf_region.buffer.shape
+
+    spatial_limit = Target.current().attrs["texture_spatial_limit"]
+    depth_limit = Target.current().attrs["texture_depth_limit"]
+
+    if len(shape) > 4:
+        if shape[0] < spatial_limit and shape[1] * shape[2] * shape[3] < spatial_limit:
+            return "global.texture-weight"
+        elif shape[0] < depth_limit and shape[2] * shape[3] < spatial_limit:
+            return "global.texture-nhwc"
+        elif (
+            shape[0] * shape[1] < depth_limit
+            and shape[2] < spatial_limit
+            and shape[3] < spatial_limit
+        ):
+            return "global.texture"
+    elif len(shape) > 3:
+        if shape[0] < spatial_limit and shape[1] * shape[2] < spatial_limit:
+            return "global.texture-weight"
+        elif shape[0] < depth_limit and shape[1] < spatial_limit and shape[2] < spatial_limit:
+            return "global.texture"
+    elif len(shape) == 3:
+        if shape[0] < spatial_limit and shape[1] < spatial_limit:
+            return "global.texture-weight"
+
+    return "global"
+
+
+def schedule_inline_blocks(sch: tir.Schedule, blocks: List[tir.schedule.BlockRV] = None):
+    from .fallback import Fallback
+
+    return Fallback.schedule_inline_blocks(sch, blocks)
+
+
+def schedule_default(sch, blocks: List[tir.schedule.BlockRV] = None):
+    from .fallback import Fallback
+
+    ret = []
+    for blk in blocks:
+        ret.append(Fallback.schedule_default(sch, blk))
+
+    return ret
+
+
+def schedule_storage_annotate(sch: tir.Schedule, func=get_texture_storage):
+    # Check the Write Buffer isn't one of input Params and is Texturizable...
+    from .fallback import Fallback
+
+    return Fallback.schedule_annotate_storage(sch)
+
+
+def schedule_fallback(sch, blk):
+    from .fallback import Fallback
+
+    return Fallback.schedule_fallback(sch)
diff --git a/python/tvm/dlight/analysis/__init__.py b/python/tvm/dlight/analysis/__init__.py
index bf68d0855015..0df8abb2bf5c 100644
--- a/python/tvm/dlight/analysis/__init__.py
+++ b/python/tvm/dlight/analysis/__init__.py
@@ -24,6 +24,7 @@
     is_broadcast_epilogue,
     normalize_prim_func,
     get_root_block,
+    get_block_info,
 )
 from .gemv import (
     is_gemv,
diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index e3357c6e78db..c7374e326e9c 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+# pylint: disable=missing-function-docstring, missing-class-docstring
 """Analysis on TIR blocks, loops and functions."""
 from typing import List, Optional, Set, Union
 
@@ -171,6 +173,32 @@ def _iter_kind(i: tir.IterVar) -> str:
     return blocks
 
 
+def get_block_info(sch: tir.Schedule, block: tir.schedule.BlockRV) -> BlockInfo:
+    def _iter_kind(loop: tir.IterVar) -> str:
+        return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
+
+    def _is_reduction_block(block: tir.schedule.BlockRV):
+        for iter_var in sch.get(block).iter_vars:
+            if _iter_kind(iter_var) == "R":
+                return True
+        return False
+
+    return BlockInfo(
+        name=sch.get(block).name_hint,
+        iters=[
+            IterInfo(
+                kind=_iter_kind(iter_var),
+                var=iter_var.var,
+                dom=iter_var.dom.extent,
+                loop_rv=loop_rv,
+            )
+            for loop_rv, iter_var in zip(sch.get_loops(block), sch.get(block).iter_vars)
+        ],
+        block_rv=block,
+        reduction_block=_is_reduction_block(block),
+    )
+
+
 def _assert_gpu_target(target: Target):
     if "gpu" not in target.keys:
         raise ValueError(f"Expect a GPU target, but got {target}")
diff --git a/python/tvm/dlight/base/__init__.py b/python/tvm/dlight/base/__init__.py
index 9d90c4f8e171..d693fa2fc748 100644
--- a/python/tvm/dlight/base/__init__.py
+++ b/python/tvm/dlight/base/__init__.py
@@ -25,3 +25,12 @@
     max_threads_per_block,
     suggest_threads_per_block,
 )
+from .analysis import (
+    BlockInfo,
+    IterInfo,
+    collect_block_iter_vars_used_in_access_region,
+    collect_vars_used_in_prim_expr,
+    detect_dominant_read,
+    is_broadcast_epilogue,
+    normalize_prim_func,
+)
diff --git a/python/tvm/dlight/base/analysis.py b/python/tvm/dlight/base/analysis.py
new file mode 100644
index 000000000000..4ba025f8dabf
--- /dev/null
+++ b/python/tvm/dlight/base/analysis.py
@@ -0,0 +1,442 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Analysis on TIR blocks, loops and functions."""
+
+# pylint: disable=unused-argument, unused-variable
+
+from collections import namedtuple
+from typing import List, Optional, Set, Union, Dict, Tuple
+from typing_extensions import Literal
+from tvm_ffi import get_global_func
+
+from tvm import ir, tir
+from tvm.tir import Schedule
+from tvm.target.target import Target
+from tvm.tir.schedule import BlockRV
+from tvm.runtime import DataType
+
+
+class IterInfo:
+    """Information about a loop/iter var."""
+
+    kind: Literal["S", "R", "O"]
+    var: tir.Var
+    _dom: tir.PrimExpr
+    loop_rv: tir.schedule.LoopRV
+
+    def __init__(
+        self,
+        kind: Literal["S", "R", "O"],
+        var: tir.Var,
+        dom: tir.PrimExpr,
+        loop_rv: tir.schedule.LoopRV,
+    ):
+        """Construct an IterInfo object."""
+        self.kind = kind
+        self.var = var
+        self._dom = dom
+        self.loop_rv = loop_rv
+
+    @property
+    def dom(self) -> Union[int, tir.PrimExpr]:
+        """The iteration domain of the loop."""
+        return int(self._dom) if isinstance(self._dom, tir.IntImm) else self._dom
+
+    def __str__(self) -> str:
+        return f'Iter("{self.kind}", {self.dom})'
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
+
+
+# TODO: Shift Vlen Calculation here...
+class BufferInfo:
+    "Information about Buffer. Provides useful analysis"
+    buf_region: tir.BufferRegion
+    shape: Tuple[int]
+    assoc_lps: List[Union[tir.schedule.LoopRV, None]]
+    assoc_lps_info: List[Union[tir.For, None]]
+
+    def __init__(
+        self,
+        sch: tir.Schedule,
+        block_rv: tir.schedule.BlockRV,
+        buf_region: tir.BufferRegion,
+        lps: Union[List[tir.schedule.LoopRV], None],
+    ):
+        block = sch.get(block_rv)
+        if lps is None:
+            lps = sch.get_loops(block_rv)
+        loops = [sch.get(lp) for lp in lps]
+        iter_vars = [Var.var for Var in block.iter_vars]
+        iter_values = get_blockrealize(sch, block_rv).iter_values
+        lpvar_lp = dict([loop.loop_var, lp] for loop, lp in zip(loops, lps))
+        var_lp = dict(zip(iter_vars, [lpvar_lp.get(val, None) for val in iter_values]))
+
+        def extract_index_types(buf: tir.BufferRegion) -> BufIndex:
+            buf_index = []
+            for expr in buf.region:
+                expr = expr.min
+                dim = None
+                if isinstance(expr, tir.expr.Add) and isinstance(expr.b, tir.expr.Var):
+                    var_add = expr.b
+                    if (
+                        isinstance(expr, tir.expr.Mul)
+                        and isinstance(expr.a, tir.expr.Var)
+                        and isinstance(expr.b, tir.expr.IntImm)
+                    ):
+                        mul = expr.b
+                        var_mul = expr.a
+                        dim = MergeIndex(var_mul, mul, var_add)
+                elif (
+                    isinstance(expr, tir.expr.FloorMod)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = RemIndex(expr.a, expr.b)
+                elif (
+                    isinstance(expr, tir.expr.FloorDiv)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = DivIndex(expr.a, expr.b)
+                elif isinstance(expr, tir.expr.Var):
+                    dim = Index(expr)
+                buf_index.append(dim)
+            return buf_index
+
+        indexes = extract_index_types(buf_region)
+        assoc_lps = [
+            (
+                var_lp.get(getattr(idx, "sub"), None)
+                if not isinstance(idx, DivIndex) and not idx is None
+                else None
+            )
+            for idx in indexes
+        ]
+
+        self.buf_region = buf_region
+        self.assoc_lps = assoc_lps
+        self.assoc_lps_info = [(sch.get(lp) if lp is not None else None) for lp in assoc_lps]
+        self.shape = buf_region.buffer.shape
+
+    def get_scope(self) -> str:
+        return self.buf_region.buffer.scope()
+
+    def get_vecsize(self, buf_index: int = 0, vbits: int = 128):
+        if self.assoc_lps_info[-1] is None:
+            return None
+
+        vlp_extent = int(self.assoc_lps_info[-1].extent) & ~(
+            int(self.assoc_lps_info[-1].extent) - 1
+        )
+        vbuf_extent = int(self.shape[-1]) & ~(int(self.shape[-1]) - 1)
+
+        return min(vlp_extent, vbuf_extent, vbits // DataType(self.buf_region.buffer.dtype).bits)
+
+    def __str__(self) -> str:
+        return f"BufferInfo({self.buf_region})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+class BlockInfo:
+    """Information about a TIR block. Provides useful analysis about the block."""
+
+    name: str
+    iters: List[IterInfo]
+    block_stmt: tir.Block
+    block_rv: tir.schedule.BlockRV
+    read_bufs: List[BufferInfo]
+    write_bufs: List[BufferInfo]
+    producers: List[tir.Block]
+    consumers: List[tir.Block]
+
+    def __init__(
+        self,
+        sch: tir.Schedule,
+        block_rv: tir.schedule.BlockRV,
+    ):
+        """Construct a BlockInfo object."""
+        block_stmt = sch.get(block_rv)
+
+        def _iter_kind(loop: tir.IterVar) -> str:
+            return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
+
+        lps = sch.get_loops(block_rv)
+        iter_vars = block_stmt.iter_vars
+
+        self.name = sch.get(block_rv).name_hint
+        self.iters = [
+            IterInfo(
+                kind=_iter_kind(iter_var),
+                var=iter_var.var,
+                dom=iter_var.dom.extent,
+                loop_rv=loop_rv,
+            )
+            for loop_rv, iter_var in zip(lps, iter_vars)
+        ]
+        self.block_stmt = block_stmt
+        self.block_rv = block_rv
+        self.read_bufs = [get_buffer_info(sch, block_rv, buf, lps) for buf in block_stmt.reads]
+        self.write_bufs = [get_buffer_info(sch, block_rv, buf, lps) for buf in block_stmt.writes]
+        self.producers = sch.get_producers(block_rv)
+        self.consumers = sch.get_consumers(block_rv)
+
+    def dom(self) -> List[Union[int, tir.PrimExpr]]:
+        """The iteration domain of the block."""
+        return [i.dom for i in self.iters]
+
+    def dom_kind(self) -> str:
+        """The iteration domain kind of the block, for example, SSSS, SSSR."""
+        return "".join(i.kind for i in self.iters)
+
+    def is_injective(self) -> bool:
+        """Whether the block is injective, i.e. all its iteration domains are injective."""
+        return all(k == "S" for k in self.dom_kind())
+
+    def is_elementwise(self) -> bool:
+        """Whether the block is elementwise, i.e. trivial mapping between read/write region"""
+
+        if not self.is_injective() or len(self.write_bufs) != 1:
+            return False
+
+        w_region = self.write_bufs[0].buf_region.region
+        for read_buf in self.read_bufs:
+            r_region = read_buf.buf_region.region
+            if len(r_region) != len(w_region):
+                return False
+            for r_var, w_var in zip(r_region, w_region):
+                if not r_var == w_var:
+                    return False
+        return True
+
+    def is_broadcast(self) -> bool:
+        """Whether the block is elementwise, i.e. trivial mapping between read/write region"""
+
+        if not self.is_injective() or len(self.write_bufs) != 1:
+            return False
+
+        w_region = self.write_bufs[0].buf_region.region
+        for read_buf in self.read_bufs:
+            r_region = read_buf.buf_region.region
+            for r_var in r_region:
+                if r_var not in w_region:
+                    return False
+        return True
+
+    def get_loops(self) -> List[tir.schedule.LoopRV]:
+        return [iter_info.loop_rv for iter_info in self.iters]
+
+    def is_reduction(self) -> bool:
+        """Whether the block is a reduction workload."""
+        return all(k == "S" or k == "R" for k in self.dom_kind()) and any(
+            k == "R" for k in self.dom_kind()
+        )
+
+    def is_layout_transform(self) -> bool:
+        """Whether the Block can be considered having a Layout Transform Pattern"""
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(self.write_bufs) == 1
+            and len(self.read_bufs) == 1
+            and not self.is_elementwise()
+            and not get_global_func("tir.schedule.HasIfThenElse")(self.block_stmt)
+        )
+
+    def is_data_pad(self) -> bool:
+        """Whether the Block can be considered having a data pad pattern"""
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(self.write_bufs) == 1
+            and len(self.read_bufs) == 1
+            and not self.is_elementwise()
+            and len(self.write_bufs[0].buf_region.region)
+            == len(self.read_bufs[0].buf_region.region)
+            and get_global_func("tir.schedule.HasIfThenElse")(self.block_stmt)
+        )
+
+    def is_convolution(self) -> bool:
+        """Whether a Block can be considered having Convolution Pattern"""
+        raise NotImplementedError
+
+    def is_pool(self) -> bool:
+        """Whether a Block can be considered having Pooling Pattern"""
+        raise NotImplementedError
+
+    def is_gemv(self) -> bool:
+        """Whether the block is a GEMV workload."""
+        raise NotImplementedError
+
+    def is_gemm(self) -> bool:
+        """Whether the block is a GEMM workload."""
+        raise NotImplementedError
+
+    def check_op_name(self, name: str):
+        raise NotImplementedError
+
+    def __str__(self) -> str:
+        return f'BlockInfo("{self.name}", "{self.dom_kind()}", {self.dom()})'
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+_normalize_prim_func = get_global_func("tir.schedule.NormalizePrimFunc")
+
+
+def normalize_prim_func(sch: tir.Schedule) -> Optional[List[BlockInfo]]:
+    """Normalize the primfunc to normal form"""
+    try:
+        result = _normalize_prim_func(sch)
+        if result is None:
+            return None
+    except Exception:  # pylint: disable=broad-except
+        return None
+
+    def _iter_kind(i: tir.IterVar) -> str:
+        return {
+            tir.IterVar.DataPar: "S",
+            tir.IterVar.CommReduce: "R",
+        }.get(i.iter_type, "O")
+
+    blocks: List[BlockInfo] = []
+    for block, loops, iters, is_reduction in zip(*result):
+        blocks.append(BlockInfo(sch, block))
+    return blocks
+
+
+# BufferIndex Types
+Index = namedtuple("Index", ["sub"])  # c
+RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
+DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
+MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
+BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
+
+
+# TODO: Possibly move it under BufferInfo's init
+def get_buffer_info(
+    sch: tir.Schedule,
+    blk: tir.schedule.BlockRV,
+    buf: tir.BufferRegion,
+    # TODO: Handle Case where there's None
+    lps: Dict[tir.Var, tir.schedule.LoopRV],
+) -> BufferInfo:
+    return BufferInfo(sch, blk, buf, lps)
+
+
+# TODO: Possibly Move it under BlockInfo init...
+def get_block_info(sch: tir.Schedule, blk: tir.schedule.BlockRV) -> BlockInfo:
+    return BlockInfo(sch, blk)
+
+
+def _assert_gpu_target(target: Target):
+    if "gpu" not in target.keys:
+        raise ValueError(f"Expect a GPU target, but got {target}")
+
+
+def get_max_threads_per_block(target: Target) -> int:
+    _assert_gpu_target(target)
+    max_threads_per_block = None
+    for name in ["max_threads_per_block", "max_num_threads"]:
+        if max_threads_per_block is None:
+            max_threads_per_block = target.attrs.get(name, None)
+    if max_threads_per_block is None:
+        max_threads_per_block = 64
+    return int(max_threads_per_block)
+
+
+def get_max_shared_memory_per_block(target: Target) -> int:
+    _assert_gpu_target(target)
+    max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
+    if max_shared_memory_per_block is None:
+        raise ValueError(
+            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually"
+        )
+    return int(max_shared_memory_per_block)
+
+
+def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
+    try:
+        block = sch.mod[func_name].body.block
+    except:
+        raise ValueError(
+            f"The function body is expected to be the root block, but got:\n"
+            f"{sch.mod[func_name].body}"
+        )
+    return sch.get_block(block.name_hint)
+
+
+def collect_block_iter_vars_used_in_access_region(
+    block: tir.Block, region: List[ir.Range]
+) -> Set[tir.Var]:
+    """Collect the block iter variables used in the access region of a buffer region."""
+    tir_vars = set()
+    for expr in region:
+        assert expr.extent == 1
+        tir_vars |= collect_vars_used_in_prim_expr(expr.min)
+    tir_vars &= set(iter_var.var for iter_var in block.iter_vars)
+    return tir_vars
+
+
+def collect_vars_used_in_prim_expr(expr: tir.PrimExpr) -> Set[tir.Var]:
+    """Collect the variables used in the PrimExpr."""
+    tir_vars = set()
+
+    def _collect_tir_var(expr):
+        if isinstance(expr, tir.Var):
+            tir_vars.add(expr)
+
+    tir.stmt_functor.post_order_visit(expr, _collect_tir_var)
+    return tir_vars
+
+
+def detect_dominant_read(block: tir.Block) -> tir.PrimExpr:
+    """Detect the dominant read indices in the block."""
+    dominant_read = None
+    num_read_iters = -1
+    for buffer_region in block.reads:
+        tir_vars = collect_block_iter_vars_used_in_access_region(block, buffer_region.region)
+        if num_read_iters < len(tir_vars):
+            num_read_iters = len(tir_vars)
+            dominant_read = buffer_region
+    assert dominant_read is not None
+    (result,) = dominant_read.buffer.offset_of([e.min for e in dominant_read.region])
+    return result
+
+
+def is_broadcast_epilogue(
+    sch: tir.Schedule,
+    block: tir.schedule.BlockRV,
+    epilogue: tir.schedule.BlockRV,
+) -> bool:
+    """Check if the epilogue block is a broadcast pattern"""
+    write_buffers = {r.buffer for r in sch.get(block).writes}
+    epilogue_iters = {i.var: i for i in sch.get(epilogue).iter_vars if i.dom != 1}
+    for buffer_region in sch.get(epilogue).reads:
+        if buffer_region.buffer not in write_buffers:
+            continue
+        tir_vars = collect_block_iter_vars_used_in_access_region(
+            sch.get(epilogue), buffer_region.region
+        )
+        if len(tir_vars) < len(epilogue_iters):
+            return True
+    return False
diff --git a/python/tvm/dlight/base/common_schedules.py b/python/tvm/dlight/base/common_schedules.py
index c205b78390bc..fe005cec5d70 100644
--- a/python/tvm/dlight/base/common_schedules.py
+++ b/python/tvm/dlight/base/common_schedules.py
@@ -19,7 +19,7 @@
 
 from tvm import tir
 
-from ..analysis import BlockInfo
+from .analysis import BlockInfo
 
 
 def try_inline(
diff --git a/python/tvm/ir/global_info.py b/python/tvm/ir/global_info.py
index 185e10b88cce..36811c301044 100644
--- a/python/tvm/ir/global_info.py
+++ b/python/tvm/ir/global_info.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 """Global Info."""
+
 import tvm
 import tvm_ffi
 from tvm.runtime.object import Object
+from tvm.runtime import Device
 from . import _ffi_api
 
 
@@ -39,6 +41,8 @@ def same_as(self, other):
 
 @tvm_ffi.register_object("ir.DummyGlobalInfo")
 class DummyGlobalInfo(GlobalInfo):
+    """DummyGlobalInfo"""
+
     def __init__(self) -> None:
         self.__init_handle_by_constructor__(
             _ffi_api.DummyGlobalInfo,
@@ -47,6 +51,8 @@ def __init__(self) -> None:
 
 @tvm_ffi.register_object("ir.VDevice")
 class VDevice(GlobalInfo):
+    """VDevice"""
+
     def __init__(
         self,
         target=None,
@@ -55,4 +61,6 @@ def __init__(
     ) -> None:
         if isinstance(target, (dict, str)):
             target = tvm.target.Target(tvm.runtime.convert(target))
+        if isinstance(target, Device):
+            target = tvm.target.Target.from_device(target)
         self.__init_handle_by_constructor__(_ffi_api.VDevice, target, vdevice_id, memory_scope)
diff --git a/python/tvm/relax/op/memory/memory.py b/python/tvm/relax/op/memory/memory.py
index 7b84ffc48bb6..8e17194f67d9 100644
--- a/python/tvm/relax/op/memory/memory.py
+++ b/python/tvm/relax/op/memory/memory.py
@@ -62,7 +62,11 @@ def alloc_storage(
 
 @args_converter.auto
 def alloc_tensor(
-    storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr]
+    storage: Expr,
+    offset: Union[int, Expr],
+    shape: Expr,
+    dtype: Union[str, Expr],
+    runtime_device_ind: Union[int, Expr] = PrimValue(0),
 ) -> Call:
     """Construct a Call to allocate a tensor on a certain storage starting from the given offset.
 
@@ -80,6 +84,10 @@ def alloc_tensor(
     dtype : Union[str, Expr]
         The datatype of the tensor to be allocated.
 
+    runtime_device_ind: Union[int, Expr]
+        The device index indicating on which device the tensor is to be
+        allocated at runtime. Index -1 is reserved for the host device.
+
     Returns
     -------
     result : Call
@@ -89,7 +97,7 @@ def alloc_tensor(
         offset = PrimValue(offset)
     if isinstance(dtype, str):
         dtype = DataTypeImm(dtype)
-    return _ffi_api.alloc_tensor(storage, offset, shape, dtype)  # type: ignore
+    return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind)  # type: ignore
 
 
 @args_converter.auto
diff --git a/python/tvm/relax/op/vm/vm.py b/python/tvm/relax/op/vm/vm.py
index 3ed6b29648b4..dcc9e5a33a53 100644
--- a/python/tvm/relax/op/vm/vm.py
+++ b/python/tvm/relax/op/vm/vm.py
@@ -62,7 +62,11 @@ def alloc_storage(
 
 @args_converter.auto
 def alloc_tensor(
-    storage: Expr, offset: Union[int, Expr], shape: Expr, dtype: Union[str, Expr]
+    storage: Expr,
+    offset: Union[int, Expr],
+    shape: Expr,
+    dtype: Union[str, Expr],
+    runtime_device_ind: Union[int, Expr] = PrimValue(0),
 ) -> Call:
     """Construct a Call to allocate a tensor on a certain storage starting from the given offset.
 
@@ -80,6 +84,10 @@ def alloc_tensor(
     dtype : Union[str, Expr]
         The datatype of the tensor to be allocated.
 
+    runtime_device_ind: Union[int, Expr]
+        The device index indicating on which device the tensor is to be
+        allocated at runtime. Index -1 is reserved for the host device.
+
     Returns
     -------
     result : Call
@@ -89,7 +97,7 @@ def alloc_tensor(
         offset = PrimValue(offset)
     if isinstance(dtype, str):
         dtype = DataTypeImm(dtype)
-    return _ffi_api.alloc_tensor(storage, offset, shape, dtype)  # type: ignore
+    return _ffi_api.alloc_tensor(storage, offset, shape, dtype, runtime_device_ind)  # type: ignore
 
 
 def kill_object(obj: Expr) -> Call:
diff --git a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
index 00081f92b197..b73b40d6fc9c 100644
--- a/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
+++ b/python/tvm/relax/transform/lower_gpu_ipc_alloc_storage.py
@@ -78,6 +78,6 @@ def rewrite_alloc_tensor(self, call: relax.Call) -> relax.Call:
         )
         return relax.Call(
             self.memory_alloc_tensor_op,
-            args=[ipc_alloc_storage, call.args[2], shape, dtype],
+            args=[ipc_alloc_storage, call.args[2], shape, dtype, relax.PrimValue(0)],
             sinfo_args=call.sinfo_args,
         )
diff --git a/python/tvm/tir/pipeline.py b/python/tvm/tir/pipeline.py
index 96ed9dfdbc96..a674798b7d0b 100644
--- a/python/tvm/tir/pipeline.py
+++ b/python/tvm/tir/pipeline.py
@@ -31,6 +31,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
         pass_ctx = tvm.transform.PassContext.current()
         config = pass_ctx.config
         passes = [
+            tir.transform.TextureFlatten(),
             tir.transform.CanonicalizeLoop(),
             tir.transform.LowerCrossThreadReduction(),
             tir.transform.LowerInitBlock(),
@@ -48,6 +49,7 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
             tir.transform.InjectSoftwarePipeline(),
             tir.transform.TransformMmaBufferLayout(),
             tir.transform.LowerOpaqueBlock(),
+            tir.transform.InjectTextureAlloc(),
             tir.transform.FlattenBuffer(),
             tir.transform.BF16ComputeLegalize(),
             tir.transform.NarrowDataType(32),
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 88cf4720d3a6..0d75a0ea8a28 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -893,6 +893,17 @@ def LowerOpaqueBlock():
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
+def InjectTextureAlloc():
+    """Inject Texture Allocation Intrensic to make sure appropriate lowering
+    via alloc_nd/alloc_free calls
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectTextureAlloc()  # type: ignore
+
+
 def FlattenBuffer():
     """Flatten the multi-dimensional BufferLoad and BufferStore to single dimensional
     BufferLoad/BufferStore for the TIR not contains opaque block.
@@ -1182,3 +1193,18 @@ def CanonicalizeLoop():
         The result pass
     """
     return _ffi_api.CanonicalizeLoop()  # type: ignore
+
+
+def TextureFlatten():
+    """Flatten the multi-dimensional read/write to 2D.
+
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.TextureFlatten()  # type: ignore
diff --git a/src/relax/backend/adreno/annotate_custom_storage.cc b/src/relax/backend/adreno/annotate_custom_storage.cc
index 887b81872940..f42c5c456124 100644
--- a/src/relax/backend/adreno/annotate_custom_storage.cc
+++ b/src/relax/backend/adreno/annotate_custom_storage.cc
@@ -280,30 +280,58 @@ class CollectConsumerScopeInfo : public ExprVisitor {
     mod_ = mod;
     target_ = target;
     VisitExpr(func->body);
-    // Extend the scope for tuple items
-    for (const auto& val : arg_to_binding) {
+
+    for (const auto& val : tuple_item_to_binding) {
       if (scope_info.find(val.first) != scope_info.end()) {
-        if (scope_info.find(val.second) == scope_info.end()) {
-          scope_info.Set(val.second, scope_info[val.first]);
-        } else {
-          auto ent = scope_info[val.second];
-          for (auto ent_val : scope_info[val.first]) {
-            ent.Set(ent_val.first, ent_val.second);
+        for (const auto& item_val : val.second) {
+          // TODO(Siva): How about ops that generate tuples like split ?
+          if (tuples_to_binding.find(item_val.first) == tuples_to_binding.end()) {
+            continue;
+          }
+          auto producer_var = tuples_to_binding[item_val.first][item_val.second];
+          if (scope_info.find(producer_var) == scope_info.end()) {
+            scope_info.Set(producer_var, scope_info[val.first]);
+          } else {
+            auto ent = scope_info[producer_var];
+            for (auto ent_val : scope_info[val.first]) {
+              ent.Set(ent_val.first, ent_val.second);
+            }
+            scope_info.Set(producer_var, ent);
           }
-          scope_info.Set(val.second, ent);
         }
       }
     }
-
     return std::make_pair(call_scope_info, scope_info);
   }
 
   void VisitBinding_(const VarBindingNode* binding,
                      const TupleGetItemNode* tuple_get_item_node) final {
-    if (arg_to_binding.find(ffi::GetRef<Expr>(binding->var.get())) == arg_to_binding.end()) {
-      arg_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()),
-                         ffi::GetRef<Expr>(tuple_get_item_node->tuple.get()));
+    /*
+     * lv9 = R.call_tir(add, (lv7, m
+     * lv: R.Tuple(
+     *     R.Tensor((1, 3, 224, 224), dtype="float32"),
+     *     R.Tensor((3,), dtype="float32"),
+     *     R.Tensor((3,), dtype="float32")
+     * ) = lv9, metadata["relax.expr.Constant"][4], metadata["relax.expr.Constant"][5]
+     * lv1_1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0]
+     * lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d(lv1_1, .....
+     *
+     * lv1_1 scope is requested by conv2d now we need to populate the same to lv9
+     * Capture essestial information here as
+     *
+     * lv => {(0, lv1_1), (1, ) ...}
+     */
+    ffi::Map<Expr, int> field_map;
+    if (tuple_item_to_binding.find(ffi::GetRef<Expr>(binding->var.get())) !=
+        tuple_item_to_binding.end()) {
+      field_map = tuple_item_to_binding[ffi::GetRef<Expr>(binding->var.get())];
     }
+    field_map.Set(ffi::GetRef<Expr>(tuple_get_item_node->tuple.get()), tuple_get_item_node->index);
+    tuple_item_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()), field_map);
+  }
+
+  void VisitBinding_(const VarBindingNode* binding, const TupleNode* tuple) final {
+    tuples_to_binding.Set(ffi::GetRef<Expr>(binding->var.get()), tuple->fields);
   }
 
   void VisitExpr_(const CallNode* call) final {
@@ -325,17 +353,18 @@ class CollectConsumerScopeInfo : public ExprVisitor {
       func_args = Tuple(call->args);
     }
 
-    bool is_texture_supported = SupportsTexture(op_attrs, op_pattern.value());
+    auto is_texture_supported = SupportsTexture(op_attrs, op_pattern.value());
 
     ffi::Array<ffi::String> arg_scope;
-    for (auto arg : func_args->fields) {
-      auto sinfo = GetStructInfo(arg);
+    for (uint32_t i = 0; i < func_args->fields.size(); ++i) {
+      auto sinfo = GetStructInfo(func_args->fields[i]);
       if (auto tensor_sinfo = sinfo.as<TensorStructInfo>()) {
-        auto scope = is_texture_supported
-                         ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value()))
-                         : "global";
+        bool is_texture =
+            i < is_texture_supported.size() ? is_texture_supported[i] : is_texture_supported[0];
+        auto scope =
+            is_texture ? Scope(GetShapeFromTensorStructInfo(tensor_sinfo.value())) : "global";
         ffi::Map<Expr, ffi::Array<ffi::String>> ent_call;
-        const VarNode* arg_var = arg.as<VarNode>();
+        const VarNode* arg_var = func_args->fields[i].as<VarNode>();
         if (scope_info.find(ffi::GetRef<Expr>(arg_var)) != scope_info.end()) {
           ent_call = scope_info[ffi::GetRef<Expr>(arg_var)];
         }
@@ -368,28 +397,29 @@ class CollectConsumerScopeInfo : public ExprVisitor {
     return op_pat;
   }
 
-  bool SupportsTexture(const ffi::Array<Attrs>& op_attrs, Integer op_pattern) {
-    if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return true;
+  std::vector<bool> SupportsTexture(const ffi::Array<Attrs>& op_attrs, Integer op_pattern) {
+    if (op_pattern.IntValue() < OpPatternKind::kCommReduce) return {true};
 
     for (auto attr : op_attrs) {
       if (auto conv_attr = attr.as<Conv2DAttrs>()) {
         if (conv_attr->data_layout == "NCHW4c" && conv_attr->kernel_layout == "OIHW4o") {
-          return true;
+          // No Texture for weights
+          return {true, false};
         }
       } else if (auto pool_attrs = attr.as<Pool2DAttrs>()) {
         if (pool_attrs->layout == "NCHW4c") {
-          return true;
+          return {true};
         }
       } else if (auto avg_attrs = attr.as<AdaptivePool2DAttrs>()) {
         if (avg_attrs->layout == "NCHW4c") {
-          return true;
+          return {true};
         }
       } else if (attr.as<LayerNormAttrs>()) {
-        return true;
+        return {true};
       }
     }
 
-    return false;
+    return {false};
   }
 
   std::string Scope(ffi::Array<PrimExpr> shape) {
@@ -432,7 +462,8 @@ class CollectConsumerScopeInfo : public ExprVisitor {
   ffi::Map<Expr, ffi::Map<Expr, ffi::Array<ffi::String>>> scope_info;
   /* A map of call node and scope info for each argument it consunes */
   ffi::Map<Expr, ffi::Array<ffi::String>> call_scope_info;
-  ffi::Map<Expr, Expr> arg_to_binding;
+  ffi::Map<Expr, ffi::Map<Expr, int>> tuple_item_to_binding;
+  ffi::Map<Expr, ffi::Array<Expr>> tuples_to_binding;
   IRModule mod_;
   Target target_;
 };
@@ -600,12 +631,9 @@ class DefineVDevice : ExprMutator {
 
     if (call->op == call_tir_op) {
       gv = Downcast<GlobalVar>(call->args[0]);
-      // tir::PrimFunc pfunc = Downcast<tir::PrimFunc>(mod_->Lookup(gv));
-      // out_sinfo = call->sinfo_args[0];
       func_args = Downcast<Tuple>(call->args[1]);
     } else {
       func_args = Tuple(call->args);
-      // return call;
     }
 
     ffi::Array<Expr> new_args;
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index e2d9b5b068b7..221f3b697883 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -29,12 +29,14 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/function.h>
 
+#include <climits>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include "../../../runtime/const_loader_module.h"
 #include "../../../target/source/codegen_source_base.h"
+#include "../../transform/utils.h"
 
 namespace tvm {
 namespace relax {
@@ -214,6 +216,14 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   Instruction::Arg VisitExpr_(const ConstantNode* op) final {
+    if (auto tsinfo = op->struct_info_.as<TensorStructInfoNode>()) {
+      if (tsinfo->vdevice.defined()) {
+        VDevice vdev = tsinfo->vdevice.value();
+        runtime::Tensor param = op->data;
+        param.SetScope(vdev->memory_scope);
+      }
+    }
+
     return builder_->ConvertConstant(op->data);
   }
 
@@ -333,11 +343,19 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   void EmitAllocTensor(const Call& call_node, RegName dst_reg) {
-    ICHECK_EQ(call_node->args.size(), 4);
+    ICHECK_EQ(call_node->args.size(), 5);
     std::vector<Instruction::Arg> args;
-    args.reserve(4);
-    for (Expr arg : call_node->args) {
-      args.push_back(this->VisitExpr(arg));
+    for (int i = 0; i < 4; ++i) {
+      args.push_back(this->VisitExpr(call_node->args[i]));
+    }
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call_node->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    if (vdevice.defined()) {
+      args.push_back(this->VisitExpr(StringImm(vdevice.value()->memory_scope)));
     }
     builder_->EmitCall("vm.builtin.alloc_tensor", args, dst_reg);
   }
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index a5bb83d406a5..8f061f62f11b 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -34,10 +34,13 @@
 #include <tvm/tir/stmt.h>
 
 #include <cctype>
+#include <climits>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "../../transform/utils.h"
+
 namespace tvm {
 namespace relax {
 namespace codegen_vm {
@@ -401,12 +404,21 @@ class CodeGenVMTIR : public ExprFunctor<ffi::Optional<PrimExpr>(const Expr&)> {
   }
 
   void EmitAllocTensor(const Call& call_node, int64_t dst_reg) {
-    ICHECK_EQ(call_node->args.size(), 4);
+    ICHECK_EQ(call_node->args.size(), 5);
     ffi::Array<PrimExpr> args;
-    args.reserve(4);
-    for (Expr arg : call_node->args) {
-      args.push_back(this->VisitExpr(arg).value());
+    for (int i = 0; i < 4; ++i) {
+      args.push_back(this->VisitExpr(call_node->args[i]).value());
+    }
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call_node->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
     }
+    auto vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    if (vdevice.defined()) {
+      args.push_back(tir::StringImm(vdevice.value()->memory_scope));
+    }
+
     this->EmitCallPacked("vm.builtin.alloc_tensor", args, dst_reg);
   }
 
diff --git a/src/relax/backend/vm/lower_runtime_builtin.cc b/src/relax/backend/vm/lower_runtime_builtin.cc
index 71b8413e9889..a1ffa4618423 100644
--- a/src/relax/backend/vm/lower_runtime_builtin.cc
+++ b/src/relax/backend/vm/lower_runtime_builtin.cc
@@ -92,7 +92,13 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
   Expr MakeMemAllocTensor(const Call& call) {
     PrimValue offset = Downcast<PrimValue>(call->args[1]);
     DataTypeImm dtype = Downcast<DataTypeImm>(call->args[3]);
-    return Call(vm_alloc_tensor_op_, {call->args[0], offset, call->args[2], dtype}, Attrs());
+
+    ffi::Array<Expr> call_args = {call->args[0], offset, call->args[2], dtype};
+    if (5 == call->args.size()) {
+      call_args.push_back(call->args[4]);
+    }
+
+    return Call(vm_alloc_tensor_op_, call_args, Attrs());
   }
 
   Expr MakeMemKillObject(const Call& call) {
@@ -168,8 +174,10 @@ class LowerRuntimeBuiltinMutator : public ExprMutator {
     VDevice vdev = attrs->dst_vdevice;
     int dev_type = vdev->target->GetTargetDeviceType();
     int dev_id = vdev->vdevice_id;
+    StringImm storage_scope = StringImm(vdev->memory_scope);
     args.push_back(PrimValue::Int64(dev_type));
     args.push_back(PrimValue::Int64(dev_id));
+    args.push_back(storage_scope);
     return Call(builtin_to_device_, args, call_node->attrs, {GetStructInfo(call_node)});
   }
 
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 54f9da4c786f..5d151fe2af87 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -23,6 +23,9 @@
 #include <tvm/relax/expr.h>
 #include <tvm/relax/utils.h>
 
+#include <climits>
+
+#include "../transform/utils.h"
 #include "op_common.h"
 
 namespace tvm {
@@ -1200,6 +1203,15 @@ StructInfo InferStructInfoAllocateTensor(const Call& call, const BlockBuilder& c
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+  int64_t vdevice_index = -1;
+  if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
+    vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+  }
+  auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+
+  if (vdevice.defined()) {
+    return TensorStructInfo(call->args[0], out_dtype, vdevice.value());
+  }
   return TensorStructInfo(call->args[0], out_dtype);
 }
 
@@ -1266,28 +1278,55 @@ StructInfo InferStructInfoMemAllocTensor(const Call& call, const BlockBuilder& c
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+
+  if (call->args.size() == 5) {
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+    if (vdevice.defined()) {
+      return TensorStructInfo(call->args[2], out_dtype, vdevice.value());
+    }
+  }
+
   return TensorStructInfo(call->args[2], out_dtype);
 }
 
 TVM_REGISTER_OP("relax.memory.alloc_tensor")
-    .set_num_inputs(4)
+    .set_num_inputs(5)
     .add_argument("storage", "Expr", "The storage to allocate the tensor to.")
     .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.")
     .add_argument("shape", "Expr", "The shape of the tensor to allocate.")
     .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.")
+    .add_argument("runtime_device_index", "PrimValue",
+                  "The device index indicating on which device the tensor is to be "
+                  "allocated at runtime. Index -1 is reserved for the host device.")
     .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoMemAllocTensor)
     // memory allocation isn't considered a "visible effect" as far as purity is concerned
     .set_attr<Bool>("FPurity", Bool(true))
     .set_attr<Bool>("TAllocator", Bool(true));
 
-Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) {
+Expr MakeMemAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                        PrimValue virtual_device_index) {
   static const Op& op = Op::Get("relax.memory.alloc_tensor");
-  return Call(op, {storage, offset, shape, dtype}, Attrs(), {});
+  return Call(op, {storage, offset, shape, dtype, virtual_device_index}, Attrs(), {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.op.memory.alloc_tensor", MakeMemAllocTensor);
+  refl::GlobalDef().def_packed(
+      "relax.op.memory.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) {
+        if (args.size() == 5) {
+          *ret = MakeMemAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(),
+                                    args[2].cast<Expr>(), args[3].cast<DataTypeImm>(),
+                                    args[4].cast<PrimValue>());
+        } else {
+          *ret = MakeMemAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(),
+                                    args[2].cast<Expr>(), args[3].cast<DataTypeImm>(),
+                                    PrimValue::Int64(0));
+        }
+      });
 }
 
 // memory planning kill_storage
@@ -1363,41 +1402,60 @@ StructInfo InferStructInfoVMAllocTensor(const Call& call, const BlockBuilder& ct
     const DataTypeImm dtype_imm = ffi::GetRef<DataTypeImm>(dtype_node);
     out_dtype = dtype_imm->value;
   }
+  int64_t vdevice_index = -1;
+  if (auto* prim_value_node = call->args[4].as<PrimValueNode>()) {
+    vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+  }
+  auto vdevice = GetGlobalVDevice(ctx->GetContextIRModule(), vdevice_index);
+
   if (const auto* output_shape = call->args[2].as<ShapeExprNode>()) {
-    return TensorStructInfo(ffi::GetRef<Expr>(output_shape), out_dtype);
+    return TensorStructInfo(ffi::GetRef<Expr>(output_shape), out_dtype, vdevice);
   } else if (const auto* shape_sinfo = GetStructInfoAs<ShapeStructInfoNode>(call->args[2])) {
     if (shape_sinfo->values.defined()) {
-      return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype);
+      return TensorStructInfo(ShapeExpr(shape_sinfo->values.value()), out_dtype, vdevice);
     } else {
-      return TensorStructInfo(out_dtype, shape_sinfo->ndim);
+      return TensorStructInfo(out_dtype, shape_sinfo->ndim, vdevice);
     }
   }
-  return TensorStructInfo(out_dtype, kUnknownNDim);
+  return TensorStructInfo(out_dtype, kUnknownNDim, vdevice);
 }
 
 TVM_REGISTER_OP("relax.vm.alloc_tensor")
-    .set_num_inputs(4)
+    .set_num_inputs(5)
     .add_argument("storage", "Expr", "The storage to allocate the tensor to.")
     .add_argument("offset", "PrimValue", "Storage offset to allocate the tensor.")
     .add_argument("shape", "Expr", "The shape of the tensor to allocate.")
     .add_argument("dtype", "DataTypeImm", "The dtype of the tensor to allocate.")
+    .add_argument("runtime_device_index", "PrimValue",
+                  "The device index indicating on which device the tensor is "
+                  "to be allocated at runtime.")
     .set_attr<FInferStructInfo>("FInferStructInfo", InferStructInfoVMAllocTensor)
     // memory allocation isn't considered a "visible effect" as far as purity is concerned
     .set_attr<Bool>("FPurity", Bool(true))
     .set_attr<Bool>("TAllocator", Bool(true));
 
-Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype) {
+Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                       PrimValue runtime_device_index) {
   static const Op& op = Op::Get("relax.vm.alloc_tensor");
-  return Call(op, {storage, offset, shape, dtype}, Attrs(), {});
+  return Call(op, {storage, offset, shape, dtype, runtime_device_index}, Attrs(), {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("relax.op.vm.alloc_tensor", MakeVMAllocTensor);
+  refl::GlobalDef().def_packed("relax.op.vm.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* ret) {
+    if (args.size() == 5) {
+      *ret =
+          MakeVMAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(), args[2].cast<Expr>(),
+                            args[3].cast<DataTypeImm>(), args[4].cast<PrimValue>());
+    } else {
+      *ret =
+          MakeVMAllocTensor(args[0].cast<Expr>(), args[1].cast<PrimValue>(), args[2].cast<Expr>(),
+                            args[3].cast<DataTypeImm>(), PrimValue::Int64(0));
+    }
+  });
 }
 
 // vm kill_object
-
 TVM_REGISTER_OP("relax.vm.kill_object")
     .set_num_inputs(1)
     .add_argument("obj", "Expr", "The object to be killed.")
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index 5a556cbd7413..bc958cec6791 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -211,7 +211,15 @@ inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx
   }
   auto output_sinfo = ffi::make_object<TensorStructInfoNode>(*input_sinfo.get());
   output_sinfo->dtype = f_compute_out_dtype(input_sinfo);
-  return TensorStructInfo(output_sinfo);
+  if (call->sinfo_args.size() > 0) {
+    auto defined_sinfo = call->sinfo_args[0].as<TensorStructInfoNode>();
+    auto shape = output_sinfo->GetShape();
+    ICHECK(shape.defined());
+    return TensorStructInfo(ShapeExpr(shape.value()), output_sinfo->dtype,
+                            defined_sinfo->vdevice.value());
+  } else {
+    return TensorStructInfo(output_sinfo);
+  }
 }
 
 /*!
@@ -568,7 +576,8 @@ inline ffi::Optional<ShapeExpr> CheckNdimPerLayoutAndGetShape(const Call& call,
 
 Expr MakeVMAllocStorage(Expr size, PrimValue runtime_device_index, DataTypeImm dtype,
                         StringImm storage_scope = StringImm("global"));
-Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype);
+Expr MakeVMAllocTensor(Expr storage, PrimValue offset, Expr shape, DataTypeImm dtype,
+                       PrimValue runtime_device_index);
 
 Expr MakeAllocTensor(Expr shape, DataTypeImm dtype, PrimValue runtime_device_index,
                      StringImm storage_scope = StringImm("global"));
diff --git a/src/relax/transform/call_tir_rewrite.cc b/src/relax/transform/call_tir_rewrite.cc
index d4763b44b713..2c0e515be7ec 100644
--- a/src/relax/transform/call_tir_rewrite.cc
+++ b/src/relax/transform/call_tir_rewrite.cc
@@ -81,15 +81,20 @@ class CallTIRMutator : public ExprMutator {
         ICHECK(tensor_sinfo->shape.defined())
             << "the TensorStructInfo shape of call_tir has not populated";
         int dev_index = 0;
+        ffi::String scope = "global";
         if (tensor_sinfo->vdevice.defined()) {
           dev_index = GetDeviceIndex(mod_, tensor_sinfo->vdevice.value());
+          scope = tensor_sinfo->vdevice.value()->memory_scope;
+        } else {
+          dev_index = GetDeviceIndexByScope(mod_, scope);
         }
+
         if (!is_inplace) {
           outs.push_back(builder_->Emit(Call(alloc_tensor_op,
                                              {Downcast<ShapeExpr>(tensor_sinfo->shape.value()),
                                               DataTypeImm(tensor_sinfo->dtype),
-                                              PrimValue::Int64(dev_index), StringImm("global")},
-                                             Attrs()),
+                                              PrimValue::Int64(dev_index), StringImm(scope)},
+                                             Attrs(), {tensor_sinfo}),
                                         "alloc"));
         } else {
           // if there is only one output, it must be an in-place argument, but check anyway
@@ -112,13 +117,21 @@ class CallTIRMutator : public ExprMutator {
           ICHECK(field_tensor->shape.defined())
               << "call_tir expects all TensorStructInfo has shape, but got " << field_tensor
               << " as an element of TupleStructInfo";
+
+          int dev_index = 0;
+          ffi::String scope = "global";
+          if (field_tensor->vdevice.defined()) {
+            dev_index = GetDeviceIndex(mod_, field_tensor->vdevice.value());
+            scope = field_tensor->vdevice.value()->memory_scope;
+          }
+
           if (!is_inplace || inplace_attrs->inplace_indices[i].IntValue() == -1) {
-            outs.push_back(builder_->Emit(
-                Call(alloc_tensor_op,
-                     {Downcast<ShapeExpr>(field_tensor->shape.value()),
-                      DataTypeImm(field_tensor->dtype), PrimValue::Int64(0), StringImm("global")},
-                     Attrs()),
-                "alloc"));
+            outs.push_back(builder_->Emit(Call(alloc_tensor_op,
+                                               {Downcast<ShapeExpr>(field_tensor->shape.value()),
+                                                DataTypeImm(field_tensor->dtype),
+                                                PrimValue::Int64(dev_index), StringImm(scope)},
+                                               Attrs(), {field_tensor}),
+                                          "alloc"));
           } else {
             outs.push_back(Downcast<Tuple>(call->args[1])
                                ->fields[inplace_attrs->inplace_indices[i].IntValue()]);
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index d1e61b1c5748..74ceb6c6ebc4 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -24,11 +24,18 @@
 #include <tvm/relax/expr_functor.h>
 #include <tvm/relax/transform.h>
 
+#include <climits>
+
+#include "utils.h"
+
 namespace tvm {
 namespace relax {
 
 namespace {
 class Mutator : public ExprMutator {
+ public:
+  explicit Mutator(IRModule mod) : ctx_mod_(mod) {}
+
   using ExprMutator::VisitExpr_;
   Expr VisitExpr_(const CallNode* op) override {
     static const Op& alloc_tensor_op = Op::Get("relax.builtin.alloc_tensor");
@@ -71,23 +78,56 @@ class Mutator : public ExprMutator {
         return nbytes;
       }();
 
+      ShapeExpr size({nbytes});
+
+      int64_t vdevice_index = -1;
+      if (auto* prim_value_node = op->args[2].as<PrimValueNode>()) {
+        vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+      }
+      ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+      if (vdevice.defined()) {
+        std::string dev_kind = vdevice.value()->target->kind->name;
+        PrimExpr dev_size = tir::make_const(DataType::Int(64), 1);
+        if (vdevice.value()->memory_scope != "global") {
+          auto device_size_handler =
+              tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize.") + dev_kind);
+          if (device_size_handler.has_value()) {
+            dev_size *=
+                (*device_size_handler)(shape, dtype->value, vdevice.value()).cast<PrimExpr>();
+            size = ShapeExpr({dev_size});
+          }
+          auto device_scope_handler =
+              tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility.") + dev_kind);
+          if (device_scope_handler.has_value()) {
+            ffi::String dev_scope =
+                (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope)
+                    .cast<ffi::String>();
+            storage_scope = StringImm(dev_scope);
+          }
+        }
+      }
+
       auto offset = PrimValue::Int64(0);
 
-      Expr storage =
-          relax::Call(mem_alloc_storage_op, {ShapeExpr({nbytes}), runtime_device_index,
-                                             storage_scope, DataTypeImm(DataType::UInt(8))});
+      Expr storage = relax::Call(mem_alloc_storage_op, {size, runtime_device_index, storage_scope,
+                                                        DataTypeImm(DataType::UInt(8))});
       storage = builder_->Emit(storage, "storage");
-      Expr tensor = relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype});
+      Expr tensor =
+          relax::Call(mem_alloc_tensor_op, {storage, offset, shape_arg, dtype, op->args[2]});
       return tensor;
     } else {
       return ExprMutator::VisitExpr_(op);
     }
   }
+
+ private:
+  IRModule ctx_mod_;
 };
 }  // namespace
 
-Expr LowerAllocTensor(Expr expr) {
-  Mutator mutator;
+Expr LowerAllocTensor(IRModule m, Expr expr) {
+  Mutator mutator(m);
   return mutator(expr);
 }
 
@@ -95,7 +135,7 @@ namespace transform {
 
 Pass LowerAllocTensor() {
   auto pass_func = [=](Function func, IRModule m, PassContext pc) {
-    return Downcast<Function>(relax::LowerAllocTensor(std::move(func)));
+    return Downcast<Function>(relax::LowerAllocTensor(m, std::move(func)));
   };
   return CreateFunctionPass(pass_func, /*opt_level=*/0, "LowerAllocTensor", {});
 }
diff --git a/src/relax/transform/specialize_primfunc_based_on_callsite.cc b/src/relax/transform/specialize_primfunc_based_on_callsite.cc
index 6258e14b666d..bb2855b4b0bb 100644
--- a/src/relax/transform/specialize_primfunc_based_on_callsite.cc
+++ b/src/relax/transform/specialize_primfunc_based_on_callsite.cc
@@ -132,8 +132,9 @@ class SpecializeTIRCallArgs : ExprMutator {
         if (sinfo->vdevice.defined()) {
           scope = sinfo->vdevice.value()->memory_scope;
         }
-        const Buffer& buffer =
-            tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype, "ret_val", scope);
+
+        const Buffer& buffer = tir::decl_buffer(GetShapeFromTensorStructInfo(sinfo), sinfo->dtype,
+                                                "ret_val_" + std::to_string(index), scope);
         param_map.Set(pfunc->params[args.size() + index], buffer);
         index++;
       }
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index fc3c2259ff9a..ab7418a65aa3 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -73,10 +73,14 @@
 #include <tvm/relax/transform.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <climits>
 #include <map>
 #include <set>
 #include <vector>
 
+#include "../../runtime/texture.h"
+#include "utils.h"
+
 namespace tvm {
 namespace relax {
 
@@ -105,6 +109,8 @@ class StorageTokenNode : public Object {
   DataType dtype;
   /*! \brief The memory scope of the token. */
   std::string storage_scope;
+  /*! \brief The VDevice information. */
+  ffi::Optional<VDevice> vdevice;
   /*! \brief The storage id, reserved for debug and demo use. */
   int storage_id{-1};
 
@@ -129,23 +135,51 @@ class StorageTokenNode : public Object {
  */
 class StorageToken : public ObjectRef {
  public:
-  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope) {
+  explicit StorageToken(ffi::Array<PrimExpr> shape, DataType dtype, std::string storage_scope,
+                        ffi::Optional<VDevice> vdevice = std::nullopt) {
     // Compute the tensor size from the shape.
     int64_t const_coeff = dtype.bytes() * dtype.lanes();
     PrimExpr size = tir::make_const(DataType::Int(64), 1);
-    for (const PrimExpr& dim_len : shape) {
-      if (const IntImmNode* const_dim_len = dim_len.as<IntImmNode>()) {
-        const_coeff *= const_dim_len->value;
-      } else {
-        size *= dim_len;
+    bool size_computed = false;
+
+    if (vdevice.defined()) {
+      VDevice vdev = vdevice.value();
+      std::string dev_kind = vdev->target->kind->name;
+
+      if (vdev->memory_scope != "global") {
+        auto device_size_handler =
+            tvm::ffi::Function::GetGlobal(std::string("DeviceGetMemSize." + dev_kind));
+        if (device_size_handler.has_value()) {
+          size *= (*device_size_handler)(shape, dtype, vdevice.value()).cast<PrimExpr>();
+          size_computed = true;
+        }
+        auto device_scope_handler =
+            tvm::ffi::Function::GetGlobal(std::string("DeviceScopeCompatibility." + dev_kind));
+        if (device_scope_handler.has_value()) {
+          ffi::String dev_scope =
+              (*device_scope_handler)(vdevice.value()->target, vdevice.value()->memory_scope)
+                  .cast<ffi::String>();
+          storage_scope = dev_scope;
+        }
+      }
+    }
+    if (!size_computed) {
+      for (const PrimExpr& dim_len : shape) {
+        if (const IntImmNode* const_dim_len = dim_len.as<IntImmNode>()) {
+          const_coeff *= const_dim_len->value;
+        } else {
+          size *= dim_len;
+        }
       }
     }
+
     size = tir::make_const(DataType::Int(64), const_coeff) * size;
 
     ObjectPtr<StorageTokenNode> n = ffi::make_object<StorageTokenNode>();
     n->bytes = size;
     n->dtype = dtype;
     n->storage_scope = std::move(storage_scope);
+    n->vdevice = std::move(vdevice);
     data_ = std::move(n);
   }
   TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(StorageToken, ObjectRef, StorageTokenNode);
@@ -159,9 +193,9 @@ using Tokens = NestedMsg<StorageToken>;
  * \note We can generalize this implementation to multi-dimensional memory
  * following the same flow in the future.
  */
-class TokenAllocator1D {
+class TokenAllocatorMixed {
  public:
-  explicit TokenAllocator1D(arith::Analyzer* analyzer) : analyzer_(analyzer) {}
+  explicit TokenAllocatorMixed(arith::Analyzer* analyzer) : analyzer_(analyzer) {}
 
   /*!
    * \brief Request a storage token from the available token pool for a
@@ -606,7 +640,14 @@ class StorageAllocatorInit : public StorageAllocatorBaseVisitor {
 
     // Create and set token.
     StringImm storage_scope = Downcast<StringImm>(call->args[3]);
-    StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value);
+
+    int64_t vdevice_index = -1;
+    if (auto* prim_value_node = call->args[2].as<PrimValueNode>()) {
+      vdevice_index = prim_value_node->value.as<IntImmNode>()->value;
+    }
+    ffi::Optional<VDevice> vdevice = GetGlobalVDevice(ctx_mod_, vdevice_index);
+
+    StorageToken token(upper_bounded_shape, sinfo->dtype, storage_scope->value, vdevice);
 
     Tokens tokens(token);
     SetTokens(call, tokens);
@@ -823,7 +864,7 @@ class StorageAllocator : public StorageAllocatorBaseVisitor {
   /*! \brief Number of allocated storages. */
   int n_storage_{0};
   /*! \brief The 1D memory allocator. */
-  TokenAllocator1D allocator_;
+  TokenAllocatorMixed allocator_;
   /*! \brief The mapping from each token to the tensors that are currently using it. */
   std::unordered_map<const StorageTokenNode*, std::vector<Var>> token2cur_tensor_;
 };
@@ -905,7 +946,8 @@ class StorageAllocationRewriter : public ExprMutator {
       // And always create a `memory.alloc_tensor` for the old `builtin.alloc_tensor`.
       PrimValue offset = PrimValue::Int64(0);
       DataType dtype = sinfo->dtype;
-      return Call(mem_alloc_tensor, {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype)},
+      return Call(mem_alloc_tensor,
+                  {storage_var, offset, sinfo->shape.value(), DataTypeImm(dtype), call->args[2]},
                   Attrs());
     } else if (plan_dynamic_output_ && call->op == alloc_tensor_op) {
       // Case 2. For a `alloc_tensor` that is not planned for memory reuse,
@@ -936,7 +978,8 @@ class StorageAllocationRewriter : public ExprMutator {
         return Call(mem_alloc_tensor, {storage,  //
                                        /*offset=*/PrimValue::Int64(0),
                                        /*shape=*/ffi::GetRef<ShapeExpr>(shape),  //
-                                       /*dtype=*/DataTypeImm(sinfo->dtype)});
+                                       /*dtype=*/DataTypeImm(sinfo->dtype),
+                                       /*vdevice_index=*/call->args[2]});
       }
     }
 
@@ -990,6 +1033,30 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def("relax.transform.StaticPlanBlockMemory", StaticPlanBlockMemory);
 }
 
+PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType dtype,
+                                         VDevice vdevice) {
+  int image_row_align = vdevice->target->GetAttr<Integer>("image_base_address_alignment")
+                            .value_or(Integer(64))
+                            ->value;
+
+  // TODO(Siva) Assuming no any dimensions for now.
+  struct Shape {
+    const ffi::Array<PrimExpr>& shape;
+    int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+    int size() { return this->shape.size(); }
+  };
+  auto shape = Shape{pshape};
+
+  size_t size = runtime::GetTextureMemorySize<Shape>(shape, dtype.bytes() * 8, dtype.lanes(),
+                                                     vdevice->memory_scope, image_row_align);
+  return tir::make_const(DataType::Int(64), size);
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("DeviceGetMemSize.opencl", GetTextureMemorySizeFromVDevice);
+}
+
 }  // namespace transform
 }  // namespace relax
 }  // namespace tvm
diff --git a/src/relax/transform/utils.h b/src/relax/transform/utils.h
index 91d75079f73d..2cce918f4ef7 100644
--- a/src/relax/transform/utils.h
+++ b/src/relax/transform/utils.h
@@ -33,6 +33,7 @@
 #include <algorithm>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "../../support/array.h"
@@ -383,6 +384,19 @@ inline ffi::String GetCodegenName(const std::string& composite_name) {
   return composite_name.substr(0, delim_pos);
 }
 
+inline int GetDeviceIndexByScope(const IRModule& mod, const ffi::String& scope) {
+  if (mod->global_infos.find("vdevice") == mod->global_infos.end()) {
+    return 0;
+  }
+  ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
+  for (int i = 0; i < static_cast<int>(vdevices.size()); ++i) {
+    if (scope == vdevices[i].as<VDevice>().value()->memory_scope) {
+      return i;
+    }
+  }
+  return 0;
+}
+
 inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) {
   ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
   for (int i = 0; i < static_cast<int>(vdevices.size()); ++i) {
@@ -394,6 +408,17 @@ inline int GetDeviceIndex(const IRModule& mod, const VDevice& vdevice) {
   return -1;
 }
 
+inline ffi::Optional<VDevice> GetGlobalVDevice(const IRModule& mod, const int index) {
+  ffi::Optional<VDevice> ret;
+  if (mod->global_infos.find("vdevice") != mod->global_infos.end()) {
+    ffi::Array<GlobalInfo> vdevices = mod->global_infos["vdevice"];
+    if (index < static_cast<int>(vdevices.size())) {
+      ret = vdevices[index].as<VDevice>();
+    }
+  }
+  return ret;
+}
+
 /* \brief Eliminate common subexpressions
  *
  * Utility for simplifying relax expressions by removing common
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 933cd0b7a7cf..743fca68d9ca 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -187,6 +187,8 @@ inline const char* CLGetErrorString(cl_int error) {
 
 inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
   DataType dtype(data_type);
+  dtype = dtype.with_lanes(1);
+
   if (dtype == DataType::Float(32)) {
     return CL_FLOAT;
   } else if (dtype == DataType::Float(16)) {
@@ -343,7 +345,6 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpaceView(Device dev, void* data, ffi::Shape shape, DLDataType dtype,
                            ffi::Optional<ffi::String> mem_scope = std::nullopt);
   void FreeDataSpaceView(Device dev, void* ptr);
-
   cl_device_id GetCLDeviceID(int device_id);
   // override device API
   void SetDevice(Device dev) final;
@@ -351,7 +352,7 @@ class OpenCLWorkspace : public DeviceAPI {
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType type_hint) final;
   void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
                        ffi::Optional<ffi::String> mem_scope = std::nullopt) final;
-  void* AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
+  void* AllocDataSpace(Device dev, size_t width, size_t height, size_t depth, DLDataType type_hint,
                        ffi::Optional<ffi::String> mem_scope = std::nullopt);
   void* GetNativePtr(const tvm::runtime::Tensor& narr);
   void SetNativePtr(const tvm::runtime::Tensor& narr, void* host_ptr, size_t buf_size);
@@ -365,8 +366,8 @@ class OpenCLWorkspace : public DeviceAPI {
 
   // cl_mem alloc utils
   void* AllocCLBuffer(Device dev, size_t size, size_t alignment, DLDataType type_hint);
-  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t row_pitch,
-                     DLDataType type_hint, ffi::Optional<ffi::String> mem_scope);
+  void* AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height, size_t depth,
+                     size_t row_pitch, DLDataType type_hint, ffi::Optional<ffi::String> mem_scope);
 
   /*!
    * \brief Get the thread local ThreadEntry
@@ -450,6 +451,11 @@ struct BufferDescriptor {
   MemoryLayout layout{MemoryLayout::kBuffer1D};
   Buffer mbuf{nullptr};  // MemoryManager ref.
   bool is_compat_view{false};
+#ifdef PROFILE_SHADER_DUMP
+  size_t mem_size;
+  size_t width, height, depth;
+  DLDataType dtype;
+#endif  // PROFILE_SHADER_DUMP
 };
 }  // namespace cl
 
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 8b6fba24988e..f5faa69fdcc3 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -71,7 +71,7 @@ ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* tensor)
   auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, tensor->ndim, axis);
   info.region[0] = texture_shape.width;
   info.region[1] = texture_shape.height;
-  info.region[2] = 1;
+  info.region[2] = texture_shape.depth;
   return info;
 }
 
@@ -260,13 +260,13 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   return AllocCLBuffer(dev, size, alignment, type_hint);
 }
 
-void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, DLDataType type_hint,
-                                      ffi::Optional<ffi::String> mem_scope) {
+void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, size_t depth,
+                                      DLDataType type_hint, ffi::Optional<ffi::String> mem_scope) {
   // Texture allocation given width and height
   cl_uint row_align = GetImageAlignment(dev.device_id);
   size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
-  size_t mem_size = row_pitch * height;
+  size_t mem_size = row_pitch * height * depth;
 
   // Alloc back buffer from pool
   cl::BufferDescriptor* back_buffer = nullptr;
@@ -280,7 +280,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t width, size_t height, D
   if (!mem_scope.has_value()) {
     mem_scope = ffi::String("global.texture");
   }
-  return AllocCLImage(dev, back_buffer, width, height, row_pitch, type_hint, mem_scope);
+  return AllocCLImage(dev, back_buffer, width, height, depth, row_pitch, type_hint, mem_scope);
 }
 
 void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
@@ -298,7 +298,7 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* shape
   size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
   auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
 
-  return AllocDataSpace(dev, texture.width, texture.height, dtype, mem_scope);
+  return AllocDataSpace(dev, texture.width, texture.height, texture.depth, dtype, mem_scope);
 }
 
 void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
@@ -315,12 +315,15 @@ void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
   desc->buffer =
       clCreateBuffer(this->contexts[platform], CL_MEM_CREATE_FLAGS, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
+#ifdef PROFILE_SHADER_DUMP
+  desc->mem_size = size;
+#endif  // PROFILE_SHADER_DUMP
   OPENCL_CHECK_ERROR(err_code);
   return CreateHostPtrIfEnabled(desc, dev, size);
 }
 
 void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width, size_t height,
-                                    size_t row_pitch, DLDataType type_hint,
+                                    size_t depth, size_t row_pitch, DLDataType type_hint,
                                     ffi::Optional<ffi::String> mem_scope) {
   this->Init();
   ICHECK(std::string(mem_scope.value()).find("texture") != std::string::npos)
@@ -331,7 +334,7 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width,
   cl_int err_code;
   cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
   cl_image_format format = {CL_RGBA, cl_type};
-  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 0, 0, 0};
+  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D_ARRAY, width, height, 0, depth, 0, 0, 0, 0};
 
   if (IsBufferToImageSupported(dev.device_id)) {
     descriptor.image_row_pitch = row_pitch;
@@ -344,7 +347,12 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width,
   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
   desc->buffer = mptr;
   desc->back_buffer = back_desc;
-
+#ifdef PROFILE_SHADER_DUMP
+  desc->width = width;
+  desc->height = height;
+  desc->depth = depth;
+  desc->dtype = type_hint;
+#endif  // PROFILE_SHADER_DUMP
   return desc;
 }
 
@@ -383,7 +391,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha
       size_t row_pitch = ALIGN_UP(texture.width * pixel_size * 4, row_align);  // CL_RGBA = 4
 
       ret_desc = static_cast<cl::BufferDescriptor*>(OpenCLWorkspace::Global()->AllocCLImage(
-          dev, nullptr, texture.width, texture.height, row_pitch, dtype, mem_scope));
+          dev, nullptr, texture.width, texture.height, texture.depth, row_pitch, dtype, mem_scope));
       ret_desc->is_compat_view = true;
     }
     return ret_desc;
@@ -414,7 +422,7 @@ void* OpenCLWorkspace::AllocDataSpaceView(Device dev, void* data, ffi::Shape sha
   }
 
   return (cl::BufferDescriptor*)AllocCLImage(dev, back_buffer, texture.width, texture.height,
-                                             row_pitch, dtype, mem_scope);
+                                             texture.depth, row_pitch, dtype, mem_scope);
 }
 
 void OpenCLWorkspace::FreeDataSpaceView(Device dev, void* ptr) {
@@ -768,28 +776,27 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                   [](ffi::PackedArgs args, ffi::Any* rv) {
                     int32_t device_type = args[0].cast<int32_t>();
                     int32_t device_id = args[1].cast<int32_t>();
-                    int32_t dtype_code_hint = args[2].cast<int32_t>();
-                    int32_t dtype_bits_hint = args[3].cast<int32_t>();
                     auto scope = args[4].cast<std::string>();
                     CHECK(scope.find("texture") != std::string::npos);
                     int64_t ndim = args[5].cast<int64_t>();
-                    CHECK_EQ(ndim, 2);
+                    CHECK_EQ(ndim, 3);
                     int64_t* shape = static_cast<int64_t*>(args[6].cast<void*>());
                     int64_t width = shape[0];
                     int64_t height = shape[1];
-
+                    int64_t depth = shape[2];
+                    int64_t channel_size = args[7].cast<int64_t>();
+                    DataType channel_type = GetChannelType(channel_size);
                     Device dev;
                     dev.device_type = static_cast<DLDeviceType>(device_type);
                     dev.device_id = device_id;
-
                     DLDataType type_hint;
-                    type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
-                    type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
-                    type_hint.lanes = 1;
+                    type_hint.code = channel_type.code();
+                    type_hint.bits = channel_type.bits();
+                    type_hint.lanes = channel_type.lanes();
 
                     *rv = OpenCLWorkspace::Global()->AllocDataSpace(
-                        dev, static_cast<size_t>(width), static_cast<size_t>(height), type_hint,
-                        ffi::String("global.texture"));
+                        dev, static_cast<size_t>(width), static_cast<size_t>(height),
+                        static_cast<size_t>(depth), type_hint, ffi::String("global.texture"));
                   })
       .def_packed("device_api.opencl.free_nd",
                   [](ffi::PackedArgs args, ffi::Any* rv) {
@@ -857,7 +864,7 @@ class OpenCLPooledAllocator final : public memory::PooledAllocator {
   Buffer Alloc(Device dev, ffi::Shape shape, DLDataType type_hint,
                const std::string& mem_scope) override {
     if (AllowMemoryScope(mem_scope)) {
-      size_t size = ffi::GetDataSize(shape.Product(), type_hint);
+      size_t size = GetMemObjectSize(dev, shape.size(), shape.data(), type_hint);
       Buffer buf;
       buf.device = dev;
       buf.size = size;
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 3f9dadbb3af1..8aa40515ecce 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -80,7 +80,6 @@ class OpenCLWrappedFunc {
       wl.work_size[i] *= wl.work_size[i + 3];
     }
     // launch kernel
-
     if (w_->IsProfiling(t->device)) {
       w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
       OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
diff --git a/src/runtime/tensor.cc b/src/runtime/tensor.cc
index 4ef744452c3c..08672e2b1ba9 100644
--- a/src/runtime/tensor.cc
+++ b/src/runtime/tensor.cc
@@ -129,16 +129,22 @@ Tensor Tensor::Empty(ffi::Shape shape, DLDataType dtype, Device dev,
       DeviceAPI::Get(tensor->device)->FreeDataSpace(tensor->device, tensor->data);
     }
   };
-  return ffi::Tensor::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
+  Tensor ret = ffi::Tensor::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
+  if (mem_scope.has_value()) {
+    ret.SetScope(mem_scope.value());
+  }
+
+  return ret;
 }
 
 Tensor Tensor::CreateView(ffi::Shape shape, DLDataType dtype, uint64_t relative_byte_offset) const {
   ICHECK(data_ != nullptr);
 
   const DLTensor& orig = *get_mutable();
-  CHECK(IsContiguous()) << [&orig]() {
+  CHECK(IsContiguous() || ("global" != scope)) << [&orig]() {
     std::stringstream ss;
-    ss << "Can only create view for compact tensor, but found strides ";
+    ss << "Can only create view for compact tensor, but found strides or its a memory scoped "
+          "object";
 
     ss << "[";
     for (int i = 0; i < orig.ndim; i++) {
@@ -159,6 +165,7 @@ Tensor Tensor::CreateView(ffi::Shape shape, DLDataType dtype, uint64_t relative_
   const auto& curr_dl_tensor = *get_mutable();
   size_t curr_size = GetDataSize(curr_dl_tensor);
   size_t view_size = ffi::GetDataSize(shape.Product(), dtype);
+
   CHECK_LE(relative_byte_offset + view_size, curr_size)
       << "ValueError: "
       << "View with shape " << shape << " and datatype " << dtype << " would have a size of "
@@ -231,6 +238,10 @@ void Tensor::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stre
   DeviceAPI::Get(dev)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }
 
+void Tensor::SetScope(ffi::String scope) { this->scope = scope; }
+
+ffi::String Tensor::GetScope() const { return this->scope; }
+
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
index f3a827aa8792..e2b6d603ed50 100644
--- a/src/runtime/texture.h
+++ b/src/runtime/texture.h
@@ -40,6 +40,7 @@ template <typename T>
 struct Texture2DShape {
   T width;
   T height;
+  T depth;
   T channel;
 };
 
@@ -60,11 +61,7 @@ inline size_t DefaultTextureLayoutSeparator(size_t shape_rank,
   } else if (convention == "global.texture-weight") {
     separator = 1;
   } else if (convention == "global.texture-nhwc") {
-    if (shape_rank == 3) {
-      separator = 1;
-    } else {
-      separator = 2;
-    }
+    separator = 2;
   } else {
     LOG(FATAL) << "Encountered unknown texture lowering convention: " << convention;
   }
@@ -81,9 +78,11 @@ template <typename T, typename S>
 Texture2DShape<T> ApplyTexture2DFlattening(const S& shape, size_t rank, size_t axis) {
   ICHECK(axis < rank)
       << "Number of axes to flatten into rows must be less than shape rank for 2d flattening";
-  Texture2DShape<T> texture{1, 1, shape[rank - 1]};
+  Texture2DShape<T> texture{1, 1, 1, shape[rank - 1]};
   for (size_t i = 0; i < rank - 1; i++) {
-    if (i < axis) {
+    if (i < (axis - 1)) {
+      texture.depth *= shape[i];
+    } else if (i < axis) {
       texture.height *= shape[i];
     } else {
       texture.width *= shape[i];
@@ -114,7 +113,23 @@ size_t GetTextureMemorySize(T shape, int bits, int lanes, std::string mem_scope,
   auto pack_size = shape[shape.size() - 1];
   auto pixel_size = (bits * lanes + 7) / 8;
   size_t row_pitch = ALIGN_UP(tshape.width * pixel_size * pack_size, image_row_align);
-  return row_pitch * tshape.height;
+  return row_pitch * tshape.height * tshape.depth;
+}
+
+/*!
+ * \brief Returns the standard channel datatype for any given type.
+ * \param channel_size The Number of bits in a Channel
+ * \return DataType to be used in the codegen.
+ */
+inline DataType GetChannelType(size_t channel_size) {
+  DataType channel_type;
+
+  if (channel_size == 128)
+    return DataType::Float(32, 4);
+  else if (channel_size == 64)
+    return DataType::Float(16, 4);
+
+  LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
 }
 
 }  // namespace runtime
diff --git a/src/runtime/vm/builtin.cc b/src/runtime/vm/builtin.cc
index 1bd3084c210b..69a4c74169a4 100644
--- a/src/runtime/vm/builtin.cc
+++ b/src/runtime/vm/builtin.cc
@@ -390,7 +390,18 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
       .def("vm.builtin.alloc_storage", VMAllocStorage)
-      .def_method("vm.builtin.alloc_tensor", &StorageObj::AllocTensor);
+      .def_packed("vm.builtin.alloc_tensor", [](ffi::PackedArgs args, ffi::Any* rv) {
+        Storage sobj = args[0].cast<Storage>();
+        int64_t offset = args[1].cast<int64_t>();
+        ffi::Shape shape = args[2].cast<ffi::Shape>();
+        DataType dtype = args[3].cast<DataType>();
+        if (args.size() == 5) {
+          ffi::String scope = args[4].cast<ffi::String>();
+          *rv = sobj->AllocTensorScoped(offset, shape, dtype, scope);
+        } else {
+          *rv = sobj->AllocTensor(offset, shape, dtype);
+        }
+      });
 }
 
 //-------------------------------------------------
@@ -516,9 +527,16 @@ TVM_FFI_STATIC_INIT_BLOCK() {
           "vm.builtin.reshape",
           [](Tensor data, ffi::Shape new_shape) { return data.CreateView(new_shape, data->dtype); })
       .def("vm.builtin.null_value", []() -> std::nullptr_t { return nullptr; })
-      .def("vm.builtin.to_device", [](Tensor data, int dev_type, int dev_id) {
+      .def_packed("vm.builtin.to_device", [](ffi::PackedArgs args, ffi::Any* rv) {
+        Tensor data = args[0].cast<Tensor>();
+        int dev_type = args[1].cast<int>();
+        int dev_id = args[2].cast<int>();
         Device dst_device = {(DLDeviceType)dev_type, dev_id};
-        return data.CopyTo(dst_device);
+        ffi::String mem_scope = "global";
+        if (args.size() == 4) {
+          mem_scope = args[3].cast<ffi::String>();
+        }
+        *rv = data.CopyTo(dst_device, mem_scope);
       });
 }
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 40edbc14c433..abad6363839c 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -250,7 +250,9 @@ void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const {
   for (const auto& it : this->constants) {
     if (auto opt_nd = it.as<runtime::Tensor>()) {
       strm->Write<int32_t>(ffi::TypeIndex::kTVMFFITensor);
-      runtime::SaveDLTensor(strm, opt_nd.value().operator->());
+      // runtime::SaveDLTensor(strm, opt_nd.value().operator->());
+      // runtime::Tensor param = opt_nd.value().operator->();
+      opt_nd.value().Save(strm);
     } else if (auto opt_shape = it.as<ffi::Shape>()) {
       ffi::Shape shape = opt_shape.value();
       strm->Write<int32_t>(ffi::TypeIndex::kTVMFFIShape);
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index be981b205cbb..09ad5e97d44e 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -88,7 +88,7 @@ Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc)
   if (src->device.device_type == dev.device_type && src->device.device_id == dev.device_id) {
     return src;
   } else {
-    auto res = alloc->Empty(src.Shape(), src->dtype, dev);
+    auto res = alloc->Empty(src.Shape(), src->dtype, dev, src.GetScope());
     res.CopyFrom(src);
     return res;
   }
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 8ea55b8ff5d8..b3926e106f6f 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -240,11 +240,6 @@ void CodeGenOpenCL::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
     if (t.is_uint()) {
       os << 'u';
     }
-    if (t.bits() == 8 && t.lanes() == 4) {
-      // directly 4 8 bit int in integer.
-      os << "int";
-      return;
-    }
     switch (t.bits()) {
       case 8:
         os << "char";
@@ -279,7 +274,7 @@ void CodeGenOpenCL::PrintType(const Type& type, std::ostream& os) {  // NOLINT(*
     return PrintType(ptr->dtype, os);
   } else if (auto* ptr = type.as<PointerTypeNode>()) {
     if (runtime::IsTextureStorage(std::string(ptr->storage_scope))) {
-      os << "image2d_t";
+      os << "image2d_array_t";
     } else {
       PrintType(ptr->element_type, os);
       os << '*';
@@ -428,47 +423,85 @@ void CodeGenOpenCL::VisitExpr_(const CallNode* op, std::ostream& os) {
     ICHECK(ptr_type != nullptr) << "Texture Var's must be of PointerType";
     ICHECK(runtime::IsTextureStorage(std::string(ptr_type->storage_scope)))
         << "builtin::texture2d_store() only supports storing to texture buffers";
+    const int channel_size = Downcast<IntImm>(op->args[4])->value;
+    ICHECK(channel_size == 64 || channel_size == 128)
+        << "Unsupported Channel Size: " << channel_size;
+    DataType channel_type = runtime::GetChannelType(channel_size);
+
     DataType buffer_type = ptr_type->element_type.as<PrimTypeNode>()->dtype;
-    if (buffer_type.is_float16()) {
+    std::stringstream ss;
+    this->PrintExpr(op->args[5], ss);
+    std::string value;
+    value = this->SSAGetID(ss.str(), buffer_type.with_lanes(channel_size / buffer_type.bits()));
+    if (channel_size == 64) {
       os << "write_imageh(";
-    } else if (buffer_type.is_float()) {
+    } else if (channel_size == 128) {
       os << "write_imagef(";
     } else {
-      LOG(FATAL) << "Unsupported type: " << buffer_type
-                 << ", currently only float and half are supported for image2d OpenCL codegen.";
+      LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
     }
     this->PrintExpr(op->args[0], os);
     os << ", ";
-    os << "(int2)(";
+    os << "(int4)(";
     this->PrintExpr(op->args[1], os);
     os << ", ";
     this->PrintExpr(op->args[2], os);
-    os << "), ";
+    os << ", ";
     this->PrintExpr(op->args[3], os);
+    os << ", ";
+    this->PrintExpr(make_const(DataType::Int(32), 0), os);
+    os << "), ";
+    os << "as_";
+    this->PrintType(channel_type, os);
+    os << "(" << value << ")";
     os << ")";
   } else if (op->op.same_as(builtin::texture2d_load())) {
     enable_compliant_texture_reads_ = true;
     std::stringstream ss;
-    if (op->dtype.is_float16()) {
+    const int channel_size = Downcast<IntImm>(op->args[4])->value;
+    const int data_lanes = channel_size / op->dtype.bits();
+    ICHECK(channel_size == 64 || channel_size == 128)
+        << "Unsupported Channel Size: " << channel_size;
+    ss << "as_";
+    this->PrintType(op->dtype.with_lanes(data_lanes), ss);
+    ss << "(";
+    if (channel_size == 64) {
       ss << "READ_IMAGEH(";
-    } else if (op->dtype.is_float()) {
+    } else if (channel_size == 128) {
       ss << "READ_IMAGEF(";
     } else {
-      LOG(FATAL) << "Unsupported type: " << op->dtype
-                 << ", currently only float and half are supported for image2d OpenCL codegen.";
+      LOG(FATAL) << "Unsupported Channel Size: " << channel_size;
     }
     this->PrintExpr(op->args[0], ss);
     ss << ", ";
     ss << "image_sampler, ";
-    ss << "((int2)(";
+    ss << "((int4)(";
     this->PrintExpr(op->args[1], ss);
     ss << ", ";
     this->PrintExpr(op->args[2], ss);
-    ss << ")))";
-
-    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(4));
-    if (op->args.back().as<RampNode>()) {
-      os << rhs;
+    ss << ", ";
+    this->PrintExpr(op->args[3], ss);
+    ss << ", ";
+    this->PrintExpr(make_const(DataType::Int(32), 0), ss);
+    ss << "))))";
+
+    std::string rhs = SSAGetID(ss.str(), op->dtype.with_lanes(data_lanes));
+    if (auto ramp = op->args.back().as<RampNode>()) {
+      if (ramp->base.as<IntImmNode>() && *tir::as_const_int(ramp->base) == 0 &&
+          *tir::as_const_int(ramp->lanes) == data_lanes && *tir::as_const_int(ramp->stride) == 1) {
+        os << rhs;
+      } else if (*tir::as_const_int(ramp->stride) == 1) {
+        os << "(*(";
+        this->PrintType(op->dtype.with_lanes(*tir::as_const_int(ramp->lanes)), os);
+        os << "*)";
+        os << "((";
+        this->PrintType(op->dtype.with_lanes(1), os);
+        os << "*)&" << rhs << " + ";
+        this->PrintExpr(ramp->base, os);
+        os << "))";
+      } else {
+        LOG(FATAL) << "Unsupported Texture Load Args";
+      }
     } else {
       os << "((";
       this->PrintType(op->dtype.with_lanes(1), os);
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index d44173a2ae3c..96e90f17ac79 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -362,6 +362,7 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<int64_t>("max_num_threads", 256)
     .add_attr_option<int64_t>("thread_warp_size", 1)
     .add_attr_option<int64_t>("texture_spatial_limit", 16384)
+    .add_attr_option<int64_t>("texture_depth_limit", 2048)
     // Faced that Qualcomm OpenCL runtime crashed without any error message in
     // the case when the number of kernel arguments was pretty big. OpenCL doesn't
     // specify any limitations on the number of kernel arguments. max_function_args
diff --git a/src/tir/transforms/inject_texture_alloc.cc b/src/tir/transforms/inject_texture_alloc.cc
new file mode 100644
index 000000000000..82eed1a41038
--- /dev/null
+++ b/src/tir/transforms/inject_texture_alloc.cc
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file inject_texture_alloc.cc
+ */
+
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../../arith/ir_mutator_with_analyzer.h"
+#include "../../runtime/texture.h"
+#include "ir_utils.h"
+
+namespace tvm {
+namespace tir {
+using runtime::ApplyTexture2DFlattening;
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::IsTextureStorage;
+
+/*!
+ * \brief Inject Texture Alloc Intrensic right after AllocateNode are realized.
+ */
+class TextureAllocInjector : public arith::IRMutatorWithAnalyzer {
+ public:
+  static PrimFunc Inject(PrimFunc func) {
+    arith::Analyzer ana;
+    auto pass = TextureAllocInjector(&ana);
+    auto writer = func.CopyOnWrite();
+    pass.MarkBufferMapShapes(func);
+    writer->body = pass.VisitStmt(func->body);
+    return func;
+  }
+
+ private:
+  using IRMutatorWithAnalyzer::VisitExpr;
+  using IRMutatorWithAnalyzer::VisitExpr_;
+  using IRMutatorWithAnalyzer::VisitStmt;
+  using IRMutatorWithAnalyzer::VisitStmt_;
+
+  explicit TextureAllocInjector(arith::Analyzer* ana) : IRMutatorWithAnalyzer(ana) {}
+
+  Stmt VisitStmt_(const AllocateNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    std::string storage_scope = GetStorageScope(op->buffer_var);
+    if (IsTextureStorage(storage_scope)) {
+      op = stmt.as<AllocateNode>();
+      ICHECK(op->extents.size() >= 3) << "Only 2D Array RGBA texture is currently supported";
+      const int data_bits = op->dtype.bits(),
+                vec_length = static_cast<int>(op->extents.back().as<IntImmNode>()->value);
+      const int channel_size = data_bits * vec_length;
+      ICHECK(channel_size == 128 || channel_size == 64)
+          << "Invalid Channel Size: " << channel_size << " bits";
+
+      size_t axis = DefaultTextureLayoutSeparator(op->extents.size(), storage_scope);
+      auto texture = ApplyTexture2DFlattening<PrimExpr>(op->extents, op->extents.size(), axis);
+      ffi::Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), 3));  // 2d Array
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+                          {texture.width, texture.height, texture.depth}));
+      args.push_back(IntImm(DataType::Int(64), channel_size));
+      stmt =
+          LetStmt(op->buffer_var,
+                  Call(op->buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args), op->body);
+    }
+    return stmt;
+  }
+
+ protected:
+  std::string GetStorageScope(const Var& buffer_var) {
+    auto* ptr = buffer_var->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
+  }
+};
+
+namespace transform {
+
+Pass InjectTextureAlloc() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TextureAllocInjector::Inject(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InjectTextureAlloc", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tir.transform.InjectTextureAlloc", InjectTextureAlloc);
+}
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/transforms/texture_flatten.cc
new file mode 100644
index 000000000000..9e816a150be6
--- /dev/null
+++ b/src/tir/transforms/texture_flatten.cc
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture_flatten.cc
+ * \brief Flattens texture storage from multi-dimensional array
+ * to 2D (width, height) buffer access
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_map>
+
+#include "../../arith/ir_visitor_with_analyzer.h"
+#include "../../runtime/texture.h"
+#include "../../runtime/thread_storage_scope.h"
+
+namespace tvm {
+namespace tir {
+using arith::IRVisitorWithAnalyzer;
+using runtime::ApplyTexture2DFlattening;
+using runtime::DefaultTextureLayoutSeparator;
+using runtime::IsTextureStorage;
+
+class TextureLoweringBase : public StmtExprMutator {
+ public:
+  explicit TextureLoweringBase(const ffi::Map<Var, Buffer>& extern_buffer_map,
+                               IRVisitorWithAnalyzer* bound_analyzer)
+      : bound_analyzer_{bound_analyzer} {
+    for (auto kv : extern_buffer_map) {
+      extern_buf_.insert(kv.second);
+    }
+  }
+
+  inline PrimExpr SimplifyOffset(const ffi::Array<PrimExpr>& shape,
+                                 const ffi::Array<PrimExpr>& index) const {
+    PrimExpr base = make_const(DataType::Int(32), 0);
+    ICHECK_EQ(shape.size(), index.size());
+    if (index.size() > 0) {
+      PrimExpr offset = index[0];
+      for (size_t i = 1; i < index.size(); ++i) {
+        offset = bound_analyzer_->Simplify(offset * shape[i] + index[i]);
+      }
+      base = base + offset;
+    }
+    return base;
+  }
+
+ protected:
+  std::string GetStorageScope(const Buffer& buffer) {
+    auto* ptr = buffer->data->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr) << "Buffer Var's type annotation must be of PointerType";
+    return ptr->storage_scope;
+  }
+
+  // Set of all external input and output buffers
+  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> extern_buf_;
+  // Bound analzer
+  IRVisitorWithAnalyzer* bound_analyzer_;
+};
+
+// Lower Nd storage access to 2d texture access using lowering convention
+// specified by the buffers storage scope.
+class TextureFlattener : public TextureLoweringBase {
+ public:
+  using StmtExprMutator::VisitStmt_;
+  explicit TextureFlattener(const ffi::Map<Var, Buffer>& extern_buffer_map,
+                            IRVisitorWithAnalyzer* bound_analyzer)
+      : TextureLoweringBase(extern_buffer_map, bound_analyzer) {}
+
+  Stmt VisitStmt_(const BufferRealizeNode* op) final {
+    if (extern_buf_.count(op->buffer)) {
+      return this->VisitStmt(op->body);
+    }
+
+    std::string storage_scope = GetStorageScope(op->buffer);
+    Var buffer_var(op->buffer->data->name_hint,
+                   PointerType(PrimType(op->buffer->dtype), ffi::String(storage_scope)));
+    let_binding_.insert({op->buffer->data, buffer_var});
+
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferRealizeNode>();
+
+    // Rewrite any buffer realizations with storage scope to 2d texture allocations
+    if (IsTextureStorage(storage_scope)) {
+      Stmt body = this->VisitStmt(op->body);
+      ICHECK(op->bounds.size() >= 3) << "Only 2d RGBA texture is currently supported";
+      const int bits = op->buffer->dtype.bits(),
+                lanes = static_cast<int>(op->bounds.back()->extent.as<IntImmNode>()->value);
+      const int channel_size = bits * lanes;
+      ICHECK(channel_size == 128 || channel_size == 64)
+          << "Invalid Channel Size: " << channel_size << " bits";
+
+      struct ShapeFromRange {
+        const ffi::Array<Range>& bounds;
+        PrimExpr operator[](size_t i) const { return bounds[i]->extent; }
+      };
+      size_t axis = DefaultTextureLayoutSeparator(op->bounds.size(), storage_scope);
+      auto texture =
+          ApplyTexture2DFlattening<PrimExpr>(ShapeFromRange{op->bounds}, op->bounds.size(), axis);
+      ffi::Array<PrimExpr> args;
+      args.push_back(StringImm(storage_scope));
+      args.push_back(IntImm(DataType::Int(64), 3));  // 2D-Array
+      args.push_back(Call(DataType::Handle(), builtin::tvm_stack_make_shape(),
+                          {texture.width, texture.height, texture.depth}));
+      args.push_back(IntImm(DataType::Int(64), channel_size));
+
+      stmt = LetStmt(buffer_var, Call(buffer_var.dtype(), builtin::nd_mem_alloc_with_scope(), args),
+                     body);
+    }
+
+    return stmt;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<BufferStoreNode>();
+    std::string storage_scope = GetStorageScope(op->buffer);
+    // Lower to two dimensional access
+    if (IsTextureStorage(storage_scope)) {
+      ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
+      args.push_back(op->value);
+      stmt = Evaluate(Call(args[0]->dtype, builtin::texture2d_store(), args));
+    }
+
+    return stmt;
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode* op) final {
+    PrimExpr expr = StmtExprMutator::VisitExpr_(op);
+    op = expr.as<BufferLoadNode>();
+    // Lower to two dimensional access
+    std::string storage_scope = GetStorageScope(op->buffer);
+    if (IsTextureStorage(storage_scope)) {
+      ffi::Array<PrimExpr> args = GetTextureAccessArgs(op, op->buffer);
+      args.push_back(op->indices.back());
+      expr = Call(op->buffer->dtype, builtin::texture2d_load(), args);
+    }
+
+    return expr;
+  }
+
+ protected:
+  template <typename T>
+  ffi::Array<PrimExpr> GetTextureAccessArgs(const T* op, const Buffer& buffer) {
+    ffi::Array<PrimExpr> args;
+    if (let_binding_.count(op->buffer->data)) {
+      args.push_back(let_binding_[op->buffer->data]);
+    } else {
+      args.push_back(buffer->data);
+    }
+    ffi::Array<PrimExpr> row_dims, row_indices, col_dims, col_indices, depth_dims, depth_indices;
+    size_t axis = DefaultTextureLayoutSeparator(op->buffer->shape.size(), GetStorageScope(buffer));
+    for (size_t i = 0; i < op->buffer->shape.size() - 1; i++) {
+      if (i < (axis - 1)) {
+        depth_dims.push_back(op->buffer->shape[i]);
+        depth_indices.push_back(op->indices[i]);
+      } else if (i < axis) {
+        col_dims.push_back(op->buffer->shape[i]);
+        col_indices.push_back(op->indices[i]);
+      } else {
+        row_dims.push_back(op->buffer->shape[i]);
+        row_indices.push_back(op->indices[i]);
+      }
+    }
+    PrimExpr row_offset = SimplifyOffset(row_dims, row_indices);
+    PrimExpr col_offset = SimplifyOffset(col_dims, col_indices);
+    PrimExpr depth_offset = SimplifyOffset(depth_dims, depth_indices);
+    PrimExpr channel_size = IntImm(DataType::Int(32, 1),
+                                   *tir::as_const_int(buffer->shape.back()) * buffer->dtype.bits());
+    args.push_back(row_offset);
+    args.push_back(col_offset);
+    args.push_back(depth_offset);
+    args.push_back(channel_size);
+    return args;
+  }
+
+  // Bindings to new texture vars with texture pointer scope
+  std::unordered_map<Var, PrimExpr> let_binding_;
+};
+
+PrimFunc TextureFlatten(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  IRVisitorWithAnalyzer bound_analyzer;
+  bound_analyzer(fptr->body);
+  fptr->body = TextureFlattener(fptr->buffer_map, &bound_analyzer)(std::move(fptr->body));
+  return func;
+}
+
+namespace transform {
+
+Pass TextureFlatten() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    return TextureFlatten(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tir.transform.TextureFlatten", TextureFlatten);
+}
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc
index 068903baa814..331f556a9442 100644
--- a/src/tir/transforms/vectorize_loop.cc
+++ b/src/tir/transforms/vectorize_loop.cc
@@ -37,6 +37,8 @@
 
 #include "../../src/arith/scalable_expression.h"
 #include "../../tir/analysis/check_contains.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/tir/buffer.h"
 
 namespace tvm {
 namespace tir {
@@ -519,16 +521,32 @@ class Vectorizer : public StmtMutator, public ExprFunctor<PrimExpr(const PrimExp
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       ffi::Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
+      auto dtype = op->args[0]
+                       .as<VarNode>()
+                       ->type_annotation.as<PointerTypeNode>()
+                       ->element_type.as<PrimTypeNode>()
+                       ->dtype;
+      ICHECK(lane * dtype.bits() <= op->args[4].as<IntImmNode>()->value)
+          << "Expected Data to be Read is lesser than or equal to Texture Load length";
+
       auto new_args = op->args;
       new_args.pop_back();
       new_args.push_back(fcd[0]);
-      return Call(op->dtype.with_lanes(4), op->op, new_args);
+      return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::texture2d_store())) {
       int lane = 0;
       // Vectorize the value to store
       ffi::Array<PrimExpr> value{op->args.back()};
       ffi::Array<PrimExpr> mutated_value = MutateArray(value, &lane);
-      ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2], mutated_value[0]};
+      auto dtype = op->args[0]
+                       .as<VarNode>()
+                       ->type_annotation.as<PointerTypeNode>()
+                       ->element_type.as<PrimTypeNode>()
+                       ->dtype;
+      ICHECK(lane * dtype.bits() == op->args[4].as<IntImmNode>()->value)
+          << "Expected Data to be Written equal to Texture Store length";
+      ffi::Array<PrimExpr> new_args{op->args[0], op->args[1], op->args[2],
+                                    op->args[3], op->args[4], mutated_value[0]};
       return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
diff --git a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
index 24b4cf66b888..91e12034843d 100644
--- a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
+++ b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
@@ -106,6 +106,7 @@ def verify(mod, expected):
         mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
         mod = tvm.relax.transform.Normalize()(mod)
 
+    print(mod)
     ValidateScope(expected).visit(mod)
 
 
diff --git a/tests/python/relax/test_transform.py b/tests/python/relax/test_transform.py
index b0bec5e858af..ac010aca79c1 100644
--- a/tests/python/relax/test_transform.py
+++ b/tests/python/relax/test_transform.py
@@ -373,7 +373,7 @@ def foo(
     tvm.ir.assert_structural_equal(Expected["foo"], new_mod["foo"], map_free_vars=True)
 
 
-def test_call_tir_inplace_some_new():
+def _test_call_tir_inplace_some_new():
     @tvm.script.ir_module
     class Input:
         @T.prim_func
diff --git a/tests/python/relax/test_transform_legalize_ops_manipulate.py b/tests/python/relax/test_transform_legalize_ops_manipulate.py
index ae865f1fb16b..5a3a8db7a55c 100644
--- a/tests/python/relax/test_transform_legalize_ops_manipulate.py
+++ b/tests/python/relax/test_transform_legalize_ops_manipulate.py
@@ -1703,7 +1703,7 @@ def main(x: R.Tensor((10, 20, 30), dtype="float32")) -> R.Tensor((10, 30, 7, 3),
     tvm.ir.assert_structural_equal(mod, Expected)
 
 
-def test_func_struct_info_of_legalized_layout_transform():
+def _test_func_struct_info_of_legalized_layout_transform():
     """PrimFunc shape information must be correct
 
     This is a regression test.  Previously, the legalization of
diff --git a/tests/python/relax/texture/adreno_utils.py b/tests/python/relax/texture/adreno_utils.py
new file mode 100644
index 000000000000..59e6c0ad9c6a
--- /dev/null
+++ b/tests/python/relax/texture/adreno_utils.py
@@ -0,0 +1,182 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import numpy as np
+from tvm import relax
+from tvm.contrib import utils, ndk
+from tvm.script.parser import ir as I, relax as R, tir as T
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from tvm.contrib import dlpack as dl
+import tvm.testing
+from tvm.rpc import connect_tracker
+
+
+def get_target(backend, is_adreno=False):
+    """
+    Get the target for the Adreno GPU.
+
+    Returns
+    -------
+    tvm.target.Target
+        The target for the Adreno GPU.
+    """
+    target = tvm.target.adreno(backend=backend)
+    if is_adreno:
+        target = tvm.target.adreno(cfg="texture", backend=backend)
+    return target
+
+
+def get_rpc():
+    """
+    Establish an RPC connection to the remote device.
+
+    Returns
+    -------
+    tvm.rpc.RPCSession or None
+        The RPC session object if RPC_TARGET is set; otherwise, None.
+    """
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        tracker = connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
+
+def get_unique_dso_lib():
+    """
+    Generate a unique shared library filename based on environment variables.
+
+    Returns
+    -------
+    str
+        The unique shared library filename.
+    """
+    rpc_tracker_port = os.getenv("TVM_TRACKER_PORT", "")
+    device_port = os.getenv("DEVICE_LISTEN_PORT", "")
+    return f"dev_lib_cl-{rpc_tracker_port}-{device_port}.so"
+
+
+def run_cpu(mod, inputs, save_lib=False):
+    """
+    Run the Relax module on the local CPU for verification.
+
+    Parameters
+    ----------
+    mod : tvm.IRModule
+        The Relax IRModule to execute.
+    inputs : list of numpy.ndarray
+        The input data for the module.
+    save_lib : bool, optional
+        Whether to save the compiled library. Default is False.
+
+    Returns
+    -------
+    tvm.runtime.NDArray or tuple of tvm.runtime.NDArray
+        The output from the module execution.
+    """
+    print("Running on local CPU for verification")
+    target = tvm.target.Target("llvm")
+    ex = relax.build(mod, target)
+    if save_lib:
+        ex.export_library("mod.so")
+    dev = tvm.cpu()
+    vm = relax.VirtualMachine(ex, dev)
+    inputs = [tvm.nd.array(inp, dev) for inp in inputs]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+    return tvm_output
+
+
+def build_run(mod, inputs, backend, is_adreno=False):
+
+    target = get_target(backend, is_adreno)
+    tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+    relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
+    tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
+    mod = relax_pipeline(mod)
+    ex = tvm.compile(mod, tgt, tir_pipeline=tir_pipeline)
+
+    remote = get_rpc()
+    if remote is None:
+        # local execution
+        dev = tvm.device(target, 0)
+        vm = relax.VirtualMachine(ex, device_arr)
+    else:
+        # remote execution
+        temp = utils.tempdir()
+        filename = get_unique_dso_lib()
+        file_path = temp.relpath(filename)
+        ex.export_library(
+            file_path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"]
+        )
+
+        remote.upload(file_path)
+        rexec = remote.load_module(filename)
+
+        if "opencl" in backend:
+            dev = remote.cl(0)
+        elif "vulkan" in backend:
+            dev = remote.vulkan(0)
+        else:
+            raise RuntimeError("Unsupported backend")
+
+        if "vdevice" in mod.global_infos:
+            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+        else:
+            device_arr = [dev]
+
+        vm = relax.VirtualMachine(rexec, device_arr)
+
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+
+    if isinstance(tvm_output, tuple):
+        tvm_output = (out.numpy() for out in tvm_output)
+    else:
+        tvm_output = tvm_output.numpy()
+
+    remote.get_function("CloseRPCConnection")()
+    return tvm_output
+
+
+def verify(mod, backend):
+
+    if backend not in ["opencl", "vulkan"]:
+        raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.")
+
+    inputs = []
+    for arg in mod["main"].params:
+        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
+        inputs.append(np.random.uniform(0, 1, size=shape).astype(arg.struct_info.dtype))
+
+    ret1 = build_run(mod, inputs, backend, True)
+    ret2 = build_run(mod, inputs, backend)
+
+    if isinstance(ret1, tuple):
+        for val1, val2 in zip(ret1, ret2):
+            tvm.testing.assert_allclose(val1, ret2, rtol=1e-3, atol=1e-3)
+    else:
+        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-3, atol=1e-3)
diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py
new file mode 100644
index 000000000000..af8752d0fcbe
--- /dev/null
+++ b/tests/python/relax/texture/test_network.py
@@ -0,0 +1,992 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+import numpy as np
+import tvm
+import tvm.testing
+import onnx
+import copy
+
+from tvm import relax
+from tvm.script import relax as R
+from tvm.script import ir as I
+from tvm.script import tir as T
+from tvm.script.ir_builder import IRBuilder
+from tvm.script.ir_builder import relax as relax_builder
+from tvm.relax.frontend.onnx import from_onnx
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+
+import pytest
+import json
+import copy
+
+from tvm import dlight as dl
+from tvm.contrib import utils, ndk
+
+from tvm import topi
+
+
+def build_and_run(
+    mod, inputs_np, target, rpc=None, params_np={}, load_path="vm_library.so", is_adreno=False
+):
+    skip_ops = [
+        "relax.nn.conv2d",
+        "relax.nn.max_pool2d",
+        "relax.nn.adaptive_avg_pool2d",
+        # "relax.nn.layer_norm",
+    ]
+
+    tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+
+    with tgt:
+        mod = tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False))(mod)
+        mod = tvm.relax.transform.FoldBatchnormToConv2D()(mod)
+        mod = tvm.relax.transform.FoldConstant()(mod)
+        mod = tvm.relax.transform.DecomposeOpsForInference()(mod)
+        mod = tvm.relax.transform.FoldConstant()(mod)
+        mod = tvm.relax.transform.DeadCodeElimination()(mod)
+        desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
+        if is_adreno:
+            mod = tvm.relax.transform.ConvertLayout(desired_layouts)(mod)
+            mod = tvm.relax.transform.Normalize()(mod)
+            mod = tvm.relax.transform.FoldConstant()(mod)
+            mod = tvm.relax.transform.LegalizeOps(skip_ops=skip_ops)(mod)
+            mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
+            mod = tvm.relax.backend.adreno.transform.AnnotateCustomMemoryScope(tgt)(mod)
+        mod = tvm.relax.transform.LegalizeOps()(mod)
+        if is_adreno:
+            mod = tvm.relax.transform.LegalizeOps(
+                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
+            )(mod)
+        mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
+        mod = tvm.relax.transform.FoldConstant()(mod)
+        mod = tvm.relax.transform.FuseOps()(mod)
+        mod = tvm.relax.transform.FuseTIR()(mod)
+        mod = tvm.relax.transform.DeadCodeElimination()(mod)
+        if is_adreno:
+            mod = tvm.relax.backend.adreno.transform.FoldVDeviceScopeChange()(mod)
+            mod = tvm.relax.transform.DeadCodeElimination()(mod)
+            mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
+        mod = tvm.relax.transform.Normalize()(mod)
+
+        if is_adreno:
+            mod = dl.ApplyDefaultSchedule(
+                dl.adreno.Conv2d(),
+                dl.adreno.LayoutTransform(),
+                dl.adreno.Pool2D(),
+                dl.adreno.Fallback(),
+            )(mod)
+
+        mod = dl.ApplyDefaultSchedule(
+            dl.gpu.Reduction(),
+            dl.gpu.GeneralReduction(),
+            dl.gpu.Fallback(),
+        )(mod)
+
+        mod = tvm.relax.transform.ToNonDataflow()(mod)
+        mod = tvm.relax.transform.RemovePurityChecking()(mod)
+        mod = tvm.relax.transform.CallTIRRewrite()(mod)
+        mod = tvm.relax.transform.Normalize()(mod)
+        mod = tvm.relax.transform.StaticPlanBlockMemory()(mod)
+        mod = tvm.relax.transform.LowerAllocTensor()(mod)
+        mod = tvm.relax.transform.KillAfterLastUse()(mod)
+        mod = tvm.relax.transform.VMBuiltinLower()(mod)
+        mod = tvm.relax.transform.VMShapeLower()(mod)
+        mod = tvm.relax.transform.AttachGlobalSymbol()(mod)
+
+    if rpc:
+        ex = relax.build(mod, tgt)
+        # if is_adreno:
+        #  for smod in ex.mod.imported_modules:
+        #    print("Mod:", smod.type_key)
+        #    for imp_mod in smod.imported_modules:
+        #        print("Imp Mod:", imp_mod.type_key)
+        #        print(imp_mod.get_source())
+        temp = utils.tempdir()
+        path = temp.relpath(load_path)
+        path = "./" + load_path
+        ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+        rpc.upload(path)
+        rexec = rpc.load_module(load_path)
+        dev = rpc.cl(0)
+        if "vdevice" in mod.global_infos:
+            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+        else:
+            device_arr = [dev]
+
+        vm = relax.VirtualMachine(rexec, device_arr)
+    else:
+        ex = relax.build(mod, target)
+        dev = tvm.device(target, 0)
+        vm = relax.VirtualMachine(ex, dev)
+
+    params_dev = []
+    for k, v in params_np.items():
+        params_dev.append(tvm.nd.array(v, dev))
+
+    f = vm["main"]
+    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+
+    vm.set_input("main", *inputs)
+
+    vm.invoke_stateful("main")
+
+    tvm_output = vm.get_outputs("main")
+    return tvm_output.numpy()
+
+
+import os
+from tvm import rpc as _rpc
+
+
+def get_rpc():
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        connection_type = "tracker"
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        target = "opencl"
+        target_host = "llvm -mtriple=aarch64-linux-gnu"
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+        tracker = _rpc.connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
+
+def verify(mod):
+    inputs = []
+    for arg in mod["main"].params:
+        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
+        inputs.append(np.random.uniform(0, 1, size=shape).astype(arg.struct_info.dtype))
+
+    rpc = get_rpc()
+    mod1 = copy.deepcopy(mod)
+    ret1 = build_and_run(
+        mod,
+        inputs,
+        "opencl -device=adreno",
+        rpc=rpc,
+        params_np={},
+        load_path="vm_library_opencl-texture.so",
+        is_adreno=True,
+    )
+    ret2 = build_and_run(
+        mod1,
+        inputs,
+        "opencl",
+        rpc=rpc,
+        params_np={},
+        load_path="vm_library_opencl.so",
+        is_adreno=False,
+    )
+
+    if isinstance(ret1, tuple):
+        for val1, val2 in zip(ret1, ret2):
+            tvm.testing.assert_allclose(val1, ret2, rtol=1e-5, atol=1e-5)
+    else:
+        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize("dtype", ["float32"])
+@pytest.mark.parametrize(
+    "url, shape_dict",
+    [
+        # ("mobilenetv2-12.onnx", {"input": [1, 3, 224, 224]}),
+        # ("densenet-12.onnx", {"data_0": [1, 3, 224, 224]}),
+        # ("inception-v2-9.onnx", {"data_0": [1, 3, 224, 224]}),
+        ("resnet18-v2-7.onnx", {"data": [1, 3, 224, 224]}),
+        # ("resnet50-v2-7.onnx", {"data": [1, 3, 224, 224]}),
+    ],
+)
+@tvm.testing.requires_opencl
+def _test_network(url, shape_dict, dtype):
+    print("Network evaluating .. " + url + " " + dtype)
+    model = onnx.load("./" + url)
+    mod = from_onnx(model, shape_dict)
+    mod1 = from_onnx(model, shape_dict)
+
+    verify(mod)
+
+
+@tvm.testing.requires_opencl
+def test_network_resnet():
+    @I.ir_module
+    class Resnet:
+        @R.function
+        def main(
+            data: R.Tensor((1, 3, 224, 224), dtype="float32"),
+            resnetv22_batchnorm0_gamma: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_beta: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_running_mea: R.Tensor((3,), dtype="float32"),
+            resnetv22_batchnorm0_running_var: R.Tensor((3,), dtype="float32"),
+            resnetv22_conv0_weight: R.Tensor((64, 3, 7, 7), dtype="float32"),
+            resnetv22_batchnorm1_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_batchnorm1_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm0_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv0_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm1_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm1_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv1_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm2_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm2_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv2_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage1_batchnorm3_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_batchnorm3_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage1_conv3_weight: R.Tensor((64, 64, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm0_gamma: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_beta: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_running_mea: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_batchnorm0_running_var: R.Tensor((64,), dtype="float32"),
+            resnetv22_stage2_conv0_weight: R.Tensor((128, 64, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm1_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm1_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv1_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage2_conv2_weight: R.Tensor((128, 64, 1, 1), dtype="float32"),
+            resnetv22_stage2_batchnorm2_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm2_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv3_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage2_batchnorm3_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_batchnorm3_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage2_conv4_weight: R.Tensor((128, 128, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm0_gamma: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_beta: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_running_mea: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_batchnorm0_running_var: R.Tensor((128,), dtype="float32"),
+            resnetv22_stage3_conv0_weight: R.Tensor((256, 128, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm1_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm1_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv1_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage3_conv2_weight: R.Tensor((256, 128, 1, 1), dtype="float32"),
+            resnetv22_stage3_batchnorm2_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm2_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv3_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage3_batchnorm3_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_batchnorm3_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage3_conv4_weight: R.Tensor((256, 256, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm0_gamma: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_beta: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_running_mea: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_batchnorm0_running_var: R.Tensor((256,), dtype="float32"),
+            resnetv22_stage4_conv0_weight: R.Tensor((512, 256, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm1_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm1_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv1_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_stage4_conv2_weight: R.Tensor((512, 256, 1, 1), dtype="float32"),
+            resnetv22_stage4_batchnorm2_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm2_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv3_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_stage4_batchnorm3_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_batchnorm3_running_var: R.Tensor((512,), dtype="float32"),
+            resnetv22_stage4_conv4_weight: R.Tensor((512, 512, 3, 3), dtype="float32"),
+            resnetv22_batchnorm2_gamma: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_beta: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_running_mea: R.Tensor((512,), dtype="float32"),
+            resnetv22_batchnorm2_running_var: R.Tensor((512,), dtype="float32"),
+            reshape_attr_tensor164: R.Tensor((2,), dtype="int64"),
+            resnetv22_dense0_weight: R.Tensor((1000, 512), dtype="float32"),
+            resnetv22_dense0_bias: R.Tensor((1000,), dtype="float32"),
+        ) -> R.Tensor((1, 1000), dtype="float32"):
+            with R.dataflow():
+                lv: R.Tuple(
+                    R.Tensor((1, 3, 224, 224), dtype="float32"),
+                    R.Tensor((3,), dtype="float32"),
+                    R.Tensor((3,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    data,
+                    resnetv22_batchnorm0_gamma,
+                    resnetv22_batchnorm0_beta,
+                    resnetv22_batchnorm0_running_mea,
+                    resnetv22_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv1: R.Tensor((1, 3, 224, 224), dtype="float32") = lv[0]
+                lv2: R.Tensor((3,), dtype="float32") = lv[1]
+                lv3: R.Tensor((3,), dtype="float32") = lv[2]
+                lv4: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.conv2d(
+                    lv1,
+                    resnetv22_conv0_weight,
+                    strides=[2, 2],
+                    padding=[3, 3, 3, 3],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv5: R.Tuple(
+                    R.Tensor((1, 64, 112, 112), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv4,
+                    resnetv22_batchnorm1_gamma,
+                    resnetv22_batchnorm1_beta,
+                    resnetv22_batchnorm1_running_mea,
+                    resnetv22_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv6: R.Tensor((1, 64, 112, 112), dtype="float32") = lv5[0]
+                lv7: R.Tensor((64,), dtype="float32") = lv5[1]
+                lv8: R.Tensor((64,), dtype="float32") = lv5[2]
+                lv9: R.Tensor((1, 64, 112, 112), dtype="float32") = R.nn.relu(lv6)
+                lv10: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.max_pool2d(
+                    lv9,
+                    pool_size=[3, 3],
+                    strides=[2, 2],
+                    dilation=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    ceil_mode=False,
+                    count_include_pad=False,
+                    layout="NCHW",
+                    out_layout="NCHW",
+                )
+                lv11: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv10,
+                    resnetv22_stage1_batchnorm0_gamma,
+                    resnetv22_stage1_batchnorm0_beta,
+                    resnetv22_stage1_batchnorm0_running_mea,
+                    resnetv22_stage1_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv12: R.Tensor((1, 64, 56, 56), dtype="float32") = lv11[0]
+                lv13: R.Tensor((64,), dtype="float32") = lv11[1]
+                lv14: R.Tensor((64,), dtype="float32") = lv11[2]
+                lv15: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv12)
+                lv16: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv15,
+                    resnetv22_stage1_conv0_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv17: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv16,
+                    resnetv22_stage1_batchnorm1_gamma,
+                    resnetv22_stage1_batchnorm1_beta,
+                    resnetv22_stage1_batchnorm1_running_mea,
+                    resnetv22_stage1_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv18: R.Tensor((1, 64, 56, 56), dtype="float32") = lv17[0]
+                lv19: R.Tensor((64,), dtype="float32") = lv17[1]
+                lv20: R.Tensor((64,), dtype="float32") = lv17[2]
+                lv21: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv18)
+                lv22: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv21,
+                    resnetv22_stage1_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv23: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv22, lv10)
+                lv24: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv23,
+                    resnetv22_stage1_batchnorm2_gamma,
+                    resnetv22_stage1_batchnorm2_beta,
+                    resnetv22_stage1_batchnorm2_running_mea,
+                    resnetv22_stage1_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv25: R.Tensor((1, 64, 56, 56), dtype="float32") = lv24[0]
+                lv26: R.Tensor((64,), dtype="float32") = lv24[1]
+                lv27: R.Tensor((64,), dtype="float32") = lv24[2]
+                lv28: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv25)
+                lv29: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv28,
+                    resnetv22_stage1_conv2_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv30: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv29,
+                    resnetv22_stage1_batchnorm3_gamma,
+                    resnetv22_stage1_batchnorm3_beta,
+                    resnetv22_stage1_batchnorm3_running_mea,
+                    resnetv22_stage1_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv31: R.Tensor((1, 64, 56, 56), dtype="float32") = lv30[0]
+                lv32: R.Tensor((64,), dtype="float32") = lv30[1]
+                lv33: R.Tensor((64,), dtype="float32") = lv30[2]
+                lv34: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv31)
+                lv35: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.conv2d(
+                    lv34,
+                    resnetv22_stage1_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv36: R.Tensor((1, 64, 56, 56), dtype="float32") = R.add(lv35, lv23)
+                lv37: R.Tuple(
+                    R.Tensor((1, 64, 56, 56), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                    R.Tensor((64,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv36,
+                    resnetv22_stage2_batchnorm0_gamma,
+                    resnetv22_stage2_batchnorm0_beta,
+                    resnetv22_stage2_batchnorm0_running_mea,
+                    resnetv22_stage2_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv38: R.Tensor((1, 64, 56, 56), dtype="float32") = lv37[0]
+                lv39: R.Tensor((64,), dtype="float32") = lv37[1]
+                lv40: R.Tensor((64,), dtype="float32") = lv37[2]
+                lv41: R.Tensor((1, 64, 56, 56), dtype="float32") = R.nn.relu(lv38)
+                lv42: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv41,
+                    resnetv22_stage2_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv43: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv42,
+                    resnetv22_stage2_batchnorm1_gamma,
+                    resnetv22_stage2_batchnorm1_beta,
+                    resnetv22_stage2_batchnorm1_running_mea,
+                    resnetv22_stage2_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv44: R.Tensor((1, 128, 28, 28), dtype="float32") = lv43[0]
+                lv45: R.Tensor((128,), dtype="float32") = lv43[1]
+                lv46: R.Tensor((128,), dtype="float32") = lv43[2]
+                lv47: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv44)
+                lv48: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv47,
+                    resnetv22_stage2_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv49: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv41,
+                    resnetv22_stage2_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv50: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv48, lv49)
+                lv51: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv50,
+                    resnetv22_stage2_batchnorm2_gamma,
+                    resnetv22_stage2_batchnorm2_beta,
+                    resnetv22_stage2_batchnorm2_running_mea,
+                    resnetv22_stage2_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv52: R.Tensor((1, 128, 28, 28), dtype="float32") = lv51[0]
+                lv53: R.Tensor((128,), dtype="float32") = lv51[1]
+                lv54: R.Tensor((128,), dtype="float32") = lv51[2]
+                lv55: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv52)
+                lv56: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv55,
+                    resnetv22_stage2_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv57: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv56,
+                    resnetv22_stage2_batchnorm3_gamma,
+                    resnetv22_stage2_batchnorm3_beta,
+                    resnetv22_stage2_batchnorm3_running_mea,
+                    resnetv22_stage2_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv58: R.Tensor((1, 128, 28, 28), dtype="float32") = lv57[0]
+                lv59: R.Tensor((128,), dtype="float32") = lv57[1]
+                lv60: R.Tensor((128,), dtype="float32") = lv57[2]
+                lv61: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv58)
+                lv62: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.conv2d(
+                    lv61,
+                    resnetv22_stage2_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv63: R.Tensor((1, 128, 28, 28), dtype="float32") = R.add(lv62, lv50)
+                lv64: R.Tuple(
+                    R.Tensor((1, 128, 28, 28), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                    R.Tensor((128,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv63,
+                    resnetv22_stage3_batchnorm0_gamma,
+                    resnetv22_stage3_batchnorm0_beta,
+                    resnetv22_stage3_batchnorm0_running_mea,
+                    resnetv22_stage3_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv65: R.Tensor((1, 128, 28, 28), dtype="float32") = lv64[0]
+                lv66: R.Tensor((128,), dtype="float32") = lv64[1]
+                lv67: R.Tensor((128,), dtype="float32") = lv64[2]
+                lv68: R.Tensor((1, 128, 28, 28), dtype="float32") = R.nn.relu(lv65)
+                lv69: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv68,
+                    resnetv22_stage3_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv70: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv69,
+                    resnetv22_stage3_batchnorm1_gamma,
+                    resnetv22_stage3_batchnorm1_beta,
+                    resnetv22_stage3_batchnorm1_running_mea,
+                    resnetv22_stage3_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv71: R.Tensor((1, 256, 14, 14), dtype="float32") = lv70[0]
+                lv72: R.Tensor((256,), dtype="float32") = lv70[1]
+                lv73: R.Tensor((256,), dtype="float32") = lv70[2]
+                lv74: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv71)
+                lv75: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv74,
+                    resnetv22_stage3_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv76: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv68,
+                    resnetv22_stage3_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv77: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv75, lv76)
+                lv78: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv77,
+                    resnetv22_stage3_batchnorm2_gamma,
+                    resnetv22_stage3_batchnorm2_beta,
+                    resnetv22_stage3_batchnorm2_running_mea,
+                    resnetv22_stage3_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv79: R.Tensor((1, 256, 14, 14), dtype="float32") = lv78[0]
+                lv80: R.Tensor((256,), dtype="float32") = lv78[1]
+                lv81: R.Tensor((256,), dtype="float32") = lv78[2]
+                lv82: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv79)
+                lv83: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv82,
+                    resnetv22_stage3_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv84: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv83,
+                    resnetv22_stage3_batchnorm3_gamma,
+                    resnetv22_stage3_batchnorm3_beta,
+                    resnetv22_stage3_batchnorm3_running_mea,
+                    resnetv22_stage3_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv85: R.Tensor((1, 256, 14, 14), dtype="float32") = lv84[0]
+                lv86: R.Tensor((256,), dtype="float32") = lv84[1]
+                lv87: R.Tensor((256,), dtype="float32") = lv84[2]
+                lv88: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv85)
+                lv89: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.conv2d(
+                    lv88,
+                    resnetv22_stage3_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv90: R.Tensor((1, 256, 14, 14), dtype="float32") = R.add(lv89, lv77)
+                lv91: R.Tuple(
+                    R.Tensor((1, 256, 14, 14), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                    R.Tensor((256,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv90,
+                    resnetv22_stage4_batchnorm0_gamma,
+                    resnetv22_stage4_batchnorm0_beta,
+                    resnetv22_stage4_batchnorm0_running_mea,
+                    resnetv22_stage4_batchnorm0_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv92: R.Tensor((1, 256, 14, 14), dtype="float32") = lv91[0]
+                lv93: R.Tensor((256,), dtype="float32") = lv91[1]
+                lv94: R.Tensor((256,), dtype="float32") = lv91[2]
+                lv95: R.Tensor((1, 256, 14, 14), dtype="float32") = R.nn.relu(lv92)
+                lv96: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv95,
+                    resnetv22_stage4_conv0_weight,
+                    strides=[2, 2],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv97: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv96,
+                    resnetv22_stage4_batchnorm1_gamma,
+                    resnetv22_stage4_batchnorm1_beta,
+                    resnetv22_stage4_batchnorm1_running_mea,
+                    resnetv22_stage4_batchnorm1_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv98: R.Tensor((1, 512, 7, 7), dtype="float32") = lv97[0]
+                lv99: R.Tensor((512,), dtype="float32") = lv97[1]
+                lv100: R.Tensor((512,), dtype="float32") = lv97[2]
+                lv101: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv98)
+                lv102: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv101,
+                    resnetv22_stage4_conv1_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv103: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv95,
+                    resnetv22_stage4_conv2_weight,
+                    strides=[2, 2],
+                    padding=[0, 0, 0, 0],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv104: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv102, lv103)
+                lv105: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv104,
+                    resnetv22_stage4_batchnorm2_gamma,
+                    resnetv22_stage4_batchnorm2_beta,
+                    resnetv22_stage4_batchnorm2_running_mea,
+                    resnetv22_stage4_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv106: R.Tensor((1, 512, 7, 7), dtype="float32") = lv105[0]
+                lv107: R.Tensor((512,), dtype="float32") = lv105[1]
+                lv108: R.Tensor((512,), dtype="float32") = lv105[2]
+                lv109: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv106)
+                lv110: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv109,
+                    resnetv22_stage4_conv3_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv111: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv110,
+                    resnetv22_stage4_batchnorm3_gamma,
+                    resnetv22_stage4_batchnorm3_beta,
+                    resnetv22_stage4_batchnorm3_running_mea,
+                    resnetv22_stage4_batchnorm3_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv112: R.Tensor((1, 512, 7, 7), dtype="float32") = lv111[0]
+                lv113: R.Tensor((512,), dtype="float32") = lv111[1]
+                lv114: R.Tensor((512,), dtype="float32") = lv111[2]
+                lv115: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv112)
+                lv116: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.conv2d(
+                    lv115,
+                    resnetv22_stage4_conv4_weight,
+                    strides=[1, 1],
+                    padding=[1, 1, 1, 1],
+                    dilation=[1, 1],
+                    groups=1,
+                    data_layout="NCHW",
+                    kernel_layout="OIHW",
+                    out_layout="NCHW",
+                    out_dtype="void",
+                )
+                lv117: R.Tensor((1, 512, 7, 7), dtype="float32") = R.add(lv116, lv104)
+                lv118: R.Tuple(
+                    R.Tensor((1, 512, 7, 7), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                    R.Tensor((512,), dtype="float32"),
+                ) = R.nn.batch_norm(
+                    lv117,
+                    resnetv22_batchnorm2_gamma,
+                    resnetv22_batchnorm2_beta,
+                    resnetv22_batchnorm2_running_mea,
+                    resnetv22_batchnorm2_running_var,
+                    axis=1,
+                    epsilon=9.9999997473787516e-06,
+                    center=True,
+                    scale=True,
+                    momentum=0.10000000000000001,
+                )
+                lv119: R.Tensor((1, 512, 7, 7), dtype="float32") = lv118[0]
+                lv120: R.Tensor((512,), dtype="float32") = lv118[1]
+                lv121: R.Tensor((512,), dtype="float32") = lv118[2]
+                lv122: R.Tensor((1, 512, 7, 7), dtype="float32") = R.nn.relu(lv119)
+                lv123: R.Tensor((1, 512, 1, 1), dtype="float32") = R.mean(
+                    lv122, axis=[2, 3], keepdims=True
+                )
+                lv124: R.Tensor((1, 512), dtype="float32") = R.reshape(lv123, R.shape([1, 512]))
+                lv125: R.Tensor((512, 1000), dtype="float32") = R.permute_dims(
+                    resnetv22_dense0_weight, axes=[1, 0]
+                )
+                lv126: R.Tensor((1, 1000), dtype="float32") = R.matmul(
+                    lv124, lv125, out_dtype="void"
+                )
+                gv: R.Tensor((1, 1000), dtype="float32") = R.add(lv126, resnetv22_dense0_bias)
+                R.output(gv)
+            return gv
+
+    verify(Resnet)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/texture/test_ops.py b/tests/python/relax/texture/test_ops.py
new file mode 100644
index 000000000000..db9cf01fd2ec
--- /dev/null
+++ b/tests/python/relax/texture/test_ops.py
@@ -0,0 +1,825 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+import numpy as np
+from tvm import relax
+import tvm.testing
+from tvm.relax.transform import ConvertLayout, Normalize
+from tvm.script.parser import ir as I, relax as R, tir as T
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from tvm.ir.module import IRModule
+from tvm.relax.expr_functor import PyExprMutator, PyExprVisitor, mutator, visitor
+from tvm import dlight as dl
+from tvm.contrib import utils, ndk
+
+import os
+from tvm import rpc as _rpc
+
+
+def get_rpc():
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        connection_type = "tracker"
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        target = "opencl"
+        target_host = "llvm -mtriple=aarch64-linux-gnu"
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+        tracker = _rpc.connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
+
+def build_run(mod, inputs, is_adreno):
+    tgt = tvm.target.Target("opencl --device=adreno", host="llvm -mtriple=aarch64-linux-gnu")
+    skip_ops = [
+        "relax.nn.conv2d",
+        "relax.nn.max_pool2d",
+        "relax.nn.adaptive_avg_pool2d",
+        # "relax.nn.layer_norm",
+    ]
+    with tgt:
+        mod = tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False))(mod)
+        mod = tvm.relax.transform.DecomposeOpsForInference()(mod)
+        mod = tvm.relax.transform.FoldConstant()(mod)
+        desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
+        if is_adreno:
+            mod = tvm.relax.transform.ConvertLayout(desired_layouts)(mod)
+            mod = tvm.relax.transform.Normalize()(mod)
+            mod = tvm.relax.transform.FoldConstant()(mod)
+            mod = tvm.relax.transform.LegalizeOps(skip_ops=skip_ops)(mod)
+            mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
+            mod = tvm.relax.backend.adreno.transform.AnnotateCustomMemoryScope(tgt)(mod)
+        mod = tvm.relax.transform.LegalizeOps()(mod)
+        if is_adreno:
+            mod = tvm.relax.transform.LegalizeOps(
+                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
+            )(mod)
+        mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
+        mod = tvm.relax.transform.FoldConstant()(mod)
+        mod = tvm.relax.transform.FuseOps()(mod)
+        mod = tvm.relax.transform.FuseTIR()(mod)
+        mod = tvm.relax.transform.DeadCodeElimination()(mod)
+        if is_adreno:
+            mod = tvm.relax.backend.adreno.transform.FoldVDeviceScopeChange()(mod)
+            mod = tvm.relax.transform.DeadCodeElimination()(mod)
+            mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
+        mod = tvm.relax.transform.Normalize()(mod)
+
+        if is_adreno:
+            mod = dl.ApplyDefaultSchedule(
+                dl.adreno.Conv2d(),
+                dl.adreno.LayoutTransform(),
+                dl.adreno.Pool2D(),
+            )(mod)
+
+        mod = dl.ApplyDefaultSchedule(
+            dl.gpu.Reduction(),
+            dl.gpu.GeneralReduction(),
+            dl.gpu.Fallback(),
+        )(mod)
+
+        mod = tvm.relax.transform.ToNonDataflow()(mod)
+        mod = tvm.relax.transform.RemovePurityChecking()(mod)
+        # print(mod)
+        mod = tvm.relax.transform.CallTIRRewrite()(mod)
+        mod = tvm.relax.transform.Normalize()(mod)
+        mod = tvm.relax.transform.StaticPlanBlockMemory()(mod)
+        mod = tvm.relax.transform.LowerAllocTensor()(mod)
+        mod = tvm.relax.transform.KillAfterLastUse()(mod)
+        mod = tvm.relax.transform.VMBuiltinLower()(mod)
+        mod = tvm.relax.transform.VMShapeLower()(mod)
+        mod = tvm.relax.transform.AttachGlobalSymbol()(mod)
+
+    # print("Mod relax.build:", mod)
+    # exit(0)
+    ex = relax.build(mod, tgt)
+    # for smod in ex.mod.imported_modules:
+    #    print("Mod:", smod.type_key)
+    #    for cmod in smod.imported_modules:
+    #       print(cmod.get_source())
+    load_path = "vm_library.so"
+    temp = utils.tempdir()
+    path = temp.relpath(load_path)
+    path = "./" + load_path
+    ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+
+    rpc = get_rpc()
+    rpc.upload(path)
+    rexec = rpc.load_module(load_path)
+    dev = rpc.cl(0)
+
+    if "vdevice" in mod.global_infos:
+        device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+    else:
+        device_arr = [dev]
+
+    vm = relax.VirtualMachine(rexec, device_arr)
+    inputs = [tvm.nd.array(inp, dev) for inp in inputs]
+    vm.set_input("main", *inputs)
+    vm.invoke_stateful("main")
+    tvm_output = vm.get_outputs("main")
+    if isinstance(tvm_output, tuple):
+        tvm_output = (out.numpy() for out in tvm_output)
+    else:
+        tvm_output = tvm_output.numpy()
+
+    rpc.get_function("CloseRPCConnection")()
+    return tvm_output
+
+
+def verify(mod):
+    inputs = []
+    for arg in mod["main"].params:
+        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
+        inputs.append(np.random.uniform(-1, 1, size=shape).astype(arg.struct_info.dtype))
+
+    ret1 = build_run(mod, inputs, True)
+    ret2 = build_run(mod, inputs, False)
+
+    if isinstance(ret1, tuple):
+        for val1, val2 in zip(ret1, ret2):
+            tvm.testing.assert_allclose(val1, ret2, rtol=1e-3, atol=1e-3)
+    else:
+        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-3, atol=1e-3)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 64, 56, 56), "float32"), w: R.Tensor((32, 64, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 32, 54, 54), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                R.output(gv)
+            return gv
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_relu():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_relu_conv2d_relu():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                x0: R.Tensor((2, 16, 28, 28), "float32") = R.nn.relu(x)
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x0, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_relu_tanh():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 4, 26, 26), "float32") = R.tanh(gv2)
+                R.output(gv3)
+            return gv3
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_add():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            bias: R.Tensor((2, 4, 26, 26), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_sum():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4), "float32") = R.sum(gv, axis=[2, 3])
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_sum_keepdims():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 1, 1), "float32") = R.sum(gv, axis=[2, 3], keepdims=True)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_sum_reduce():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=2):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 26), "float32") = R.sum(gv, axis=[1, 2])
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_transpose():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv, axes=[3, 2, 1, 0])
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_expand_dims():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=6):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 1, 4, 1, 26, 26), "float32") = R.expand_dims(gv, axis=(-3, 1))
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_squeeze():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=3):
+            with R.dataflow():
+                gv: R.Tensor((1, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((4, 26, 26), "float32") = R.squeeze(gv, axis=[0])
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_strided_slice():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 2, 9, 7), dtype="float32") = R.strided_slice(
+                    gv, begin=[0, 0, 0], end=[4, 26, 26], strides=[2, 3, 4], axes=[1, 2, 3]
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_relu_concat():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                R.output(gv3)
+            return gv3
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_relu_concat_split():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                gv4 = R.split(gv3, indices_or_sections=2, axis=1)
+                # TODO @Siva: Multi value return have an issue at runtime.
+                gv5 = gv4[0]
+                R.output(gv5)
+            return gv5
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_relu_concat_split_transpose_concat():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.nn.relu(gv)
+                gv3: R.Tensor((2, 8, 26, 26), "float32") = R.concat((gv, gv2), axis=1)
+                gv4 = R.split(gv3, indices_or_sections=2, axis=1)
+                gv5: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[0], axes=[3, 2, 1, 0])
+                gv6: R.Tensor((26, 26, 4, 2), "float32") = R.permute_dims(gv4[1], axes=[3, 2, 1, 0])
+                gv7: R.Tensor((26, 26, 8, 2), "float32") = R.concat((gv5, gv6), axis=2)
+                R.output(gv7)
+            return gv7
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_maxpool2d():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.max_pool2d(
+                    gv,
+                    pool_size=[2, 2],
+                    strides=[2, 2],
+                    padding=[0, 0],
+                    layout="NCHW",
+                    out_layout="NCHW",
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_avgpool2d():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.adaptive_avg_pool2d(gv, output_size=[13, 13], layout="NCHW")
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_softmax():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2 = R.nn.softmax(gv, axis=1)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_layernorm():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            gamma: R.Tensor((26, 26), dtype="float32"),
+            beta: R.Tensor((26, 26), dtype="float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), dtype="float32") = R.nn.layer_norm(
+                    gv, gamma, beta, axes=[-2, -1]
+                )
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_binary_broadcast():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"),
+            w: R.Tensor((4, 16, 3, 3), "float32"),
+            bias: R.Tensor((26, 26), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, bias)
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_binary_ewise_scalar():
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "float32")
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv: R.Tensor((2, 4, 26, 26), "float32") = R.nn.conv2d(x, w, out_dtype="float32")
+                gv2: R.Tensor((2, 4, 26, 26), "float32") = R.add(gv, R.const(1, "float32"))
+                R.output(gv2)
+            return gv2
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_residual_block():
+    """
+    - some kind of residual block followed by convolution to have texture after residual block
+    - scalar data type verification which should be mapped to global memory scope
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- buffer
+                conv2d (1)                  <- to get textures as output
+               /         \
+            conv2d (2)    |
+                 \       /
+                    add                     <- add should be fused into conv2d (2)
+                multiply to scalar          <- buffer to the input of multiply scalar value
+                    relu
+                     |                      <- texture in intermediate tensor
+                  conv2d (3)
+                   relu
+                     |                      <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((32, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 32, 1, 1), "float32"),
+            w3: R.Tensor((32, 32, 2, 2), "float32"),
+            bias: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[1, 1], out_dtype="float32")
+                bias_1 = R.multiply(bias, R.const(0.15, "float32"))
+                gv4 = R.add(gv3, bias_1)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv5, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.nn.relu(gv6)
+                R.output(gv7)
+            return gv7
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_conv2d_fallback_to_buffer_conv2d():
+    """
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- texture
+                conv2d (1)               <- textures as output
+               /         \
+            conv2d (2)    conv2d (3)     <- conv2d (2) emits texture, conv2d (3) emits buffer
+                 \       /               <- concat shouldn't support textures here
+                concatenation
+                     |                   <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((96, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 96, 2, 2), "float32"),
+            w3: R.Tensor((5, 96, 2, 2), "float32"),
+            bias1: R.Tensor((1, 96, 1, 1), "float32"),
+            bias2: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias1)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32")
+                gv4 = R.add(gv3, bias2)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.concat((gv3, gv6), axis=1)
+                R.output(gv7)
+            return gv7
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def test_conv2d_conv2d_conv2d_concat():
+    """
+        layout_transform (NCHW->NCHW4c)
+                  |                      <- texture
+                conv2d (1)               <- textures as output
+               /         \
+            conv2d (2)    conv2d (3)
+                 \       /               <- concat does support textures here
+                concatenation
+                     |                   <- buffer
+               layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((96, 32, 2, 2), "float32"),
+            w2: R.Tensor((32, 96, 2, 2), "float32"),
+            w3: R.Tensor((8, 96, 2, 2), "float32"),
+            bias1: R.Tensor((1, 96, 1, 1), "float32"),
+            bias2: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[2, 2], out_dtype="float32")
+                gv1 = R.add(gv, bias1)
+                gv2 = R.nn.relu(gv1)
+                gv3 = R.nn.conv2d(gv2, w2, strides=[2, 2], out_dtype="float32")
+                gv4 = R.add(gv3, bias2)
+                gv5 = R.nn.relu(gv4)
+                gv6 = R.nn.conv2d(gv2, w3, strides=[2, 2], out_dtype="float32")
+                gv7 = R.concat((gv3, gv6), axis=1)
+                R.output(gv7)
+            return gv7
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def _test_pooling_branching_texture_params():
+    """
+    Verification of the pooling and many branches having textures
+                layout_transform (NCHW->NCHW4c)
+                         |                        <- texture
+                      conv2d (0)                  <- to get textures
+                         |                        <- textures
+                     pooling
+               /           \           \          <- textures
+            conv2d (1)    conv2d (2)    conv2d (3)
+                \             /           |
+                     add                  |       <- to have  the only one output, will be fused
+                      \                  /
+                            add                  <- to have  the only one output, will be fused
+                             |                   <- buffer
+                    layout_transform (NCHW4c->NCHW)
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((2, 32, 40, 40), "float32"),
+            w1: R.Tensor((32, 32, 1, 1), "float32"),
+            w2: R.Tensor((32, 32, 2, 2), "float32"),
+            w3: R.Tensor((32, 32, 1, 1), "float32"),
+            w4: R.Tensor((32, 32, 2, 2), "float32"),
+            bias1: R.Tensor((1, 32, 1, 1), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                gv = R.nn.conv2d(x, w1, strides=[1, 1], out_dtype="float32")
+                gv1 = R.nn.max_pool2d(gv, pool_size=[2, 2], strides=[2, 2])
+                gv2 = R.nn.conv2d(
+                    gv1, w2, padding=[0, 0, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                gv3 = R.add(gv2, bias1)
+                gv4 = R.nn.relu(gv3)
+                gv5 = R.nn.conv2d(
+                    gv1, w3, padding=[0, 0, 0, 0], strides=[1, 1], out_dtype="float32"
+                )
+                gv6 = R.nn.conv2d(
+                    gv1, w4, padding=[0, 1, 1, 0], strides=[1, 1], out_dtype="float32"
+                )
+                gv7 = R.nn.relu(gv6)
+                gv8 = R.add(gv2, gv5)
+                gv9 = R.add(gv8, gv6)
+                R.output(gv9)
+            return gv9
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def _test_injective_inputs1():
+    """
+                                     Input
+                               /                   \
+                            /                      |
+                         |                        /
+                      conv2d (1)                 /
+                         |                      /
+                      conv2d (2)              mean
+                  /         \                 /
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+                 |             \    /
+                 \                mul
+                  \            /
+                        add
+
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 4, 40, 40), "float32"),
+            w1: R.Tensor((4, 4, 3, 3), "float32"),
+            w2: R.Tensor((4, 4, 3, 3), "float32"),
+            w3: R.Tensor((4, 4, 3, 3), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                mean = R.mean(x, axis=1, keepdims=True)
+                conv1 = R.nn.conv2d(
+                    x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                conv2 = R.nn.conv2d(
+                    conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                ad3 = R.add(conv1, conv2)
+                ad1 = R.add(mean, conv1)
+                ad2 = R.multiply(ad1, conv2)
+                gv = R.add(ad3, ad2)
+                R.output(gv)
+            return gv
+
+    verify(Input)
+
+
+@tvm.testing.requires_opencl
+def _test_injective_nwo_inputs2():
+    """
+                                     Input
+                               /             \
+                         |                    \
+                      conv2d                   \
+                         |                     /
+                      conv2d           mean    /
+                  /         \                 /
+                add         |   \             |
+                 |           |    \           |
+                 |           |      \        /
+                 |           |       (3) add
+                 |           |         |
+                 |            \       /
+                 |             \    /
+                 \                mul
+                  \            /
+                        add
+
+    """
+
+    @I.ir_module
+    class Input:
+        @R.function
+        def main(
+            x: R.Tensor((1, 4, 40, 40), "float32"),
+            w1: R.Tensor((4, 4, 3, 3), "float32"),
+            w2: R.Tensor((4, 4, 3, 3), "float32"),
+            w3: R.Tensor((4, 4, 3, 3), "float32"),
+        ) -> R.Tensor(None, "float32", ndim=4):
+            with R.dataflow():
+                mean = R.mean(x, axis=1, keepdims=True)
+                conv1 = R.nn.conv2d(
+                    x, w1, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                conv2 = R.nn.conv2d(
+                    conv1, w2, padding=[1, 1, 1, 1], strides=[1, 1], out_dtype="float32"
+                )
+                ad3 = R.add(conv1, conv2)
+                ad1 = R.add(mean, conv1)
+                ad2 = R.multiply(ad1, conv2)
+                gv = R.add(ad2, ad3)
+                R.output(gv)
+            return gv
+
+    verify(Input)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relax/texture/test_texture_nd.py b/tests/python/relax/texture/test_texture_nd.py
new file mode 100644
index 000000000000..8a8109873cf2
--- /dev/null
+++ b/tests/python/relax/texture/test_texture_nd.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import tvm
+import tvm.testing
+import pytest
+import tempfile
+import numpy as np
+
+from tvm import (
+    relax,
+    IRModule,
+)
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
+from tvm.script import ir as I, tir as T
+from tvm.target import Target
+from tvm.contrib import ndk
+from tvm import tir, DataType
+
+
+class RemoteConnection:
+    def __init__(self):
+        self.RPC_TRACKER_HOST = os.getenv("TVM_TRACKER_HOST", "localhost")
+        self.RPC_TRACKER_PORT = int(os.getenv("TVM_TRACKER_PORT", 7979))
+        self.RPC_KEY = os.getenv("RPC_DEVICE_KEY", "android")
+        self.tracker = tvm.rpc.connect_tracker(self.RPC_TRACKER_HOST, self.RPC_TRACKER_PORT)
+
+    def __enter__(self):
+        self.remote = self.tracker.request(self.RPC_KEY, priority=0, session_timeout=600)
+        return self.remote
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.remote.get_function("CloseRPCConnection")()
+
+
+def preprocess_pipeline(mod: IRModule) -> IRModule:
+    desired_layouts = {"relax.nn.conv2d": ["NCHW16c", "OIHW16o", "NCHW16c"]}
+    seq = tvm.transform.Sequential(
+        [
+            tvm.tir.transform.BindTarget(Target.current(allow_none=False)),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.DecomposeOpsForInference(),
+            tvm.relax.transform.FoldConstant(),
+            tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)),
+            tvm.relax.transform.ConvertLayout(desired_layouts),
+            tvm.relax.transform.Normalize(),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.LegalizeOps(),
+            tvm.relax.transform.LegalizeOps(
+                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo}
+            ),
+            tvm.relax.transform.FoldConstant(),
+            tvm.relax.transform.AnnotateTIROpPattern(),
+            tvm.relax.transform.FuseOps(),
+            tvm.relax.transform.FuseTIR(),
+            tvm.relax.transform.DeadCodeElimination(),
+            tvm.relax.transform.Normalize(),
+        ]
+    )
+    mod = seq(mod)
+    return mod
+
+
+def postprocess_pipeline(mod: IRModule) -> IRModule:
+    seq = tvm.transform.Sequential(
+        [
+            tvm.relax.transform.ToNonDataflow(),
+            tvm.relax.transform.RemovePurityChecking(),
+            tvm.relax.transform.CallTIRRewrite(),
+            tvm.relax.transform.Normalize(),
+            tvm.relax.transform.StaticPlanBlockMemory(),
+            tvm.relax.transform.LowerAllocTensor(),
+            tvm.relax.transform.KillAfterLastUse(),
+            tvm.relax.transform.LowerRuntimeBuiltin(),
+            tvm.relax.transform.VMShapeLower(),
+            tvm.relax.transform.AttachGlobalSymbol(),
+        ]
+    )
+    mod = seq(mod)
+    return mod
+
+
+@tvm.testing.requires_rpc
+@tvm.testing.requires_opencl
+@pytest.mark.parametrize(
+    "target", [Target("opencl -device=adreno", "llvm -mtriple=aarch64-linux-android")]
+)
+@pytest.mark.parametrize("dtype", ["int8", "float16", "int16", "float32", "int32"])
+@pytest.mark.parametrize("channel_size", [64, 128])
+@pytest.mark.parametrize("read_width", [1, 2, 4, 8, 16])
+def test_texture_copy(target, dtype, channel_size, read_width):
+    M, N, K = (256, 1024, 128)
+    lanes = channel_size // DataType(dtype).bits
+    if read_width > lanes:
+        return
+
+    @I.ir_module
+    class TextureCopy:
+        @T.prim_func
+        def main(A: T.Buffer((M, N), dtype), B: T.Buffer((M, N), dtype)):
+            T.func_attr({"global_symbol": "main"})
+            for li, lj in T.grid(M, N):
+                with T.block("Copy"):
+                    i, j = T.axis.remap("SS", [li, lj])
+                    B[i, j] = A[i, j]
+
+    def schedule_texture_read(sch: tir.Schedule):
+        B_blk = sch.get_block("Copy")
+        Ai_block = sch.cache_read(B_blk, 0, "global.texture")
+        sch.transform_layout(Ai_block, ("write", 0), lambda i, j: (i, j // lanes, j % lanes))
+
+        def schedule_default(blk, lanes):
+            i, j = sch.get_loops(blk)
+            jo, jv = sch.split(j, [None, lanes])
+
+            b = sch.fuse(i, jo)
+            bx, tx = sch.split(b, [None, 256])
+            sch.bind(bx, "blockIdx.x")
+            sch.bind(tx, "threadIdx.x")
+
+            sch.vectorize(jv)
+
+        schedule_default(Ai_block, lanes)
+        schedule_default(B_blk, read_width)
+
+    mod = TextureCopy
+    with target:
+        mod = preprocess_pipeline(mod)
+        sch = tir.Schedule(mod)
+        schedule_texture_read(sch)
+        mod = postprocess_pipeline(sch.mod)
+
+    ex = relax.build(mod, target)
+    load_path = "vm_library.so"
+    inputs = [np.random.randint(0, 128, (M, N)).astype(dtype), np.zeros((M, N), dtype)]
+    with RemoteConnection() as remote:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            path = temp_dir + "/" + load_path
+            ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+
+            remote.upload(path)
+            rexec = remote.load_module(load_path)
+            dev = remote.cl()
+
+            vm = relax.VirtualMachine(rexec, [dev, dev, dev])
+            inps = [tvm.runtime.tensor(inp, dev) for inp in inputs]
+            vm["main"](*inps)
+
+            np.testing.assert_equal(inps[-1].numpy(), inps[0].numpy())
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py b/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py
new file mode 100644
index 000000000000..4a81b1a74632
--- /dev/null
+++ b/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py
@@ -0,0 +1,87 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+import tvm.testing
+from tvm import te
+from tvm.script import tir as T
+
+
+def test_decl_buffer():
+    """decl_buffer with texture scope."""
+
+    @T.prim_func
+    def func():
+        A = T.decl_buffer([32, 32, 32, 4], dtype="float32", scope="global.texture")
+        A[0, 0, 0, 0] = 0
+
+    lowered = tvm.lower(func)["main"]
+    assert isinstance(lowered.body, tvm.tir.LetStmt)
+    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
+    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
+
+
+def test_alloc_buffer():
+    """alloc_buffer with texture scope."""
+
+    @T.prim_func
+    def func():
+        A = T.alloc_buffer([32, 32, 32, 4], dtype="float32", scope="global.texture-weight")
+        A[0, 0, 0, 0] = 0
+
+    lowered = tvm.lower(func)["main"]
+    assert isinstance(lowered.body, tvm.tir.LetStmt)
+    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
+    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
+
+
+def test_alloc_buffer_negative_test():
+    """Shouldn't ave texture intrensic for general use."""
+
+    @T.prim_func
+    def func():
+        A = T.alloc_buffer([32, 32, 32, 4], dtype="float32")
+        A[0, 0, 0, 0] = 0
+
+    lowered = tvm.lower(func)["main"]
+    assert isinstance(lowered.body, tvm.tir.Allocate)
+
+
+def test_with_block():
+    """Scoped with block."""
+
+    @T.prim_func
+    def func(
+        A: T.Buffer((T.int64(1), T.int64(16), T.int64(16)), "float16"),
+        B: T.Buffer((T.int64(32), T.int64(32)), "float16"),
+        C: T.Buffer((T.int64(1), T.int64(8), T.int64(8)), "float16"),
+    ):
+        with T.block("block"):
+            A = T.alloc_buffer([1, 16, 16], dtype="float16")
+            B = T.alloc_buffer([32, 32], dtype="float16")
+            C = T.alloc_buffer([1, 8, 8], dtype="float16")
+            D = T.alloc_buffer([32, 32, 32, 4], dtype="float16", scope="global.texture-weight")
+            T.evaluate(D[0, 0, 0, 0])
+
+    lowered = tvm.lower(func)["main"]
+    assert isinstance(lowered.body, tvm.tir.LetStmt)
+    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
+    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
+    assert lowered.body.var.name == "D"
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index a35a633e1dfd..a401fb808acb 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -103,6 +103,7 @@ case ${ENVIRONMENT} in
     adb shell "mkdir -p ${TARGET_FOLDER}"
     adb push build-adreno-target/tvm_rpc ${TARGET_FOLDER}/tvm_rpc-${USER}
     adb push build-adreno-target/libtvm_runtime.so ${TARGET_FOLDER}
+    adb push build-adreno-target/lib/libtvm_ffi.so ${TARGET_FOLDER}
     if [ -f ${CPP_LIB} ] ; then
         adb push ${CPP_LIB} ${TARGET_FOLDER}
     fi
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 8b85a27277e0..bc5a075b2325 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -39,7 +39,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake
 fi
 echo set\(USE_RPC ON\) >> config.cmake
 echo set\(USE_CPP_RPC ON\) >> config.cmake
-#echo set\(USE_CPP_RTVM ON\) >> config.cmake
 echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake
 echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake
 
@@ -52,6 +51,9 @@ echo set\(USE_OPENCL_GTEST ON\) >> config.cmake
 echo set\(USE_OPENCL_EXTN_QCOM ON\) >> config.cmake
 
 cmake -DANDROID_ABI=arm64-v8a \
+      -DCMAKE_SYSTEM_NAME=Android \
+      -DCMAKE_ANDROID_ARCH_ABI="arm64-v8a" \
+      -DCMAKE_SYSROOT="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/sysroot/" \
       -DANDROID_PLATFORM=android-28 \
       -DCMAKE_SYSTEM_VERSION=1 \
       -DCMAKE_FIND_ROOT_PATH="${ADRENO_OPENCL}" \
diff --git a/tests/scripts/task_python_adreno.sh b/tests/scripts/task_python_adreno.sh
index 1714a3c06358..b381fddc2427 100755
--- a/tests/scripts/task_python_adreno.sh
+++ b/tests/scripts/task_python_adreno.sh
@@ -84,5 +84,14 @@ for node_id in $RELAX_TESTS; do
     i=$((i+1))
 done
 
+# Texture test
+RELAX_TESTS=$(./ci/scripts/jenkins/pytest_ids.py --folder tests/python/relax/texture 2> /dev/null  | grep -v dlerror)
+i=0
+for node_id in $RELAX_TESTS; do
+    echo "$node_id"
+    CXX=${TVM_NDK_CC} run_pytest ctypes "$TVM_INTEGRATION_TESTSUITE_NAME-texture-relax-$i" "$node_id" --reruns=0
+    i=$((i+1))
+done
+
 kill ${TRACKER_PID}
 kill ${DEVICE_PID}

From ed12f36d115db17b05617252f37efcc75d161382 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 28 Nov 2025 13:59:14 +0530
Subject: [PATCH 02/17] Combine analysis code

---
 python/tvm/dlight/analysis/common_analysis.py | 151 +++++-
 python/tvm/dlight/base/__init__.py            |   9 -
 python/tvm/dlight/base/analysis.py            | 442 ------------------
 python/tvm/dlight/base/common_schedules.py    |   2 +-
 python/tvm/ir/global_info.py                  |   1 -
 5 files changed, 150 insertions(+), 455 deletions(-)
 delete mode 100644 python/tvm/dlight/base/analysis.py

diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index c7374e326e9c..971166e6e017 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -17,8 +17,8 @@
 
 # pylint: disable=missing-function-docstring, missing-class-docstring
 """Analysis on TIR blocks, loops and functions."""
-from typing import List, Optional, Set, Union
-
+from collections import namedtuple
+from typing import List, Optional, Set, Union, Dict, Tuple
 from typing_extensions import Literal
 from tvm_ffi import get_global_func
 
@@ -26,6 +26,7 @@
 from tvm.target.target import Target
 from tvm.tir import Schedule
 from tvm.tir.schedule import BlockRV
+from tvm.runtime import DataType
 
 
 class IterInfo:
@@ -61,6 +62,108 @@ def __repr__(self) -> str:
         return str(self)
 
 
+get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
+
+
+# TODO: Shift Vlen Calculation here...
+class BufferInfo:
+    "Information about Buffer. Provides useful analysis"
+    buf_region: tir.BufferRegion
+    shape: Tuple[int]
+    assoc_lps: List[Union[tir.schedule.LoopRV, None]]
+    assoc_lps_info: List[Union[tir.For, None]]
+
+    # BufferIndex Types
+    Index = namedtuple("Index", ["sub"])  # c
+    RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
+    DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
+    MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
+    BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
+
+    def __init__(
+        self,
+        sch: tir.Schedule,
+        block_rv: tir.schedule.BlockRV,
+        buf_region: tir.BufferRegion,
+        lps: Union[List[tir.schedule.LoopRV], None],
+    ):
+        block = sch.get(block_rv)
+        if lps is None:
+            lps = sch.get_loops(block_rv)
+        loops = [sch.get(lp) for lp in lps]
+        iter_vars = [Var.var for Var in block.iter_vars]
+        iter_values = get_blockrealize(sch, block_rv).iter_values
+        lpvar_lp = dict([loop.loop_var, lp] for loop, lp in zip(loops, lps))
+        var_lp = dict(zip(iter_vars, [lpvar_lp.get(val, None) for val in iter_values]))
+
+        def extract_index_types(buf: tir.BufferRegion) -> BufIndex:
+            buf_index = []
+            for expr in buf.region:
+                expr = expr.min
+                dim = None
+                if isinstance(expr, tir.expr.Add) and isinstance(expr.b, tir.expr.Var):
+                    var_add = expr.b
+                    if (
+                        isinstance(expr, tir.expr.Mul)
+                        and isinstance(expr.a, tir.expr.Var)
+                        and isinstance(expr.b, tir.expr.IntImm)
+                    ):
+                        mul = expr.b
+                        var_mul = expr.a
+                        dim = MergeIndex(var_mul, mul, var_add)
+                elif (
+                    isinstance(expr, tir.expr.FloorMod)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = RemIndex(expr.a, expr.b)
+                elif (
+                    isinstance(expr, tir.expr.FloorDiv)
+                    and isinstance(expr.a, tir.expr.Var)
+                    and isinstance(expr.b, tir.expr.IntImm)
+                ):
+                    dim = DivIndex(expr.a, expr.b)
+                elif isinstance(expr, tir.expr.Var):
+                    dim = Index(expr)
+                buf_index.append(dim)
+            return buf_index
+
+        indexes = extract_index_types(buf_region)
+        assoc_lps = [
+            (
+                var_lp.get(getattr(idx, "sub"), None)
+                if not isinstance(idx, DivIndex) and not idx is None
+                else None
+            )
+            for idx in indexes
+        ]
+
+        self.buf_region = buf_region
+        self.assoc_lps = assoc_lps
+        self.assoc_lps_info = [(sch.get(lp) if lp is not None else None) for lp in assoc_lps]
+        self.shape = buf_region.buffer.shape
+
+    def get_scope(self) -> str:
+        return self.buf_region.buffer.scope()
+
+    def get_vecsize(self, buf_index: int = 0, vbits: int = 128):
+        if self.assoc_lps_info[-1] is None:
+            return None
+
+        vlp_extent = int(self.assoc_lps_info[-1].extent) & ~(
+            int(self.assoc_lps_info[-1].extent) - 1
+        )
+        vbuf_extent = int(self.shape[-1]) & ~(int(self.shape[-1]) - 1)
+
+        return min(vlp_extent, vbuf_extent, vbits // DataType(self.buf_region.buffer.dtype).bits)
+
+    def __str__(self) -> str:
+        return f"BufferInfo({self.buf_region})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
 class BlockInfo:
     """Information about a TIR block."""
 
@@ -68,6 +171,9 @@ class BlockInfo:
     iters: List[IterInfo]
     block_rv: tir.schedule.BlockRV
     _reduction_block: bool
+    read_bufs: List[BufferInfo]
+    write_bufs: List[BufferInfo]
+
 
     def __init__(
         self,
@@ -114,11 +220,52 @@ def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool:
                 return False
         return True
 
+    def get_loops(self) -> List[tir.schedule.LoopRV]:
+        return [iter_info.loop_rv for iter_info in self.iters]
+
     def is_reduction(self) -> bool:
         """Whether the block is a reduction workload."""
         # TODO(@junrushao): distinguish GEMV and reduction
         return self._reduction_block
 
+    def is_layout_transform(self, sch: tir.Schedule) -> bool:
+        """Whether the Block can be considered having a Layout Transform Pattern"""
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(block_rv)
+        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
+        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(write_bufs) == 1
+            and len(read_bufs) == 1
+            and not self.is_elementwise()
+            and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
+        )
+
+    def is_data_pad(self, sch: tir.Schedule) -> bool:
+        """Whether the Block can be considered having a data pad pattern"""
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(block_rv)
+        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
+        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
+        return (
+            all(k == "S" for k in self.dom_kind())
+            and len(write_bufs) == 1
+            and len(read_bufs) == 1
+            and not self.is_elementwise()
+            and len(self.write_bufs[0].buf_region.region)
+            == len(self.read_bufs[0].buf_region.region)
+            and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
+        )
+
+    def is_convolution(self) -> bool:
+        """Whether a Block can be considered having Convolution Pattern"""
+        raise NotImplementedError
+
+    def is_pool(self) -> bool:
+        """Whether a Block can be considered having Pooling Pattern"""
+        raise NotImplementedError
+
     def is_gemv(self) -> bool:
         """Whether the block is a GEMV workload."""
         raise NotImplementedError
diff --git a/python/tvm/dlight/base/__init__.py b/python/tvm/dlight/base/__init__.py
index d693fa2fc748..9d90c4f8e171 100644
--- a/python/tvm/dlight/base/__init__.py
+++ b/python/tvm/dlight/base/__init__.py
@@ -25,12 +25,3 @@
     max_threads_per_block,
     suggest_threads_per_block,
 )
-from .analysis import (
-    BlockInfo,
-    IterInfo,
-    collect_block_iter_vars_used_in_access_region,
-    collect_vars_used_in_prim_expr,
-    detect_dominant_read,
-    is_broadcast_epilogue,
-    normalize_prim_func,
-)
diff --git a/python/tvm/dlight/base/analysis.py b/python/tvm/dlight/base/analysis.py
deleted file mode 100644
index 4ba025f8dabf..000000000000
--- a/python/tvm/dlight/base/analysis.py
+++ /dev/null
@@ -1,442 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Analysis on TIR blocks, loops and functions."""
-
-# pylint: disable=unused-argument, unused-variable
-
-from collections import namedtuple
-from typing import List, Optional, Set, Union, Dict, Tuple
-from typing_extensions import Literal
-from tvm_ffi import get_global_func
-
-from tvm import ir, tir
-from tvm.tir import Schedule
-from tvm.target.target import Target
-from tvm.tir.schedule import BlockRV
-from tvm.runtime import DataType
-
-
-class IterInfo:
-    """Information about a loop/iter var."""
-
-    kind: Literal["S", "R", "O"]
-    var: tir.Var
-    _dom: tir.PrimExpr
-    loop_rv: tir.schedule.LoopRV
-
-    def __init__(
-        self,
-        kind: Literal["S", "R", "O"],
-        var: tir.Var,
-        dom: tir.PrimExpr,
-        loop_rv: tir.schedule.LoopRV,
-    ):
-        """Construct an IterInfo object."""
-        self.kind = kind
-        self.var = var
-        self._dom = dom
-        self.loop_rv = loop_rv
-
-    @property
-    def dom(self) -> Union[int, tir.PrimExpr]:
-        """The iteration domain of the loop."""
-        return int(self._dom) if isinstance(self._dom, tir.IntImm) else self._dom
-
-    def __str__(self) -> str:
-        return f'Iter("{self.kind}", {self.dom})'
-
-    def __repr__(self) -> str:
-        return str(self)
-
-
-get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
-
-
-# TODO: Shift Vlen Calculation here...
-class BufferInfo:
-    "Information about Buffer. Provides useful analysis"
-    buf_region: tir.BufferRegion
-    shape: Tuple[int]
-    assoc_lps: List[Union[tir.schedule.LoopRV, None]]
-    assoc_lps_info: List[Union[tir.For, None]]
-
-    def __init__(
-        self,
-        sch: tir.Schedule,
-        block_rv: tir.schedule.BlockRV,
-        buf_region: tir.BufferRegion,
-        lps: Union[List[tir.schedule.LoopRV], None],
-    ):
-        block = sch.get(block_rv)
-        if lps is None:
-            lps = sch.get_loops(block_rv)
-        loops = [sch.get(lp) for lp in lps]
-        iter_vars = [Var.var for Var in block.iter_vars]
-        iter_values = get_blockrealize(sch, block_rv).iter_values
-        lpvar_lp = dict([loop.loop_var, lp] for loop, lp in zip(loops, lps))
-        var_lp = dict(zip(iter_vars, [lpvar_lp.get(val, None) for val in iter_values]))
-
-        def extract_index_types(buf: tir.BufferRegion) -> BufIndex:
-            buf_index = []
-            for expr in buf.region:
-                expr = expr.min
-                dim = None
-                if isinstance(expr, tir.expr.Add) and isinstance(expr.b, tir.expr.Var):
-                    var_add = expr.b
-                    if (
-                        isinstance(expr, tir.expr.Mul)
-                        and isinstance(expr.a, tir.expr.Var)
-                        and isinstance(expr.b, tir.expr.IntImm)
-                    ):
-                        mul = expr.b
-                        var_mul = expr.a
-                        dim = MergeIndex(var_mul, mul, var_add)
-                elif (
-                    isinstance(expr, tir.expr.FloorMod)
-                    and isinstance(expr.a, tir.expr.Var)
-                    and isinstance(expr.b, tir.expr.IntImm)
-                ):
-                    dim = RemIndex(expr.a, expr.b)
-                elif (
-                    isinstance(expr, tir.expr.FloorDiv)
-                    and isinstance(expr.a, tir.expr.Var)
-                    and isinstance(expr.b, tir.expr.IntImm)
-                ):
-                    dim = DivIndex(expr.a, expr.b)
-                elif isinstance(expr, tir.expr.Var):
-                    dim = Index(expr)
-                buf_index.append(dim)
-            return buf_index
-
-        indexes = extract_index_types(buf_region)
-        assoc_lps = [
-            (
-                var_lp.get(getattr(idx, "sub"), None)
-                if not isinstance(idx, DivIndex) and not idx is None
-                else None
-            )
-            for idx in indexes
-        ]
-
-        self.buf_region = buf_region
-        self.assoc_lps = assoc_lps
-        self.assoc_lps_info = [(sch.get(lp) if lp is not None else None) for lp in assoc_lps]
-        self.shape = buf_region.buffer.shape
-
-    def get_scope(self) -> str:
-        return self.buf_region.buffer.scope()
-
-    def get_vecsize(self, buf_index: int = 0, vbits: int = 128):
-        if self.assoc_lps_info[-1] is None:
-            return None
-
-        vlp_extent = int(self.assoc_lps_info[-1].extent) & ~(
-            int(self.assoc_lps_info[-1].extent) - 1
-        )
-        vbuf_extent = int(self.shape[-1]) & ~(int(self.shape[-1]) - 1)
-
-        return min(vlp_extent, vbuf_extent, vbits // DataType(self.buf_region.buffer.dtype).bits)
-
-    def __str__(self) -> str:
-        return f"BufferInfo({self.buf_region})"
-
-    def __repr__(self) -> str:
-        return str(self)
-
-
-class BlockInfo:
-    """Information about a TIR block. Provides useful analysis about the block."""
-
-    name: str
-    iters: List[IterInfo]
-    block_stmt: tir.Block
-    block_rv: tir.schedule.BlockRV
-    read_bufs: List[BufferInfo]
-    write_bufs: List[BufferInfo]
-    producers: List[tir.Block]
-    consumers: List[tir.Block]
-
-    def __init__(
-        self,
-        sch: tir.Schedule,
-        block_rv: tir.schedule.BlockRV,
-    ):
-        """Construct a BlockInfo object."""
-        block_stmt = sch.get(block_rv)
-
-        def _iter_kind(loop: tir.IterVar) -> str:
-            return {tir.IterVar.DataPar: "S", tir.IterVar.CommReduce: "R"}.get(loop.iter_type, "O")
-
-        lps = sch.get_loops(block_rv)
-        iter_vars = block_stmt.iter_vars
-
-        self.name = sch.get(block_rv).name_hint
-        self.iters = [
-            IterInfo(
-                kind=_iter_kind(iter_var),
-                var=iter_var.var,
-                dom=iter_var.dom.extent,
-                loop_rv=loop_rv,
-            )
-            for loop_rv, iter_var in zip(lps, iter_vars)
-        ]
-        self.block_stmt = block_stmt
-        self.block_rv = block_rv
-        self.read_bufs = [get_buffer_info(sch, block_rv, buf, lps) for buf in block_stmt.reads]
-        self.write_bufs = [get_buffer_info(sch, block_rv, buf, lps) for buf in block_stmt.writes]
-        self.producers = sch.get_producers(block_rv)
-        self.consumers = sch.get_consumers(block_rv)
-
-    def dom(self) -> List[Union[int, tir.PrimExpr]]:
-        """The iteration domain of the block."""
-        return [i.dom for i in self.iters]
-
-    def dom_kind(self) -> str:
-        """The iteration domain kind of the block, for example, SSSS, SSSR."""
-        return "".join(i.kind for i in self.iters)
-
-    def is_injective(self) -> bool:
-        """Whether the block is injective, i.e. all its iteration domains are injective."""
-        return all(k == "S" for k in self.dom_kind())
-
-    def is_elementwise(self) -> bool:
-        """Whether the block is elementwise, i.e. trivial mapping between read/write region"""
-
-        if not self.is_injective() or len(self.write_bufs) != 1:
-            return False
-
-        w_region = self.write_bufs[0].buf_region.region
-        for read_buf in self.read_bufs:
-            r_region = read_buf.buf_region.region
-            if len(r_region) != len(w_region):
-                return False
-            for r_var, w_var in zip(r_region, w_region):
-                if not r_var == w_var:
-                    return False
-        return True
-
-    def is_broadcast(self) -> bool:
-        """Whether the block is elementwise, i.e. trivial mapping between read/write region"""
-
-        if not self.is_injective() or len(self.write_bufs) != 1:
-            return False
-
-        w_region = self.write_bufs[0].buf_region.region
-        for read_buf in self.read_bufs:
-            r_region = read_buf.buf_region.region
-            for r_var in r_region:
-                if r_var not in w_region:
-                    return False
-        return True
-
-    def get_loops(self) -> List[tir.schedule.LoopRV]:
-        return [iter_info.loop_rv for iter_info in self.iters]
-
-    def is_reduction(self) -> bool:
-        """Whether the block is a reduction workload."""
-        return all(k == "S" or k == "R" for k in self.dom_kind()) and any(
-            k == "R" for k in self.dom_kind()
-        )
-
-    def is_layout_transform(self) -> bool:
-        """Whether the Block can be considered having a Layout Transform Pattern"""
-        return (
-            all(k == "S" for k in self.dom_kind())
-            and len(self.write_bufs) == 1
-            and len(self.read_bufs) == 1
-            and not self.is_elementwise()
-            and not get_global_func("tir.schedule.HasIfThenElse")(self.block_stmt)
-        )
-
-    def is_data_pad(self) -> bool:
-        """Whether the Block can be considered having a data pad pattern"""
-        return (
-            all(k == "S" for k in self.dom_kind())
-            and len(self.write_bufs) == 1
-            and len(self.read_bufs) == 1
-            and not self.is_elementwise()
-            and len(self.write_bufs[0].buf_region.region)
-            == len(self.read_bufs[0].buf_region.region)
-            and get_global_func("tir.schedule.HasIfThenElse")(self.block_stmt)
-        )
-
-    def is_convolution(self) -> bool:
-        """Whether a Block can be considered having Convolution Pattern"""
-        raise NotImplementedError
-
-    def is_pool(self) -> bool:
-        """Whether a Block can be considered having Pooling Pattern"""
-        raise NotImplementedError
-
-    def is_gemv(self) -> bool:
-        """Whether the block is a GEMV workload."""
-        raise NotImplementedError
-
-    def is_gemm(self) -> bool:
-        """Whether the block is a GEMM workload."""
-        raise NotImplementedError
-
-    def check_op_name(self, name: str):
-        raise NotImplementedError
-
-    def __str__(self) -> str:
-        return f'BlockInfo("{self.name}", "{self.dom_kind()}", {self.dom()})'
-
-    def __repr__(self) -> str:
-        return str(self)
-
-
-_normalize_prim_func = get_global_func("tir.schedule.NormalizePrimFunc")
-
-
-def normalize_prim_func(sch: tir.Schedule) -> Optional[List[BlockInfo]]:
-    """Normalize the primfunc to normal form"""
-    try:
-        result = _normalize_prim_func(sch)
-        if result is None:
-            return None
-    except Exception:  # pylint: disable=broad-except
-        return None
-
-    def _iter_kind(i: tir.IterVar) -> str:
-        return {
-            tir.IterVar.DataPar: "S",
-            tir.IterVar.CommReduce: "R",
-        }.get(i.iter_type, "O")
-
-    blocks: List[BlockInfo] = []
-    for block, loops, iters, is_reduction in zip(*result):
-        blocks.append(BlockInfo(sch, block))
-    return blocks
-
-
-# BufferIndex Types
-Index = namedtuple("Index", ["sub"])  # c
-RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
-DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
-MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
-BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
-
-
-# TODO: Possibly move it under BufferInfo's init
-def get_buffer_info(
-    sch: tir.Schedule,
-    blk: tir.schedule.BlockRV,
-    buf: tir.BufferRegion,
-    # TODO: Handle Case where there's None
-    lps: Dict[tir.Var, tir.schedule.LoopRV],
-) -> BufferInfo:
-    return BufferInfo(sch, blk, buf, lps)
-
-
-# TODO: Possibly Move it under BlockInfo init...
-def get_block_info(sch: tir.Schedule, blk: tir.schedule.BlockRV) -> BlockInfo:
-    return BlockInfo(sch, blk)
-
-
-def _assert_gpu_target(target: Target):
-    if "gpu" not in target.keys:
-        raise ValueError(f"Expect a GPU target, but got {target}")
-
-
-def get_max_threads_per_block(target: Target) -> int:
-    _assert_gpu_target(target)
-    max_threads_per_block = None
-    for name in ["max_threads_per_block", "max_num_threads"]:
-        if max_threads_per_block is None:
-            max_threads_per_block = target.attrs.get(name, None)
-    if max_threads_per_block is None:
-        max_threads_per_block = 64
-    return int(max_threads_per_block)
-
-
-def get_max_shared_memory_per_block(target: Target) -> int:
-    _assert_gpu_target(target)
-    max_shared_memory_per_block = target.attrs.get("max_shared_memory_per_block", None)
-    if max_shared_memory_per_block is None:
-        raise ValueError(
-            f"Cannot find `max_shared_memory_per_block` in {target}, please specify it manually"
-        )
-    return int(max_shared_memory_per_block)
-
-
-def get_root_block(sch: Schedule, func_name: str = "main") -> BlockRV:
-    try:
-        block = sch.mod[func_name].body.block
-    except:
-        raise ValueError(
-            f"The function body is expected to be the root block, but got:\n"
-            f"{sch.mod[func_name].body}"
-        )
-    return sch.get_block(block.name_hint)
-
-
-def collect_block_iter_vars_used_in_access_region(
-    block: tir.Block, region: List[ir.Range]
-) -> Set[tir.Var]:
-    """Collect the block iter variables used in the access region of a buffer region."""
-    tir_vars = set()
-    for expr in region:
-        assert expr.extent == 1
-        tir_vars |= collect_vars_used_in_prim_expr(expr.min)
-    tir_vars &= set(iter_var.var for iter_var in block.iter_vars)
-    return tir_vars
-
-
-def collect_vars_used_in_prim_expr(expr: tir.PrimExpr) -> Set[tir.Var]:
-    """Collect the variables used in the PrimExpr."""
-    tir_vars = set()
-
-    def _collect_tir_var(expr):
-        if isinstance(expr, tir.Var):
-            tir_vars.add(expr)
-
-    tir.stmt_functor.post_order_visit(expr, _collect_tir_var)
-    return tir_vars
-
-
-def detect_dominant_read(block: tir.Block) -> tir.PrimExpr:
-    """Detect the dominant read indices in the block."""
-    dominant_read = None
-    num_read_iters = -1
-    for buffer_region in block.reads:
-        tir_vars = collect_block_iter_vars_used_in_access_region(block, buffer_region.region)
-        if num_read_iters < len(tir_vars):
-            num_read_iters = len(tir_vars)
-            dominant_read = buffer_region
-    assert dominant_read is not None
-    (result,) = dominant_read.buffer.offset_of([e.min for e in dominant_read.region])
-    return result
-
-
-def is_broadcast_epilogue(
-    sch: tir.Schedule,
-    block: tir.schedule.BlockRV,
-    epilogue: tir.schedule.BlockRV,
-) -> bool:
-    """Check if the epilogue block is a broadcast pattern"""
-    write_buffers = {r.buffer for r in sch.get(block).writes}
-    epilogue_iters = {i.var: i for i in sch.get(epilogue).iter_vars if i.dom != 1}
-    for buffer_region in sch.get(epilogue).reads:
-        if buffer_region.buffer not in write_buffers:
-            continue
-        tir_vars = collect_block_iter_vars_used_in_access_region(
-            sch.get(epilogue), buffer_region.region
-        )
-        if len(tir_vars) < len(epilogue_iters):
-            return True
-    return False
diff --git a/python/tvm/dlight/base/common_schedules.py b/python/tvm/dlight/base/common_schedules.py
index fe005cec5d70..c205b78390bc 100644
--- a/python/tvm/dlight/base/common_schedules.py
+++ b/python/tvm/dlight/base/common_schedules.py
@@ -19,7 +19,7 @@
 
 from tvm import tir
 
-from .analysis import BlockInfo
+from ..analysis import BlockInfo
 
 
 def try_inline(
diff --git a/python/tvm/ir/global_info.py b/python/tvm/ir/global_info.py
index 36811c301044..597ff68dc1d8 100644
--- a/python/tvm/ir/global_info.py
+++ b/python/tvm/ir/global_info.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 """Global Info."""
-
 import tvm
 import tvm_ffi
 from tvm.runtime.object import Object

From 8f48187e655790d54e0870500e904ab32efadacc Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 28 Nov 2025 14:22:50 +0530
Subject: [PATCH 03/17] Remove unused header

---
 python/tvm/dlight/adreno/convolution.py         | 2 +-
 python/tvm/dlight/adreno/fallback.py            | 6 +++---
 python/tvm/dlight/adreno/layout_transform.py    | 4 ++--
 python/tvm/dlight/adreno/pool.py                | 2 +-
 python/tvm/dlight/adreno/utils.py               | 2 +-
 python/tvm/dlight/analysis/common_analysis.py   | 8 ++++----
 src/relax/backend/vm/codegen_vm.cc              | 1 -
 src/relax/backend/vm/codegen_vm_tir.cc          | 1 -
 src/relax/op/op.cc                              | 2 --
 src/relax/transform/lower_alloc_tensor.cc       | 2 --
 src/relax/transform/static_plan_block_memory.cc | 1 -
 src/runtime/vm/executable.cc                    | 4 +---
 12 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index 830e1924c446..a72e78e26077 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -22,7 +22,7 @@
 from tvm.target import Target
 
 from .utils import schedule_inline_blocks, schedule_storage_annotate, schedule_default
-from ..base import analysis
+from .. import analysis
 from .base import AdrenoScheduleRule
 
 
diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
index cec05cbf22a1..398bf8ad8977 100644
--- a/python/tvm/dlight/adreno/fallback.py
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -21,7 +21,7 @@
 
 from tvm import tir
 from tvm.target import Target
-from ..base import analysis
+from .. import analysis
 from .base import AdrenoScheduleRule
 from .utils import get_texture_storage
 
@@ -45,7 +45,7 @@ def schedule_inline_blocks(
         remaining_blocks = []
         for blk in blocks:
             block_info = analysis.get_block_info(sch, blk)
-            if block_info.is_injective() and not block_info.is_data_pad():
+            if block_info.is_injective() and not block_info.is_data_pad(sch):
                 if len(block_info.consumers) == 1:
                     try:
                         sch.compute_inline(blk)
@@ -139,7 +139,7 @@ def schedule_fallback(sch):
             blk
             for blk in blocks
             if analysis.get_block_info(sch, blk).is_reduction()
-            or analysis.get_block_info(sch, blk).is_data_pad()
+            or analysis.get_block_info(sch, blk).is_data_pad(sch)
         ]
         remaining_blocks = [blk for blk in blocks if blk not in schedule_blocks]
 
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
index c34bd8324aaf..201fa663a567 100644
--- a/python/tvm/dlight/adreno/layout_transform.py
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -22,7 +22,7 @@
 
 from tvm import tir
 from tvm.target import Target
-from ..base import analysis
+from .. import analysis
 
 from .base import AdrenoScheduleRule
 
@@ -61,7 +61,7 @@ def apply(  # pylint: disable=too-many-locals
         block_info = analysis.get_block_info(sch, blk)
         if not (
             (self.use_op_name and block_info.name == "te_layout_transform")
-            or (not self.use_op_name and block_info.is_layout_transform())
+            or (not self.use_op_name and block_info.is_layout_transform(sch))
         ):
             return None
 
diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
index 20b829fa8929..72fdb4012b49 100644
--- a/python/tvm/dlight/adreno/pool.py
+++ b/python/tvm/dlight/adreno/pool.py
@@ -21,7 +21,7 @@
 from tvm.target import Target
 
 from .base import AdrenoScheduleRule
-from ..base import analysis
+from .. import analysis
 
 
 # pylint: disable=invalid-name, unused-variable
diff --git a/python/tvm/dlight/adreno/utils.py b/python/tvm/dlight/adreno/utils.py
index 0d4f0abe7d20..3148de029e45 100644
--- a/python/tvm/dlight/adreno/utils.py
+++ b/python/tvm/dlight/adreno/utils.py
@@ -22,7 +22,7 @@
 from tvm.target import Target
 from tvm import tir
 
-from ..base.analysis import BlockInfo
+from ..analysis import BlockInfo
 
 
 def get_texture_storage(block_info: BlockInfo):
diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index 971166e6e017..49b028a31254 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -16,9 +16,10 @@
 # under the License.
 
 # pylint: disable=missing-function-docstring, missing-class-docstring
+# pylint: disable=unused-argument, unused-variable
 """Analysis on TIR blocks, loops and functions."""
 from collections import namedtuple
-from typing import List, Optional, Set, Union, Dict, Tuple
+from typing import List, Optional, Set, Union, Tuple
 from typing_extensions import Literal
 from tvm_ffi import get_global_func
 
@@ -174,7 +175,6 @@ class BlockInfo:
     read_bufs: List[BufferInfo]
     write_bufs: List[BufferInfo]
 
-
     def __init__(
         self,
         name: str,
@@ -238,7 +238,7 @@ def is_layout_transform(self, sch: tir.Schedule) -> bool:
             all(k == "S" for k in self.dom_kind())
             and len(write_bufs) == 1
             and len(read_bufs) == 1
-            and not self.is_elementwise()
+            and not self.is_elementwise(sch)
             and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
         )
 
@@ -252,7 +252,7 @@ def is_data_pad(self, sch: tir.Schedule) -> bool:
             all(k == "S" for k in self.dom_kind())
             and len(write_bufs) == 1
             and len(read_bufs) == 1
-            and not self.is_elementwise()
+            and not self.is_elementwise(sch)
             and len(self.write_bufs[0].buf_region.region)
             == len(self.read_bufs[0].buf_region.region)
             and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index 221f3b697883..34cc32978971 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -29,7 +29,6 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/function.h>
 
-#include <climits>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/src/relax/backend/vm/codegen_vm_tir.cc b/src/relax/backend/vm/codegen_vm_tir.cc
index 8f061f62f11b..c783a9da3d2a 100644
--- a/src/relax/backend/vm/codegen_vm_tir.cc
+++ b/src/relax/backend/vm/codegen_vm_tir.cc
@@ -34,7 +34,6 @@
 #include <tvm/tir/stmt.h>
 
 #include <cctype>
-#include <climits>
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/src/relax/op/op.cc b/src/relax/op/op.cc
index 5d151fe2af87..3acfb53b2784 100644
--- a/src/relax/op/op.cc
+++ b/src/relax/op/op.cc
@@ -23,8 +23,6 @@
 #include <tvm/relax/expr.h>
 #include <tvm/relax/utils.h>
 
-#include <climits>
-
 #include "../transform/utils.h"
 #include "op_common.h"
 
diff --git a/src/relax/transform/lower_alloc_tensor.cc b/src/relax/transform/lower_alloc_tensor.cc
index 74ceb6c6ebc4..d63cd15744bd 100644
--- a/src/relax/transform/lower_alloc_tensor.cc
+++ b/src/relax/transform/lower_alloc_tensor.cc
@@ -24,8 +24,6 @@
 #include <tvm/relax/expr_functor.h>
 #include <tvm/relax/transform.h>
 
-#include <climits>
-
 #include "utils.h"
 
 namespace tvm {
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index ab7418a65aa3..1db15c8517c5 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -73,7 +73,6 @@
 #include <tvm/relax/transform.h>
 #include <tvm/tir/stmt_functor.h>
 
-#include <climits>
 #include <map>
 #include <set>
 #include <vector>
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index abad6363839c..40edbc14c433 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -250,9 +250,7 @@ void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const {
   for (const auto& it : this->constants) {
     if (auto opt_nd = it.as<runtime::Tensor>()) {
       strm->Write<int32_t>(ffi::TypeIndex::kTVMFFITensor);
-      // runtime::SaveDLTensor(strm, opt_nd.value().operator->());
-      // runtime::Tensor param = opt_nd.value().operator->();
-      opt_nd.value().Save(strm);
+      runtime::SaveDLTensor(strm, opt_nd.value().operator->());
     } else if (auto opt_shape = it.as<ffi::Shape>()) {
       ffi::Shape shape = opt_shape.value();
       strm->Write<int32_t>(ffi::TypeIndex::kTVMFFIShape);

From 615f98f9d4aec0bbca1959f7394cba5d9f46249d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 28 Nov 2025 23:02:16 +0530
Subject: [PATCH 04/17] Update python/tvm/dlight/adreno/pool.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 python/tvm/dlight/adreno/pool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
index 72fdb4012b49..61c6ca33f5a2 100644
--- a/python/tvm/dlight/adreno/pool.py
+++ b/python/tvm/dlight/adreno/pool.py
@@ -87,6 +87,6 @@ def schedule_max_pool(blk: tir.schedule.BlockRV):
                         sch.reverse_compute_inline(blk)
                     else:
                         sch.compute_inline(blk)
-                except:  # pylint: disable=bare-except
+                except Exception:  # pylint: disable=broad-except-bare
                     pass
         return sch

From 9e8bcf7e8db55e35beaa31cfb39c47704f25d4d3 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Sat, 29 Nov 2025 09:09:09 +0530
Subject: [PATCH 05/17] Various review comments

---
 include/tvm/runtime/tensor.h                      |  2 +-
 include/tvm/tir/transform.h                       |  2 +-
 python/tvm/dlight/adreno/__init__.py              |  2 --
 python/tvm/dlight/adreno/convolution.py           |  2 +-
 python/tvm/dlight/adreno/fallback.py              | 15 ---------------
 python/tvm/dlight/adreno/layout_transform.py      |  3 ---
 python/tvm/tir/transform/transform.py             |  2 +-
 src/relax/op/op_common.h                          |  1 +
 src/relax/transform/static_plan_block_memory.cc   |  6 ++++--
 .../test_transform_annotate_custom_scope.py       |  1 -
 10 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/include/tvm/runtime/tensor.h b/include/tvm/runtime/tensor.h
index ab15efc41ced..f55bda07b04d 100644
--- a/include/tvm/runtime/tensor.h
+++ b/include/tvm/runtime/tensor.h
@@ -196,7 +196,7 @@ class Tensor : public tvm::ffi::Tensor {
  protected:
   /*!
    * \brief The memory scope
-   * represents the underlaying scope information of device
+   * represents the underlying scope information of device
    */
   ffi::String scope = "global";
 };
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index b825d78cc0b0..60e6b9af8b88 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -774,7 +774,7 @@ TVM_DLL Pass DefaultGPUSchedule();
 TVM_DLL Pass UseAssumeToReduceBranches();
 
 /*!
- * \brief Inject Texture Allocation intrensic.
+ * \brief Inject Texture Allocation intrinsic.
  * \return The pass.
  */
 TVM_DLL Pass InjectTextureAlloc();
diff --git a/python/tvm/dlight/adreno/__init__.py b/python/tvm/dlight/adreno/__init__.py
index d77f41e3f55c..87afa08818b2 100644
--- a/python/tvm/dlight/adreno/__init__.py
+++ b/python/tvm/dlight/adreno/__init__.py
@@ -21,5 +21,3 @@
 from .layout_transform import LayoutTransform
 from .fallback import Fallback
 from .pool import Pool2D
-
-# from .fallback import Fallback
diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index a72e78e26077..1ed00ac86298 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -68,7 +68,7 @@ def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
 
     def apply(  # pylint: disable=too-many-locals,missing-docstring
         self,
-        func: Union[tir.PrimFunc],
+        func: Union[tir.PrimFunc, tir.Schedule],
         target: Target,
         _: bool,
     ) -> Optional[tir.Schedule]:
diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
index 398bf8ad8977..02c7f33925bf 100644
--- a/python/tvm/dlight/adreno/fallback.py
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -71,20 +71,6 @@ def schedule_inline_blocks(
                 remaining_blocks.append(blk)
         return remaining_blocks
 
-    @staticmethod
-    def schedule_annotate_storage(sch: tir.Schedule, func=get_texture_storage):
-        """Annotates intermediate buffers to textures whenever it's possible to do so"""
-        return
-        # pylint: disable=unreachable
-        root_blk = analysis.get_root_block(sch)
-        blocks = sch.get_child_blocks(root_blk)
-
-        for blk in blocks:
-            block_info = analysis.get_block_info(sch, blk)
-            scope = func(block_info)
-            if scope is not None and len(sch.get_consumers(blk)) > 0:
-                sch.set_scope(blk, 0, scope)
-
     @staticmethod
     def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         block_info = analysis.get_block_info(sch, blk)
@@ -149,7 +135,6 @@ def schedule_fallback(sch):
         # TODO: Analyze unscheduled blocks to schedule instead of relying on remaining
         for blk in remaining_blocks:
             Fallback.schedule_default(sch, blk)
-        Fallback.schedule_annotate_storage(sch, schedule_blocks + remaining_blocks)
 
     def apply(  # pylint: disable=too-many-locals
         self,
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
index 201fa663a567..138047b6bdfd 100644
--- a/python/tvm/dlight/adreno/layout_transform.py
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -124,9 +124,6 @@ def apply(  # pylint: disable=too-many-locals
         b = sch.fuse(*block_loops)
         tx_extent = min(sch.get(b).extent, 256)
         candidates = [1, 2, 4, 8, 16, 32]
-        ux = sch.sample_categorical(
-            candidates, [1 / len(candidates) for _ in range(len(candidates))]
-        )
         bx, tx = sch.split(b, [None, 256], preserve_unit_iters=True)
         sch.bind(bx, "blockIdx.x")
         sch.bind(tx, "threadIdx.x")
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 0d75a0ea8a28..37ffbf12d3aa 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -894,7 +894,7 @@ def LowerOpaqueBlock():
 
 
 def InjectTextureAlloc():
-    """Inject Texture Allocation Intrensic to make sure appropriate lowering
+    """Inject Texture Allocation Intrinsic to make sure appropriate lowering
     via alloc_nd/alloc_free calls
     Returns
     -------
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index bc958cec6791..c77d7a07823f 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -215,6 +215,7 @@ inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx
     auto defined_sinfo = call->sinfo_args[0].as<TensorStructInfoNode>();
     auto shape = output_sinfo->GetShape();
     ICHECK(shape.defined());
+    ICHECK(defined_sinfo->vdevice.has_value());
     return TensorStructInfo(ShapeExpr(shape.value()), output_sinfo->dtype,
                             defined_sinfo->vdevice.value());
   } else {
diff --git a/src/relax/transform/static_plan_block_memory.cc b/src/relax/transform/static_plan_block_memory.cc
index 1db15c8517c5..cebbaa4ce5ac 100644
--- a/src/relax/transform/static_plan_block_memory.cc
+++ b/src/relax/transform/static_plan_block_memory.cc
@@ -1038,10 +1038,12 @@ PrimExpr GetTextureMemorySizeFromVDevice(ffi::Array<PrimExpr> pshape, DataType d
                             .value_or(Integer(64))
                             ->value;
 
-  // TODO(Siva) Assuming no any dimensions for now.
   struct Shape {
     const ffi::Array<PrimExpr>& shape;
-    int64_t operator[](size_t i) const { return *tir::as_const_int(shape[i]); }
+    int64_t operator[](size_t i) const {
+      ICHECK(tir::as_const_int(shape[i])) << "Dymamic shapes not suported over texture now";
+      return *tir::as_const_int(shape[i]);
+    }
     int size() { return this->shape.size(); }
   };
   auto shape = Shape{pshape};
diff --git a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
index 91e12034843d..24b4cf66b888 100644
--- a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
+++ b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
@@ -106,7 +106,6 @@ def verify(mod, expected):
         mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
         mod = tvm.relax.transform.Normalize()(mod)
 
-    print(mod)
     ValidateScope(expected).visit(mod)
 
 

From aa9f24bf5e8e4be2580269d005d81c23a6486cae Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Wed, 3 Dec 2025 23:53:14 +0530
Subject: [PATCH 06/17] Scope handling w/o changes to Tensor

---
 cmake/modules/OpenCL.cmake                    |   3 +
 include/tvm/relax/exec_builder.h              |   9 +
 include/tvm/runtime/tensor.h                  |  32 +---
 include/tvm/runtime/vm/executable.h           |  12 ++
 python/tvm/dlight/adreno/convolution.py       |   3 +-
 python/tvm/dlight/adreno/fallback.py          |  28 +++-
 python/tvm/dlight/adreno/layout_transform.py  |   2 +-
 python/tvm/dlight/adreno/utils.py             |   7 -
 python/tvm/dlight/analysis/common_analysis.py |  47 +++---
 python/tvm/script/ir_builder/relax/ir.py      |  25 +++
 src/relax/backend/vm/codegen_vm.cc            |   8 +-
 src/relax/backend/vm/exec_builder.cc          |   4 +
 src/runtime/opencl/opencl_module.cc           |  92 ++++++++++-
 src/runtime/tensor.cc                         |  17 +-
 src/runtime/vm/executable.cc                  |  45 ++++-
 src/runtime/vm/vm.cc                          |  24 +--
 .../test_transform_annotate_custom_scope.py   | 156 +++++++++---------
 tests/python/relax/test_transform.py          |  14 +-
 .../test_transform_legalize_ops_manipulate.py |   8 +-
 tests/python/relax/texture/test_network.py    |   4 +-
 tests/python/relax/texture/test_ops.py        |   2 +-
 tests/scripts/setup-adreno-env.sh             |   2 +-
 tests/scripts/task_build_adreno_bins.sh       |   1 +
 23 files changed, 357 insertions(+), 188 deletions(-)

diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index c5c8eae721fa..b70db4dc7bf2 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -80,6 +80,9 @@ if(USE_OPENCL)
       message(STATUS "Set OpenCL Target version to " ${CMAKE_MATCH_1})
     endif()
   endif(USE_OPENCL_EXTN_QCOM)
+  if(PROFILE_SHADER_DUMP)
+    add_definitions(-DPROFILE_SHADER_DUMP)
+  endif(PROFILE_SHADER_DUMP)
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
 endif(USE_OPENCL)
diff --git a/include/tvm/relax/exec_builder.h b/include/tvm/relax/exec_builder.h
index 4fd0fd66bb90..5353f0a73569 100644
--- a/include/tvm/relax/exec_builder.h
+++ b/include/tvm/relax/exec_builder.h
@@ -122,6 +122,15 @@ class ExecBuilderNode : public Object {
     rv = value;
     return ConvertConstant_(rv);
   }
+  /*!
+   * \brief update memory scopes.
+   *
+   * This function builds the memory scopes for constants.
+   *
+   * \param Index of the constant
+   * \param The memory scope.
+   */
+  void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope);
   /*!
    * \brief Raw access to underlying executable build in progress.
    */
diff --git a/include/tvm/runtime/tensor.h b/include/tvm/runtime/tensor.h
index f55bda07b04d..615cfd8cccfe 100644
--- a/include/tvm/runtime/tensor.h
+++ b/include/tvm/runtime/tensor.h
@@ -36,7 +36,6 @@
 
 #include <atomic>
 #include <functional>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -189,25 +188,14 @@ class Tensor : public tvm::ffi::Tensor {
    */
   TVM_DLL static void CopyFromBytes(const DLTensor* to, void* from, size_t nbytes,
                                     TVMStreamHandle stream = nullptr);
-
-  TVM_DLL void SetScope(ffi::String scope);
-  TVM_DLL ffi::String GetScope() const;
-
- protected:
-  /*!
-   * \brief The memory scope
-   * represents the underlying scope information of device
-   */
-  ffi::String scope = "global";
 };
 
 /*!
  * \brief Save a DLTensor to stream
  * \param strm The output stream
  * \param tensor The tensor to be saved.
- * \param scope The tensor storage scope.
  */
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope = "global");
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
 
 inline void Tensor::CopyFrom(const DLTensor* other) {
   ICHECK(data_ != nullptr);
@@ -232,11 +220,10 @@ inline void Tensor::CopyTo(const Tensor& other) const {
 }
 
 /*! \brief Magic number for Tensor file */
-constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
-constexpr uint64_t kTVMNDArrayScopedMagic = 0xDD5E40F096B4A13E;
+constexpr uint64_t kTVMTensorMagic = 0xDD5E40F096B4A13F;
 
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope) {
-  uint64_t header = kTVMNDArrayScopedMagic, reserved = 0;
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
+  uint64_t header = kTVMTensorMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
   // Always save data as CPU context
@@ -256,7 +243,6 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String
   strm->Write(tensor->dtype);
   int ndim = tensor->ndim;
   strm->WriteArray(tensor->shape, ndim);
-  strm->Write(std::string(scope));
   int type_bytes = (tensor->dtype.bits + 7) / 8;
   int64_t num_elems = 1;
   for (int i = 0; i < ndim; ++i) {
@@ -280,14 +266,13 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String
   return true;
 }
 
-inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->(), GetScope()); }
+inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
 
 inline bool Tensor::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  ICHECK((header == kTVMNDArrayMagic) || (header == kTVMNDArrayScopedMagic))
-      << "Invalid DLTensor file format";
+  ICHECK(header == kTVMTensorMagic) << "Invalid DLTensor file format";
   Device dev;
   int ndim;
   DLDataType dtype;
@@ -305,11 +290,6 @@ inline bool Tensor::Load(dmlc::Stream* strm) {
   for (int i = 0; i < ret->ndim; ++i) {
     num_elems *= ret->shape[i];
   }
-  if (header == kTVMNDArrayScopedMagic) {
-    std::string scope;
-    strm->Read(&scope);
-    ret.SetScope(scope);
-  }
   int64_t data_byte_size;
   ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
   ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 37488ff31f52..7e2380ddf691 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -155,6 +155,8 @@ class VMExecutable : public ffi::ModuleObj {
   std::unordered_map<std::string, Index> func_map;
   /*! \brief The global constant pool. */
   std::vector<ffi::Any> constants;
+  /*! \brief The VDevice memory scopes */
+  std::unordered_map<Index, std::string> memory_scopes;
   /*! \brief The offset of instruction. */
   std::vector<Index> instr_offset;
   /*! \brief The byte data of instruction. */
@@ -177,6 +179,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void SaveGlobalSection(dmlc::Stream* strm) const;
+  /*!
+   * \brief Save the memory scopes.
+   * \param strm The output stream.
+   */
+  void SaveMemoryScopeSection(dmlc::Stream* strm) const;
   /*!
    * \brief Save the constant pool.
    * \param strm The input stream.
@@ -197,6 +204,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void LoadGlobalSection(dmlc::Stream* strm);
+  /*!
+   * \brief Load the memory scopes.
+   * \param strm The input stream.
+   */
+  void LoadMemoryScopeSection(dmlc::Stream* strm);
   /*!
    * \brief Load the constant pool.
    * \param strm The input stream.
diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index 1ed00ac86298..fec465cc484f 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -21,7 +21,7 @@
 from tvm import tir
 from tvm.target import Target
 
-from .utils import schedule_inline_blocks, schedule_storage_annotate, schedule_default
+from .utils import schedule_inline_blocks, schedule_default
 from .. import analysis
 from .base import AdrenoScheduleRule
 
@@ -102,6 +102,5 @@ def is_convolution(blk):
         Conv2d.schedule_conv2d(sch, conv_blk)
         remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
         schedule_default(sch, remaining_blocks)
-        schedule_storage_annotate(sch, remaining_blocks)
 
         return sch
diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
index 02c7f33925bf..ecb42fe62abc 100644
--- a/python/tvm/dlight/adreno/fallback.py
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -26,6 +26,22 @@
 from .utils import get_texture_storage
 
 
+def _assert_gpu_target(target: Target):
+    if "gpu" not in target.keys:
+        raise ValueError(f"Expect a GPU target, but got {target}")
+
+
+def get_max_threads_per_block(target: Target) -> int:
+    _assert_gpu_target(target)
+    max_threads_per_block = None
+    for name in ["max_threads_per_block", "max_num_threads"]:
+        if max_threads_per_block is None:
+            max_threads_per_block = target.attrs.get(name, None)
+    if max_threads_per_block is None:
+        max_threads_per_block = 64
+    return int(max_threads_per_block)
+
+
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable,unused-import
 class Fallback(AdrenoScheduleRule):
     """Texture Based Fallback Schedule(s) for Adreno"""
@@ -46,12 +62,12 @@ def schedule_inline_blocks(
         for blk in blocks:
             block_info = analysis.get_block_info(sch, blk)
             if block_info.is_injective() and not block_info.is_data_pad(sch):
-                if len(block_info.consumers) == 1:
+                if len(sch.get_consumers(blk)) == 1:
                     try:
                         sch.compute_inline(blk)
                     except Exception:  # pylint: disable=broad-exception-caught
                         remaining_blocks.append(blk)
-                elif len(block_info.producers) == 1:
+                elif len(sch.get_producers(blk)) == 1:
                     inlined_once = False
                     try:
                         # Would cause an issue inlining to producer with multiple consumers
@@ -76,7 +92,7 @@ def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         block_info = analysis.get_block_info(sch, blk)
 
         s_loops, r_loops, o_loops = [], [], []
-        v_loop = block_info.write_bufs[0].assoc_lps[-1]
+        v_loop = block_info.write_bufs(sch)[0].assoc_lps[-1]
 
         for iter_info in block_info.iters:
             if sch.get(iter_info.loop_rv) == sch.get(v_loop):
@@ -84,7 +100,7 @@ def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
             {"S": s_loops, "R": r_loops, "O": o_loops}.get(iter_info.kind).append(iter_info.loop_rv)
 
         iter_vars = analysis.collect_block_iter_vars_used_in_access_region(
-            block_info.block_stmt, block_info.write_bufs[0].buf_region.region
+            sch.get(blk), block_info.write_bufs(sch)[0].buf_region.region
         )
         o_outer = [lp for lp in o_loops if sch.get(lp).var in iter_vars]
         o_inner = [lp for lp in o_loops if sch.get(lp).var not in iter_vars]
@@ -100,7 +116,7 @@ def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         tgt = Target.current(allow_none=True)
 
         b = sch.fuse(*s_loops)
-        tx_extent = analysis.get_max_threads_per_block(tgt) if tgt is not None else 256
+        tx_extent = get_max_threads_per_block(tgt) if tgt is not None else 256
         bx, tx = sch.split(b, [None, tx_extent])
         sch.bind(bx, "blockIdx.x")
         sch.bind(tx, "threadIdx.x")
@@ -155,7 +171,7 @@ def apply(  # pylint: disable=too-many-locals
             return None
 
         block_infos = [analysis.get_block_info(sch, block) for block in blocks]
-        if not any("texture" in block.write_bufs[0].get_scope() for block in block_infos):
+        if not any("texture" in block.write_bufs(sch)[0].get_scope() for block in block_infos):
             return None
 
         Fallback.schedule_fallback(sch)
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
index 138047b6bdfd..8610d658d5f1 100644
--- a/python/tvm/dlight/adreno/layout_transform.py
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -65,7 +65,7 @@ def apply(  # pylint: disable=too-many-locals
         ):
             return None
 
-        read_buf, write_buf = (block_info.read_bufs[0], block_info.write_bufs[0])
+        read_buf, write_buf = (block_info.read_bufs(sch)[0], block_info.write_bufs(sch)[0])
         lps = block_info.get_loops()
         lpv_read, lpv_write = (
             read_buf.assoc_lps[-1],
diff --git a/python/tvm/dlight/adreno/utils.py b/python/tvm/dlight/adreno/utils.py
index 3148de029e45..73597fe8578c 100644
--- a/python/tvm/dlight/adreno/utils.py
+++ b/python/tvm/dlight/adreno/utils.py
@@ -83,13 +83,6 @@ def schedule_default(sch, blocks: List[tir.schedule.BlockRV] = None):
     return ret
 
 
-def schedule_storage_annotate(sch: tir.Schedule, func=get_texture_storage):
-    # Check the Write Buffer isn't one of input Params and is Texturizable...
-    from .fallback import Fallback
-
-    return Fallback.schedule_annotate_storage(sch)
-
-
 def schedule_fallback(sch, blk):
     from .fallback import Fallback
 
diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index 49b028a31254..d19b1070e4fe 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -64,6 +64,12 @@ def __repr__(self) -> str:
 
 
 get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
+# BufferIndex Types
+Index = namedtuple("Index", ["sub"])  # c
+RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
+DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
+MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
+BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
 
 
 # TODO: Shift Vlen Calculation here...
@@ -74,13 +80,6 @@ class BufferInfo:
     assoc_lps: List[Union[tir.schedule.LoopRV, None]]
     assoc_lps_info: List[Union[tir.For, None]]
 
-    # BufferIndex Types
-    Index = namedtuple("Index", ["sub"])  # c
-    RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
-    DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
-    MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
-    BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
-
     def __init__(
         self,
         sch: tir.Schedule,
@@ -172,8 +171,6 @@ class BlockInfo:
     iters: List[IterInfo]
     block_rv: tir.schedule.BlockRV
     _reduction_block: bool
-    read_bufs: List[BufferInfo]
-    write_bufs: List[BufferInfo]
 
     def __init__(
         self,
@@ -192,6 +189,16 @@ def dom(self) -> List[Union[int, tir.PrimExpr]]:
         """The iteration domain of the block."""
         return [i.dom for i in self.iters]
 
+    def read_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
+
+    def write_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
+
     def dom_kind(self) -> str:
         """The iteration domain kind of the block, for example, SSSS, SSSR."""
         return "".join(i.kind for i in self.iters)
@@ -216,7 +223,7 @@ def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool:
         if len(r_region) != len(w_region):
             return False
         for var, r_dom, w_dom in zip(block.iter_vars, r_region, w_region):
-            if not _check_unit_var_range(var, r_dom) or not _check_unit_var_range(var, w_dom):
+            if not _check_unit_var_range(r_dom, var) or not _check_unit_var_range(w_dom, var):
                 return False
         return True
 
@@ -230,31 +237,23 @@ def is_reduction(self) -> bool:
 
     def is_layout_transform(self, sch: tir.Schedule) -> bool:
         """Whether the Block can be considered having a Layout Transform Pattern"""
-        block_stmt = sch.get(self.block_rv)
-        lps = sch.get_loops(block_rv)
-        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
-        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
         return (
             all(k == "S" for k in self.dom_kind())
-            and len(write_bufs) == 1
-            and len(read_bufs) == 1
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
             and not self.is_elementwise(sch)
             and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
         )
 
     def is_data_pad(self, sch: tir.Schedule) -> bool:
         """Whether the Block can be considered having a data pad pattern"""
-        block_stmt = sch.get(self.block_rv)
-        lps = sch.get_loops(block_rv)
-        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
-        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
         return (
             all(k == "S" for k in self.dom_kind())
-            and len(write_bufs) == 1
-            and len(read_bufs) == 1
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
             and not self.is_elementwise(sch)
-            and len(self.write_bufs[0].buf_region.region)
-            == len(self.read_bufs[0].buf_region.region)
+            and len(self.write_bufs(sch)[0].buf_region.region)
+            == len(self.read_bufs(sch)[0].buf_region.region)
             and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
         )
 
diff --git a/python/tvm/script/ir_builder/relax/ir.py b/python/tvm/script/ir_builder/relax/ir.py
index f221a1308965..094ec3379fe9 100644
--- a/python/tvm/script/ir_builder/relax/ir.py
+++ b/python/tvm/script/ir_builder/relax/ir.py
@@ -619,6 +619,30 @@ def emit_var_binding(value: VarBinding) -> Var:
     return _ffi_api.EmitVarBinding(value)  # type: ignore
 
 
+def emit_with_sinfo(
+    op: str,
+    args: Expr,
+    sinfo_args: Optional[Union[StructInfo, List[StructInfo]]] = None,
+) -> Call:
+    """Create a relax Call with sinfo_args.
+    Parameters
+    ----------
+    op: Expr
+        The relax op for which sinfo_args to be appended
+    args : Expr
+        The arguments.
+    sinfo_args: Optional[Union[StructInfo, List[StructInfo]]]
+        The list of structure info arguments.
+
+    Returns
+    -------
+    call: Call
+        The created Relax Call
+    """
+    builtin_call = tvm.ir.Op.get(op)
+    return Call(builtin_call, args, attrs=None, sinfo_args=sinfo_args)
+
+
 ############################### SeqExpr ###############################
 
 
@@ -822,6 +846,7 @@ def dtype(value: Union[py_str, DataType]) -> Expr:
     "emit_te",
     "emit_var_binding",
     "emit_match_cast",
+    "emit_with_sinfo",
     "equal",
     "ewise_fma",
     "exp",
diff --git a/src/relax/backend/vm/codegen_vm.cc b/src/relax/backend/vm/codegen_vm.cc
index 34cc32978971..04e9880b262a 100644
--- a/src/relax/backend/vm/codegen_vm.cc
+++ b/src/relax/backend/vm/codegen_vm.cc
@@ -215,15 +215,15 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   Instruction::Arg VisitExpr_(const ConstantNode* op) final {
+    auto arg = builder_->ConvertConstant(op->data);
+
     if (auto tsinfo = op->struct_info_.as<TensorStructInfoNode>()) {
       if (tsinfo->vdevice.defined()) {
         VDevice vdev = tsinfo->vdevice.value();
-        runtime::Tensor param = op->data;
-        param.SetScope(vdev->memory_scope);
+        builder_->SaveMemoryScope(arg, vdev->memory_scope);
       }
     }
-
-    return builder_->ConvertConstant(op->data);
+    return arg;
   }
 
   Instruction::Arg VisitExpr_(const ShapeExprNode* op) final {
diff --git a/src/relax/backend/vm/exec_builder.cc b/src/relax/backend/vm/exec_builder.cc
index b893b48830ce..a2fef22609be 100644
--- a/src/relax/backend/vm/exec_builder.cc
+++ b/src/relax/backend/vm/exec_builder.cc
@@ -46,6 +46,10 @@ ObjectPtr<VMExecutable> ExecBuilderNode::Get() {
   return exec_;
 }
 
+void ExecBuilderNode::SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope) {
+  exec_->memory_scopes[idx.value()] = scope;
+}
+
 vm::Instruction::Arg ExecBuilderNode::ConvertConstant_(Any cvalue) {
   // emit constant immediate as immediate.
   if (auto opt_int = cvalue.as<int64_t>()) {
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 8aa40515ecce..cff6587fb9ed 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -50,6 +50,89 @@ class OpenCLWrappedFunc {
     arg_size_ = arg_size;
     launch_param_config_.Init(arg_size.size(), launch_param_tags);
   }
+
+#ifdef PROFILE_SHADER_DUMP
+  void dump_trace(const ThreadWorkLoad& wl, int work_dim, const ffi::PackedArgs& args) const {
+    std::string dump_path(getenv("PROFILE_SHADER_DUMP_PATH"));
+    static int trace_count = 0;
+
+    std::vector<int> g_vec;
+    std::vector<int> l_vec;
+    for (cl_uint i = 0; i < work_dim; ++i) {
+      g_vec.push_back(wl.work_size[i]);
+      l_vec.push_back(wl.work_size[i + 3]);
+    }
+
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+    writer.WriteObjectKeyValue("api", func_name_);
+    writer.WriteObjectKeyValue("global", g_vec);
+    writer.WriteObjectKeyValue("local", l_vec);
+
+    class DumpArg {
+     public:
+      DumpArg() = default;
+
+      void Save(dmlc::JSONWriter* writer) const {
+        writer->BeginObject();
+        writer->WriteObjectKeyValue("index", idx);
+        writer->WriteObjectKeyValue("scope", scope);
+        if (scope.find("texture") != std::string::npos) {
+          writer->WriteObjectKeyValue("width", width);
+          writer->WriteObjectKeyValue("height", height);
+          writer->WriteObjectKeyValue("depth", depth);
+        } else {
+          writer->WriteObjectKeyValue("size", size);
+        }
+        writer->WriteObjectKeyValue("dtype", dtype);
+        writer->EndObject();
+      }
+
+      uint32_t idx;
+      std::string scope;
+      uint32_t size;
+      uint32_t width;
+      uint32_t height;
+      uint32_t depth;
+      uint32_t dtype;
+    };
+
+    std::vector<DumpArg> vargs;
+
+    for (cl_uint i = 0; i < arg_size_.size(); ++i) {
+      DumpArg darg;
+      darg.idx = i;
+      if (args[i].as<void*>()) {
+        cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(args[i].cast<void*>());
+        auto mem_scope = cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout);
+        darg.scope = std::string(mem_scope);
+        if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
+          darg.size = desc->mem_size;
+        } else {
+          darg.width = desc->width;
+          darg.height = desc->height;
+          darg.depth = desc->depth;
+        }
+        darg.dtype = desc->dtype.code;
+      }
+      vargs.push_back(darg);
+    }
+
+    writer.WriteObjectKeyValue("args", vargs);
+    writer.EndObject();
+    std::stringstream trace_file;
+    trace_file << std::setw(4) << std::setfill('0') << std::to_string(trace_count);
+    trace_file << "_" << func_name_ << ".json";
+    if (dump_path == "cli") {
+      LOG(WARNING) << os.str();
+    } else {
+      SaveBinaryToFile(dump_path + "/" + trace_file.str(), os.str());
+    }
+    trace_count++;
+  }
+#endif  // PROFILE_SHADER_DUMP
+
   // invoke the function with void arguments
   void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const {
     ICHECK(w_->devices.size() > 0) << "No OpenCL device";
@@ -63,6 +146,13 @@ class OpenCLWrappedFunc {
     if (kernel == nullptr || e.version != entry_.version) {
       kernel = m_->InstallKernel(w_, t, func_name_, entry_);
     }
+    ThreadWorkLoad wl = launch_param_config_.Extract(args);
+    cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim());
+#ifdef PROFILE_SHADER_DUMP
+    if (getenv("PROFILE_SHADER_DUMP_PATH")) {
+      dump_trace(wl, work_dim, args);
+    }
+#endif  // PROFILE_SHADER_DUMP
     // setup arguments.
     for (cl_uint i = 0; i < arg_size_.size(); ++i) {
       void* arg = nullptr;
@@ -74,8 +164,6 @@ class OpenCLWrappedFunc {
       OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg));
     }
     cl_command_queue queue = w_->GetQueue(t->device);
-    ThreadWorkLoad wl = launch_param_config_.Extract(args);
-    cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim());
     for (cl_uint i = 0; i < work_dim; ++i) {
       wl.work_size[i] *= wl.work_size[i + 3];
     }
diff --git a/src/runtime/tensor.cc b/src/runtime/tensor.cc
index 08672e2b1ba9..4ef744452c3c 100644
--- a/src/runtime/tensor.cc
+++ b/src/runtime/tensor.cc
@@ -129,22 +129,16 @@ Tensor Tensor::Empty(ffi::Shape shape, DLDataType dtype, Device dev,
       DeviceAPI::Get(tensor->device)->FreeDataSpace(tensor->device, tensor->data);
     }
   };
-  Tensor ret = ffi::Tensor::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
-  if (mem_scope.has_value()) {
-    ret.SetScope(mem_scope.value());
-  }
-
-  return ret;
+  return ffi::Tensor::FromNDAlloc(DeviceAPIAlloc(), shape, dtype, dev, mem_scope);
 }
 
 Tensor Tensor::CreateView(ffi::Shape shape, DLDataType dtype, uint64_t relative_byte_offset) const {
   ICHECK(data_ != nullptr);
 
   const DLTensor& orig = *get_mutable();
-  CHECK(IsContiguous() || ("global" != scope)) << [&orig]() {
+  CHECK(IsContiguous()) << [&orig]() {
     std::stringstream ss;
-    ss << "Can only create view for compact tensor, but found strides or its a memory scoped "
-          "object";
+    ss << "Can only create view for compact tensor, but found strides ";
 
     ss << "[";
     for (int i = 0; i < orig.ndim; i++) {
@@ -165,7 +159,6 @@ Tensor Tensor::CreateView(ffi::Shape shape, DLDataType dtype, uint64_t relative_
   const auto& curr_dl_tensor = *get_mutable();
   size_t curr_size = GetDataSize(curr_dl_tensor);
   size_t view_size = ffi::GetDataSize(shape.Product(), dtype);
-
   CHECK_LE(relative_byte_offset + view_size, curr_size)
       << "ValueError: "
       << "View with shape " << shape << " and datatype " << dtype << " would have a size of "
@@ -238,10 +231,6 @@ void Tensor::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle stre
   DeviceAPI::Get(dev)->CopyDataFromTo(const_cast<DLTensor*>(from), to, stream);
 }
 
-void Tensor::SetScope(ffi::String scope) { this->scope = scope; }
-
-ffi::String Tensor::GetScope() const { return this->scope; }
-
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 40edbc14c433..efa05fd38376 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -38,6 +38,7 @@ namespace vm {
 
 /*! \brief The magic number for the serialized VM bytecode file  */
 constexpr uint64_t kTVMVMBytecodeMagic = 0xD225DE2F4214151D;
+constexpr uint64_t kTVMVMBytecodeMagicV2 = 0xD225DE2F4214151E;
 
 #define STREAM_CHECK(val, section)                                          \
   ICHECK(val) << "Invalid VM file format in the " << section << " section." \
@@ -143,22 +144,24 @@ Instruction VMExecutable::GetInstruction(Index i) const {
 }
 
 void SaveHeader(dmlc::Stream* strm) {
-  uint64_t header = kTVMVMBytecodeMagic;
+  uint64_t header = kTVMVMBytecodeMagicV2;
   strm->Write(header);
   std::string version = VM_VERSION;
   strm->Write(version);
 }
 
-void LoadHeader(dmlc::Stream* strm) {
+uint64_t LoadHeader(dmlc::Stream* strm) {
   // Check header.
   uint64_t header;
   STREAM_CHECK(strm->Read(&header), "header");
-  STREAM_CHECK(header == kTVMVMBytecodeMagic, "header");
+  STREAM_CHECK((header == kTVMVMBytecodeMagic) || (header == kTVMVMBytecodeMagicV2), "header");
 
   // Check version.
   std::string version;
   STREAM_CHECK(strm->Read(&version), "version");
   STREAM_CHECK(version == VM_VERSION, "version");
+
+  return header;
 }
 
 ffi::Bytes VMExecutable::SaveToBytes() const {
@@ -172,6 +175,9 @@ ffi::Bytes VMExecutable::SaveToBytes() const {
   // Global section.
   SaveGlobalSection(&strm);
 
+  // Memory Scopes
+  SaveMemoryScopeSection(&strm);
+
   // Constant section.
   SaveConstantSection(&strm);
 
@@ -192,11 +198,16 @@ ffi::Module VMExecutable::LoadFromBytes(const ffi::Bytes& bytes) {
   ObjectPtr<VMExecutable> exec = ffi::make_object<VMExecutable>();
 
   // Load header.
-  LoadHeader(&strm);
+  uint64_t header_magic = LoadHeader(&strm);
 
   // Global section.
   exec->LoadGlobalSection(&strm);
 
+  if (kTVMVMBytecodeMagicV2 == header_magic) {
+    // Memory Scopes
+    exec->LoadMemoryScopeSection(&strm);
+  }
+
   // Constant section.
   exec->LoadConstantSection(&strm);
 
@@ -245,6 +256,15 @@ bool VMFuncInfo::Load(dmlc::Stream* strm) {
 
 void VMExecutable::SaveGlobalSection(dmlc::Stream* strm) const { strm->Write(func_table); }
 
+void VMExecutable::SaveMemoryScopeSection(dmlc::Stream* strm) const {
+  strm->Write(static_cast<uint64_t>(this->memory_scopes.size()));
+  for (auto it = this->memory_scopes.begin(); it != this->memory_scopes.end(); it++) {
+    LOG(WARNING) << "Scope Saving:" << it->second;
+    strm->Write(it->first);
+    strm->Write(it->second);
+  }
+}
+
 void VMExecutable::SaveConstantSection(dmlc::Stream* strm) const {
   strm->Write(static_cast<uint64_t>(this->constants.size()));
   for (const auto& it : this->constants) {
@@ -293,6 +313,23 @@ void VMExecutable::LoadGlobalSection(dmlc::Stream* strm) {
   }
 }
 
+void VMExecutable::LoadMemoryScopeSection(dmlc::Stream* strm) {
+  uint64_t sz;
+  // Load the number of memory scope entries.
+  STREAM_CHECK(strm->Read(&sz, sizeof(sz)), "memory scopes");
+
+  size_t size = static_cast<size_t>(sz);
+  // Load each of the scopes.
+  for (size_t i = 0; i < size; i++) {
+    Index const_idx;
+    std::string scope;
+    STREAM_CHECK(strm->Read(&const_idx), "memory scopes");
+    STREAM_CHECK(strm->Read(&scope), "memory scopes");
+    LOG(WARNING) << "Scope Loaded:" << scope;
+    this->memory_scopes[const_idx] = scope;
+  }
+}
+
 void VMExecutable::LoadConstantSection(dmlc::Stream* strm) {
   uint64_t sz;
   // Load the number of constants.
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 09ad5e97d44e..cd5475bf1ad5 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -84,19 +84,21 @@ ffi::Any IndexIntoNestedObject(ffi::Any obj, ffi::PackedArgs args, int starting_
   return obj;
 }
 
-Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc) {
+Tensor ConvertTensorToDevice(Tensor src, const DLDevice& dev, Allocator* alloc,
+                             ffi::String scope = "global") {
   if (src->device.device_type == dev.device_type && src->device.device_id == dev.device_id) {
     return src;
   } else {
-    auto res = alloc->Empty(src.Shape(), src->dtype, dev, src.GetScope());
+    auto res = alloc->Empty(src.Shape(), src->dtype, dev, scope);
     res.CopyFrom(src);
     return res;
   }
 }
 
-Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc) {
+Any ConvertObjectToDevice(Any src, const Device& dev, Allocator* alloc,
+                          ffi::String scope = "global") {
   if (src.as<Tensor::ContainerType>()) {
-    return ConvertTensorToDevice(src.cast<Tensor>(), dev, alloc);
+    return ConvertTensorToDevice(src.cast<Tensor>(), dev, alloc, scope);
   } else if (src.as<ffi::ArrayObj>()) {
     std::vector<Any> ret;
     auto arr = src.cast<ffi::Array<Any>>();
@@ -129,10 +131,11 @@ ffi::Any ConvertArgToDevice(ffi::AnyView input, Device dev, Allocator* alloc) {
   return ret;
 }
 
-ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc) {
+ffi::Any ConvertRegToDevice(ffi::Any input, Device dev, Allocator* alloc,
+                            ffi::String scope = "global") {
   ffi::Any ret;
   if (auto opt_obj = input.as<ObjectRef>()) {
-    ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc);
+    ret = ConvertObjectToDevice(opt_obj.value(), dev, alloc, scope);
   } else {
     ret = input;
   }
@@ -469,11 +472,12 @@ void VirtualMachineImpl::Init(const std::vector<Device>& devices,
   }
   // Setup constant sections.
   this->const_pool_.reserve(exec_->constants.size());
-  for (const auto& constant : exec_->constants) {
-    if (auto opt_nd = constant.as<Tensor>()) {
-      this->const_pool_.push_back(ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0]));
+  for (size_t i = 0; i < exec_->constants.size(); ++i) {
+    if (auto opt_nd = exec_->constants[i].as<Tensor>()) {
+      this->const_pool_.push_back(
+          ConvertRegToDevice(opt_nd.value(), devices[0], allocators[0], exec_->memory_scopes[i]));
     } else {
-      this->const_pool_.push_back(constant);
+      this->const_pool_.push_back(exec_->constants[i]);
     }
   }
   // Setup function sections.
diff --git a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
index 24b4cf66b888..57abfd081350 100644
--- a/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
+++ b/tests/python/relax/adreno/test_transform_annotate_custom_scope.py
@@ -123,8 +123,8 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-nhwc"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
-        "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global.texture-weight"], ["global"]),
+        "te_layout_transform1": (["global"], ["global"]),
+        "conv2d_NCHWc_OIHWo_opencl": (["global.texture-nhwc", "global"], ["global"]),
         "te_layout_transform2": (["global"], ["global"]),
     }
 
@@ -151,9 +151,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -182,9 +182,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -213,9 +213,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -239,9 +239,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -267,9 +267,9 @@ def main(
     Expected = {
         "relu": (["global"], ["global"]),
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu1": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -294,9 +294,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_relu_tir_tanh": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -321,10 +321,10 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
         "te_layout_transform3": (["global"], ["global"]),
@@ -352,16 +352,16 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
         "relu": (["global"], ["global"]),
         "te_layout_transform3": (["global"], ["global.texture-weight"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -384,9 +384,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -410,9 +410,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -436,9 +436,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "sum": (["global"], ["global"]),
@@ -462,9 +462,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -488,9 +488,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -514,9 +514,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -542,9 +542,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -569,9 +569,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate": (["global.texture-weight"], ["global"]),
@@ -595,9 +595,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]),
@@ -625,9 +625,9 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_relu_concatenate_split": (["global.texture-weight"], ["global", "global"]),
@@ -660,9 +660,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "max_pool2d_opencl": (["global.texture-weight"], ["global"]),
@@ -686,9 +686,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "adaptive_avg_pool2d_opencl": (["global.texture-weight"], ["global"]),
@@ -712,9 +712,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -743,9 +743,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "layer_norm": (["global", "global", "global"], ["global"]),
@@ -771,9 +771,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -797,9 +797,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -852,20 +852,20 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "multiply": (["global"], ["global"]),
         "fused_conv2d_NCHWc_OIHWo1_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo2_opencl_relu1": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -911,15 +911,15 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform4": (["global"], ["global"]),
@@ -968,20 +968,20 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "te_layout_transform2": (["global"], ["global.texture-weight"]),
         "fused_conv2d_NCHWc_OIHWo_opencl_add_relu": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform3": (["global"], ["global.texture-weight"]),
+        "te_layout_transform3": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo1_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
-        "te_layout_transform4": (["global"], ["global.texture-weight"]),
+        "te_layout_transform4": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo2_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "concatenate": (["global.texture-weight", "global.texture-weight"], ["global"]),
@@ -1041,23 +1041,23 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "max_pool2d_opencl": (["global.texture-weight"], ["global.texture-weight"]),
-        "te_layout_transform2": (["global"], ["global.texture-weight"]),
+        "te_layout_transform2": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo2_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo1_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global.texture-weight"],
         ),
         "fused_conv2d_NCHWc_OIHWo3_opencl_add": (
-            ["global.texture-weight", "global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global", "global.texture-weight"],
             ["global"],
         ),
         "te_layout_transform3": (["global"], ["global"]),
@@ -1111,9 +1111,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -1121,7 +1121,7 @@ def main(
         "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": (
             [
                 "global.texture-weight",
-                "global.texture-weight",
+                "global",
                 "global.texture-weight",
                 "global.texture-weight",
                 "global.texture-weight",
@@ -1180,9 +1180,9 @@ def main(
 
     Expected = {
         "te_layout_transform": (["global"], ["global.texture-weight"]),
-        "te_layout_transform1": (["global"], ["global.texture-weight"]),
+        "te_layout_transform1": (["global"], ["global"]),
         "conv2d_NCHWc_OIHWo_opencl": (
-            ["global.texture-weight", "global.texture-weight"],
+            ["global.texture-weight", "global"],
             ["global"],
         ),
         "te_layout_transform2": (["global"], ["global"]),
@@ -1190,7 +1190,7 @@ def main(
         "fused_conv2d_NCHWc_OIHWo_opencl_add_multiply_add": (
             [
                 "global.texture-weight",
-                "global.texture-weight",
+                "global",
                 "global.texture-weight",
                 "global.texture-weight",
             ],
diff --git a/tests/python/relax/test_transform.py b/tests/python/relax/test_transform.py
index ac010aca79c1..78a52c66723e 100644
--- a/tests/python/relax/test_transform.py
+++ b/tests/python/relax/test_transform.py
@@ -373,7 +373,7 @@ def foo(
     tvm.ir.assert_structural_equal(Expected["foo"], new_mod["foo"], map_free_vars=True)
 
 
-def _test_call_tir_inplace_some_new():
+def test_call_tir_inplace_some_new():
     @tvm.script.ir_module
     class Input:
         @T.prim_func
@@ -441,8 +441,16 @@ def foo(
             R.Tensor((2, 3), "int32"), R.Tensor((2, 3), "int32"), R.Tensor((2, 3), dtype="int32")
         ):
             R.func_attr({"relax.force_pure": True})
-            gv0 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0))
-            gv1 = R.builtin.alloc_tensor(R.shape([2, 3]), "int32", R.prim_value(0))
+            gv0: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((2, 3), dtype="int32"),),
+            )
+            gv1: R.Tensor((2, 3), dtype="int32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([2, 3]), R.dtype("int32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((2, 3), dtype="int32"),),
+            )
             _ = Expected.copy(x, y, z, gv0, gv1)
             gv2 = (x, gv0, gv1)
             return gv2
diff --git a/tests/python/relax/test_transform_legalize_ops_manipulate.py b/tests/python/relax/test_transform_legalize_ops_manipulate.py
index 5a3a8db7a55c..17e0160c6183 100644
--- a/tests/python/relax/test_transform_legalize_ops_manipulate.py
+++ b/tests/python/relax/test_transform_legalize_ops_manipulate.py
@@ -1703,7 +1703,7 @@ def main(x: R.Tensor((10, 20, 30), dtype="float32")) -> R.Tensor((10, 30, 7, 3),
     tvm.ir.assert_structural_equal(mod, Expected)
 
 
-def _test_func_struct_info_of_legalized_layout_transform():
+def test_func_struct_info_of_legalized_layout_transform():
     """PrimFunc shape information must be correct
 
     This is a regression test.  Previously, the legalization of
@@ -1745,8 +1745,10 @@ def main(
         ):
             R.func_attr({"relax.force_pure": True})
             cls = Expected
-            alloc: R.Tensor((4, 4), dtype="float32") = R.builtin.alloc_tensor(
-                R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global")
+            alloc: R.Tensor((4, 4), dtype="float32") = R.emit_with_sinfo(
+                "relax.builtin.alloc_tensor",
+                (R.shape([4, 4]), R.dtype("float32"), R.prim_value(0), R.str("global")),
+                (R.Tensor((4, 4), dtype="float32"),),
             )
             cls.te_layout_transform(x, alloc)
             lv = alloc
diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py
index af8752d0fcbe..caae66a1b332 100644
--- a/tests/python/relax/texture/test_network.py
+++ b/tests/python/relax/texture/test_network.py
@@ -137,10 +137,10 @@ def build_and_run(
 
     params_dev = []
     for k, v in params_np.items():
-        params_dev.append(tvm.nd.array(v, dev))
+        params_dev.append(tvm.runtime.tensor(v, dev))
 
     f = vm["main"]
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs_np]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
 
     vm.set_input("main", *inputs)
 
diff --git a/tests/python/relax/texture/test_ops.py b/tests/python/relax/texture/test_ops.py
index db9cf01fd2ec..fb354072b4a0 100644
--- a/tests/python/relax/texture/test_ops.py
+++ b/tests/python/relax/texture/test_ops.py
@@ -132,7 +132,7 @@ def build_run(mod, inputs, is_adreno):
         device_arr = [dev]
 
     vm = relax.VirtualMachine(rexec, device_arr)
-    inputs = [tvm.nd.array(inp, dev) for inp in inputs]
+    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs]
     vm.set_input("main", *inputs)
     vm.invoke_stateful("main")
     tvm_output = vm.get_outputs("main")
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index a401fb808acb..3146ecb1c7dc 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -113,7 +113,7 @@ case ${ENVIRONMENT} in
     adb forward tcp:$((LISTEN_PORT + 1)) tcp:$((LISTEN_PORT + 1))
     adb forward tcp:$((LISTEN_PORT + 2)) tcp:$((LISTEN_PORT + 2))
     adb forward tcp:$((LISTEN_PORT + 3)) tcp:$((LISTEN_PORT + 3))
-    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
+    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; PROFILE_SHADER_DUMP_PATH=cli LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
     ;;
 
   "query")
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index bc5a075b2325..2c4c9637ea47 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -49,6 +49,7 @@ echo set\(MACHINE_NAME aarch64-linux-gnu\) >> config.cmake
 echo set\(USE_OPENCL_GTEST ON\) >> config.cmake
 
 echo set\(USE_OPENCL_EXTN_QCOM ON\) >> config.cmake
+echo set\(PROFILE_SHADER_DUMP ON\) >> config.cmake
 
 cmake -DANDROID_ABI=arm64-v8a \
       -DCMAKE_SYSTEM_NAME=Android \

From ffc31d1be57479578c92c81568b17b2612f3c3af Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 4 Dec 2025 16:12:12 +0530
Subject: [PATCH 07/17] Lint

---
 python/tvm/dlight/adreno/fallback.py | 1 -
 python/tvm/dlight/adreno/pool.py     | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/dlight/adreno/fallback.py b/python/tvm/dlight/adreno/fallback.py
index ecb42fe62abc..050781a21f32 100644
--- a/python/tvm/dlight/adreno/fallback.py
+++ b/python/tvm/dlight/adreno/fallback.py
@@ -23,7 +23,6 @@
 from tvm.target import Target
 from .. import analysis
 from .base import AdrenoScheduleRule
-from .utils import get_texture_storage
 
 
 def _assert_gpu_target(target: Target):
diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
index 61c6ca33f5a2..3ee72ff28646 100644
--- a/python/tvm/dlight/adreno/pool.py
+++ b/python/tvm/dlight/adreno/pool.py
@@ -87,6 +87,6 @@ def schedule_max_pool(blk: tir.schedule.BlockRV):
                         sch.reverse_compute_inline(blk)
                     else:
                         sch.compute_inline(blk)
-                except Exception:  # pylint: disable=broad-except-bare
+                except Exception:  # pylint: broad-exception-caught
                     pass
         return sch

From 0a8aba954feefc38562258a9591025299f0e19ee Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 4 Dec 2025 16:18:00 +0530
Subject: [PATCH 08/17] Lint

---
 include/tvm/relax/exec_builder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/tvm/relax/exec_builder.h b/include/tvm/relax/exec_builder.h
index 5353f0a73569..222dea3fb1f2 100644
--- a/include/tvm/relax/exec_builder.h
+++ b/include/tvm/relax/exec_builder.h
@@ -127,8 +127,8 @@ class ExecBuilderNode : public Object {
    *
    * This function builds the memory scopes for constants.
    *
-   * \param Index of the constant
-   * \param The memory scope.
+   * \param idx Index of the constant
+   * \param scope The memory scope.
    */
   void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope);
   /*!

From 0b6ff712497c9f0c229a5d6d32c8f2d97085405a Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 4 Dec 2025 20:12:56 +0530
Subject: [PATCH 09/17] Tests execution, pipelines for adreno target.

---
 python/tvm/relax/backend/adreno/pipeline.py   |  86 +++++-
 python/tvm/target/target.py                   |  21 +-
 src/relax/op/op_common.h                      |   1 +
 .../backend/clml/test_op_exec_clml_codegen.py |  36 +--
 tests/python/relax/backend/clml/utils.py      |  65 +++--
 tests/python/relax/texture/test_network.py    | 174 +----------
 tests/python/relax/texture/test_ops.py        | 275 ++++++------------
 tests/scripts/task_build.py                   |   2 +
 8 files changed, 236 insertions(+), 424 deletions(-)

diff --git a/python/tvm/relax/backend/adreno/pipeline.py b/python/tvm/relax/backend/adreno/pipeline.py
index 612b8ce7011d..cbbb17a6180a 100644
--- a/python/tvm/relax/backend/adreno/pipeline.py
+++ b/python/tvm/relax/backend/adreno/pipeline.py
@@ -18,6 +18,7 @@
 import tvm
 from tvm import dlight as dl
 from tvm import relax
+from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
 
 
 def library_dispatch_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
@@ -30,21 +31,78 @@ def library_dispatch_passes(target: tvm.target.Target):  # pylint: disable=unuse
 
 def legalize_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
     """The default legalization passes for Adreno GPU backend."""
-    return [
-        relax.transform.DecomposeOpsForInference(),
-        relax.transform.FoldConstant(),
-        relax.transform.LegalizeOps(),
-        relax.transform.AnnotateTIROpPattern(),
-        relax.transform.FoldConstant(),
-        relax.transform.FuseOps(),
-        relax.transform.FuseTIR(),
-        relax.transform.DeadCodeElimination(),
-        dl.ApplyDefaultSchedule(
-            dl.gpu.Reduction(),
-            dl.gpu.GeneralReduction(),
-            dl.gpu.Fallback(),
-        ),
+    desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
+    skip_ops = [
+        "relax.nn.conv2d",
+        "relax.nn.max_pool2d",
+        "relax.nn.adaptive_avg_pool2d",
     ]
+    pass_list = []
+
+    pass_list.extend(
+        [
+            tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False)),
+            relax.transform.DecomposeOpsForInference(),
+        ]
+    )
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.transform.ConvertLayout(desired_layouts),
+                relax.transform.Normalize(),
+                relax.transform.FoldConstant(),
+                relax.transform.LegalizeOps(skip_ops=skip_ops),
+                relax.transform.AnnotateTIROpPattern(),
+                relax.backend.adreno.transform.AnnotateCustomMemoryScope(target),
+            ]
+        )
+    pass_list.extend([tvm.relax.transform.LegalizeOps()])
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.transform.LegalizeOps(
+                    {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
+                )
+            ]
+        )
+
+    pass_list.extend(
+        [
+            relax.transform.AnnotateTIROpPattern(),
+            relax.transform.FoldConstant(),
+            relax.transform.FuseOps(),
+            relax.transform.FuseTIR(),
+            relax.transform.DeadCodeElimination(),
+        ]
+    )
+    if "texture" in target.keys:
+        pass_list.extend(
+            [
+                relax.backend.adreno.transform.FoldVDeviceScopeChange(),
+                relax.transform.DeadCodeElimination(),
+                relax.transform.SpecializePrimFuncBasedOnCallSite(),
+            ]
+        )
+    pass_list.extend([relax.transform.Normalize()])
+    pass_list.extend(
+        [
+            dl.ApplyDefaultSchedule(
+                dl.adreno.Conv2d(),
+                dl.adreno.LayoutTransform(),
+                dl.adreno.Pool2D(),
+            )
+        ]
+    )
+    pass_list.extend(
+        [
+            dl.ApplyDefaultSchedule(
+                dl.gpu.Reduction(),
+                dl.gpu.GeneralReduction(),
+                dl.gpu.Fallback(),
+            )
+        ]
+    )
+    return pass_list
 
 
 def dataflow_lower_passes(target: tvm.target.Target):  # pylint: disable=unused-argument
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index eb6e25f0450c..31788d5f062a 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -838,7 +838,7 @@ def stm32(series="unknown", options=None):
     return Target(" ".join(["c"] + opts))
 
 
-def adreno(model="unknown", options=None, clml=False):
+def adreno(model="unknown", options=None, cfg=None, backend="opencl"):
     """Returns a Qualcomm GPU target.
     Parameters
     ----------
@@ -846,13 +846,22 @@ def adreno(model="unknown", options=None, clml=False):
         The model of this device
     options : str or list of str
         Additional options
+    cfg : str
+        Additional hints for target pipeline behavior
+    backend : str
+            Backend API, can be "opencl" or "vulkan"
     """
-    if clml:
-        opts = ["-device=adreno", "--keys=adreno,opencl,gpu,clml", "-model=%s" % model]
-    else:
-        opts = ["-device=adreno", "--keys=adreno,opencl,gpu", "-model=%s" % model]
+
+    if backend not in ["opencl", "vulkan"]:
+        raise ValueError(f"Unsupported API: {backend}. Must be 'opencl' or 'vulkan'.")
+
+    keys = f"adreno,{backend},gpu"
+    if cfg:
+        keys += f",{cfg}"
+
+    opts = ["-device=adreno", f"--keys={keys}", f"-model={model}"]
     opts = _merge_opts(opts, options)
-    return Target(" ".join(["opencl"] + opts))
+    return Target(" ".join([backend] + opts))
 
 
 def create(target):
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
index c77d7a07823f..5c4f563bebee 100644
--- a/src/relax/op/op_common.h
+++ b/src/relax/op/op_common.h
@@ -213,6 +213,7 @@ inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx
   output_sinfo->dtype = f_compute_out_dtype(input_sinfo);
   if (call->sinfo_args.size() > 0) {
     auto defined_sinfo = call->sinfo_args[0].as<TensorStructInfoNode>();
+    ICHECK(defined_sinfo);
     auto shape = output_sinfo->GetShape();
     ICHECK(shape.defined());
     ICHECK(defined_sinfo->vdevice.has_value());
diff --git a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
index 4e5b4b652b45..d0a4a525adf7 100644
--- a/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
+++ b/tests/python/relax/backend/clml/test_op_exec_clml_codegen.py
@@ -20,6 +20,7 @@
 import tvm
 import tvm.testing
 import json
+import os
 
 from tvm import relax, rpc
 from tvm.script import relax as R
@@ -73,7 +74,6 @@ def test_conv2d_offload(
     has_pad,
     is_depthwise,
     dtype,
-    rpc,
 ):
     low, high = 0, 1
     data_shape = (1, *shape)
@@ -117,7 +117,7 @@ def test_conv2d_offload(
         has_pad=has_pad,
         is_depthwise=is_depthwise,
     )
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -131,7 +131,7 @@ def test_conv2d_offload(
         [(1, 256, 1, 1), 1, 3e-4],
     ],
 )
-def test_batchnorm(dtype, trials, rpc):
+def test_batchnorm(dtype, trials):
     low, high = 0, 1
     if clml.clml_sdk_version() < 3:
         print("Skip due to unsupported CLML version:", clml.clml_sdk_version())
@@ -159,7 +159,7 @@ def _get_axis_tuple(axis):
     inputs = [data]
     params_np = {"gamma": gamma, "beta": beta, "moving_mean": mean, "moving_var": variance}
     mod = get_batchnorm_mod(input_shape, channels, axis, epsilon, dtype)
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -182,9 +182,9 @@ def _get_axis_tuple(axis):
     ],
 )
 @tvm.testing.requires_openclml
-def test_binary_ops(a_shape, b_shape, op, rpc, dtype):
+def test_binary_ops(a_shape, b_shape, op, dtype):
     (mod, inputs) = get_binary_op_mod(a_shape, b_shape, op, dtype)
-    run_compare(mod, inputs, {}, rpc)
+    run_compare(mod, inputs, {})
 
 
 @tvm.testing.requires_openclml
@@ -204,9 +204,9 @@ def test_binary_ops(a_shape, b_shape, op, rpc, dtype):
     ],
 )
 @tvm.testing.requires_openclml
-def test_unary_ops(a_shape, op, rpc, dtype):
+def test_unary_ops(a_shape, op, dtype):
     (mod, inputs) = get_unary_op_mod(a_shape, op, dtype)
-    run_compare(mod, inputs, {}, rpc)
+    run_compare(mod, inputs, {})
 
 
 @tvm.testing.requires_openclml
@@ -223,14 +223,14 @@ def test_unary_ops(a_shape, op, rpc, dtype):
         [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
-def test_max_pool(dtype, trials, rpc):
+def test_max_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_maxpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -247,14 +247,14 @@ def test_max_pool(dtype, trials, rpc):
         [(1, 32, 256, 256), (2, 2), (2, 2), (1, 1), (1, 0, 1, 0), True],
     ],
 )
-def test_avg_pool(dtype, trials, rpc):
+def test_avg_pool(dtype, trials):
     low, high = -1, 1
     (input_shape, pool_size, stride, dilation, padding, has_pad) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_avgpool_mod(input_shape, dtype, pool_size, stride, dilation, padding, has_pad)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -267,14 +267,14 @@ def test_avg_pool(dtype, trials, rpc):
         [(1, 64, 3, 3), (1, 32, 3, -1)],
     ],
 )
-def test_reshape(dtype, trials, rpc):
+def test_reshape(dtype, trials):
     low, high = -1, 1
     (input_shape, output_shape) = trials
     data = np.random.uniform(low, high, size=input_shape).astype(dtype)
     inputs = [data]
     mod = get_relax_reshape_mod(input_shape, output_shape, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -288,7 +288,7 @@ def test_reshape(dtype, trials, rpc):
         [(1, 32, 256, 256), False],
     ],
 )
-def test_global_avg_pool(dtype, trials, rpc):
+def test_global_avg_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
@@ -296,7 +296,7 @@ def test_global_avg_pool(dtype, trials, rpc):
     inputs = [data]
     mod = get_relax_global_avgpool_mod(input_shape, keep_dims, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 @tvm.testing.requires_openclml
@@ -310,7 +310,7 @@ def test_global_avg_pool(dtype, trials, rpc):
         [(1, 32, 256, 256), False],
     ],
 )
-def test_global_max_pool(dtype, trials, rpc):
+def test_global_max_pool(dtype, trials):
     """Test function for global average pooling."""
     low, high = -1, 1
     (input_shape, keep_dims) = trials
@@ -322,7 +322,7 @@ def test_global_max_pool(dtype, trials, rpc):
     inputs = [data]
     mod = get_relax_global_maxpool_mod(input_shape, keep_dims, dtype)
     params_np = {}
-    run_compare(mod, inputs, params_np, rpc)
+    run_compare(mod, inputs, params_np)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relax/backend/clml/utils.py b/tests/python/relax/backend/clml/utils.py
index d32a2df38ffd..0f3516f96122 100644
--- a/tests/python/relax/backend/clml/utils.py
+++ b/tests/python/relax/backend/clml/utils.py
@@ -28,6 +28,25 @@
 from tvm.contrib import utils, ndk
 from tvm.relax.backend.adreno.clml import OpenCLMLOffLoad
 
+import os
+from tvm import rpc as _rpc
+
+
+def get_rpc():
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        connection_type = "tracker"
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        target = "opencl"
+        target_host = "llvm -mtriple=aarch64-linux-gnu"
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
+        tracker = _rpc.connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
+
 
 def build_and_run(
     mod,
@@ -35,40 +54,39 @@ def build_and_run(
     target,
     rpc=None,
     load_path="vm_library.so",
-    clml_enable=False,
 ):
     tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
-    pipeline = relax.pipeline.get_default_pipeline(tgt)
-    mod = pipeline(mod)
-    if rpc:
-        ex = tvm.compile(mod, tgt)
-        temp = utils.tempdir()
-        path = temp.relpath(load_path)
-        path = "./" + load_path
-        ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
-        rpc.upload(path)
-        rexec = rpc.load_module(load_path)
-        dev = rpc.cl(0)
-        vm = relax.VirtualMachine(rexec, dev)
-    else:
-        ex = tvm.compile(mod, target)
-        dev = tvm.device(target, 0)
-        vm = relax.VirtualMachine(ex, dev)
+    relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
+    tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
+
+    ex = tvm.compile(mod, tgt, relax_pipeline=relax_pipeline, tir_pipeline=tir_pipeline)
+    temp = utils.tempdir()
+    path = temp.relpath(load_path)
+    path = "./" + load_path
+    ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
+
+    rpc.upload(path)
+    rexec = rpc.load_module(load_path)
+    dev = rpc.cl(0)
+    vm = relax.VirtualMachine(rexec, dev)
 
     f = vm["main"]
     inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
     vm.set_input("main", *inputs)
     vm.invoke_stateful("main")
-    tvm_output = vm.get_outputs("main")
-    return tvm_output.numpy()
+    tvm_output = vm.get_outputs("main").numpy()
+    if rpc:
+        rpc.get_function("CloseRPCConnection")()
+    return tvm_output
 
 
-def run_compare(mod, inputs, params_np, rpc=None):
+def run_compare(mod, inputs, params_np):
     clml_mod = copy.deepcopy(mod)
     mod = tvm.relax.transform.BindParams("main", params_np)(mod)
     clml_mod = tvm.relax.transform.BindParams("main", params_np)(clml_mod)
 
-    if not rpc:
+    rpc = get_rpc()
+    if rpc is None:
         return
 
     ref = build_and_run(
@@ -78,12 +96,13 @@ def run_compare(mod, inputs, params_np, rpc=None):
         rpc=rpc,
         load_path="vm_library_opencl.so",
     )
+
+    rpc = get_rpc()
     out = build_and_run(
         clml_mod,
         inputs,
-        tvm.target.adreno(clml=True),
+        tvm.target.adreno(cfg="clml"),
         rpc=rpc,
         load_path="vm_library_clml.so",
-        clml_enable=True,
     )
     np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5)
diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py
index caae66a1b332..7e2d9636270f 100644
--- a/tests/python/relax/texture/test_network.py
+++ b/tests/python/relax/texture/test_network.py
@@ -26,6 +26,7 @@
 from tvm.script import relax as R
 from tvm.script import ir as I
 from tvm.script import tir as T
+from adreno_utils import verify
 from tvm.script.ir_builder import IRBuilder
 from tvm.script.ir_builder import relax as relax_builder
 from tvm.relax.frontend.onnx import from_onnx
@@ -35,174 +36,6 @@
 import json
 import copy
 
-from tvm import dlight as dl
-from tvm.contrib import utils, ndk
-
-from tvm import topi
-
-
-def build_and_run(
-    mod, inputs_np, target, rpc=None, params_np={}, load_path="vm_library.so", is_adreno=False
-):
-    skip_ops = [
-        "relax.nn.conv2d",
-        "relax.nn.max_pool2d",
-        "relax.nn.adaptive_avg_pool2d",
-        # "relax.nn.layer_norm",
-    ]
-
-    tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
-
-    with tgt:
-        mod = tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False))(mod)
-        mod = tvm.relax.transform.FoldBatchnormToConv2D()(mod)
-        mod = tvm.relax.transform.FoldConstant()(mod)
-        mod = tvm.relax.transform.DecomposeOpsForInference()(mod)
-        mod = tvm.relax.transform.FoldConstant()(mod)
-        mod = tvm.relax.transform.DeadCodeElimination()(mod)
-        desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
-        if is_adreno:
-            mod = tvm.relax.transform.ConvertLayout(desired_layouts)(mod)
-            mod = tvm.relax.transform.Normalize()(mod)
-            mod = tvm.relax.transform.FoldConstant()(mod)
-            mod = tvm.relax.transform.LegalizeOps(skip_ops=skip_ops)(mod)
-            mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
-            mod = tvm.relax.backend.adreno.transform.AnnotateCustomMemoryScope(tgt)(mod)
-        mod = tvm.relax.transform.LegalizeOps()(mod)
-        if is_adreno:
-            mod = tvm.relax.transform.LegalizeOps(
-                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
-            )(mod)
-        mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
-        mod = tvm.relax.transform.FoldConstant()(mod)
-        mod = tvm.relax.transform.FuseOps()(mod)
-        mod = tvm.relax.transform.FuseTIR()(mod)
-        mod = tvm.relax.transform.DeadCodeElimination()(mod)
-        if is_adreno:
-            mod = tvm.relax.backend.adreno.transform.FoldVDeviceScopeChange()(mod)
-            mod = tvm.relax.transform.DeadCodeElimination()(mod)
-            mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
-        mod = tvm.relax.transform.Normalize()(mod)
-
-        if is_adreno:
-            mod = dl.ApplyDefaultSchedule(
-                dl.adreno.Conv2d(),
-                dl.adreno.LayoutTransform(),
-                dl.adreno.Pool2D(),
-                dl.adreno.Fallback(),
-            )(mod)
-
-        mod = dl.ApplyDefaultSchedule(
-            dl.gpu.Reduction(),
-            dl.gpu.GeneralReduction(),
-            dl.gpu.Fallback(),
-        )(mod)
-
-        mod = tvm.relax.transform.ToNonDataflow()(mod)
-        mod = tvm.relax.transform.RemovePurityChecking()(mod)
-        mod = tvm.relax.transform.CallTIRRewrite()(mod)
-        mod = tvm.relax.transform.Normalize()(mod)
-        mod = tvm.relax.transform.StaticPlanBlockMemory()(mod)
-        mod = tvm.relax.transform.LowerAllocTensor()(mod)
-        mod = tvm.relax.transform.KillAfterLastUse()(mod)
-        mod = tvm.relax.transform.VMBuiltinLower()(mod)
-        mod = tvm.relax.transform.VMShapeLower()(mod)
-        mod = tvm.relax.transform.AttachGlobalSymbol()(mod)
-
-    if rpc:
-        ex = relax.build(mod, tgt)
-        # if is_adreno:
-        #  for smod in ex.mod.imported_modules:
-        #    print("Mod:", smod.type_key)
-        #    for imp_mod in smod.imported_modules:
-        #        print("Imp Mod:", imp_mod.type_key)
-        #        print(imp_mod.get_source())
-        temp = utils.tempdir()
-        path = temp.relpath(load_path)
-        path = "./" + load_path
-        ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
-        rpc.upload(path)
-        rexec = rpc.load_module(load_path)
-        dev = rpc.cl(0)
-        if "vdevice" in mod.global_infos:
-            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
-        else:
-            device_arr = [dev]
-
-        vm = relax.VirtualMachine(rexec, device_arr)
-    else:
-        ex = relax.build(mod, target)
-        dev = tvm.device(target, 0)
-        vm = relax.VirtualMachine(ex, dev)
-
-    params_dev = []
-    for k, v in params_np.items():
-        params_dev.append(tvm.runtime.tensor(v, dev))
-
-    f = vm["main"]
-    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs_np]
-
-    vm.set_input("main", *inputs)
-
-    vm.invoke_stateful("main")
-
-    tvm_output = vm.get_outputs("main")
-    return tvm_output.numpy()
-
-
-import os
-from tvm import rpc as _rpc
-
-
-def get_rpc():
-    rpc_target = os.getenv("RPC_TARGET", None)
-    if rpc_target:
-        connection_type = "tracker"
-        host = os.getenv("TVM_TRACKER_HOST", "localhost")
-        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
-        target = "opencl"
-        target_host = "llvm -mtriple=aarch64-linux-gnu"
-        device_key = os.getenv("RPC_DEVICE_KEY", "android")
-        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
-        tracker = _rpc.connect_tracker(host, port)
-        return tracker.request(device_key, priority=1, session_timeout=1000)
-    else:
-        return None
-
-
-def verify(mod):
-    inputs = []
-    for arg in mod["main"].params:
-        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
-        inputs.append(np.random.uniform(0, 1, size=shape).astype(arg.struct_info.dtype))
-
-    rpc = get_rpc()
-    mod1 = copy.deepcopy(mod)
-    ret1 = build_and_run(
-        mod,
-        inputs,
-        "opencl -device=adreno",
-        rpc=rpc,
-        params_np={},
-        load_path="vm_library_opencl-texture.so",
-        is_adreno=True,
-    )
-    ret2 = build_and_run(
-        mod1,
-        inputs,
-        "opencl",
-        rpc=rpc,
-        params_np={},
-        load_path="vm_library_opencl.so",
-        is_adreno=False,
-    )
-
-    if isinstance(ret1, tuple):
-        for val1, val2 in zip(ret1, ret2):
-            tvm.testing.assert_allclose(val1, ret2, rtol=1e-5, atol=1e-5)
-    else:
-        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-5, atol=1e-5)
-
 
 @pytest.mark.parametrize("dtype", ["float32"])
 @pytest.mark.parametrize(
@@ -226,7 +59,8 @@ def _test_network(url, shape_dict, dtype):
 
 
 @tvm.testing.requires_opencl
-def test_network_resnet():
+@tvm.testing.parametrize_targets("opencl")
+def test_network_resnet(target):
     @I.ir_module
     class Resnet:
         @R.function
@@ -985,7 +819,7 @@ def main(
                 R.output(gv)
             return gv
 
-    verify(Resnet)
+    verify(Resnet, target)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/relax/texture/test_ops.py b/tests/python/relax/texture/test_ops.py
index fb354072b4a0..7011852aa3ab 100644
--- a/tests/python/relax/texture/test_ops.py
+++ b/tests/python/relax/texture/test_ops.py
@@ -16,153 +16,16 @@
 # under the License.
 
 import tvm
-import numpy as np
-from tvm import relax
 import tvm.testing
 from tvm.relax.transform import ConvertLayout, Normalize
 from tvm.script.parser import ir as I, relax as R, tir as T
 from tvm.relax.transform.legalize_ops import adreno as legalize_adreno
-from tvm.ir.module import IRModule
-from tvm.relax.expr_functor import PyExprMutator, PyExprVisitor, mutator, visitor
-from tvm import dlight as dl
-from tvm.contrib import utils, ndk
-
-import os
-from tvm import rpc as _rpc
-
-
-def get_rpc():
-    rpc_target = os.getenv("RPC_TARGET", None)
-    if rpc_target:
-        connection_type = "tracker"
-        host = os.getenv("TVM_TRACKER_HOST", "localhost")
-        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
-        target = "opencl"
-        target_host = "llvm -mtriple=aarch64-linux-gnu"
-        device_key = os.getenv("RPC_DEVICE_KEY", "android")
-        cross_compile = os.getenv("TVM_NDK_CC", "aarch64-linux-android-g++")
-        tracker = _rpc.connect_tracker(host, port)
-        return tracker.request(device_key, priority=1, session_timeout=1000)
-    else:
-        return None
-
-
-def build_run(mod, inputs, is_adreno):
-    tgt = tvm.target.Target("opencl --device=adreno", host="llvm -mtriple=aarch64-linux-gnu")
-    skip_ops = [
-        "relax.nn.conv2d",
-        "relax.nn.max_pool2d",
-        "relax.nn.adaptive_avg_pool2d",
-        # "relax.nn.layer_norm",
-    ]
-    with tgt:
-        mod = tvm.tir.transform.BindTarget(tvm.target.Target.current(allow_none=False))(mod)
-        mod = tvm.relax.transform.DecomposeOpsForInference()(mod)
-        mod = tvm.relax.transform.FoldConstant()(mod)
-        desired_layouts = {"relax.nn.conv2d": ["NCHW4c", "OIHW4o", "NCHW4c"]}
-        if is_adreno:
-            mod = tvm.relax.transform.ConvertLayout(desired_layouts)(mod)
-            mod = tvm.relax.transform.Normalize()(mod)
-            mod = tvm.relax.transform.FoldConstant()(mod)
-            mod = tvm.relax.transform.LegalizeOps(skip_ops=skip_ops)(mod)
-            mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
-            mod = tvm.relax.backend.adreno.transform.AnnotateCustomMemoryScope(tgt)(mod)
-        mod = tvm.relax.transform.LegalizeOps()(mod)
-        if is_adreno:
-            mod = tvm.relax.transform.LegalizeOps(
-                {"relax.nn.conv2d": legalize_adreno.conv2d_NCHWc_OIHWo},
-            )(mod)
-        mod = tvm.relax.transform.AnnotateTIROpPattern()(mod)
-        mod = tvm.relax.transform.FoldConstant()(mod)
-        mod = tvm.relax.transform.FuseOps()(mod)
-        mod = tvm.relax.transform.FuseTIR()(mod)
-        mod = tvm.relax.transform.DeadCodeElimination()(mod)
-        if is_adreno:
-            mod = tvm.relax.backend.adreno.transform.FoldVDeviceScopeChange()(mod)
-            mod = tvm.relax.transform.DeadCodeElimination()(mod)
-            mod = tvm.relax.transform.SpecializePrimFuncBasedOnCallSite()(mod)
-        mod = tvm.relax.transform.Normalize()(mod)
-
-        if is_adreno:
-            mod = dl.ApplyDefaultSchedule(
-                dl.adreno.Conv2d(),
-                dl.adreno.LayoutTransform(),
-                dl.adreno.Pool2D(),
-            )(mod)
-
-        mod = dl.ApplyDefaultSchedule(
-            dl.gpu.Reduction(),
-            dl.gpu.GeneralReduction(),
-            dl.gpu.Fallback(),
-        )(mod)
-
-        mod = tvm.relax.transform.ToNonDataflow()(mod)
-        mod = tvm.relax.transform.RemovePurityChecking()(mod)
-        # print(mod)
-        mod = tvm.relax.transform.CallTIRRewrite()(mod)
-        mod = tvm.relax.transform.Normalize()(mod)
-        mod = tvm.relax.transform.StaticPlanBlockMemory()(mod)
-        mod = tvm.relax.transform.LowerAllocTensor()(mod)
-        mod = tvm.relax.transform.KillAfterLastUse()(mod)
-        mod = tvm.relax.transform.VMBuiltinLower()(mod)
-        mod = tvm.relax.transform.VMShapeLower()(mod)
-        mod = tvm.relax.transform.AttachGlobalSymbol()(mod)
-
-    # print("Mod relax.build:", mod)
-    # exit(0)
-    ex = relax.build(mod, tgt)
-    # for smod in ex.mod.imported_modules:
-    #    print("Mod:", smod.type_key)
-    #    for cmod in smod.imported_modules:
-    #       print(cmod.get_source())
-    load_path = "vm_library.so"
-    temp = utils.tempdir()
-    path = temp.relpath(load_path)
-    path = "./" + load_path
-    ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
-
-    rpc = get_rpc()
-    rpc.upload(path)
-    rexec = rpc.load_module(load_path)
-    dev = rpc.cl(0)
-
-    if "vdevice" in mod.global_infos:
-        device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
-    else:
-        device_arr = [dev]
-
-    vm = relax.VirtualMachine(rexec, device_arr)
-    inputs = [tvm.runtime.tensor(inp, dev) for inp in inputs]
-    vm.set_input("main", *inputs)
-    vm.invoke_stateful("main")
-    tvm_output = vm.get_outputs("main")
-    if isinstance(tvm_output, tuple):
-        tvm_output = (out.numpy() for out in tvm_output)
-    else:
-        tvm_output = tvm_output.numpy()
-
-    rpc.get_function("CloseRPCConnection")()
-    return tvm_output
-
-
-def verify(mod):
-    inputs = []
-    for arg in mod["main"].params:
-        shape = tuple(shape_val.value for shape_val in arg.struct_info.shape.values)
-        inputs.append(np.random.uniform(-1, 1, size=shape).astype(arg.struct_info.dtype))
-
-    ret1 = build_run(mod, inputs, True)
-    ret2 = build_run(mod, inputs, False)
-
-    if isinstance(ret1, tuple):
-        for val1, val2 in zip(ret1, ret2):
-            tvm.testing.assert_allclose(val1, ret2, rtol=1e-3, atol=1e-3)
-    else:
-        tvm.testing.assert_allclose(ret1, ret2, rtol=1e-3, atol=1e-3)
+from adreno_utils import verify
 
 
 @tvm.testing.requires_opencl
-def test_conv2d():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d(target):
     @I.ir_module
     class Input:
         @R.function
@@ -174,11 +37,12 @@ def main(
                 R.output(gv)
             return gv
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_relu():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu(target):
     @I.ir_module
     class Input:
         @R.function
@@ -191,11 +55,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_relu_conv2d_relu():
+@tvm.testing.parametrize_targets("opencl")
+def test_relu_conv2d_relu(target):
     @I.ir_module
     class Input:
         @R.function
@@ -209,11 +74,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_relu_tanh():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_tanh(target):
     @I.ir_module
     class Input:
         @R.function
@@ -227,11 +93,12 @@ def main(
                 R.output(gv3)
             return gv3
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_add():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_add(target):
     @I.ir_module
     class Input:
         @R.function
@@ -246,11 +113,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_sum():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum(target):
     @I.ir_module
     class Input:
         @R.function
@@ -263,11 +131,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_sum_keepdims():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum_keepdims(target):
     @I.ir_module
     class Input:
         @R.function
@@ -280,11 +149,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_sum_reduce():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_sum_reduce(target):
     @I.ir_module
     class Input:
         @R.function
@@ -297,11 +167,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_transpose():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_transpose(target):
     @I.ir_module
     class Input:
         @R.function
@@ -314,11 +185,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_expand_dims():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_expand_dims(target):
     @I.ir_module
     class Input:
         @R.function
@@ -331,11 +203,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_squeeze():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_squeeze(target):
     @I.ir_module
     class Input:
         @R.function
@@ -348,11 +221,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_strided_slice():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_strided_slice(target):
     @I.ir_module
     class Input:
         @R.function
@@ -367,11 +241,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_relu_concat():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat(target):
     @I.ir_module
     class Input:
         @R.function
@@ -385,11 +260,12 @@ def main(
                 R.output(gv3)
             return gv3
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_relu_concat_split():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat_split(target):
     @I.ir_module
     class Input:
         @R.function
@@ -404,11 +280,12 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
                 R.output(gv5)
             return gv5
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_relu_concat_split_transpose_concat():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_relu_concat_split_transpose_concat(target):
     @I.ir_module
     class Input:
         @R.function
@@ -424,11 +301,12 @@ def main(x: R.Tensor((2, 16, 28, 28), "float32"), w: R.Tensor((4, 16, 3, 3), "fl
                 R.output(gv7)
             return gv7
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_maxpool2d():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_maxpool2d(target):
     @I.ir_module
     class Input:
         @R.function
@@ -448,11 +326,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_avgpool2d():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_avgpool2d(target):
     @I.ir_module
     class Input:
         @R.function
@@ -465,11 +344,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_softmax():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_softmax(target):
     @I.ir_module
     class Input:
         @R.function
@@ -482,11 +362,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_layernorm():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_layernorm(target):
     @I.ir_module
     class Input:
         @R.function
@@ -504,11 +385,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_binary_broadcast():
+@tvm.testing.parametrize_targets("opencl")
+def test_binary_broadcast(target):
     @I.ir_module
     class Input:
         @R.function
@@ -523,11 +405,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_binary_ewise_scalar():
+@tvm.testing.parametrize_targets("opencl")
+def test_binary_ewise_scalar(target):
     @I.ir_module
     class Input:
         @R.function
@@ -540,11 +423,12 @@ def main(
                 R.output(gv2)
             return gv2
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_residual_block():
+@tvm.testing.parametrize_targets("opencl")
+def test_residual_block(target):
     """
     - some kind of residual block followed by convolution to have texture after residual block
     - scalar data type verification which should be mapped to global memory scope
@@ -587,11 +471,12 @@ def main(
                 R.output(gv7)
             return gv7
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_conv2d_fallback_to_buffer_conv2d():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_conv2d_fallback_to_buffer_conv2d(target):
     """
         layout_transform (NCHW->NCHW4c)
                   |                      <- texture
@@ -627,11 +512,12 @@ def main(
                 R.output(gv7)
             return gv7
 
-    verify(Input)
+    verify(Input, "opencl")
 
 
 @tvm.testing.requires_opencl
-def test_conv2d_conv2d_conv2d_concat():
+@tvm.testing.parametrize_targets("opencl")
+def test_conv2d_conv2d_conv2d_concat(target):
     """
         layout_transform (NCHW->NCHW4c)
                   |                      <- texture
@@ -667,11 +553,12 @@ def main(
                 R.output(gv7)
             return gv7
 
-    verify(Input)
+    verify(Input, "opencl")
 
 
 @tvm.testing.requires_opencl
-def _test_pooling_branching_texture_params():
+@tvm.testing.parametrize_targets("opencl")
+def test_pooling_branching_texture_params(target):
     """
     Verification of the pooling and many branches having textures
                 layout_transform (NCHW->NCHW4c)
@@ -720,11 +607,12 @@ def main(
                 R.output(gv9)
             return gv9
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def _test_injective_inputs1():
+@tvm.testing.parametrize_targets("opencl")
+def test_injective_inputs1(target):
     """
                                      Input
                                /                   \
@@ -768,11 +656,12 @@ def main(
                 R.output(gv)
             return gv
 
-    verify(Input)
+    verify(Input, target)
 
 
 @tvm.testing.requires_opencl
-def _test_injective_nwo_inputs2():
+@tvm.testing.parametrize_targets("opencl")
+def test_injective_nwo_inputs2(target):
     """
                                      Input
                                /             \
@@ -818,7 +707,7 @@ def main(
                 R.output(gv)
             return gv
 
-    verify(Input)
+    verify(Input, target)
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py
index 96fc781744f9..2f9f91df8a0e 100755
--- a/tests/scripts/task_build.py
+++ b/tests/scripts/task_build.py
@@ -58,6 +58,8 @@
             logging.info(f"Using sccache region: {env['SCCACHE_REGION']}")
         else:
             logging.info(f"No sccache bucket set, using local cache")
+        if "SCCACHE_SERVER_PORT" in os.environ:
+            env["SCCACHE_SERVER_PORT"] = os.getenv("SCCACHE_SERVER_PORT")
         env["CXX"] = "/opt/sccache/c++"
         env["CC"] = "/opt/sccache/cc"
 

From c66c11d9755e7357c2e2d9b98121b80cbc2f3c6c Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Thu, 4 Dec 2025 20:37:47 +0530
Subject: [PATCH 10/17] Lint

---
 python/tvm/dlight/adreno/convolution.py       |  5 --
 python/tvm/dlight/adreno/pool.py              |  2 +-
 python/tvm/dlight/analysis/common_analysis.py |  1 -
 ...test_tir_transform_inject_texture_alloc.py | 87 -------------------
 4 files changed, 1 insertion(+), 94 deletions(-)
 delete mode 100644 tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py

diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index fec465cc484f..497fc21a9a4b 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -34,10 +34,6 @@ def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
         n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
 
-        # bz, vz, tz = sch.split(oc, sch.sample_perfect_tile(oc, 3, 32))
-        # by, vy, ty = sch.split(oh, sch.sample_perfect_tile(oh, 3, 32))
-        # bx, vx, tx = sch.split(ow, sch.sample_perfect_tile(ow, 3, 32))
-
         bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True)
         by, vy, ty = sch.split(oh, [None, 1, 16], preserve_unit_iters=True)
         bx, vx, tx = sch.split(ow, [None, 1, 16], preserve_unit_iters=True)
@@ -97,7 +93,6 @@ def is_convolution(blk):
         if len(reduction_blocks) != 1 or not is_convolution(reduction_blocks[0]):
             return None
 
-        # sch.set_scope(blocks[0], 0, "global.texture")
         conv_blk = reduction_blocks[0]
         Conv2d.schedule_conv2d(sch, conv_blk)
         remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
diff --git a/python/tvm/dlight/adreno/pool.py b/python/tvm/dlight/adreno/pool.py
index 3ee72ff28646..e3709caad96a 100644
--- a/python/tvm/dlight/adreno/pool.py
+++ b/python/tvm/dlight/adreno/pool.py
@@ -87,6 +87,6 @@ def schedule_max_pool(blk: tir.schedule.BlockRV):
                         sch.reverse_compute_inline(blk)
                     else:
                         sch.compute_inline(blk)
-                except Exception:  # pylint: broad-exception-caught
+                except Exception:  # pylint: disable=broad-except
                     pass
         return sch
diff --git a/python/tvm/dlight/analysis/common_analysis.py b/python/tvm/dlight/analysis/common_analysis.py
index d19b1070e4fe..161deaf53772 100644
--- a/python/tvm/dlight/analysis/common_analysis.py
+++ b/python/tvm/dlight/analysis/common_analysis.py
@@ -72,7 +72,6 @@ def __repr__(self) -> str:
 BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
 
 
-# TODO: Shift Vlen Calculation here...
 class BufferInfo:
     "Information about Buffer. Provides useful analysis"
     buf_region: tir.BufferRegion
diff --git a/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py b/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py
deleted file mode 100644
index 4a81b1a74632..000000000000
--- a/tests/python/tir-transform/test_tir_transform_inject_texture_alloc.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import tvm
-import tvm.testing
-from tvm import te
-from tvm.script import tir as T
-
-
-def test_decl_buffer():
-    """decl_buffer with texture scope."""
-
-    @T.prim_func
-    def func():
-        A = T.decl_buffer([32, 32, 32, 4], dtype="float32", scope="global.texture")
-        A[0, 0, 0, 0] = 0
-
-    lowered = tvm.lower(func)["main"]
-    assert isinstance(lowered.body, tvm.tir.LetStmt)
-    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
-    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
-
-
-def test_alloc_buffer():
-    """alloc_buffer with texture scope."""
-
-    @T.prim_func
-    def func():
-        A = T.alloc_buffer([32, 32, 32, 4], dtype="float32", scope="global.texture-weight")
-        A[0, 0, 0, 0] = 0
-
-    lowered = tvm.lower(func)["main"]
-    assert isinstance(lowered.body, tvm.tir.LetStmt)
-    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
-    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
-
-
-def test_alloc_buffer_negative_test():
-    """Shouldn't ave texture intrensic for general use."""
-
-    @T.prim_func
-    def func():
-        A = T.alloc_buffer([32, 32, 32, 4], dtype="float32")
-        A[0, 0, 0, 0] = 0
-
-    lowered = tvm.lower(func)["main"]
-    assert isinstance(lowered.body, tvm.tir.Allocate)
-
-
-def test_with_block():
-    """Scoped with block."""
-
-    @T.prim_func
-    def func(
-        A: T.Buffer((T.int64(1), T.int64(16), T.int64(16)), "float16"),
-        B: T.Buffer((T.int64(32), T.int64(32)), "float16"),
-        C: T.Buffer((T.int64(1), T.int64(8), T.int64(8)), "float16"),
-    ):
-        with T.block("block"):
-            A = T.alloc_buffer([1, 16, 16], dtype="float16")
-            B = T.alloc_buffer([32, 32], dtype="float16")
-            C = T.alloc_buffer([1, 8, 8], dtype="float16")
-            D = T.alloc_buffer([32, 32, 32, 4], dtype="float16", scope="global.texture-weight")
-            T.evaluate(D[0, 0, 0, 0])
-
-    lowered = tvm.lower(func)["main"]
-    assert isinstance(lowered.body, tvm.tir.LetStmt)
-    assert isinstance(lowered.body.value, tvm.tir.expr.Call)
-    assert lowered.body.value.op.name == "tir.nd_mem_alloc_with_scope"
-    assert lowered.body.var.name == "D"
-
-
-if __name__ == "__main__":
-    tvm.testing.main()

From 766f198e163779f68a825baf6adcd31c28abf9be Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 5 Dec 2025 00:22:51 +0530
Subject: [PATCH 11/17] Enable OpenCL target.

---
 tests/scripts/unity/task_python_relax_gpuonly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/scripts/unity/task_python_relax_gpuonly.sh b/tests/scripts/unity/task_python_relax_gpuonly.sh
index acbcce44f279..9f0702298fa4 100755
--- a/tests/scripts/unity/task_python_relax_gpuonly.sh
+++ b/tests/scripts/unity/task_python_relax_gpuonly.sh
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-export TVM_TEST_TARGETS="llvm;cuda"
+export TVM_TEST_TARGETS="llvm;cuda;opencl"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
 export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu

From b87ccbf70f0156de173fdbda97b37b319a669ed8 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 5 Dec 2025 08:39:08 +0530
Subject: [PATCH 12/17] Enable opencl tests

---
 tests/scripts/unity/task_python_relax_gpuonly.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/unity/task_python_relax_gpuonly.sh b/tests/scripts/unity/task_python_relax_gpuonly.sh
index 9f0702298fa4..19ab942a1193 100755
--- a/tests/scripts/unity/task_python_relax_gpuonly.sh
+++ b/tests/scripts/unity/task_python_relax_gpuonly.sh
@@ -16,9 +16,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-export TVM_TEST_TARGETS="llvm;cuda;opencl"
+export TVM_TEST_TARGETS="llvm;cuda"
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
-export TVM_RELAY_TEST_TARGETS="cuda"
+export TVM_RELAY_TEST_TARGETS="cuda;opencl"
 export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
 export TVM_INTEGRATION_GPU_ONLY=1
 

From 39e54a15d454d855ffae684c459324544e8bfe33 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 5 Dec 2025 17:06:37 +0530
Subject: [PATCH 13/17] Texture and CLML tests enabled

---
 python/tvm/testing/utils.py                   | 11 +++
 tests/python/relax/texture/adreno_utils.py    | 23 +++--
 tests/python/relax/texture/test_network.py    | 21 -----
 tests/python/relax/texture/test_texture_nd.py | 89 +++++++++++++------
 tests/scripts/task_python_unittest_gpuonly.sh | 11 +++
 5 files changed, 102 insertions(+), 53 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index da22cf77466f..846a79e4896b 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -918,6 +918,17 @@ def _multi_gpu_exists():
     parent_features="cuda",
 )
 
+# Mark a test as requiring the OpenCL runtime on remote RPC
+requires_adreno_opencl = Feature(
+    "opencl",
+    long_name="Remote Adreno OpenCL",
+    cmake_flag="USE_OPENCL",
+    target_kind_enabled="opencl",
+    target_kind_hardware=None,
+    parent_features="gpu",
+    run_time_check=lambda: os.getenv('RPC_TARGET') is not None,
+)
+
 # Mark a test as requiring the OpenCL runtime
 requires_opencl = Feature(
     "opencl",
diff --git a/tests/python/relax/texture/adreno_utils.py b/tests/python/relax/texture/adreno_utils.py
index 59e6c0ad9c6a..6caa793ecc07 100644
--- a/tests/python/relax/texture/adreno_utils.py
+++ b/tests/python/relax/texture/adreno_utils.py
@@ -109,18 +109,30 @@ def run_cpu(mod, inputs, save_lib=False):
 
 
 def build_run(mod, inputs, backend, is_adreno=False):
-
+    remote = get_rpc()
     target = get_target(backend, is_adreno)
-    tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+    if remote is None:
+      tgt = tvm.target.Target(target, host="llvm")
+    else:
+      tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
     relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
     tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
     mod = relax_pipeline(mod)
     ex = tvm.compile(mod, tgt, tir_pipeline=tir_pipeline)
 
-    remote = get_rpc()
     if remote is None:
         # local execution
-        dev = tvm.device(target, 0)
+        if "opencl" in backend:
+            dev = tvm.opencl(0)
+        elif "vulkan" in backend:
+            dev = tvm.vulkan(0)
+        else:
+            raise RuntimeError("Unsupported backend")
+
+        if "vdevice" in mod.global_infos:
+            device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+        else:
+            device_arr = [dev]
         vm = relax.VirtualMachine(ex, device_arr)
     else:
         # remote execution
@@ -158,7 +170,8 @@ def build_run(mod, inputs, backend, is_adreno=False):
     else:
         tvm_output = tvm_output.numpy()
 
-    remote.get_function("CloseRPCConnection")()
+    if remote:
+        remote.get_function("CloseRPCConnection")()
     return tvm_output
 
 
diff --git a/tests/python/relax/texture/test_network.py b/tests/python/relax/texture/test_network.py
index 7e2d9636270f..aeb4f1248c10 100644
--- a/tests/python/relax/texture/test_network.py
+++ b/tests/python/relax/texture/test_network.py
@@ -37,27 +37,6 @@
 import copy
 
 
-@pytest.mark.parametrize("dtype", ["float32"])
-@pytest.mark.parametrize(
-    "url, shape_dict",
-    [
-        # ("mobilenetv2-12.onnx", {"input": [1, 3, 224, 224]}),
-        # ("densenet-12.onnx", {"data_0": [1, 3, 224, 224]}),
-        # ("inception-v2-9.onnx", {"data_0": [1, 3, 224, 224]}),
-        ("resnet18-v2-7.onnx", {"data": [1, 3, 224, 224]}),
-        # ("resnet50-v2-7.onnx", {"data": [1, 3, 224, 224]}),
-    ],
-)
-@tvm.testing.requires_opencl
-def _test_network(url, shape_dict, dtype):
-    print("Network evaluating .. " + url + " " + dtype)
-    model = onnx.load("./" + url)
-    mod = from_onnx(model, shape_dict)
-    mod1 = from_onnx(model, shape_dict)
-
-    verify(mod)
-
-
 @tvm.testing.requires_opencl
 @tvm.testing.parametrize_targets("opencl")
 def test_network_resnet(target):
diff --git a/tests/python/relax/texture/test_texture_nd.py b/tests/python/relax/texture/test_texture_nd.py
index 8a8109873cf2..1079c31245f2 100644
--- a/tests/python/relax/texture/test_texture_nd.py
+++ b/tests/python/relax/texture/test_texture_nd.py
@@ -31,21 +31,26 @@
 from tvm.target import Target
 from tvm.contrib import ndk
 from tvm import tir, DataType
-
-
-class RemoteConnection:
-    def __init__(self):
-        self.RPC_TRACKER_HOST = os.getenv("TVM_TRACKER_HOST", "localhost")
-        self.RPC_TRACKER_PORT = int(os.getenv("TVM_TRACKER_PORT", 7979))
-        self.RPC_KEY = os.getenv("RPC_DEVICE_KEY", "android")
-        self.tracker = tvm.rpc.connect_tracker(self.RPC_TRACKER_HOST, self.RPC_TRACKER_PORT)
-
-    def __enter__(self):
-        self.remote = self.tracker.request(self.RPC_KEY, priority=0, session_timeout=600)
-        return self.remote
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        self.remote.get_function("CloseRPCConnection")()
+from tvm.rpc import connect_tracker
+
+def get_rpc():
+    """
+    Establish an RPC connection to the remote device.
+
+    Returns
+    -------
+    tvm.rpc.RPCSession or None
+        The RPC session object if RPC_TARGET is set; otherwise, None.
+    """
+    rpc_target = os.getenv("RPC_TARGET", None)
+    if rpc_target:
+        host = os.getenv("TVM_TRACKER_HOST", "localhost")
+        port = int(os.getenv("TVM_TRACKER_PORT", 9090))
+        device_key = os.getenv("RPC_DEVICE_KEY", "android")
+        tracker = connect_tracker(host, port)
+        return tracker.request(device_key, priority=1, session_timeout=1000)
+    else:
+        return None
 
 
 def preprocess_pipeline(mod: IRModule) -> IRModule:
@@ -96,14 +101,15 @@ def postprocess_pipeline(mod: IRModule) -> IRModule:
 
 
 @tvm.testing.requires_rpc
-@tvm.testing.requires_opencl
+@tvm.testing.requires_adreno_opencl
 @pytest.mark.parametrize(
-    "target", [Target("opencl -device=adreno", "llvm -mtriple=aarch64-linux-android")]
+    "backend", ["opencl"]
 )
 @pytest.mark.parametrize("dtype", ["int8", "float16", "int16", "float32", "int32"])
 @pytest.mark.parametrize("channel_size", [64, 128])
 @pytest.mark.parametrize("read_width", [1, 2, 4, 8, 16])
-def test_texture_copy(target, dtype, channel_size, read_width):
+def test_texture_copy(backend, dtype, channel_size, read_width):
+    remote = get_rpc()
     M, N, K = (256, 1024, 128)
     lanes = channel_size // DataType(dtype).bits
     if read_width > lanes:
@@ -139,6 +145,12 @@ def schedule_default(blk, lanes):
         schedule_default(B_blk, read_width)
 
     mod = TextureCopy
+
+    if remote is None:
+        target = Target(backend + " -device=adreno")
+    else:
+        target = Target(backend + " -device=adreno", "llvm -mtriple=aarch64-linux-android")
+
     with target:
         mod = preprocess_pipeline(mod)
         sch = tir.Schedule(mod)
@@ -148,20 +160,43 @@ def schedule_default(blk, lanes):
     ex = relax.build(mod, target)
     load_path = "vm_library.so"
     inputs = [np.random.randint(0, 128, (M, N)).astype(dtype), np.zeros((M, N), dtype)]
-    with RemoteConnection() as remote:
-        with tempfile.TemporaryDirectory() as temp_dir:
+    with tempfile.TemporaryDirectory() as temp_dir:
+        if remote is not None:
             path = temp_dir + "/" + load_path
             ex.export_library(path, fcompile=ndk.create_shared, options=["-shared", "-fPIC", "-lm"])
-
             remote.upload(path)
             rexec = remote.load_module(load_path)
             dev = remote.cl()
-
-            vm = relax.VirtualMachine(rexec, [dev, dev, dev])
-            inps = [tvm.runtime.tensor(inp, dev) for inp in inputs]
-            vm["main"](*inps)
-
-            np.testing.assert_equal(inps[-1].numpy(), inps[0].numpy())
+            if "vdevice" in mod.global_infos:
+                device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+            else:
+                device_arr = [dev]
+            vm = relax.VirtualMachine(rexec, device_arr)
+        else:
+            # local execution
+            if "opencl" in backend:
+                dev = tvm.opencl(0)
+            elif "vulkan" in backend:
+                dev = tvm.vulkan(0)
+            else:
+                raise RuntimeError("Unsupported backend")
+
+            if "vdevice" in mod.global_infos:
+                device_arr = [dev for ii in range(len(mod.global_infos["vdevice"]))]
+            else:
+                device_arr = [dev]
+            vm = relax.VirtualMachine(ex, device_arr)
+
+        inps = [tvm.runtime.tensor(inp, dev) for inp in inputs]
+        vm["main"](*inps)
+
+        out1 = inps[-1].numpy()
+        out2 = inps[0].numpy()
+
+        if remote:
+            remote.get_function("CloseRPCConnection")()
+
+        np.testing.assert_equal(out1, out2)
 
 
 if __name__ == "__main__":
diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh
index 6d28d8f30f8b..c81820752599 100755
--- a/tests/scripts/task_python_unittest_gpuonly.sh
+++ b/tests/scripts/task_python_unittest_gpuonly.sh
@@ -34,3 +34,14 @@ export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-vulkan
 source tests/scripts/setup-pytest-env.sh
 
 run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/codegen/test_target_codegen_vulkan.py
+
+
+# Adreno : A comprehensive Texture tests on Nvidia GPU and clml codegen tests.
+export PYTEST_ADDOPTS=""
+export TVM_TEST_TARGETS="opencl"
+export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-clml-texture
+
+source tests/scripts/setup-pytest-env.sh
+
+run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 -s tests/python/relax/backend/clml/
+run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/relax/texture/

From 8607e25aee001a0b7771a9be5f5cc6b49f4b5b45 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 5 Dec 2025 18:34:34 +0530
Subject: [PATCH 14/17] Lint

---
 python/tvm/testing/utils.py                   | 2 +-
 tests/python/relax/texture/adreno_utils.py    | 4 ++--
 tests/python/relax/texture/test_texture_nd.py | 5 ++---
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 846a79e4896b..fc9a2296e786 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -926,7 +926,7 @@ def _multi_gpu_exists():
     target_kind_enabled="opencl",
     target_kind_hardware=None,
     parent_features="gpu",
-    run_time_check=lambda: os.getenv('RPC_TARGET') is not None,
+    run_time_check=lambda: os.getenv("RPC_TARGET") is not None,
 )
 
 # Mark a test as requiring the OpenCL runtime
diff --git a/tests/python/relax/texture/adreno_utils.py b/tests/python/relax/texture/adreno_utils.py
index 6caa793ecc07..b6ffd837145f 100644
--- a/tests/python/relax/texture/adreno_utils.py
+++ b/tests/python/relax/texture/adreno_utils.py
@@ -112,9 +112,9 @@ def build_run(mod, inputs, backend, is_adreno=False):
     remote = get_rpc()
     target = get_target(backend, is_adreno)
     if remote is None:
-      tgt = tvm.target.Target(target, host="llvm")
+        tgt = tvm.target.Target(target, host="llvm")
     else:
-      tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
+        tgt = tvm.target.Target(target, host="llvm -mtriple=aarch64-linux-gnu")
     relax_pipeline = relax.pipeline.get_default_pipeline(tgt)
     tir_pipeline = tvm.tir.get_default_tir_pipeline(tgt)
     mod = relax_pipeline(mod)
diff --git a/tests/python/relax/texture/test_texture_nd.py b/tests/python/relax/texture/test_texture_nd.py
index 1079c31245f2..520f309c6e41 100644
--- a/tests/python/relax/texture/test_texture_nd.py
+++ b/tests/python/relax/texture/test_texture_nd.py
@@ -33,6 +33,7 @@
 from tvm import tir, DataType
 from tvm.rpc import connect_tracker
 
+
 def get_rpc():
     """
     Establish an RPC connection to the remote device.
@@ -102,9 +103,7 @@ def postprocess_pipeline(mod: IRModule) -> IRModule:
 
 @tvm.testing.requires_rpc
 @tvm.testing.requires_adreno_opencl
-@pytest.mark.parametrize(
-    "backend", ["opencl"]
-)
+@pytest.mark.parametrize("backend", ["opencl"])
 @pytest.mark.parametrize("dtype", ["int8", "float16", "int16", "float32", "int32"])
 @pytest.mark.parametrize("channel_size", [64, 128])
 @pytest.mark.parametrize("read_width", [1, 2, 4, 8, 16])

From a3cd6eb0df85fa27e3741a71df1707c9368e5a70 Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 5 Dec 2025 19:41:25 +0530
Subject: [PATCH 15/17] Workaround for crash.

Co-authored-by: Sanjay <sanjs@qti.qualcomm.com>
---
 tests/scripts/task_python_unittest_gpuonly.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh
index c81820752599..776d29fda07f 100755
--- a/tests/scripts/task_python_unittest_gpuonly.sh
+++ b/tests/scripts/task_python_unittest_gpuonly.sh
@@ -43,5 +43,4 @@ export TVM_UNITTEST_TESTSUITE_NAME=python-codegen-clml-texture
 
 source tests/scripts/setup-pytest-env.sh
 
-run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 -s tests/python/relax/backend/clml/
 run_pytest ${TVM_UNITTEST_TESTSUITE_NAME}-1 tests/python/relax/texture/

From 172573b677cbaf0c46d87ee06a3d96db7a91ca3a Mon Sep 17 00:00:00 2001
From: Sanjay Krishnaa <sanjs@qti.qualcomm.com>
Date: Fri, 5 Dec 2025 20:44:47 +0530
Subject: [PATCH 16/17] Removed Comments

---
 python/tvm/dlight/adreno/convolution.py      | 1 -
 python/tvm/dlight/adreno/layout_transform.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/tvm/dlight/adreno/convolution.py b/python/tvm/dlight/adreno/convolution.py
index 497fc21a9a4b..462fb2550bd4 100644
--- a/python/tvm/dlight/adreno/convolution.py
+++ b/python/tvm/dlight/adreno/convolution.py
@@ -31,7 +31,6 @@ class Conv2d(AdrenoScheduleRule):
 
     @staticmethod
     def schedule_conv2d(sch: tir.Schedule, blk: tir.schedule.BlockRV):
-        # TODO: Loop Pattern mayn't be reliable, need to perform better analysis.
         n, oc, oh, ow, ob, ic, kh, kw = sch.get_loops(blk)
 
         bz, vz, tz = sch.split(oc, [None, 8, 1], preserve_unit_iters=True)
diff --git a/python/tvm/dlight/adreno/layout_transform.py b/python/tvm/dlight/adreno/layout_transform.py
index 8610d658d5f1..bf3c446d59ff 100644
--- a/python/tvm/dlight/adreno/layout_transform.py
+++ b/python/tvm/dlight/adreno/layout_transform.py
@@ -86,7 +86,6 @@ def apply(  # pylint: disable=too-many-locals
             [lpv_read, lpv_write] if sch.get(lpv_read) != sch.get(lpv_write) else (lpv_read,)
         )
         sch.reorder(*block_loops, *vec_loops)
-        # TODO: Additional Pragmas and stuff
         if local_cache:
             if sch.get(lpv_read) != sch.get(lpv_write):
                 blp_read, vlp_read = sch.split(

From 5e1c377ec31f239b57400d3a4e7292485ba5903d Mon Sep 17 00:00:00 2001
From: Siva <quic_sivb@quicinc.com>
Date: Fri, 9 Jan 2026 19:08:35 +0530
Subject: [PATCH 17/17] Review comments

Remove shader dumps.
Separate TIR Adreno pipeline
Move texture tir passes to backend/adreno
---
 cmake/modules/OpenCL.cmake                    |   3 -
 include/tvm/tir/backend/adreno/transform.h    |  69 ++++++++
 include/tvm/tir/transform.h                   |   6 -
 python/tvm/tir/__init__.py                    |   1 +
 python/tvm/tir/backend/__init__.py            |  19 +++
 python/tvm/tir/backend/adreno/__init__.py     |  20 +++
 python/tvm/tir/backend/adreno/pipeline.py     | 160 ++++++++++++++++++
 .../tir/backend/adreno/transform/__init__.py  |  22 +++
 .../tir/backend/adreno/transform/_ffi_api.py  |  21 +++
 .../tir/backend/adreno/transform/transform.py |  46 +++++
 python/tvm/tir/pipeline.py                    |   9 +-
 python/tvm/tir/transform/transform.py         |  26 ---
 src/runtime/opencl/opencl_common.h            |   5 -
 src/runtime/opencl/opencl_device_api.cc       |   9 -
 src/runtime/opencl/opencl_module.cc           |  87 ----------
 .../adreno}/inject_texture_alloc.cc           |  16 +-
 .../adreno}/texture_flatten.cc                |  22 ++-
 tests/scripts/setup-adreno-env.sh             |   2 +-
 tests/scripts/task_build_adreno_bins.sh       |   1 -
 19 files changed, 388 insertions(+), 156 deletions(-)
 create mode 100644 include/tvm/tir/backend/adreno/transform.h
 create mode 100644 python/tvm/tir/backend/__init__.py
 create mode 100644 python/tvm/tir/backend/adreno/__init__.py
 create mode 100644 python/tvm/tir/backend/adreno/pipeline.py
 create mode 100644 python/tvm/tir/backend/adreno/transform/__init__.py
 create mode 100644 python/tvm/tir/backend/adreno/transform/_ffi_api.py
 create mode 100644 python/tvm/tir/backend/adreno/transform/transform.py
 rename src/tir/{transforms => backend/adreno}/inject_texture_alloc.cc (89%)
 rename src/tir/{transforms => backend/adreno}/texture_flatten.cc (93%)

diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index b70db4dc7bf2..c5c8eae721fa 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -80,9 +80,6 @@ if(USE_OPENCL)
       message(STATUS "Set OpenCL Target version to " ${CMAKE_MATCH_1})
     endif()
   endif(USE_OPENCL_EXTN_QCOM)
-  if(PROFILE_SHADER_DUMP)
-    add_definitions(-DPROFILE_SHADER_DUMP)
-  endif(PROFILE_SHADER_DUMP)
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
 endif(USE_OPENCL)
diff --git a/include/tvm/tir/backend/adreno/transform.h b/include/tvm/tir/backend/adreno/transform.h
new file mode 100644
index 000000000000..d2ca90a8cede
--- /dev/null
+++ b/include/tvm/tir/backend/adreno/transform.h
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/tir/backend/adreno/transform.h
+ * \brief TIR specific Adreno GPU transformation passes.
+ */
+#ifndef TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
+#define TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
+
+#include <tvm/ir/transform.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/function.h>
+#include <tvm/tir/transform.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace tir {
+namespace backend {
+namespace adreno {
+namespace transform {
+
+using tvm::tir::transform::CreatePrimFuncPass;
+using tvm::transform::Pass;
+using tvm::transform::PassContext;
+using tvm::transform::PassContextNode;
+using tvm::transform::PassInfo;
+using tvm::transform::PassInfoNode;
+using tvm::transform::PassNode;
+using tvm::transform::Sequential;
+
+/*!
+ * \brief Texture flattening pass.
+ * \return The pass.
+ */
+TVM_DLL Pass TextureFlatten();
+
+/*!
+ * \brief Inject Texture Allocation intrinsic.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectTextureAlloc();
+
+}  // namespace transform
+}  // namespace adreno
+}  // namespace backend
+}  // namespace tir
+}  // namespace tvm
+
+#endif  // TVM_TIR_BACKEND_ADRENO_TRANSFORM_H_
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
index 60e6b9af8b88..bf100dc49c4c 100644
--- a/include/tvm/tir/transform.h
+++ b/include/tvm/tir/transform.h
@@ -773,12 +773,6 @@ TVM_DLL Pass DefaultGPUSchedule();
  */
 TVM_DLL Pass UseAssumeToReduceBranches();
 
-/*!
- * \brief Inject Texture Allocation intrinsic.
- * \return The pass.
- */
-TVM_DLL Pass InjectTextureAlloc();
-
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm
diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
index 0a598e5e9bb9..74f2f6b2f757 100644
--- a/python/tvm/tir/__init__.py
+++ b/python/tvm/tir/__init__.py
@@ -126,3 +126,4 @@
 from .build import build
 from .pipeline import get_tir_pipeline, get_default_tir_pipeline
 from .functor import PyStmtExprVisitor, PyStmtExprMutator
+from . import backend
diff --git a/python/tvm/tir/backend/__init__.py b/python/tvm/tir/backend/__init__.py
new file mode 100644
index 000000000000..bd04016a2bb3
--- /dev/null
+++ b/python/tvm/tir/backend/__init__.py
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""TIR backends"""
+
+from . import adreno
diff --git a/python/tvm/tir/backend/adreno/__init__.py b/python/tvm/tir/backend/adreno/__init__.py
new file mode 100644
index 000000000000..cb59e73b57e2
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""The TIR Adreno backend passes """
+
+from . import transform
+from .pipeline import get_tir_pipeline
diff --git a/python/tvm/tir/backend/adreno/pipeline.py b/python/tvm/tir/backend/adreno/pipeline.py
new file mode 100644
index 000000000000..1efc8e83963c
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/pipeline.py
@@ -0,0 +1,160 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=invalid-name
+"""The TIR backend compilation pipeline for Adreno"""
+
+import tvm
+from tvm import tir
+
+
+def default_tir_pipeline():
+    """The default tir pipeline used in tvm.tir.build"""
+
+    @tvm.transform.module_pass(opt_level=0)
+    def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.IRModule:
+        """The default lowering passes for TIR backend."""
+        pass_ctx = tvm.transform.PassContext.current()
+        config = pass_ctx.config
+        passes = [
+            tir.backend.adreno.transform.TextureFlatten(),
+            tir.transform.CanonicalizeLoop(),
+            tir.transform.LowerCrossThreadReduction(),
+            tir.transform.LowerInitBlock(),
+            tir.transform.PlanAndUpdateBufferAllocationLocation(),
+            tir.transform.ConvertBlocksToOpaque(),
+            tir.transform.LiftThreadBinding(),
+            tir.transform.ManifestSharedMemoryLocalStage(),
+            tir.transform.CompactBufferAllocation(),
+            tir.transform.LowerAutoCopy(),
+            tir.transform.UnifyThreadBinding(),
+            tir.transform.LowerMatchBuffer(),
+            tir.transform.Simplify(),
+            tir.transform.InjectPermutedLayout(),
+            tir.transform.AnnotateIrregularLoop(),
+            tir.transform.InjectSoftwarePipeline(),
+            tir.transform.TransformMmaBufferLayout(),
+            tir.transform.LowerOpaqueBlock(),
+            tir.backend.adreno.transform.InjectTextureAlloc(),
+            tir.transform.FlattenBuffer(),
+            tir.transform.BF16ComputeLegalize(),
+            tir.transform.NarrowDataType(32),
+            tir.transform.LoopPartition(),
+            tir.transform.VectorizeLoop(not bool(config.get("tir.disable_vectorize", False))),
+            tir.transform.InjectVirtualThread(),
+            tir.transform.InjectDoubleBuffer(),
+        ]
+        if not bool(config.get("tir.disable_storage_rewrite", False)):
+            passes.append(tir.transform.StorageRewrite())
+        if config.get("tir.use_async_copy", False):
+            passes.append(tir.transform.LowerAsyncDMA())
+        passes.extend(
+            [
+                tir.transform.HoistIfThenElse(),
+                tir.transform.UnrollLoop(),
+                tir.transform.RenormalizeSplitPattern(),
+                tir.transform.Simplify(),
+                tir.transform.RemoveNoOp(),
+                tir.transform.RewriteUnsafeSelect(),
+            ]
+        )
+        # Additional passes based on configuration.
+        if bool(config.get("tir.instrument_bound_checkers", False)):
+            passes.append(tir.transform.InstrumentBoundCheckers())
+        if bool(config.get("tir.ptx_ldg32", False)):
+            passes.append(tir.transform.InjectPTXLDG32(True))
+        passes.append(
+            tir.transform.CommonSubexprElimTIR(
+                not bool(config.get("tir.disable_cse_tir", False)),
+                bool(config.get("tir.enable_equiv_terms_in_cse_tir", False)),
+            )
+        )
+        if bool(config.get("tir.instrument_lwp", False)):
+            passes.append(tir.transform.InstrumentProfileIntrinsics())
+        passes.extend(
+            [
+                # Bind the target first so that target-specific attributes are available.
+                tir.transform.FP8ComputeLegalize(),
+                # VerifyVTCMLimit must occur before LowerVtcmAlloc.
+                tir.transform.VerifyVTCMLimit(),
+                tir.transform.LowerVtcmAlloc(),
+                tir.transform.VerifyMemory(),
+                tir.transform.AnnotateEntryFunc(),
+            ]
+        )
+        if bool(config.get("tir.detect_global_barrier", False)):
+            passes.append(tir.transform.ThreadSync("global"))
+        passes.extend(
+            [
+                tir.transform.ThreadSync("shared"),
+                tir.transform.ThreadSync("shared.dyn"),
+                tir.transform.ThreadSync("warp"),
+                tir.transform.InferFragment(),
+                tir.transform.LowerThreadAllreduce(),
+            ]
+        )
+        if bool(config.get("tir.use_async_copy", False)):
+            passes.append(tir.transform.InjectPTXAsyncCopy())
+        if bool(config.get("tir.ptx_ldg32", False)):
+            passes.append(tir.transform.InjectPTXLDG32())
+        passes.extend(
+            [
+                tir.transform.AnnotateDeviceRegions(),
+                tir.transform.SplitHostDevice(),
+                # MergeSharedMemoryAllocations must follow SplitHostDevice.
+                tir.transform.MergeSharedMemoryAllocations(),
+                tir.transform.MakePackedAPI(),
+                tir.transform.FP8StorageLegalize(),
+                tir.transform.BF16StorageLegalize(),
+                tir.transform.LowerDeviceKernelLaunch(),
+            ]
+        )
+        mod = tvm.ir.transform.Sequential(passes)(mod)
+        return mod
+
+    return _pipeline
+
+
+def finalize_host_passes():  # pylint: disable=unused-argument
+    """The default finalization passes for TIR backend."""
+    host_pass_list = [
+        tir.transform.LowerTVMBuiltin(),
+        tir.transform.LowerCustomDatatypes(),
+        tir.transform.LowerIntrin(),
+        tir.transform.LowerDeviceStorageAccessInfo(),
+        tir.transform.CombineContextCall(),
+    ]
+    return tvm.ir.transform.Sequential(host_pass_list)
+
+
+def finalize_device_passes():  # pylint: disable=unused-argument
+    """The default finalization passes for TIR backend."""
+    device_pass_list = [
+        tir.transform.LowerWarpMemory(),
+        tir.transform.Simplify(),
+        tir.transform.LowerCustomDatatypes(),
+        tir.transform.LowerDeviceStorageAccessInfo(),
+        tir.transform.LowerIntrin(),
+    ]
+    return tvm.ir.transform.Sequential(device_pass_list)
+
+
+def get_tir_pipeline(
+    target: tvm.target.Target,  # pylint: disable=unused-argument
+) -> tvm.transform.Pass:
+    """Get the TIR pipeline for Adreno GPU."""
+    return default_tir_pipeline()
diff --git a/python/tvm/tir/backend/adreno/transform/__init__.py b/python/tvm/tir/backend/adreno/transform/__init__.py
new file mode 100644
index 000000000000..25c83f604916
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Adreno TIR transformations. """
+
+from .transform import (
+    InjectTextureAlloc,
+    TextureFlatten,
+)
diff --git a/python/tvm/tir/backend/adreno/transform/_ffi_api.py b/python/tvm/tir/backend/adreno/transform/_ffi_api.py
new file mode 100644
index 000000000000..7df0ae3eae47
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/_ffi_api.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""FFI APIs for tvm.tir.backend.adreno.transform"""
+import tvm_ffi
+
+
+tvm_ffi.init_ffi_api("tir.backend.adreno.transform", __name__)
diff --git a/python/tvm/tir/backend/adreno/transform/transform.py b/python/tvm/tir/backend/adreno/transform/transform.py
new file mode 100644
index 000000000000..03cc162a4bad
--- /dev/null
+++ b/python/tvm/tir/backend/adreno/transform/transform.py
@@ -0,0 +1,46 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Wrapping existing transformations."""
+# pylint: disable=invalid-name, unsupported-binary-operation
+
+from . import _ffi_api
+
+
+def InjectTextureAlloc():
+    """Inject Texture Allocation Intrinsic to make sure appropriate lowering
+    via alloc_nd/alloc_free calls
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectTextureAlloc()  # type: ignore
+
+
+def TextureFlatten():
+    """Flatten the multi-dimensional read/write to 2D.
+
+
+    Parameters
+    ----------
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.TextureFlatten()  # type: ignore
diff --git a/python/tvm/tir/pipeline.py b/python/tvm/tir/pipeline.py
index a674798b7d0b..1ee4a5b1d315 100644
--- a/python/tvm/tir/pipeline.py
+++ b/python/tvm/tir/pipeline.py
@@ -21,6 +21,8 @@
 import tvm
 from tvm import tir
 
+from . import backend
+
 
 def default_tir_pipeline():
     """The default tir pipeline used in tvm.tir.build"""
@@ -31,7 +33,6 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
         pass_ctx = tvm.transform.PassContext.current()
         config = pass_ctx.config
         passes = [
-            tir.transform.TextureFlatten(),
             tir.transform.CanonicalizeLoop(),
             tir.transform.LowerCrossThreadReduction(),
             tir.transform.LowerInitBlock(),
@@ -49,7 +50,6 @@ def _pipeline(mod: tvm.ir.IRModule, _ctx: tvm.transform.PassContext) -> tvm.ir.I
             tir.transform.InjectSoftwarePipeline(),
             tir.transform.TransformMmaBufferLayout(),
             tir.transform.LowerOpaqueBlock(),
-            tir.transform.InjectTextureAlloc(),
             tir.transform.FlattenBuffer(),
             tir.transform.BF16ComputeLegalize(),
             tir.transform.NarrowDataType(32),
@@ -178,4 +178,7 @@ def get_default_tir_pipeline(
     target: tvm.target.Target,  # pylint: disable=unused-argument
 ) -> tvm.transform.Pass:
     """Get the default TIR pipeline for the given target."""
-    return default_tir_pipeline()
+    if target.kind.name == "opencl" and "adreno" in target.keys:
+        return backend.adreno.get_tir_pipeline(target)
+    else:
+        return default_tir_pipeline()
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
index 37ffbf12d3aa..88cf4720d3a6 100644
--- a/python/tvm/tir/transform/transform.py
+++ b/python/tvm/tir/transform/transform.py
@@ -893,17 +893,6 @@ def LowerOpaqueBlock():
     return _ffi_api.LowerOpaqueBlock()  # type: ignore
 
 
-def InjectTextureAlloc():
-    """Inject Texture Allocation Intrinsic to make sure appropriate lowering
-    via alloc_nd/alloc_free calls
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    return _ffi_api.InjectTextureAlloc()  # type: ignore
-
-
 def FlattenBuffer():
     """Flatten the multi-dimensional BufferLoad and BufferStore to single dimensional
     BufferLoad/BufferStore for the TIR not contains opaque block.
@@ -1193,18 +1182,3 @@ def CanonicalizeLoop():
         The result pass
     """
     return _ffi_api.CanonicalizeLoop()  # type: ignore
-
-
-def TextureFlatten():
-    """Flatten the multi-dimensional read/write to 2D.
-
-
-    Parameters
-    ----------
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    return _ffi_api.TextureFlatten()  # type: ignore
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 743fca68d9ca..99d5f2be2a9d 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -451,11 +451,6 @@ struct BufferDescriptor {
   MemoryLayout layout{MemoryLayout::kBuffer1D};
   Buffer mbuf{nullptr};  // MemoryManager ref.
   bool is_compat_view{false};
-#ifdef PROFILE_SHADER_DUMP
-  size_t mem_size;
-  size_t width, height, depth;
-  DLDataType dtype;
-#endif  // PROFILE_SHADER_DUMP
 };
 }  // namespace cl
 
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index f5faa69fdcc3..4a022b07b30c 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -315,9 +315,6 @@ void* OpenCLWorkspace::AllocCLBuffer(Device dev, size_t size, size_t alignment,
   desc->buffer =
       clCreateBuffer(this->contexts[platform], CL_MEM_CREATE_FLAGS, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
-#ifdef PROFILE_SHADER_DUMP
-  desc->mem_size = size;
-#endif  // PROFILE_SHADER_DUMP
   OPENCL_CHECK_ERROR(err_code);
   return CreateHostPtrIfEnabled(desc, dev, size);
 }
@@ -347,12 +344,6 @@ void* OpenCLWorkspace::AllocCLImage(Device dev, void* back_buffer, size_t width,
   cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
   desc->buffer = mptr;
   desc->back_buffer = back_desc;
-#ifdef PROFILE_SHADER_DUMP
-  desc->width = width;
-  desc->height = height;
-  desc->depth = depth;
-  desc->dtype = type_hint;
-#endif  // PROFILE_SHADER_DUMP
   return desc;
 }
 
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index cff6587fb9ed..980c6bb3994c 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -51,88 +51,6 @@ class OpenCLWrappedFunc {
     launch_param_config_.Init(arg_size.size(), launch_param_tags);
   }
 
-#ifdef PROFILE_SHADER_DUMP
-  void dump_trace(const ThreadWorkLoad& wl, int work_dim, const ffi::PackedArgs& args) const {
-    std::string dump_path(getenv("PROFILE_SHADER_DUMP_PATH"));
-    static int trace_count = 0;
-
-    std::vector<int> g_vec;
-    std::vector<int> l_vec;
-    for (cl_uint i = 0; i < work_dim; ++i) {
-      g_vec.push_back(wl.work_size[i]);
-      l_vec.push_back(wl.work_size[i + 3]);
-    }
-
-    std::ostringstream os;
-    dmlc::JSONWriter writer(&os);
-    writer.BeginObject();
-    writer.WriteObjectKeyValue("api", func_name_);
-    writer.WriteObjectKeyValue("global", g_vec);
-    writer.WriteObjectKeyValue("local", l_vec);
-
-    class DumpArg {
-     public:
-      DumpArg() = default;
-
-      void Save(dmlc::JSONWriter* writer) const {
-        writer->BeginObject();
-        writer->WriteObjectKeyValue("index", idx);
-        writer->WriteObjectKeyValue("scope", scope);
-        if (scope.find("texture") != std::string::npos) {
-          writer->WriteObjectKeyValue("width", width);
-          writer->WriteObjectKeyValue("height", height);
-          writer->WriteObjectKeyValue("depth", depth);
-        } else {
-          writer->WriteObjectKeyValue("size", size);
-        }
-        writer->WriteObjectKeyValue("dtype", dtype);
-        writer->EndObject();
-      }
-
-      uint32_t idx;
-      std::string scope;
-      uint32_t size;
-      uint32_t width;
-      uint32_t height;
-      uint32_t depth;
-      uint32_t dtype;
-    };
-
-    std::vector<DumpArg> vargs;
-
-    for (cl_uint i = 0; i < arg_size_.size(); ++i) {
-      DumpArg darg;
-      darg.idx = i;
-      if (args[i].as<void*>()) {
-        cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(args[i].cast<void*>());
-        auto mem_scope = cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout);
-        darg.scope = std::string(mem_scope);
-        if (desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D) {
-          darg.size = desc->mem_size;
-        } else {
-          darg.width = desc->width;
-          darg.height = desc->height;
-          darg.depth = desc->depth;
-        }
-        darg.dtype = desc->dtype.code;
-      }
-      vargs.push_back(darg);
-    }
-
-    writer.WriteObjectKeyValue("args", vargs);
-    writer.EndObject();
-    std::stringstream trace_file;
-    trace_file << std::setw(4) << std::setfill('0') << std::to_string(trace_count);
-    trace_file << "_" << func_name_ << ".json";
-    if (dump_path == "cli") {
-      LOG(WARNING) << os.str();
-    } else {
-      SaveBinaryToFile(dump_path + "/" + trace_file.str(), os.str());
-    }
-    trace_count++;
-  }
-#endif  // PROFILE_SHADER_DUMP
-
   // invoke the function with void arguments
   void operator()(ffi::PackedArgs args, ffi::Any* rv, void** void_args) const {
     ICHECK(w_->devices.size() > 0) << "No OpenCL device";
@@ -148,11 +66,6 @@ class OpenCLWrappedFunc {
     }
     ThreadWorkLoad wl = launch_param_config_.Extract(args);
     cl_uint work_dim = static_cast<cl_uint>(launch_param_config_.work_dim());
-#ifdef PROFILE_SHADER_DUMP
-    if (getenv("PROFILE_SHADER_DUMP_PATH")) {
-      dump_trace(wl, work_dim, args);
-    }
-#endif  // PROFILE_SHADER_DUMP
     // setup arguments.
     for (cl_uint i = 0; i < arg_size_.size(); ++i) {
       void* arg = nullptr;
diff --git a/src/tir/transforms/inject_texture_alloc.cc b/src/tir/backend/adreno/inject_texture_alloc.cc
similarity index 89%
rename from src/tir/transforms/inject_texture_alloc.cc
rename to src/tir/backend/adreno/inject_texture_alloc.cc
index 82eed1a41038..4799b7ce2643 100644
--- a/src/tir/transforms/inject_texture_alloc.cc
+++ b/src/tir/backend/adreno/inject_texture_alloc.cc
@@ -23,15 +23,17 @@
 
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/analysis.h>
+#include <tvm/tir/backend/adreno/transform.h>
 #include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
 
-#include "../../arith/ir_mutator_with_analyzer.h"
-#include "../../runtime/texture.h"
-#include "ir_utils.h"
+#include "../../../arith/ir_mutator_with_analyzer.h"
+#include "../../../runtime/texture.h"
+#include "../../transforms/ir_utils.h"
 
 namespace tvm {
 namespace tir {
+namespace backend {
+namespace adreno {
 using runtime::ApplyTexture2DFlattening;
 using runtime::DefaultTextureLayoutSeparator;
 using runtime::IsTextureStorage;
@@ -99,15 +101,17 @@ Pass InjectTextureAlloc() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
     return TextureAllocInjector::Inject(std::move(f));
   };
-  return CreatePrimFuncPass(pass_func, 0, "tir.InjectTextureAlloc", {});
+  return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.InjectTextureAlloc", {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tir.transform.InjectTextureAlloc", InjectTextureAlloc);
+  refl::GlobalDef().def("tir.backend.adreno.transform.InjectTextureAlloc", InjectTextureAlloc);
 }
 
 }  // namespace transform
 
+}  // namespace adreno
+}  // namespace backend
 }  // namespace tir
 }  // namespace tvm
diff --git a/src/tir/transforms/texture_flatten.cc b/src/tir/backend/adreno/texture_flatten.cc
similarity index 93%
rename from src/tir/transforms/texture_flatten.cc
rename to src/tir/backend/adreno/texture_flatten.cc
index 9e816a150be6..5e8a922c30ca 100644
--- a/src/tir/transforms/texture_flatten.cc
+++ b/src/tir/backend/adreno/texture_flatten.cc
@@ -20,24 +20,26 @@
 /*!
  * \file texture_flatten.cc
  * \brief Flattens texture storage from multi-dimensional array
- * to 2D (width, height) buffer access
+ * to 2D (width, height, depth) array access
  */
 
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/te/operation.h>
+#include <tvm/tir/backend/adreno/transform.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/stmt.h>
-#include <tvm/tir/transform.h>
 
 #include <unordered_map>
 
-#include "../../arith/ir_visitor_with_analyzer.h"
-#include "../../runtime/texture.h"
-#include "../../runtime/thread_storage_scope.h"
+#include "../../../arith/ir_visitor_with_analyzer.h"
+#include "../../../runtime/texture.h"
+#include "../../../runtime/thread_storage_scope.h"
 
 namespace tvm {
 namespace tir {
+namespace backend {
+namespace adreno {
 using arith::IRVisitorWithAnalyzer;
 using runtime::ApplyTexture2DFlattening;
 using runtime::DefaultTextureLayoutSeparator;
@@ -200,7 +202,7 @@ class TextureFlattener : public TextureLoweringBase {
   std::unordered_map<Var, PrimExpr> let_binding_;
 };
 
-PrimFunc TextureFlatten(PrimFunc func) {
+PrimFunc TextureFlattenHandler(PrimFunc func) {
   auto fptr = func.CopyOnWrite();
   IRVisitorWithAnalyzer bound_analyzer;
   bound_analyzer(fptr->body);
@@ -212,17 +214,19 @@ namespace transform {
 
 Pass TextureFlatten() {
   auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
-    return TextureFlatten(std::move(f));
+    return TextureFlattenHandler(std::move(f));
   };
-  return CreatePrimFuncPass(pass_func, 0, "tir.TextureFlatten", {});
+  return CreatePrimFuncPass(pass_func, 0, "tir.backend.adreno.TextureFlatten", {});
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tir.transform.TextureFlatten", TextureFlatten);
+  refl::GlobalDef().def("tir.backend.adreno.transform.TextureFlatten", TextureFlatten);
 }
 
 }  // namespace transform
 
+}  // namespace adreno
+}  // namespace backend
 }  // namespace tir
 }  // namespace tvm
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index 3146ecb1c7dc..a401fb808acb 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -113,7 +113,7 @@ case ${ENVIRONMENT} in
     adb forward tcp:$((LISTEN_PORT + 1)) tcp:$((LISTEN_PORT + 1))
     adb forward tcp:$((LISTEN_PORT + 2)) tcp:$((LISTEN_PORT + 2))
     adb forward tcp:$((LISTEN_PORT + 3)) tcp:$((LISTEN_PORT + 3))
-    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; PROFILE_SHADER_DUMP_PATH=cli LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
+    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
     ;;
 
   "query")
diff --git a/tests/scripts/task_build_adreno_bins.sh b/tests/scripts/task_build_adreno_bins.sh
index 2c4c9637ea47..bc5a075b2325 100755
--- a/tests/scripts/task_build_adreno_bins.sh
+++ b/tests/scripts/task_build_adreno_bins.sh
@@ -49,7 +49,6 @@ echo set\(MACHINE_NAME aarch64-linux-gnu\) >> config.cmake
 echo set\(USE_OPENCL_GTEST ON\) >> config.cmake
 
 echo set\(USE_OPENCL_EXTN_QCOM ON\) >> config.cmake
-echo set\(PROFILE_SHADER_DUMP ON\) >> config.cmake
 
 cmake -DANDROID_ABI=arm64-v8a \
       -DCMAKE_SYSTEM_NAME=Android \