diff --git a/docs/DXIL.rst b/docs/DXIL.rst
index e9514212dd..795828b33a 100644
--- a/docs/DXIL.rst
+++ b/docs/DXIL.rst
@@ -1758,6 +1758,15 @@ The following signature shows the operation syntax::
 
 The call respects SM5.1 OOB and alignment rules.
 
+The ``alignment`` parameter specifies the **absolute alignment** of the
+effective address (``base address + index``). For regular ``Load`` operations,
+this defaults to 4 bytes for raw buffers. For templated ''Load'' operations,
+this defaults to the size of the largest scalar component contained in the
+aggregate template parameter type.  The HLSL
+``AlignedLoad<T>(offset, alignment)`` intrinsic allows applications to specify
+custom alignment values when they can guarantee higher alignment, enabling
+backend compiler optimizations.
+
 ====================  =====================================================
 Valid resource type   # of active coordinates
 ====================  =====================================================
@@ -1816,6 +1825,15 @@ The call respects SM5.1 OOB and alignment rules.
 
 The write mask indicates which components are written (x - 1, y - 2, z - 4, w - 8), similar to DXBC. For RWTypedBuffer, the mask must cover all resource components. For RWRawBuffer and RWStructuredBuffer, valid masks are: x, xy, xyz, xyzw.
 
+The ``alignment`` parameter specifies the **absolute alignment** of the
+effective address (``base address + index``). For regular ``Store`` operations,
+this defaults to 4 bytes for raw buffers. For templated ''Store'' operations,
+this defaults to the size of the largest scalar component contained in the
+aggregate template parameter type.  The HLSL
+``AlignedStore<T>(offset, alignment, value)`` intrinsic allows applications to
+specify custom alignment values when they can guarantee higher alignment,
+enabling backend compiler optimizations.
+
 ==================== =====================================================
 Valid resource type  # of active coordinates
 ==================== =====================================================
diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
index 71935e1757..38535125af 100644
--- a/docs/SPIR-V.rst
+++ b/docs/SPIR-V.rst
@@ -2768,6 +2768,15 @@ used to store a 32-bit unsigned integer. For ``Store2``, ``Store3``, and ``Store
 done 2, 3, and 4 times, respectively. Each time the word offset is incremented by 1 before
 performing ``OpAccessChain``.
 
+``.AlignedLoad<T>()``, ``.AlignedStore<T>()``
+++++++++++++++++++++++++++++++++++++++++++++++
+These functions work identically to their non-aligned counterparts (``Load`` and ``Store``),
+but accept an additional ``alignment`` parameter that specifies the guaranteed alignment of
+the effective address. The alignment value is passed to SPIR-V load/store operations via
+memory operands (``Aligned`` memory access qualifier) to enable backend optimizations.
+The alignment parameter must be a compile-time constant power-of-two value that is greater
+than or equal to the largest scalar type size and less than or equal to 4096 bytes.
+
 ``.Interlocked*()``
 +++++++++++++++++++
 
diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h
index aae269cfa1..03cd62f1fb 100644
--- a/include/dxc/HlslIntrinsicOp.h
+++ b/include/dxc/HlslIntrinsicOp.h
@@ -270,9 +270,11 @@ enum class IntrinsicOp {
   MOP_GatherRaw = 250,
   MOP_GatherRed = 251,
   MOP_GetSamplePosition = 252,
+  MOP_AlignedLoad = 405,
   MOP_Load2 = 253,
   MOP_Load3 = 254,
   MOP_Load4 = 255,
+  MOP_AlignedStore = 406,
   MOP_InterlockedAdd = 256,
   MOP_InterlockedAdd64 = 257,
   MOP_InterlockedAnd = 258,
@@ -411,7 +413,7 @@ enum class IntrinsicOp {
   IOP_usign = 355,
   MOP_InterlockedUMax = 356,
   MOP_InterlockedUMin = 357,
-  Num_Intrinsics = 405,
+  Num_Intrinsics = 407,
 };
 inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) {
   switch (opcode) {
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index f8a7907c91..70ee43a387 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4100,7 +4100,8 @@ struct ResLoadHelper {
   ResLoadHelper(Instruction *Inst, DxilResource::Kind RK, Value *h, Value *idx,
                 Value *Offset, Value *status = nullptr, Value *mip = nullptr)
       : intrinsicOpCode(IntrinsicOp::Num_Intrinsics), handle(h), retVal(Inst),
-        addr(idx), offset(Offset), status(status), mipLevel(mip) {
+        addr(idx), offset(Offset), status(status), mipLevel(mip),
+        customAlignment(0) {
     opcode = LoadOpFromResKind(RK);
     Type *Ty = Inst->getType();
     if (opcode == OP::OpCode::RawBufferLoad && Ty->isVectorTy() &&
@@ -4118,6 +4119,8 @@ struct ResLoadHelper {
   Value *offset;
   Value *status;
   Value *mipLevel;
+  unsigned
+      customAlignment; // For AlignedLoad/AlignedStore - 0 means use default
 };
 
 // Uses CI arguments to determine the index, offset, and mipLevel also depending
@@ -4129,7 +4132,8 @@ struct ResLoadHelper {
 ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
                              DxilResourceBase::Class RC, Value *hdl,
                              IntrinsicOp IOP, LoadInst *TyBufSubLoad)
-    : intrinsicOpCode(IOP), handle(hdl), offset(nullptr), status(nullptr) {
+    : intrinsicOpCode(IOP), handle(hdl), offset(nullptr), status(nullptr),
+      customAlignment(0) {
   opcode = LoadOpFromResKind(RK);
   bool bForSubscript = false;
   if (TyBufSubLoad) {
@@ -4144,6 +4148,26 @@ ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
   unsigned StatusIdx = HLOperandIndex::kBufLoadStatusOpIdx;
   unsigned OffsetIdx = HLOperandIndex::kInvalidIdx;
 
+  // Extract alignment for AlignedLoad operations
+  // AlignedLoad CallInst has: (opcode, handle, addr, alignment [, status])
+  // Regular Load has: (opcode, handle, addr [, status])
+  if (IOP == IntrinsicOp::MOP_AlignedLoad) {
+    // alignment is at index 3 (after opcode, handle, addr)
+    const unsigned kAlignmentIdx = kAddrIdx + 1;
+    if (argc > kAlignmentIdx) {
+      if (ConstantInt *AlignConst =
+              dyn_cast<ConstantInt>(CI->getArgOperand(kAlignmentIdx))) {
+        customAlignment = AlignConst->getZExtValue();
+      }
+    }
+    // Status is at index 4 for AlignedLoad (if present)
+    if (argc > kAlignmentIdx + 1) {
+      StatusIdx = kAlignmentIdx + 1;
+    } else {
+      StatusIdx = HLOperandIndex::kInvalidIdx;
+    }
+  }
+
   if (opcode == OP::OpCode::TextureLoad) {
     bool IsMS = (RK == DxilResource::Kind::Texture2DMS ||
                  RK == DxilResource::Kind::Texture2DMSArray);
@@ -4191,7 +4215,7 @@ ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
     // Structured buffers receive no exterior offset in this constructor,
     // but may need to increment it later.
     offset = ConstantInt::get(i32Ty, 0U);
-  else if (argc > OffsetIdx)
+  else if (argc > OffsetIdx && OffsetIdx != HLOperandIndex::kInvalidIdx)
     // Textures may set the offset from an explicit argument.
     offset = CI->getArgOperand(OffsetIdx);
   else
@@ -4199,7 +4223,7 @@ ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
     offset = UndefValue::get(i32Ty);
 
   // Retrieve status value if provided.
-  if (argc > StatusIdx)
+  if (StatusIdx != HLOperandIndex::kInvalidIdx && argc > StatusIdx)
     status = CI->getArgOperand(StatusIdx);
 }
 
@@ -4246,8 +4270,15 @@ static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
   OP::OpCode opcode = helper.opcode;
   llvm::Constant *opArg = Builder.getInt32((uint32_t)opcode);
 
-  unsigned alignment = RK == DxilResource::Kind::RawBuffer ? 4U : 8U;
-  alignment = std::min(alignment, LdSize);
+  // Use custom alignment if provided (for AlignedLoad), otherwise calculate
+  // default
+  unsigned alignment;
+  if (helper.customAlignment != 0) {
+    alignment = helper.customAlignment;
+  } else {
+    alignment = RK == DxilResource::Kind::RawBuffer ? 4U : 8U;
+    alignment = std::min(alignment, LdSize);
+  }
   Constant *alignmentVal = Builder.getInt32(alignment);
 
   // Assemble args specific to the type bab/struct/typed:
@@ -4516,7 +4547,8 @@ void Split64bitValForStore(Type *EltTy, ArrayRef<Value *> vals, unsigned size,
 
 void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
                     Value *Idx, Value *offset, IRBuilder<> &Builder,
-                    hlsl::OP *OP, Value *sampIdx = nullptr) {
+                    hlsl::OP *OP, Value *sampIdx = nullptr,
+                    unsigned customAlignment = 0) {
   Type *Ty = val->getType();
   OP::OpCode opcode = OP::OpCode::NumOpCodes;
   bool IsTyped = true;
@@ -4560,11 +4592,18 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
-  // If RawBuffer store of 64-bit value, don't set alignment to 8,
-  // since buffer alignment isn't known to be anything over 4.
-  unsigned alignValue = OP->GetAllocSizeForType(EltTy);
-  if (RK == HLResource::Kind::RawBuffer && alignValue > 4)
-    alignValue = 4;
+  // Use custom alignment if provided (for AlignedStore), otherwise calculate
+  // default
+  unsigned alignValue;
+  if (customAlignment != 0) {
+    alignValue = customAlignment;
+  } else {
+    // If RawBuffer store of 64-bit value, don't set alignment to 8,
+    // since buffer alignment isn't known to be anything over 4.
+    alignValue = OP->GetAllocSizeForType(EltTy);
+    if (RK == HLResource::Kind::RawBuffer && alignValue > 4)
+      alignValue = 4;
+  }
   Constant *Alignment = OP->GetI32Const(alignValue);
   bool is64 = EltTy == i64Ty || EltTy == doubleTy;
   if (is64 && IsTyped) {
@@ -4758,10 +4797,30 @@ Value *TranslateResourceStore(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   IRBuilder<> Builder(CI);
   DXIL::ResourceKind RK = pObjHelper->GetRK(handle);
 
-  Value *val = CI->getArgOperand(HLOperandIndex::kStoreValOpIdx);
-  Value *offset = CI->getArgOperand(HLOperandIndex::kStoreOffsetOpIdx);
+  // Extract custom alignment for AlignedStore
+  unsigned customAlignment = 0;
+  unsigned valueArgIdx = HLOperandIndex::kStoreValOpIdx;
+  unsigned offsetArgIdx = HLOperandIndex::kStoreOffsetOpIdx;
+
+  if (IOP == IntrinsicOp::MOP_AlignedStore) {
+    // AlignedStore CallInst has: (opcode, handle, offset, alignment, value)
+    // Regular Store has: (opcode, handle, offset, value)
+    const unsigned kAlignmentIdx = HLOperandIndex::kStoreOffsetOpIdx + 1; // = 3
+    if (CI->getNumArgOperands() > kAlignmentIdx) {
+      if (ConstantInt *AlignConst =
+              dyn_cast<ConstantInt>(CI->getArgOperand(kAlignmentIdx))) {
+        customAlignment = AlignConst->getZExtValue();
+      }
+    }
+    valueArgIdx =
+        kAlignmentIdx + 1; // Value is after alignment for AlignedStore
+  }
+
+  Value *val = CI->getArgOperand(valueArgIdx);
+  Value *offset = CI->getArgOperand(offsetArgIdx);
   Value *UndefI = UndefValue::get(Builder.getInt32Ty());
-  TranslateStore(RK, handle, val, offset, UndefI, Builder, hlslOP);
+  TranslateStore(RK, handle, val, offset, UndefI, Builder, hlslOP, nullptr,
+                 customAlignment);
 
   return nullptr;
 }
@@ -7514,7 +7573,6 @@ constexpr IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::VectorAccumulate},
 
     {IntrinsicOp::IOP_isnormal, TrivialIsSpecialFloat, DXIL::OpCode::IsNormal},
-
     {IntrinsicOp::IOP_GetGroupWaveCount, EmptyLower,
      DXIL::OpCode::GetGroupWaveCount},
     {IntrinsicOp::IOP_GetGroupWaveIndex, EmptyLower,
@@ -7536,6 +7594,11 @@ constexpr IntrinsicLower gLowerTable[] = {
      DXIL::OpCode::RayQuery_CommittedTriangleObjectPosition},
     {IntrinsicOp::MOP_DxHitObject_TriangleObjectPosition, EmptyLower,
      DXIL::OpCode::HitObject_TriangleObjectPosition},
+
+    {IntrinsicOp::MOP_AlignedLoad, TranslateResourceLoad,
+     DXIL::OpCode::NumOpCodes},
+    {IntrinsicOp::MOP_AlignedStore, TranslateResourceStore,
+     DXIL::OpCode::NumOpCodes},
 };
 constexpr size_t NumLowerTableEntries =
     sizeof(gLowerTable) / sizeof(gLowerTable[0]);
diff --git a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 34a2195cbc..17ef549040 100644
--- a/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/tools/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -7606,6 +7606,15 @@ def err_hlsl_unsupported_buffer_packoffset : Error<
   "packoffset is only allowed within a constant buffer, not on the constant buffer declaration">;
 def err_hlsl_unsupported_buffer_slot_target_specific : Error<
   "user defined constant buffer slots cannot be target specific">;
+def err_hlsl_aligned_buffer_unsupported_type : Error<
+  "AlignedLoad/AlignedStore functions cannot be used with %0. "
+  "Supported types are ByteAddressBuffer and RWByteAddressBuffer">;
+def err_hlsl_aligned_buffer_invalid_alignment : Error<
+  "Alignment values require compile-time constant power-of-two values "
+  "that are >= largest scalar type size and <= 4096">;
+def err_hlsl_aligned_buffer_alignment_too_small : Error<
+  "Alignment parameter of %0 bytes must be >= the largest scalar type size "
+  "%1 bytes for %2 element type">;
 def err_hlsl_unsupported_typedbuffer_template_parameter : Error<
   "elements of typed buffers and textures must be scalars or vectors">;
 def err_hlsl_unsupported_typedbuffer_template_parameter_size : Error<
diff --git a/tools/clang/lib/SPIRV/RawBufferMethods.cpp b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
index 87409e7ccc..8c7e81dd8f 100644
--- a/tools/clang/lib/SPIRV/RawBufferMethods.cpp
+++ b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
@@ -148,7 +148,7 @@ SpirvInstruction *RawBufferHandler::load64Bits(SpirvInstruction *buffer,
 
 SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     SpirvInstruction *buffer, BufferAddress &address, const QualType targetType,
-    SourceRange range) {
+    SourceRange range, uint32_t alignment) {
   const auto loc = buffer->getSourceLocation();
   SpirvInstruction *result = nullptr;
 
@@ -188,8 +188,8 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     if (isVectorType(targetType, &elemType, &elemCount)) {
       llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
       for (uint32_t i = 0; i < elemCount; ++i) {
-        loadedElems.push_back(
-            processTemplatedLoadFromBuffer(buffer, address, elemType, range));
+        loadedElems.push_back(processTemplatedLoadFromBuffer(
+            buffer, address, elemType, range, alignment));
       }
       result = spvBuilder.createCompositeConstruct(targetType, loadedElems, loc,
                                                    range);
@@ -207,8 +207,8 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
       elemType = arrType->getElementType();
       llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
       for (uint32_t i = 0; i < elemCount; ++i) {
-        loadedElems.push_back(
-            processTemplatedLoadFromBuffer(buffer, address, elemType, range));
+        loadedElems.push_back(processTemplatedLoadFromBuffer(
+            buffer, address, elemType, range, alignment));
       }
       result = spvBuilder.createCompositeConstruct(targetType, loadedElems, loc,
                                                    range);
@@ -241,8 +241,8 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
       const uint32_t numElements = numRows * numCols;
       llvm::SmallVector<SpirvInstruction *, 16> loadedElems(numElements);
       for (uint32_t i = 0; i != numElements; ++i)
-        loadedElems[i] =
-            processTemplatedLoadFromBuffer(buffer, address, elemType, range);
+        loadedElems[i] = processTemplatedLoadFromBuffer(
+            buffer, address, elemType, range, alignment);
 
       llvm::SmallVector<SpirvInstruction *, 4> loadedRows;
       for (uint32_t i = 0; i < numRows; ++i) {
@@ -279,7 +279,7 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
     forEachSpirvField(
         structType, spvType,
-        [this, &buffer, &address, range,
+        [this, &buffer, &address, range, alignment,
          &loadedElems](size_t spirvFieldIndex, const QualType &fieldType,
                        const auto &field) {
           auto *baseOffset = address.getByteAddress();
@@ -294,7 +294,7 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
           }
 
           loadedElems.push_back(processTemplatedLoadFromBuffer(
-              buffer, baseOffset, fieldType, range));
+              buffer, baseOffset, fieldType, range, alignment));
           return true;
         });
 
@@ -328,10 +328,11 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
 
 SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
     SpirvInstruction *buffer, SpirvInstruction *byteAddress,
-    const QualType targetType, SourceRange range) {
+    const QualType targetType, SourceRange range, uint32_t alignment) {
   BufferAddress address(byteAddress, theEmitter);
 
-  return processTemplatedLoadFromBuffer(buffer, address, targetType, range);
+  return processTemplatedLoadFromBuffer(buffer, address, targetType, range,
+                                        alignment);
 }
 
 void RawBufferHandler::store16Bits(SpirvInstruction *value,
@@ -535,11 +536,9 @@ QualType RawBufferHandler::serializeToScalarsOrStruct(
   llvm_unreachable("unhandled type when serializing an array");
 }
 
-void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
-                                                     SpirvInstruction *buffer,
-                                                     BufferAddress &address,
-                                                     const QualType valueType,
-                                                     SourceRange range) {
+void RawBufferHandler::processTemplatedStoreToBuffer(
+    SpirvInstruction *value, SpirvInstruction *buffer, BufferAddress &address,
+    const QualType valueType, SourceRange range, uint32_t alignment) {
   const auto loc = buffer->getSourceLocation();
 
   // Scalar types
@@ -575,7 +574,7 @@ void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
     if (isScalarType(serializedType) || serializedType->getAs<RecordType>()) {
       for (auto elem : elems)
         processTemplatedStoreToBuffer(elem, buffer, address, serializedType,
-                                      range);
+                                      range, alignment);
     }
     return;
   }
@@ -595,9 +594,9 @@ void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
     assert(spvType);
     forEachSpirvField(
         structType, spvType,
-        [this, &address, loc, range, buffer, value](size_t spirvFieldIndex,
-                                                    const QualType &fieldType,
-                                                    const auto &field) {
+        [this, &address, loc, range, buffer, value,
+         alignment](size_t spirvFieldIndex, const QualType &fieldType,
+                    const auto &field) {
           auto *baseOffset = address.getByteAddress();
           if (field.offset.hasValue() && field.offset.getValue() != 0) {
             SpirvConstant *offset = spvBuilder.getConstantInt(
@@ -612,7 +611,7 @@ void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
               spvBuilder.createCompositeExtract(
                   fieldType, value, {static_cast<uint32_t>(spirvFieldIndex)},
                   loc, range),
-              buffer, baseOffset, fieldType, range);
+              buffer, baseOffset, fieldType, range, alignment);
           return true;
         });
 
@@ -645,11 +644,12 @@ void RawBufferHandler::processTemplatedStoreToBuffer(SpirvInstruction *value,
 
 void RawBufferHandler::processTemplatedStoreToBuffer(
     SpirvInstruction *value, SpirvInstruction *buffer,
-    SpirvInstruction *&byteAddress, const QualType valueType,
-    SourceRange range) {
+    SpirvInstruction *&byteAddress, const QualType valueType, SourceRange range,
+    uint32_t alignment) {
   BufferAddress address(byteAddress, theEmitter);
 
-  processTemplatedStoreToBuffer(value, buffer, address, valueType, range);
+  processTemplatedStoreToBuffer(value, buffer, address, valueType, range,
+                                alignment);
 }
 
 SpirvInstruction *RawBufferHandler::BufferAddress::getByteAddress() {
diff --git a/tools/clang/lib/SPIRV/RawBufferMethods.h b/tools/clang/lib/SPIRV/RawBufferMethods.h
index f089f2df5c..477a765b4e 100644
--- a/tools/clang/lib/SPIRV/RawBufferMethods.h
+++ b/tools/clang/lib/SPIRV/RawBufferMethods.h
@@ -36,7 +36,8 @@ class RawBufferHandler {
   ///                 --> Load the first 16-bit uint starting at byte address 0.
   SpirvInstruction *processTemplatedLoadFromBuffer(
       SpirvInstruction *buffer, SpirvInstruction *byteAddress,
-      const QualType targetType, SourceRange range = {});
+      const QualType targetType, SourceRange range = {},
+      uint32_t alignment = 0);
 
   /// \brief Performs RWByteAddressBuffer.Store<T>(address, value).
   /// RWByteAddressBuffers are represented in SPIR-V as structs with only one
@@ -51,7 +52,8 @@ class RawBufferHandler {
                                      SpirvInstruction *buffer,
                                      SpirvInstruction *&byteAddress,
                                      const QualType valueType,
-                                     SourceRange range = {});
+                                     SourceRange range = {},
+                                     uint32_t alignment = 0);
 
 private:
   class BufferAddress {
@@ -81,12 +83,11 @@ class RawBufferHandler {
   SpirvInstruction *processTemplatedLoadFromBuffer(SpirvInstruction *buffer,
                                                    BufferAddress &address,
                                                    const QualType targetType,
-                                                   SourceRange range = {});
-  void processTemplatedStoreToBuffer(SpirvInstruction *value,
-                                     SpirvInstruction *buffer,
-                                     BufferAddress &address,
-                                     const QualType valueType,
-                                     SourceRange range = {});
+                                                   SourceRange range = {},
+                                                   uint32_t alignment = 0);
+  void processTemplatedStoreToBuffer(
+      SpirvInstruction *value, SpirvInstruction *buffer, BufferAddress &address,
+      const QualType valueType, SourceRange range = {}, uint32_t alignment = 0);
 
   SpirvInstruction *load16Bits(SpirvInstruction *buffer, BufferAddress &address,
                                QualType target16BitType,
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.cpp b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
index 1400104d3d..4916be77f7 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.cpp
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.cpp
@@ -4740,34 +4740,65 @@ SpirvInstruction *SpirvEmitter::processBufferTextureLoad(
   return retVal;
 }
 
-SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
-    const CXXMemberCallExpr *expr, uint32_t numWords, bool doStore) {
+SpirvInstruction *
+SpirvEmitter::processByteAddressBufferLoadStore(const CXXMemberCallExpr *expr,
+                                                uint32_t numWords, bool doStore,
+                                                bool isAligned) {
   SpirvInstruction *result = nullptr;
   const auto object = expr->getImplicitObjectArgument();
   auto *objectInfo = loadIfAliasVarRef(object);
   assert(numWords >= 1 && numWords <= 4);
+
+  // Extract alignment parameter if this is an aligned operation
+  uint32_t alignment = 0;
+  uint32_t addressArgIndex = 0; // offset/address is first arg
+  uint32_t valueArgIndex = 1;   // value is second arg (for store)
+
+  if (isAligned) {
+    // For AlignedLoad/AlignedStore: args are (offset, alignment [, value] [,
+    // status]) offset is arg 0, alignment is arg 1
+    if (expr->getNumArgs() < 2) {
+      emitError("AlignedLoad/AlignedStore requires alignment parameter",
+                expr->getExprLoc());
+      return nullptr;
+    }
+    const Expr *alignmentExpr = expr->getArg(1);
+    alignment = getRawBufferAlignment(alignmentExpr);
+
+    // For AlignedStore, the value is the 3rd argument (after offset and
+    // alignment)
+    if (doStore) {
+      valueArgIndex = 2;
+    }
+  }
+
   if (doStore) {
     assert(isRWByteAddressBuffer(object->getType()));
-    assert(expr->getNumArgs() == 2);
+    uint32_t expectedArgs =
+        isAligned ? 3 : 2; // AlignedStore has 3 args (offset, alignment, value)
+    assert(expr->getNumArgs() == expectedArgs);
   } else {
     assert(isRWByteAddressBuffer(object->getType()) ||
            isByteAddressBuffer(object->getType()));
-    if (expr->getNumArgs() == 2) {
+    // Regular Load with status has 2 args, AlignedLoad with status has 3 args
+    uint32_t maxArgs = isAligned ? 3 : 2;
+    if (expr->getNumArgs() == maxArgs && !isAligned) {
       emitError(
           "(RW)ByteAddressBuffer::Load(in address, out status) not supported",
           expr->getExprLoc());
       return 0;
     }
   }
-  const Expr *addressExpr = expr->getArg(0);
+  const Expr *addressExpr = expr->getArg(addressArgIndex);
   auto *byteAddress = doExpr(addressExpr);
   const QualType addressType = addressExpr->getType();
   // The front-end prevents usage of templated Load2, Load3, Load4, Store2,
   // Store3, Store4 intrinsic functions.
   const bool isTemplatedLoadOrStore =
       (numWords == 1) &&
-      (doStore ? !expr->getArg(1)->getType()->isSpecificBuiltinType(
-                     BuiltinType::UInt)
+      (doStore ? !expr->getArg(valueArgIndex)
+                      ->getType()
+                      ->isSpecificBuiltinType(BuiltinType::UInt)
                : !expr->getType()->isSpecificBuiltinType(BuiltinType::UInt));
 
   const auto range = expr->getSourceRange();
@@ -4782,14 +4813,15 @@ SpirvInstruction *SpirvEmitter::processByteAddressBufferLoadStore(
     }
 
     if (doStore) {
-      auto *values = doExpr(expr->getArg(1));
+      auto *values = doExpr(expr->getArg(valueArgIndex));
       RawBufferHandler(*this).processTemplatedStoreToBuffer(
-          values, objectInfo, byteAddress, expr->getArg(1)->getType(), range);
+          values, objectInfo, byteAddress,
+          expr->getArg(valueArgIndex)->getType(), range, alignment);
       result = nullptr;
     } else {
       RawBufferHandler rawBufferHandler(*this);
       result = rawBufferHandler.processTemplatedLoadFromBuffer(
-          objectInfo, byteAddress, expr->getType(), range);
+          objectInfo, byteAddress, expr->getType(), range, alignment);
     }
 
     if (rasterizerOrder) {
@@ -5556,6 +5588,9 @@ SpirvEmitter::processIntrinsicMemberCall(const CXXMemberCallExpr *expr,
     return processByteAddressBufferLoadStore(expr, 3, /*doStore*/ false);
   case IntrinsicOp::MOP_Load4:
     return processByteAddressBufferLoadStore(expr, 4, /*doStore*/ false);
+  case IntrinsicOp::MOP_AlignedLoad:
+    return processByteAddressBufferLoadStore(expr, 1, /*doStore*/ false,
+                                             /*isAligned*/ true);
   case IntrinsicOp::MOP_Store:
     return processByteAddressBufferLoadStore(expr, 1, /*doStore*/ true);
   case IntrinsicOp::MOP_Store2:
@@ -5564,6 +5599,9 @@ SpirvEmitter::processIntrinsicMemberCall(const CXXMemberCallExpr *expr,
     return processByteAddressBufferLoadStore(expr, 3, /*doStore*/ true);
   case IntrinsicOp::MOP_Store4:
     return processByteAddressBufferLoadStore(expr, 4, /*doStore*/ true);
+  case IntrinsicOp::MOP_AlignedStore:
+    return processByteAddressBufferLoadStore(expr, 1, /*doStore*/ true,
+                                             /*isAligned*/ true);
   case IntrinsicOp::MOP_GetDimensions:
     retVal = processGetDimensions(expr);
     break;
diff --git a/tools/clang/lib/SPIRV/SpirvEmitter.h b/tools/clang/lib/SPIRV/SpirvEmitter.h
index 9b890d3af4..eb80bd31dc 100644
--- a/tools/clang/lib/SPIRV/SpirvEmitter.h
+++ b/tools/clang/lib/SPIRV/SpirvEmitter.h
@@ -1240,7 +1240,8 @@ class SpirvEmitter : public ASTConsumer {
   /// Panics if it is not the case.
   SpirvInstruction *processByteAddressBufferLoadStore(const CXXMemberCallExpr *,
                                                       uint32_t numWords,
-                                                      bool doStore);
+                                                      bool doStore,
+                                                      bool isAligned = false);
 
   /// \brief Processes the GetDimensions intrinsic function call on a
   /// (RW)ByteAddressBuffer by querying the image in the given expr.
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index e9c8c90a2d..47bffb10b4 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -7119,9 +7119,10 @@ bool HLSLExternalSource::MatchArguments(
     } else if (pArgument->uTemplateId == INTRIN_TEMPLATE_FROM_FUNCTION) {
       if (functionTemplateTypeArg.isNull()) {
         if (i == 0) {
-          // [RW]ByteAddressBuffer.Load, default to uint
+          // [RW]ByteAddressBuffer.Load/AlignedLoad, default to uint
           pNewType = m_context->UnsignedIntTy;
-          if (builtinOp != hlsl::IntrinsicOp::MOP_Load)
+          if (builtinOp != hlsl::IntrinsicOp::MOP_Load &&
+              builtinOp != hlsl::IntrinsicOp::MOP_AlignedLoad)
             badArgIdx = std::min(badArgIdx, i);
         } else {
           // [RW]ByteAddressBuffer.Store, default to argument type
@@ -10077,6 +10078,108 @@ bool HLSLExternalSource::ValidateTypeRequirements(SourceLocation loc,
   return true;
 }
 
+// Get the largest scalar type size in bytes for a given type (for
+// AlignedLoad/AlignedStore validation)
+static UINT GetLargestScalarTypeSize(QualType Ty, ASTContext &Ctx) {
+  if (Ty.isNull())
+    return 0;
+
+  // Strip off reference types
+  Ty = Ty.getNonReferenceType();
+
+  // Handle scalar types
+  if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+    switch (BT->getKind()) {
+    case BuiltinType::Bool:
+      return 1;
+    case BuiltinType::Half:
+    case BuiltinType::Short:
+    case BuiltinType::UShort:
+    case BuiltinType::Min16Float:
+    case BuiltinType::Min16Int:
+    case BuiltinType::Min16UInt:
+    case BuiltinType::Min10Float:
+    case BuiltinType::Min12Int:
+      return 2;
+    case BuiltinType::Int:
+    case BuiltinType::UInt:
+    case BuiltinType::Float:
+    case BuiltinType::LitInt:
+    case BuiltinType::LitFloat:
+      return 4;
+    case BuiltinType::Double:
+    case BuiltinType::LongLong:
+    case BuiltinType::ULongLong:
+      return 8;
+    default:
+      break;
+    }
+  }
+
+  // Handle vector types
+  if (const ExtVectorType *VT = Ty->getAs<ExtVectorType>()) {
+    return GetLargestScalarTypeSize(VT->getElementType(), Ctx);
+  }
+
+  // Handle array types
+  if (const ConstantArrayType *AT = Ctx.getAsConstantArrayType(Ty)) {
+    return GetLargestScalarTypeSize(AT->getElementType(), Ctx);
+  }
+
+  // Handle record (struct) types - find the largest field
+  if (const RecordType *RT = Ty->getAs<RecordType>()) {
+    UINT maxSize = 0;
+    RecordDecl *RD = RT->getDecl();
+    for (const FieldDecl *FD : RD->fields()) {
+      UINT fieldSize = GetLargestScalarTypeSize(FD->getType(), Ctx);
+      if (fieldSize > maxSize)
+        maxSize = fieldSize;
+    }
+    return maxSize;
+  }
+
+  // Default to 4 bytes
+  return 4;
+}
+
+// Validate alignment parameter for AlignedLoad/AlignedStore
+static bool ValidateAlignmentParameter(Sema &S, const Expr *AlignmentExpr,
+                                       QualType TemplateType,
+                                       SourceLocation Loc) {
+  // Alignment must be a compile-time constant
+  llvm::APSInt alignmentValue;
+  if (!AlignmentExpr->isIntegerConstantExpr(alignmentValue,
+                                            S.getASTContext())) {
+    S.Diag(Loc, diag::err_hlsl_aligned_buffer_invalid_alignment);
+    return false;
+  }
+
+  UINT alignment = alignmentValue.getZExtValue();
+
+  // Alignment must be a power of two
+  if (alignment == 0 || (alignment & (alignment - 1)) != 0) {
+    S.Diag(Loc, diag::err_hlsl_aligned_buffer_invalid_alignment);
+    return false;
+  }
+
+  // Alignment must be <= 4096
+  if (alignment > 4096) {
+    S.Diag(Loc, diag::err_hlsl_aligned_buffer_invalid_alignment);
+    return false;
+  }
+
+  // Alignment must be >= largest scalar type size
+  UINT largestScalarSize =
+      GetLargestScalarTypeSize(TemplateType, S.getASTContext());
+  if (alignment < largestScalarSize) {
+    S.Diag(Loc, diag::err_hlsl_aligned_buffer_alignment_too_small)
+        << alignment << largestScalarSize << TemplateType;
+    return false;
+  }
+
+  return true;
+}
+
 bool HLSLExternalSource::ValidatePrimitiveTypeForOperand(
     SourceLocation loc, QualType type, ArTypeObjectKind kind) {
   bool isValid = true;
@@ -10973,13 +11076,43 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
         objectName == g_ArBasicTypeNames[AR_OBJECT_RWBYTEADDRESS_BUFFER];
     bool IsBABLoad = false;
     bool IsBABStore = false;
+    bool IsBABAlignedLoad = false;
+    bool IsBABAlignedStore = false;
     if (IsBuiltinTable(tableName) && IsBAB) {
       IsBABLoad = intrinsicOp == (UINT)IntrinsicOp::MOP_Load;
       IsBABStore = intrinsicOp == (UINT)IntrinsicOp::MOP_Store;
+      IsBABAlignedLoad = intrinsicOp == (UINT)IntrinsicOp::MOP_AlignedLoad;
+      IsBABAlignedStore = intrinsicOp == (UINT)IntrinsicOp::MOP_AlignedStore;
+    }
+
+    // Validate alignment parameter for AlignedLoad/AlignedStore
+    if (IsBABAlignedLoad || IsBABAlignedStore) {
+      // AlignedLoad/AlignedStore have alignment as second parameter (after
+      // offset)
+      if (Args.size() < 2) {
+        getSema()->Diag(Args[0]->getExprLoc(),
+                        diag::err_ovl_no_viable_member_function_in_call)
+            << intrinsicName;
+        return Sema::TemplateDeductionResult::TDK_Invalid;
+      }
+
+      const Expr *AlignmentExpr = Args[1];
+      SourceLocation AlignmentLoc = AlignmentExpr->getExprLoc();
+
+      // If we have a template type, validate alignment against it
+      if (!functionTemplateTypeArg.isNull()) {
+        if (!ValidateAlignmentParameter(*getSema(), AlignmentExpr,
+                                        functionTemplateTypeArg,
+                                        AlignmentLoc)) {
+          return Sema::TemplateDeductionResult::TDK_Invalid;
+        }
+      }
     }
+
     if (ExplicitTemplateArgs && ExplicitTemplateArgs->size() >= 1) {
       SourceLocation Loc = ExplicitTemplateArgs->getLAngleLoc();
-      if (!IsBABLoad && !IsBABStore) {
+      if (!IsBABLoad && !IsBABStore && !IsBABAlignedLoad &&
+          !IsBABAlignedStore) {
         getSema()->Diag(Loc, diag::err_hlsl_intrinsic_template_arg_unsupported)
             << intrinsicName;
         return Sema::TemplateDeductionResult::TDK_Invalid;
@@ -10992,7 +11125,7 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
         return Sema::TemplateDeductionResult::TDK_Invalid;
       }
 
-      if (IsBABLoad || IsBABStore) {
+      if (IsBABLoad || IsBABStore || IsBABAlignedLoad || IsBABAlignedStore) {
         const bool IsNull = functionTemplateTypeArg.isNull();
         // Incomplete type is diagnosed elsewhere, so just fail if incomplete.
         if (!IsNull &&
@@ -11008,10 +11141,22 @@ HLSLExternalSource::DeduceTemplateArgumentsForHLSL(
               TypeDiagContext::Valid /*LongVecDiagContext*/);
           return Sema::TemplateDeductionResult::TDK_Invalid;
         }
+
+        // Re-validate alignment with the now-known template type for
+        // AlignedLoad/AlignedStore
+        if ((IsBABAlignedLoad || IsBABAlignedStore) && Args.size() >= 2) {
+          const Expr *AlignmentExpr = Args[1];
+          SourceLocation AlignmentLoc = AlignmentExpr->getExprLoc();
+          if (!ValidateAlignmentParameter(*getSema(), AlignmentExpr,
+                                          functionTemplateTypeArg,
+                                          AlignmentLoc)) {
+            return Sema::TemplateDeductionResult::TDK_Invalid;
+          }
+        }
       }
-    } else if (IsBABStore) {
+    } else if (IsBABStore || IsBABAlignedStore) {
       // Prior to HLSL 2018, Store operation only stored scalar uint.
-      if (!Is2018) {
+      if (!Is2018 && !IsBABAlignedStore) {
         if (GetNumElements(argTypes[2]) != 1) {
           getSema()->Diag(Args[1]->getLocStart(),
                           diag::err_ovl_no_viable_member_function_in_call)
@@ -12365,6 +12510,16 @@ void Sema::CheckHLSLFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall) {
 
   hlsl::IntrinsicOp opCode = (hlsl::IntrinsicOp)IntrinsicAttr->getOpcode();
   switch (opCode) {
+  case hlsl::IntrinsicOp::MOP_AlignedLoad:
+  case hlsl::IntrinsicOp::MOP_AlignedStore:
+    // AlignedLoad/AlignedStore require SM 6.2+ (DXIL 1.2+) for
+    // RawBufferLoad/Store
+    if (SM->GetMajor() < 6 || (SM->GetMajor() == 6 && SM->GetMinor() < 2)) {
+      Diag(TheCall->getLocStart(),
+           diag::warn_hlsl_intrinsic_in_wrong_shader_model)
+          << FDecl->getName() << FDecl << "6.2";
+    }
+    break;
   case hlsl::IntrinsicOp::MOP_FinishedCrossGroupSharing:
     CheckFinishedCrossGroupSharingCall(*this, cast<CXXMethodDecl>(FDecl),
                                        TheCall->getLocStart());
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/aligned_load_types.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/aligned_load_types.hlsl
new file mode 100644
index 0000000000..cbd8323089
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/aligned_load_types.hlsl
@@ -0,0 +1,159 @@
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float -DALIGN=4 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT32_A4_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float -DALIGN=16 %s | FileCheck %s -check-prefix=CHK_FLOAT32_A16_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float -DALIGN=8 -DSRCRW %s | FileCheck %s -check-prefix=CHK_FLOAT32_A8_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float -DALIGN=32 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT32_A32_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float4 -DALIGN=4 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT32x4_A4_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float4 -DALIGN=16 %s | FileCheck %s -check-prefix=CHK_FLOAT32x4_A16_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float4 -DALIGN=8 -DSRCRW %s | FileCheck %s -check-prefix=CHK_FLOAT32x4_A8_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float4 -DALIGN=64 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT32x4_A64_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float16_t -DALIGN=2 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT16_A2_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float16_t -DALIGN=8 %s | FileCheck %s -check-prefix=CHK_FLOAT16_A8_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float16_t -DALIGN=4 -DSRCRW %s | FileCheck %s -check-prefix=CHK_FLOAT16_A4_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=float16_t -DALIGN=16 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT16_A16_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=double -DALIGN=8 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT64_A8_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=double -DALIGN=32 %s | FileCheck %s -check-prefix=CHK_FLOAT64_A32_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=double -DALIGN=16 -DSRCRW %s | FileCheck %s -check-prefix=CHK_FLOAT64_A16_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=double -DALIGN=64 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_FLOAT64_A64_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint -DALIGN=4 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT32_A4_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint -DALIGN=16 %s | FileCheck %s -check-prefix=CHK_UINT32_A16_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint -DALIGN=8 -DSRCRW %s | FileCheck %s -check-prefix=CHK_UINT32_A8_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint -DALIGN=32 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT32_A32_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint3 -DALIGN=4 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT32x3_A4_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint3 -DALIGN=16 %s | FileCheck %s -check-prefix=CHK_UINT32x3_A16_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint3 -DALIGN=8 -DSRCRW %s | FileCheck %s -check-prefix=CHK_UINT32x3_A8_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint3 -DALIGN=32 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT32x3_A32_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint16_t -DALIGN=2 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT16_A2_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint16_t -DALIGN=8 %s | FileCheck %s -check-prefix=CHK_UINT16_A8_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint16_t -DALIGN=4 -DSRCRW %s | FileCheck %s -check-prefix=CHK_UINT16_A4_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint16_t -DALIGN=16 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_UINT16_A16_RW
+
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=int64_t -DALIGN=8 -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_INT64_A8_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=int64_t -DALIGN=32 %s | FileCheck %s -check-prefix=CHK_INT64_A32_RO
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=int64_t -DALIGN=16 -DSRCRW %s | FileCheck %s -check-prefix=CHK_INT64_A16_RW
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=int64_t -DALIGN=64 -DSRCRW -DCHKSTATUS %s | FileCheck %s -check-prefix=CHK_INT64_A64_RW
+
+
+// CHK_FLOAT32_A4_RO: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 4)
+// CHK_FLOAT32_A4_RO: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float undef, float undef, float undef, i8 1, i32 4)
+
+// CHK_FLOAT32_A16_RO: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 16)
+// CHK_FLOAT32_A16_RO: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float undef, float undef, float undef, i8 1, i32 16)
+
+// CHK_FLOAT32_A8_RW: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 8)
+// CHK_FLOAT32_A8_RW: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float undef, float undef, float undef, i8 1, i32 8)
+
+// CHK_FLOAT32_A32_RW: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 32)
+// CHK_FLOAT32_A32_RW: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float undef, float undef, float undef, i8 1, i32 32)
+
+// CHK_FLOAT32x4_A4_RO: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 15, i32 4)
+// CHK_FLOAT32x4_A4_RO: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 4)
+
+// CHK_FLOAT32x4_A16_RO: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 15, i32 16)
+// CHK_FLOAT32x4_A16_RO: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 16)
+
+// CHK_FLOAT32x4_A8_RW: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 15, i32 8)
+// CHK_FLOAT32x4_A8_RW: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 8)
+
+// CHK_FLOAT32x4_A64_RW: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 15, i32 64)
+// CHK_FLOAT32x4_A64_RW: call void @dx.op.rawBufferStore.f32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i8 15, i32 64)
+
+// CHK_FLOAT16_A2_RO: call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 2)
+// CHK_FLOAT16_A2_RO: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, half %{{.*}}, half undef, half undef, half undef, i8 1, i32 2)
+
+// CHK_FLOAT16_A8_RO: call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 8)
+// CHK_FLOAT16_A8_RO: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, half %{{.*}}, half undef, half undef, half undef, i8 1, i32 8)
+
+// CHK_FLOAT16_A4_RW: call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 4)
+// CHK_FLOAT16_A4_RW: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, half %{{.*}}, half undef, half undef, half undef, i8 1, i32 4)
+
+// CHK_FLOAT16_A16_RW: call %dx.types.ResRet.f16 @dx.op.rawBufferLoad.f16(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 16)
+// CHK_FLOAT16_A16_RW: call void @dx.op.rawBufferStore.f16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, half %{{.*}}, half undef, half undef, half undef, i8 1, i32 16)
+
+// CHK_FLOAT64_A8_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 3, i32 8)
+// CHK_FLOAT64_A8_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
+
+// CHK_FLOAT64_A32_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 3, i32 32)
+// CHK_FLOAT64_A32_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 32)
+
+// CHK_FLOAT64_A16_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 3, i32 16)
+// CHK_FLOAT64_A16_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 16)
+
+// CHK_FLOAT64_A64_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 3, i32 64)
+// CHK_FLOAT64_A64_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 64)
+
+// CHK_UINT32_A4_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 4)
+// CHK_UINT32_A4_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 undef, i32 undef, i32 undef, i8 1, i32 4)
+
+// CHK_UINT32_A16_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 16)
+// CHK_UINT32_A16_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 undef, i32 undef, i32 undef, i8 1, i32 16)
+
+// CHK_UINT32_A8_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 8)
+// CHK_UINT32_A8_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 undef, i32 undef, i32 undef, i8 1, i32 8)
+
+// CHK_UINT32_A32_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 32)
+// CHK_UINT32_A32_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 undef, i32 undef, i32 undef, i8 1, i32 32)
+
+// CHK_UINT32x3_A4_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 7, i32 4)
+// CHK_UINT32x3_A4_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i8 7, i32 4)
+
+// CHK_UINT32x3_A16_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 7, i32 16)
+// CHK_UINT32x3_A16_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i8 7, i32 16)
+
+// CHK_UINT32x3_A8_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 7, i32 8)
+// CHK_UINT32x3_A8_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i8 7, i32 8)
+
+// CHK_UINT32x3_A32_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 7, i32 32)
+// CHK_UINT32x3_A32_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i8 7, i32 32)
+
+// CHK_UINT16_A2_RO: call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 2)
+// CHK_UINT16_A2_RO: call void @dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i16 %{{.*}}, i16 undef, i16 undef, i16 undef, i8 1, i32 2)
+
+// CHK_UINT16_A8_RO: call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 1, i32 8)
+// CHK_UINT16_A8_RO: call void @dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i16 %{{.*}}, i16 undef, i16 undef, i16 undef, i8 1, i32 8)
+
+// CHK_UINT16_A4_RW: call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 4)
+// CHK_UINT16_A4_RW: call void @dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i16 %{{.*}}, i16 undef, i16 undef, i16 undef, i8 1, i32 4)
+
+// CHK_UINT16_A16_RW: call %dx.types.ResRet.i16 @dx.op.rawBufferLoad.i16(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 1, i32 16)
+// CHK_UINT16_A16_RW: call void @dx.op.rawBufferStore.i16(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i16 %{{.*}}, i16 undef, i16 undef, i16 undef, i8 1, i32 16)
+
+// CHK_INT64_A8_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 3, i32 8)
+// CHK_INT64_A8_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 8)
+
+// CHK_INT64_A32_RO: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_texture_rawbuf, i32 %mul, i32 undef, i8 3, i32 32)
+// CHK_INT64_A32_RO: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 32)
+
+// CHK_INT64_A16_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 3, i32 16)
+// CHK_INT64_A16_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 16)
+
+// CHK_INT64_A64_RW: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32(i32 139, %dx.types.Handle %srcbuf_UAV_rawbuf, i32 %mul, i32 undef, i8 3, i32 64)
+// CHK_INT64_A64_RW: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle %dstbuf_UAV_rawbuf, i32 %mul, i32 undef, i32 %{{.*}}, i32 %{{.*}}, i32 undef, i32 undef, i8 3, i32 64)
+
+
+#ifdef SRCRW
+RWByteAddressBuffer srcbuf : register(u0);
+RWByteAddressBuffer dstbuf : register(u1);
+#else
+ByteAddressBuffer   srcbuf : register(t0);
+RWByteAddressBuffer dstbuf : register(u0);
+#endif
+
+[numthreads(1, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadID) {
+    const uint offset = tid.x * ALIGN;
+#ifdef CHKSTATUS
+    uint status = 0;
+    TY data = srcbuf.AlignedLoad<TY>(offset, ALIGN, status);
+    if (!CheckAccessFullyMapped(status)) return;
+#else
+    TY data = srcbuf.AlignedLoad<TY>(offset, ALIGN);
+#endif
+    dstbuf.AlignedStore<TY>(offset, ALIGN, data);
+}
+
diff --git a/tools/clang/test/SemaHLSL/aligned_load_buffer_types.hlsl b/tools/clang/test/SemaHLSL/aligned_load_buffer_types.hlsl
new file mode 100644
index 0000000000..a6e4dece46
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/aligned_load_buffer_types.hlsl
@@ -0,0 +1,65 @@
+// RUN: %dxc -E main -T cs_6_2 %s -verify
+
+// Test that AlignedLoad/AlignedStore only work with ByteAddressBuffer and RWByteAddressBuffer
+
+ByteAddressBuffer bab;
+RWByteAddressBuffer rwbab;
+
+// Invalid buffer types
+Texture2D<float4> tex2d;
+RWTexture2D<float4> rwtex2d;
+Buffer<float4> typedBuffer;
+RWBuffer<float4> rwTypedBuffer;
+StructuredBuffer<float4> structuredBuffer;
+RWStructuredBuffer<float4> rwStructuredBuffer;
+AppendStructuredBuffer<float4> appendBuffer;
+ConsumeStructuredBuffer<float4> consumeBuffer;
+
+[numthreads(1,1,1)]
+void main()
+{
+    uint offset = 0;
+    uint data;
+    
+    // Valid - these should work
+    data = bab.AlignedLoad<uint>(offset, 4);
+    rwbab.AlignedStore<uint>(offset, 4, data);
+    
+    // Invalid buffer types - Texture2D
+    // expected-error@+2 {{no member named 'AlignedLoad' in 'Texture2D<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    data = tex2d.AlignedLoad<uint>(offset, 4);
+    
+    // expected-error@+2 {{no member named 'AlignedStore' in 'RWTexture2D<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    rwtex2d.AlignedStore<uint>(offset, 4, data);
+    
+    // Invalid buffer types - Buffer (typed)
+    // expected-error@+2 {{no member named 'AlignedLoad' in 'Buffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    data = typedBuffer.AlignedLoad<uint>(offset, 4);
+    
+    // expected-error@+2 {{no member named 'AlignedStore' in 'RWBuffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    rwTypedBuffer.AlignedStore<uint>(offset, 4, data);
+    
+    // Invalid buffer types - StructuredBuffer
+    // expected-error@+2 {{no member named 'AlignedLoad' in 'StructuredBuffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    data = structuredBuffer.AlignedLoad<uint>(offset, 4);
+    
+    // expected-error@+2 {{no member named 'AlignedStore' in 'RWStructuredBuffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    rwStructuredBuffer.AlignedStore<uint>(offset, 4, data);
+    
+    // Invalid buffer types - AppendStructuredBuffer
+    // expected-error@+2 {{no member named 'AlignedStore' in 'AppendStructuredBuffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    appendBuffer.AlignedStore<uint>(offset, 4, data);
+    
+    // Invalid buffer types - ConsumeStructuredBuffer
+    // expected-error@+2 {{no member named 'AlignedLoad' in 'ConsumeStructuredBuffer<vector<float, 4> >'}}
+    // expected-error@+1 {{unexpected type name 'uint': expected expression}}
+    data = consumeBuffer.AlignedLoad<uint>(offset, 4);
+}
+
diff --git a/tools/clang/test/SemaHLSL/aligned_load_errors.hlsl b/tools/clang/test/SemaHLSL/aligned_load_errors.hlsl
new file mode 100644
index 0000000000..55746b8519
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/aligned_load_errors.hlsl
@@ -0,0 +1,82 @@
+// RUN: %dxc -E main -T cs_6_2 -DTY=uint -DALIGN=4 %s -verify
+// RUN: %dxc -E main -T cs_6_2 -DTY=uint3 -DALIGN=4 %s -verify
+// RUN: %dxc -E main -T cs_6_2 -DTY=float4 -DALIGN=4 %s -verify
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=uint16_t -DALIGN=2 %s -verify
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=int64_t -DALIGN=8 %s -verify
+// RUN: %dxc -E main -T cs_6_2 -enable-16bit-types -DTY=double -DALIGN=8 %s -verify
+
+// Test alignment validation for AlignedLoad/AlignedStore with various types
+// ALIGN is set to match the largest scalar type size for each type
+
+ByteAddressBuffer srcbuf;
+RWByteAddressBuffer dstbuf;
+
+[numthreads(1,1,1)]
+void main(uint3 tid : SV_DispatchThreadID)
+{
+    uint offset = tid.x * ALIGN;
+    int dynAlign = ALIGN;
+    TY data;
+    
+    // Error: Non-constant alignment value
+    // expected-error@+1 {{Alignment values require compile-time constant}}
+    data = srcbuf.AlignedLoad<TY>(offset, dynAlign);
+    
+    // expected-error@+1 {{Alignment values require compile-time constant}}
+    dstbuf.AlignedStore<TY>(offset, dynAlign, data);
+    offset += ALIGN;
+    
+    // Error: Non-power-of-two alignment (ALIGN - 1)
+    // When ALIGN=4, this is 3 (not power of 2)
+    // When ALIGN=2, this is 1 (power of 2, but less than scalar size)
+    // When ALIGN=8, this is 7 (not power of 2)
+#if ALIGN == 2
+    // For 16-bit types: ALIGN=2, ALIGN-1=1
+    // 1 is power-of-two, but less than scalar size
+    // expected-error@+1 {{Alignment parameter of 1 bytes must be >= the largest scalar type size 2 bytes}}
+    data = srcbuf.AlignedLoad<TY>(offset, ALIGN - 1);
+    
+    // expected-error@+1 {{Alignment parameter of 1 bytes must be >= the largest scalar type size 2 bytes}}
+    dstbuf.AlignedStore<TY>(offset, ALIGN - 1, data);
+#else
+    // For 32-bit and 64-bit types: ALIGN-1 is not power-of-two
+    // expected-error@+1 {{Alignment values require compile-time constant power-of-two}}
+    data = srcbuf.AlignedLoad<TY>(offset, ALIGN - 1);
+    
+    // expected-error@+1 {{Alignment values require compile-time constant power-of-two}}
+    dstbuf.AlignedStore<TY>(offset, ALIGN - 1, data);
+#endif
+    offset += ALIGN;
+    
+    // Error: Alignment greater than 4096
+    // expected-error@+1 {{Alignment values require compile-time constant power-of-two values that are >= largest scalar type size and <= 4096}}
+    data = srcbuf.AlignedLoad<TY>(offset, 4096 * 2);
+    
+    // expected-error@+1 {{Alignment values require compile-time constant power-of-two values that are >= largest scalar type size and <= 4096}}
+    dstbuf.AlignedStore<TY>(offset, 4096 * 2, data);
+    offset += ALIGN;
+    
+    // Error: Alignment less than largest scalar type size (ALIGN / 2)
+    // For ALIGN=4: ALIGN/2=2, error shows "2 bytes must be >= 4 bytes"
+    // For ALIGN=2: ALIGN/2=1, error shows "1 bytes must be >= 2 bytes"
+    // For ALIGN=8: ALIGN/2=4, error shows "4 bytes must be >= 8 bytes"
+#if ALIGN == 4
+    // expected-error@+1 {{Alignment parameter of 2 bytes must be >= the largest scalar type size 4 bytes}}
+    data = srcbuf.AlignedLoad<TY>(offset, ALIGN / 2);
+    
+    // expected-error@+1 {{Alignment parameter of 2 bytes must be >= the largest scalar type size 4 bytes}}
+    dstbuf.AlignedStore<TY>(offset, ALIGN / 2, data);
+#elif ALIGN == 2
+    // expected-error@+1 {{Alignment parameter of 1 bytes must be >= the largest scalar type size 2 bytes}}
+    data = srcbuf.AlignedLoad<TY>(offset, ALIGN / 2);
+    
+    // expected-error@+1 {{Alignment parameter of 1 bytes must be >= the largest scalar type size 2 bytes}}
+    dstbuf.AlignedStore<TY>(offset, ALIGN / 2, data);
+#elif ALIGN == 8
+    // expected-error@+1 {{Alignment parameter of 4 bytes must be >= the largest scalar type size 8 bytes}}
+    data = srcbuf.AlignedLoad<TY>(offset, ALIGN / 2);
+    
+    // expected-error@+1 {{Alignment parameter of 4 bytes must be >= the largest scalar type size 8 bytes}}
+    dstbuf.AlignedStore<TY>(offset, ALIGN / 2, data);
+#endif
+}
diff --git a/tools/clang/test/SemaHLSL/aligned_load_shader_model.hlsl b/tools/clang/test/SemaHLSL/aligned_load_shader_model.hlsl
new file mode 100644
index 0000000000..cdee41cb85
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/aligned_load_shader_model.hlsl
@@ -0,0 +1,18 @@
+// RUN: %dxc -E main -T cs_6_0 %s -verify
+// RUN: %dxc -E main -T cs_6_1 %s -verify
+
+// Test that AlignedLoad/AlignedStore require SM 6.2+
+
+ByteAddressBuffer inputBuffer;
+RWByteAddressBuffer outputBuffer;
+
+[numthreads(1, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadID)
+{
+    // expected-error@+1 {{intrinsic AlignedLoad potentially used by ''AlignedLoad<uint, unsigned int, unsigned int>'' requires shader model 6.2 or greater}}
+    uint value = inputBuffer.AlignedLoad<uint>(0, 4);
+    
+    // expected-error@+1 {{intrinsic AlignedStore potentially used by ''AlignedStore<void, unsigned int, unsigned int, uint>'' requires shader model 6.2 or greater}}
+    outputBuffer.AlignedStore<uint>(0, 4, value);
+}
+
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 8b10f733a8..ed248adfbf 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -952,6 +952,8 @@ $funcT [[]] Load(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<2> [[]] Load2(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<3> [[]] Load3(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<4> [[]] Load4(in uint byteOffset, out uint_only status) : byteaddress_load_s;
+$funcT [[ro]] AlignedLoad(in uint byteOffset, in uint alignment) : byteaddress_alignedload;
+$funcT [[]] AlignedLoad(in uint byteOffset, in uint alignment, out uint_only status) : byteaddress_alignedload_s;
 
 } namespace
 
@@ -966,10 +968,13 @@ $funcT [[]] Load(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<2> [[]] Load2(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<3> [[]] Load3(in uint byteOffset, out uint_only status) : byteaddress_load_s;
 uint<4> [[]] Load4(in uint byteOffset, out uint_only status) : byteaddress_load_s;
+$funcT [[ro]] AlignedLoad(in uint byteOffset, in uint alignment) : byteaddress_alignedload;
+$funcT [[]] AlignedLoad(in uint byteOffset, in uint alignment, out uint_only status) : byteaddress_alignedload_s;
 void [[]] Store(in uint byteOffset, in $funcT value) : byteaddress_store;
 void [[]] Store2(in uint byteOffset, in uint<2> value) : byteaddress_store;
 void [[]] Store3(in uint byteOffset, in uint<3> value) : byteaddress_store;
 void [[]] Store4(in uint byteOffset, in uint<4> value) : byteaddress_store;
+void [[]] AlignedStore(in uint byteOffset, in uint alignment, in $funcT value) : byteaddress_alignedstore;
 // 64-bit integer interlocks
 void [[]] InterlockedAdd64(in uint byteOffset, in u64 value);
 void [[]] InterlockedAdd64(in uint byteOffset, in u64 value, out any_int64 original) : interlockedadd_immediate;
diff --git a/utils/hct/hlsl_intrinsic_opcodes.json b/utils/hct/hlsl_intrinsic_opcodes.json
index e2d3c6f290..645a621ae3 100644
--- a/utils/hct/hlsl_intrinsic_opcodes.json
+++ b/utils/hct/hlsl_intrinsic_opcodes.json
@@ -1,6 +1,6 @@
 {
   "IntrinsicOpCodes": {
-    "Num_Intrinsics": 405,
+    "Num_Intrinsics": 407,
     "IOP_AcceptHitAndEndSearch": 0,
     "IOP_AddUint64": 1,
     "IOP_AllMemoryBarrier": 2,
@@ -405,6 +405,8 @@
     "IOP_TriangleObjectPosition": 401,
     "MOP_CandidateTriangleObjectPosition": 402,
     "MOP_CommittedTriangleObjectPosition": 403,
-    "MOP_DxHitObject_TriangleObjectPosition": 404
+    "MOP_DxHitObject_TriangleObjectPosition": 404,
+    "MOP_AlignedLoad": 405,
+    "MOP_AlignedStore": 406,
   }
 }