Skip to content

Conversation

@vzakhari
Copy link
Contributor

If an operation deallocates memory allocated inside the loop,
then it is as safe for parallelization as the allocation,
which is already handled by isLocallyDefined.

If an operation deallocates memory allocated inside the loop,
then it is as safe for parallelization as the allocation,
which is already handled by `isLocallyDefined`.
@llvmbot
Copy link
Member

llvmbot commented Dec 16, 2025

@llvm/pr-subscribers-mlir

Author: Slava Zakharin (vzakhari)

Changes

If an operation deallocates memory allocated inside the loop,
then it is as safe for parallelization as the allocation,
which is already handled by isLocallyDefined.


Full diff: https://github.com/llvm/llvm-project/pull/172388.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp (+20)
  • (modified) mlir/test/Dialect/Affine/parallelize.mlir (+19-2)
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 3d1a73417d1ea..b126909439b88 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -134,6 +134,25 @@ static bool isLocallyDefined(Value v, Operation *enclosingOp) {
   return viewOp && isLocallyDefined(viewOp.getViewSource(), enclosingOp);
 }
 
+/// Returns true if `op` has only Free memory effects on the values
+/// that are locally defined (i.e. they are allocated by operations
+/// nested withing `enclosingOp`).
+static bool isDeallocationOfLocallyDefined(Operation *op,
+                                           Operation *enclosingOp) {
+  std::optional<SmallVector<MemoryEffects::EffectInstance>> effects =
+      getEffectsRecursively(op);
+  if (!effects)
+    return false;
+
+  for (const MemoryEffects::EffectInstance &effect : *effects) {
+    Value freed = effect.getValue();
+    if (!isa<MemoryEffects::Free>(effect.getEffect()) || !freed ||
+        !isLocallyDefined(freed, enclosingOp))
+      return false;
+  }
+  return true;
+}
+
 bool mlir::affine::isLoopMemoryParallel(AffineForOp forOp) {
   // Any memref-typed iteration arguments are treated as serializing.
   if (llvm::any_of(forOp.getResultTypes(), llvm::IsaPred<BaseMemRefType>))
@@ -152,6 +171,7 @@ bool mlir::affine::isLoopMemoryParallel(AffineForOp forOp) {
         loadAndStoreOps.push_back(op);
     } else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
                !hasSingleEffect<MemoryEffects::Allocate>(op) &&
+               !isDeallocationOfLocallyDefined(op, forOp) &&
                !isMemoryEffectFree(op)) {
       // Alloc-like ops inside `forOp` are fine (they don't impact parallelism)
       // as long as they don't escape the loop (which has been checked above).
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
index bfd1720959861..95cf8f3a45f3f 100644
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -272,19 +272,20 @@ func.func @nested_min_max(%m: memref<?xf32>, %lb0: index,
 
 // Test in the presence of locally allocated memrefs.
 
-// CHECK: func @local_alloc
+// CHECK-LABEL: func @local_alloc
 func.func @local_alloc() {
   %cst = arith.constant 0.0 : f32
   affine.for %i = 0 to 100 {
     %m = memref.alloc() : memref<1xf32>
     %ma = memref.alloca() : memref<1xf32>
     affine.store %cst, %m[0] : memref<1xf32>
+    memref.dealloc %m : memref<1xf32>
   }
   // CHECK: affine.parallel
   return
 }
 
-// CHECK: func @local_alloc_cast
+// CHECK-LABEL: func @local_alloc_cast
 func.func @local_alloc_cast() {
   %cst = arith.constant 0.0 : f32
   affine.for %i = 0 to 100 {
@@ -300,6 +301,7 @@ func.func @local_alloc_cast() {
     affine.for %j = 0 to 8 {
       affine.store %cst, %r[%j, %j] : memref<8x16xf32>
     }
+    memref.dealloc %m : memref<128xf32>
   }
   // CHECK: affine.parallel
   // CHECK:   affine.parallel
@@ -313,6 +315,21 @@ func.func @local_alloc_cast() {
   return
 }
 
+// When memref.dealloc deallocates out-of-loop allocation,
+// the loop should not be parallelized. This test is quite
+// artificial though.
+// CHECK-LABEL: func @local_dealloc
+func.func @local_dealloc() {
+  %cst = arith.constant 0.0 : f32
+  %m = memref.alloc() : memref<1xf32>
+  affine.for %i = 0 to 1 {
+    affine.store %cst, %m[%i] : memref<1xf32>
+    memref.dealloc %m : memref<1xf32>
+  }
+  // CHECK-NOT: affine.parallel
+  return
+}
+
 // CHECK-LABEL: @iter_arg_memrefs
 func.func @iter_arg_memrefs(%in: memref<10xf32>) {
   %mi = memref.alloc() : memref<f32>

@llvmbot
Copy link
Member

llvmbot commented Dec 16, 2025

@llvm/pr-subscribers-mlir-affine

Author: Slava Zakharin (vzakhari)

Changes

If an operation deallocates memory allocated inside the loop,
then it is as safe for parallelization as the allocation,
which is already handled by isLocallyDefined.


Full diff: https://github.com/llvm/llvm-project/pull/172388.diff

2 Files Affected:

  • (modified) mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp (+20)
  • (modified) mlir/test/Dialect/Affine/parallelize.mlir (+19-2)
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 3d1a73417d1ea..b126909439b88 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -134,6 +134,25 @@ static bool isLocallyDefined(Value v, Operation *enclosingOp) {
   return viewOp && isLocallyDefined(viewOp.getViewSource(), enclosingOp);
 }
 
+/// Returns true if `op` has only Free memory effects on the values
+/// that are locally defined (i.e. they are allocated by operations
+/// nested withing `enclosingOp`).
+static bool isDeallocationOfLocallyDefined(Operation *op,
+                                           Operation *enclosingOp) {
+  std::optional<SmallVector<MemoryEffects::EffectInstance>> effects =
+      getEffectsRecursively(op);
+  if (!effects)
+    return false;
+
+  for (const MemoryEffects::EffectInstance &effect : *effects) {
+    Value freed = effect.getValue();
+    if (!isa<MemoryEffects::Free>(effect.getEffect()) || !freed ||
+        !isLocallyDefined(freed, enclosingOp))
+      return false;
+  }
+  return true;
+}
+
 bool mlir::affine::isLoopMemoryParallel(AffineForOp forOp) {
   // Any memref-typed iteration arguments are treated as serializing.
   if (llvm::any_of(forOp.getResultTypes(), llvm::IsaPred<BaseMemRefType>))
@@ -152,6 +171,7 @@ bool mlir::affine::isLoopMemoryParallel(AffineForOp forOp) {
         loadAndStoreOps.push_back(op);
     } else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
                !hasSingleEffect<MemoryEffects::Allocate>(op) &&
+               !isDeallocationOfLocallyDefined(op, forOp) &&
                !isMemoryEffectFree(op)) {
       // Alloc-like ops inside `forOp` are fine (they don't impact parallelism)
       // as long as they don't escape the loop (which has been checked above).
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
index bfd1720959861..95cf8f3a45f3f 100644
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -272,19 +272,20 @@ func.func @nested_min_max(%m: memref<?xf32>, %lb0: index,
 
 // Test in the presence of locally allocated memrefs.
 
-// CHECK: func @local_alloc
+// CHECK-LABEL: func @local_alloc
 func.func @local_alloc() {
   %cst = arith.constant 0.0 : f32
   affine.for %i = 0 to 100 {
     %m = memref.alloc() : memref<1xf32>
     %ma = memref.alloca() : memref<1xf32>
     affine.store %cst, %m[0] : memref<1xf32>
+    memref.dealloc %m : memref<1xf32>
   }
   // CHECK: affine.parallel
   return
 }
 
-// CHECK: func @local_alloc_cast
+// CHECK-LABEL: func @local_alloc_cast
 func.func @local_alloc_cast() {
   %cst = arith.constant 0.0 : f32
   affine.for %i = 0 to 100 {
@@ -300,6 +301,7 @@ func.func @local_alloc_cast() {
     affine.for %j = 0 to 8 {
       affine.store %cst, %r[%j, %j] : memref<8x16xf32>
     }
+    memref.dealloc %m : memref<128xf32>
   }
   // CHECK: affine.parallel
   // CHECK:   affine.parallel
@@ -313,6 +315,21 @@ func.func @local_alloc_cast() {
   return
 }
 
+// When memref.dealloc deallocates out-of-loop allocation,
+// the loop should not be parallelized. This test is quite
+// artificial though.
+// CHECK-LABEL: func @local_dealloc
+func.func @local_dealloc() {
+  %cst = arith.constant 0.0 : f32
+  %m = memref.alloc() : memref<1xf32>
+  affine.for %i = 0 to 1 {
+    affine.store %cst, %m[%i] : memref<1xf32>
+    memref.dealloc %m : memref<1xf32>
+  }
+  // CHECK-NOT: affine.parallel
+  return
+}
+
 // CHECK-LABEL: @iter_arg_memrefs
 func.func @iter_arg_memrefs(%in: memref<10xf32>) {
   %mi = memref.alloc() : memref<f32>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants