diff --git a/CLAUDE.md b/CLAUDE.md
index b10a33b..08b345f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -87,7 +87,7 @@ uv run ruff format gpu_test/
 
 - **Stack Type**: `!forth.stack` - untyped stack, programmer ensures type safety
 - **Operations**: All take stack as input and produce stack as output (except `forth.stack`)
-- **Supported Words**: literals, `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `@ !`, `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP I J K`, `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing).
+- **Supported Words**: literals, `DUP DROP SWAP OVER ROT NIP TUCK PICK ROLL`, `+ - * / MOD`, `AND OR XOR NOT LSHIFT RSHIFT`, `= < > <> <= >= 0=`, `@ !`, `CELLS`, `IF ELSE THEN`, `BEGIN UNTIL`, `BEGIN WHILE REPEAT`, `DO LOOP +LOOP I J K`, `LEAVE UNLOOP EXIT`, `TID-X/Y/Z BID-X/Y/Z BDIM-X/Y/Z GDIM-X/Y/Z GLOBAL-ID` (GPU indexing).
 - **Kernel Parameters**: Declared with `PARAM <name> <size>`, each becomes a `memref<Nxi64>` function argument with `forth.param_name` attribute. Using a param name in code pushes its byte address onto the stack via `forth.param_ref`
 - **Conversion**: `!forth.stack` → `memref<256xi64>` with explicit stack pointer
 - **GPU**: Functions wrapped in `gpu.module`, `main` gets `gpu.kernel` attribute, configured with bare pointers for NVVM conversion
diff --git a/gpu_test/test_kernels.py b/gpu_test/test_kernels.py
index 5a99cfa..99ab192 100644
--- a/gpu_test/test_kernels.py
+++ b/gpu_test/test_kernels.py
@@ -154,6 +154,24 @@ def test_do_loop(kernel_runner: KernelRunner) -> None:
     assert result == [0, 1, 2, 3, 4]
 
 
+def test_do_plus_loop(kernel_runner: KernelRunner) -> None:
+    """DO/+LOOP: write I values 0, 2, 4, 6, 8 to DATA[0..4]."""
+    result = kernel_runner.run(
+        forth_source=("PARAM DATA 256\n0\n10 0 DO\n  I OVER CELLS DATA + !\n  1 +\n2 +LOOP\nDROP"),
+        output_count=5,
+    )
+    assert result == [0, 2, 4, 6, 8]
+
+
+def test_do_plus_loop_negative(kernel_runner: KernelRunner) -> None:
+    """DO/+LOOP with negative step: count down from 10 to 1."""
+    result = kernel_runner.run(
+        forth_source=("PARAM DATA 256\n0\n0 10 DO\n  I OVER CELLS DATA + !\n  1 +\n-1 +LOOP\nDROP"),
+        output_count=10,
+    )
+    assert result == [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+
+
 def test_multi_while(kernel_runner: KernelRunner) -> None:
     """Multi-WHILE: two exit conditions from the same loop (interleaved CF).
 
diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
index edbe11f..fb992b2 100644
--- a/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
+++ b/lib/Translation/ForthToMLIR/ForthToMLIR.cpp
@@ -397,6 +397,36 @@ std::pair<Value, Value> ForthParser::emitPopFlag(Location loc, Value stack) {
   return {popFlag.getOutputStack(), popFlag.getFlag()};
 }
 
+void ForthParser::emitLoopEnd(Location loc, const LoopContext &ctx, Value step,
+                              Value &stack) {
+  auto i64Type = builder.getI64Type();
+
+  // Load old counter, compute new = old + step, store.
+  Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
+  Value oldIdx =
+      builder.create<memref::LoadOp>(loc, ctx.counter, ValueRange{c0});
+  Value newIdx = builder.create<arith::AddIOp>(loc, oldIdx, step);
+  builder.create<memref::StoreOp>(loc, newIdx, ctx.counter, ValueRange{c0});
+
+  // Crossing test: ((oldIdx - limit) XOR (newIdx - limit)) < 0
+  // This correctly handles both positive and negative step values.
+  Value oldDiff = builder.create<arith::SubIOp>(loc, oldIdx, ctx.limit);
+  Value newDiff = builder.create<arith::SubIOp>(loc, newIdx, ctx.limit);
+  Value xorVal = builder.create<arith::XOrIOp>(loc, oldDiff, newDiff);
+  Value zero = builder.create<arith::ConstantOp>(loc, i64Type,
+                                                 builder.getI64IntegerAttr(0));
+  Value crossed = builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
+                                                xorVal, zero);
+
+  // If crossed → exit, otherwise → loop back to body.
+  builder.create<cf::CondBranchOp>(loc, crossed, ctx.exit, ValueRange{stack},
+                                   ctx.body, ValueRange{stack});
+
+  // Continue after exit.
+  builder.setInsertionPointToStart(ctx.exit);
+  stack = ctx.exit->getArgument(0);
+}
+
 LogicalResult ForthParser::parseBody(Value &stack) {
   Type stackType = forth::StackType::get(context);
 
@@ -644,27 +674,15 @@ LogicalResult ForthParser::parseBody(Value &stack) {
         Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
         builder.create<memref::StoreOp>(loc, start, counter, ValueRange{c0});
 
-        // Create check, body, and exit blocks.
-        auto *checkBlock = createStackBlock(parentRegion, loc);
+        // Create body and exit blocks (post-test loop: always enters once).
         auto *bodyBlock = createStackBlock(parentRegion, loc);
         auto *exitBlock = createStackBlock(parentRegion, loc);
 
-        // Branch to check.
-        builder.create<cf::BranchOp>(loc, checkBlock, ValueRange{s2});
-
-        // --- Check block: load counter, compare < limit ---
-        builder.setInsertionPointToStart(checkBlock);
-        Value checkC0 = builder.create<arith::ConstantIndexOp>(loc, 0);
-        Value idx =
-            builder.create<memref::LoadOp>(loc, counter, ValueRange{checkC0});
-        Value cond = builder.create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::slt, idx, limit);
-        builder.create<cf::CondBranchOp>(
-            loc, cond, bodyBlock, ValueRange{checkBlock->getArgument(0)},
-            exitBlock, ValueRange{checkBlock->getArgument(0)});
+        // Branch directly to body.
+        builder.create<cf::BranchOp>(loc, bodyBlock, ValueRange{s2});
 
         // Push loop context for I/J/K.
-        loopStack.push_back({counter, limit, checkBlock, exitBlock});
+        loopStack.push_back({counter, limit, bodyBlock, exitBlock});
 
         // Continue parsing in body.
         builder.setInsertionPointToStart(bodyBlock);
@@ -673,29 +691,32 @@ LogicalResult ForthParser::parseBody(Value &stack) {
         //=== LOOP ===
       } else if (word == "LOOP") {
         consume();
-        auto i64Type = builder.getI64Type();
 
         if (loopStack.empty()) {
           return emitError("LOOP without matching DO");
         }
 
         auto ctx = loopStack.pop_back_val();
-
-        // Increment counter: load, add 1, store.
-        Value c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
-        Value idx =
-            builder.create<memref::LoadOp>(loc, ctx.counter, ValueRange{c0});
         Value one = builder.create<arith::ConstantOp>(
-            loc, i64Type, builder.getI64IntegerAttr(1));
-        Value next = builder.create<arith::AddIOp>(loc, idx, one);
-        builder.create<memref::StoreOp>(loc, next, ctx.counter, ValueRange{c0});
+            loc, builder.getI64Type(), builder.getI64IntegerAttr(1));
+        emitLoopEnd(loc, ctx, one, stack);
 
-        // Branch back to check.
-        builder.create<cf::BranchOp>(loc, ctx.check, ValueRange{stack});
+        //=== +LOOP ===
+      } else if (word == "+LOOP") {
+        consume();
 
-        // Continue after exit.
-        builder.setInsertionPointToStart(ctx.exit);
-        stack = ctx.exit->getArgument(0);
+        if (loopStack.empty()) {
+          return emitError("+LOOP without matching DO");
+        }
+
+        auto ctx = loopStack.pop_back_val();
+
+        // Pop step from data stack.
+        auto popOp = builder.create<forth::PopOp>(
+            loc, forth::StackType::get(context), builder.getI64Type(), stack);
+        stack = popOp.getOutputStack();
+        Value step = popOp.getValue();
+        emitLoopEnd(loc, ctx, step, stack);
 
         //=== Normal word ===
       } else {
diff --git a/lib/Translation/ForthToMLIR/ForthToMLIR.h b/lib/Translation/ForthToMLIR/ForthToMLIR.h
index 264175b..c860dd2 100644
--- a/lib/Translation/ForthToMLIR/ForthToMLIR.h
+++ b/lib/Translation/ForthToMLIR/ForthToMLIR.h
@@ -92,7 +92,7 @@ class ForthParser {
   struct LoopContext {
     Value counter; // memref<1xi64> alloca for the loop counter
     Value limit;   // i64 loop limit
-    Block *check;  // condition check block
+    Block *body;   // loop body block
     Block *exit;   // loop exit block
   };
   SmallVector<LoopContext> loopStack;
@@ -127,6 +127,11 @@ class ForthParser {
   /// Parse a sequence of Forth operations, handling control flow inline.
   LogicalResult parseBody(Value &stack);
 
+  /// Emit the common loop-end logic for LOOP and +LOOP:
+  /// load counter, add step, store, crossing test, cond_br to exit or body.
+  void emitLoopEnd(Location loc, const LoopContext &ctx, Value step,
+                   Value &stack);
+
   /// Parse a user-defined word definition.
   LogicalResult parseWordDefinition();
 };
diff --git a/test/Conversion/ForthToMemRef/do-loop.mlir b/test/Conversion/ForthToMemRef/do-loop.mlir
index 62fa347..034807a 100644
--- a/test/Conversion/ForthToMemRef/do-loop.mlir
+++ b/test/Conversion/ForthToMemRef/do-loop.mlir
@@ -1,6 +1,6 @@
 // RUN: %warpforth-opt --convert-forth-to-memref %s | %FileCheck %s
 
-// Test: DO...LOOP with I conversion to memref with CF-based control flow
+// Test: DO...LOOP with I conversion to memref with post-test crossing check
 // Forth: 10 0 DO I LOOP
 
 // CHECK-LABEL: func.func private @main
@@ -23,24 +23,20 @@
 // CHECK: memref.store %{{.*}}, %[[COUNTER]]
 // CHECK: cf.br ^bb1
 
-// Loop header: load counter, compare < limit, cond_br
+// Loop body: push I (load counter, push to stack), crossing test
 // CHECK: ^bb1(%{{.*}}: memref<256xi64>, %{{.*}}: index):
 // CHECK: memref.load %[[COUNTER]]
-// CHECK: arith.cmpi slt
-// CHECK: cf.cond_br %{{.*}}, ^bb2(%{{.*}}: memref<256xi64>, index), ^bb3(%{{.*}}: memref<256xi64>, index)
-
-// Loop body: push I (load counter, push to stack), increment counter
-// CHECK: ^bb2(%{{.*}}: memref<256xi64>, %{{.*}}: index):
-// CHECK: memref.load %[[COUNTER]]
 // CHECK: memref.store
-// CHECK: memref.load %[[COUNTER]]
-// CHECK: arith.constant 1 : i64
 // CHECK: arith.addi
 // CHECK: memref.store %{{.*}}, %[[COUNTER]]
-// CHECK: cf.br ^bb1
+// CHECK: arith.subi
+// CHECK: arith.subi
+// CHECK: arith.xori
+// CHECK: arith.cmpi slt
+// CHECK: cf.cond_br
 
 // Exit block
-// CHECK: ^bb3(%{{.*}}: memref<256xi64>, %{{.*}}: index):
+// CHECK: ^bb2(%{{.*}}: memref<256xi64>, %{{.*}}: index):
 // CHECK: return
 
 module {
@@ -57,19 +53,19 @@ module {
   ^bb1(%3: !forth.stack):
     %c0_2 = arith.constant 0 : index
     %4 = memref.load %alloca[%c0_2] : memref<1xi64>
-    %5 = arith.cmpi slt, %4, %value_1 : i64
-    cf.cond_br %5, ^bb2(%3 : !forth.stack), ^bb3(%3 : !forth.stack)
-  ^bb2(%6: !forth.stack):
-    %c0_3 = arith.constant 0 : index
-    %7 = memref.load %alloca[%c0_3] : memref<1xi64>
-    %8 = forth.push_value %6, %7 : !forth.stack, i64 -> !forth.stack
-    %c0_4 = arith.constant 0 : index
-    %9 = memref.load %alloca[%c0_4] : memref<1xi64>
+    %5 = forth.push_value %3, %4 : !forth.stack, i64 -> !forth.stack
     %c1_i64 = arith.constant 1 : i64
-    %10 = arith.addi %9, %c1_i64 : i64
-    memref.store %10, %alloca[%c0_4] : memref<1xi64>
-    cf.br ^bb1(%8 : !forth.stack)
-  ^bb3(%11: !forth.stack):
+    %c0_3 = arith.constant 0 : index
+    %6 = memref.load %alloca[%c0_3] : memref<1xi64>
+    %7 = arith.addi %6, %c1_i64 : i64
+    memref.store %7, %alloca[%c0_3] : memref<1xi64>
+    %8 = arith.subi %6, %value_1 : i64
+    %9 = arith.subi %7, %value_1 : i64
+    %10 = arith.xori %8, %9 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %11 = arith.cmpi slt, %10, %c0_i64 : i64
+    cf.cond_br %11, ^bb2(%5 : !forth.stack), ^bb1(%5 : !forth.stack)
+  ^bb2(%12: !forth.stack):
     return
   }
 }
diff --git a/test/Conversion/ForthToMemRef/leave.mlir b/test/Conversion/ForthToMemRef/leave.mlir
index 60f4979..2c93ddc 100644
--- a/test/Conversion/ForthToMemRef/leave.mlir
+++ b/test/Conversion/ForthToMemRef/leave.mlir
@@ -7,10 +7,8 @@
 // CHECK: %[[STACK:.*]] = memref.alloca() : memref<256xi64>
 // CHECK: cf.br ^bb1(%[[STACK]], %{{.*}} : memref<256xi64>, index)
 // CHECK: ^bb1(%{{.*}}: memref<256xi64>, %{{.*}}: index):
-// CHECK: cf.cond_br %{{.*}}, ^bb2(%{{.*}}: memref<256xi64>, index), ^bb3(%{{.*}}: memref<256xi64>, index)
+// CHECK: cf.cond_br %true, ^bb2(%{{.*}}: memref<256xi64>, index), ^bb3(%{{.*}}: memref<256xi64>, index)
 // CHECK: ^bb2(%{{.*}}: memref<256xi64>, %{{.*}}: index):
-// CHECK-NEXT: cf.br ^bb3(%{{.*}}: memref<256xi64>, index)
-// CHECK: ^bb3(%{{.*}}: memref<256xi64>, %{{.*}}: index):
 // CHECK: return
 
 module {
@@ -25,13 +23,21 @@ module {
     memref.store %value, %alloca[%c0] : memref<1xi64>
     cf.br ^bb1(%output_stack_0 : !forth.stack)
   ^bb1(%3: !forth.stack):
-    %c0_2 = arith.constant 0 : index
-    %4 = memref.load %alloca[%c0_2] : memref<1xi64>
-    %5 = arith.cmpi slt, %4, %value_1 : i64
-    cf.cond_br %5, ^bb2(%3 : !forth.stack), ^bb3(%3 : !forth.stack)
-  ^bb2(%6: !forth.stack):
-    cf.br ^bb3(%6 : !forth.stack)
-  ^bb3(%7: !forth.stack):
+    %true = arith.constant true
+    cf.cond_br %true, ^bb2(%3 : !forth.stack), ^bb3(%3 : !forth.stack)
+  ^bb2(%4: !forth.stack):
     return
+  ^bb3(%5: !forth.stack):
+    %c1_i64 = arith.constant 1 : i64
+    %c0_2 = arith.constant 0 : index
+    %6 = memref.load %alloca[%c0_2] : memref<1xi64>
+    %7 = arith.addi %6, %c1_i64 : i64
+    memref.store %7, %alloca[%c0_2] : memref<1xi64>
+    %8 = arith.subi %6, %value_1 : i64
+    %9 = arith.subi %7, %value_1 : i64
+    %10 = arith.xori %8, %9 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %11 = arith.cmpi slt, %10, %c0_i64 : i64
+    cf.cond_br %11, ^bb2(%5 : !forth.stack), ^bb1(%5 : !forth.stack)
   }
 }
diff --git a/test/Pipeline/nested-control-flow.forth b/test/Pipeline/nested-control-flow.forth
index 3e11155..41a520b 100644
--- a/test/Pipeline/nested-control-flow.forth
+++ b/test/Pipeline/nested-control-flow.forth
@@ -8,8 +8,7 @@
 \ MID: gpu.module @warpforth_module
 \ MID: gpu.func @main(%arg0: memref<4xi64> {forth.param_name = "DATA"}) kernel
 \ MID: cf.br
-\ MID: cf.cond_br
-\ MID: gpu.return
+\ MID: arith.xori
 
 PARAM DATA 4
 3 0 DO 4 0 DO J I + LOOP LOOP DATA 0 CELLS + !
diff --git a/test/Pipeline/plus-loop-negative.forth b/test/Pipeline/plus-loop-negative.forth
new file mode 100644
index 0000000..50afe63
--- /dev/null
+++ b/test/Pipeline/plus-loop-negative.forth
@@ -0,0 +1,7 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s
+
+\ Verify that +LOOP with negative step through the full pipeline produces a gpu.binary
+\ CHECK: gpu.binary @warpforth_module
+
+PARAM DATA 4
+0 10 DO I DATA 0 CELLS + ! -1 +LOOP
diff --git a/test/Pipeline/plus-loop.forth b/test/Pipeline/plus-loop.forth
new file mode 100644
index 0000000..6eb32ed
--- /dev/null
+++ b/test/Pipeline/plus-loop.forth
@@ -0,0 +1,7 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %warpforth-opt --warpforth-pipeline | %FileCheck %s
+
+\ Verify that +LOOP through the full pipeline produces a gpu.binary
+\ CHECK: gpu.binary @warpforth_module
+
+PARAM DATA 4
+10 0 DO I DATA 0 CELLS + ! 2 +LOOP
diff --git a/test/Translation/Forth/do-loop.forth b/test/Translation/Forth/do-loop.forth
index 4a4a7ff..4c0b9bc 100644
--- a/test/Translation/Forth/do-loop.forth
+++ b/test/Translation/Forth/do-loop.forth
@@ -1,6 +1,6 @@
 \ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s
 
-\ Verify DO/LOOP generates loop counter with memref.alloca, pop, cmpi, cond_br
+\ Verify DO/LOOP generates post-test loop with crossing test
 
 \ CHECK:       %[[S0:.*]] = forth.stack !forth.stack
 \ CHECK-NEXT:  %[[S1:.*]] = forth.literal %[[S0]] 10 : !forth.stack -> !forth.stack
@@ -14,18 +14,18 @@
 \ CHECK:     ^bb1(%[[B1:.*]]: !forth.stack):
 \ CHECK-NEXT:  %[[C0_2:.*]] = arith.constant 0 : index
 \ CHECK-NEXT:  %[[LOAD1:.*]] = memref.load %[[ALLOCA]][%[[C0_2]]] : memref<1xi64>
-\ CHECK-NEXT:  %[[CMP:.*]] = arith.cmpi slt, %[[LOAD1]], %[[LIM]] : i64
-\ CHECK-NEXT:  cf.cond_br %[[CMP]], ^bb2(%[[B1]] : !forth.stack), ^bb3(%[[B1]] : !forth.stack)
-\ CHECK:     ^bb2(%[[B2:.*]]: !forth.stack):
-\ CHECK-NEXT:  %[[C0_3:.*]] = arith.constant 0 : index
-\ CHECK-NEXT:  %[[LOAD2:.*]] = memref.load %[[ALLOCA]][%[[C0_3]]] : memref<1xi64>
-\ CHECK-NEXT:  %[[PUSH:.*]] = forth.push_value %[[B2]], %[[LOAD2]] : !forth.stack, i64 -> !forth.stack
-\ CHECK-NEXT:  %[[C0_4:.*]] = arith.constant 0 : index
-\ CHECK-NEXT:  %[[LOAD3:.*]] = memref.load %[[ALLOCA]][%[[C0_4]]] : memref<1xi64>
+\ CHECK-NEXT:  %[[PUSH:.*]] = forth.push_value %[[B1]], %[[LOAD1]] : !forth.stack, i64 -> !forth.stack
 \ CHECK-NEXT:  %[[C1:.*]] = arith.constant 1 : i64
-\ CHECK-NEXT:  %[[ADDI:.*]] = arith.addi %[[LOAD3]], %[[C1]] : i64
-\ CHECK-NEXT:  memref.store %[[ADDI]], %[[ALLOCA]][%[[C0_4]]] : memref<1xi64>
-\ CHECK-NEXT:  cf.br ^bb1(%[[PUSH]] : !forth.stack)
-\ CHECK:     ^bb3(%[[B3:.*]]: !forth.stack):
+\ CHECK-NEXT:  %[[C0_3:.*]] = arith.constant 0 : index
+\ CHECK-NEXT:  %[[OLD:.*]] = memref.load %[[ALLOCA]][%[[C0_3]]] : memref<1xi64>
+\ CHECK-NEXT:  %[[NEW:.*]] = arith.addi %[[OLD]], %[[C1]] : i64
+\ CHECK-NEXT:  memref.store %[[NEW]], %[[ALLOCA]][%[[C0_3]]] : memref<1xi64>
+\ CHECK-NEXT:  %[[D1:.*]] = arith.subi %[[OLD]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[D2:.*]] = arith.subi %[[NEW]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[XOR:.*]] = arith.xori %[[D1]], %[[D2]] : i64
+\ CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i64
+\ CHECK-NEXT:  %[[CROSSED:.*]] = arith.cmpi slt, %[[XOR]], %[[ZERO]] : i64
+\ CHECK-NEXT:  cf.cond_br %[[CROSSED]], ^bb2(%[[PUSH]] : !forth.stack), ^bb1(%[[PUSH]] : !forth.stack)
+\ CHECK:     ^bb2(%[[B2:.*]]: !forth.stack):
 \ CHECK-NEXT:  return
 10 0 DO I LOOP
diff --git a/test/Translation/Forth/leave-conditional.forth b/test/Translation/Forth/leave-conditional.forth
index 1f3f327..5d56a65 100644
--- a/test/Translation/Forth/leave-conditional.forth
+++ b/test/Translation/Forth/leave-conditional.forth
@@ -2,19 +2,31 @@
 
 \ Verify conditional LEAVE preserves the loop backedge for non-LEAVE paths.
 
+\ Branch directly to body (post-test loop)
 \ CHECK:       cf.br ^bb1(%{{.*}} : !forth.stack)
-\ CHECK:     ^bb1(%[[CHK:.*]]: !forth.stack):
-\ CHECK:       cf.cond_br %{{.*}}, ^bb2(%[[CHK]] : !forth.stack), ^bb[[EXIT:[0-9]+]](%[[CHK]] : !forth.stack)
-\ CHECK:     ^bb2(%[[B:.*]]: !forth.stack):
-\ CHECK:       cf.cond_br %{{.*}}, ^bb[[LEAVE:[0-9]+]](%{{.*}} : !forth.stack), ^bb[[JOIN:[0-9]+]](%{{.*}} : !forth.stack)
-\ CHECK:     ^bb[[EXIT]](%{{.*}}: !forth.stack):
+
+\ Body: I 5 = IF → cond_br to LEAVE or THEN merge
+\ CHECK:     ^bb1(%[[B:.*]]: !forth.stack):
+\ CHECK:       forth.pop_flag
+\ CHECK-NEXT:  cf.cond_br %{{[^,]*}}, ^bb[[LEAVE:[0-9]+]](%{{[^)]*}} : !forth.stack), ^bb[[JOIN:[0-9]+]](%{{[^)]*}} : !forth.stack)
+
+\ Exit: return
+\ CHECK:     ^bb[[EXIT:[0-9]+]](%{{.*}}: !forth.stack):
 \ CHECK:       return
+
+\ LEAVE branch: unconditional jump to exit
 \ CHECK:     ^bb[[LEAVE]](%{{.*}}: !forth.stack):
-\ CHECK:       cf.cond_br %{{.*}}, ^bb[[EXIT]](%{{.*}} : !forth.stack), ^bb[[DEAD:[0-9]+]](%{{.*}} : !forth.stack)
+\ CHECK:       cf.cond_br %true, ^bb[[EXIT]](%{{.*}} : !forth.stack), ^bb[[DEAD:[0-9]+]](%{{.*}} : !forth.stack)
+
+\ Join (THEN merge): 1 DROP, crossing test, loop back to body or exit
 \ CHECK:     ^bb[[JOIN]](%{{.*}}: !forth.stack):
-\ CHECK:       cf.br ^bb1(%{{.*}} : !forth.stack)
+\ CHECK:       arith.xori
+\ CHECK:       arith.cmpi slt
+\ CHECK:       cf.cond_br
+
+\ Dead block from LEAVE
 \ CHECK:     ^bb[[DEAD]](%{{.*}}: !forth.stack):
-\ CHECK:       cf.br ^bb[[JOIN]](%{{.*}} : !forth.stack)
+\ CHECK:       cf.br ^bb[[JOIN]]
 
 10 0 DO
   I 5 = IF LEAVE THEN
diff --git a/test/Translation/Forth/leave.forth b/test/Translation/Forth/leave.forth
index 0a185c0..242b3c4 100644
--- a/test/Translation/Forth/leave.forth
+++ b/test/Translation/Forth/leave.forth
@@ -7,10 +7,8 @@
 \ CHECK-NEXT:  %[[S2:.*]] = forth.literal %[[S1]] 0 : !forth.stack -> !forth.stack
 \ CHECK:       cf.br ^bb1(%{{.*}} : !forth.stack)
 \ CHECK:     ^bb1(%[[B1:.*]]: !forth.stack):
-\ CHECK:       cf.cond_br %{{.*}}, ^bb2(%[[B1]] : !forth.stack), ^bb[[EXIT:[0-9]+]](%[[B1]] : !forth.stack)
-\ CHECK:     ^bb2(%[[B2:.*]]: !forth.stack):
 \ CHECK-NEXT:  %[[TRUE:.*]] = arith.constant true
-\ CHECK-NEXT:  cf.cond_br %[[TRUE]], ^bb[[EXIT:[0-9]+]](%[[B2]] : !forth.stack), ^bb{{[0-9]+}}(%[[B2]] : !forth.stack)
+\ CHECK-NEXT:  cf.cond_br %[[TRUE]], ^bb[[EXIT:[0-9]+]](%[[B1]] : !forth.stack), ^bb{{[0-9]+}}(%[[B1]] : !forth.stack)
 \ CHECK:     ^bb[[EXIT]](%[[B3:.*]]: !forth.stack):
 \ CHECK-NEXT:  return
 
diff --git a/test/Translation/Forth/nested-control-flow.forth b/test/Translation/Forth/nested-control-flow.forth
index 2cbed2f..8cb108c 100644
--- a/test/Translation/Forth/nested-control-flow.forth
+++ b/test/Translation/Forth/nested-control-flow.forth
@@ -12,6 +12,7 @@
 1 IF 2 IF 3 THEN THEN
 
 \ === IF inside DO ===
+\ After IF/THEN merge, set up DO loop: 10 0 DO
 \ CHECK:     ^bb2(%[[B2:.*]]: !forth.stack):
 \ CHECK-NEXT:  %[[L10:.*]] = forth.literal %[[B2]] 10 : !forth.stack -> !forth.stack
 \ CHECK-NEXT:  %[[L0A:.*]] = forth.literal %[[L10]] 0 : !forth.stack -> !forth.stack
@@ -30,43 +31,35 @@
 \ CHECK:     ^bb4(%[[B4:.*]]: !forth.stack):
 \ CHECK-NEXT:  cf.br ^bb2(%[[B4]] : !forth.stack)
 
-\ DO loop header: check index < limit
+\ DO loop body (post-test: no check block): I 5 > IF I THEN
 \ CHECK:     ^bb5(%[[B5:.*]]: !forth.stack):
-\ CHECK:       arith.cmpi slt
-\ CHECK-NEXT:  cf.cond_br %{{.*}}, ^bb6(%[[B5]] : !forth.stack), ^bb7(%[[B5]] : !forth.stack)
-
-\ DO loop body: I 5 > IF I THEN
-\ CHECK:     ^bb6(%[[B6:.*]]: !forth.stack):
-\ CHECK:       forth.push_value %[[B6]]
+\ CHECK:       forth.push_value %[[B5]]
 \ CHECK:       forth.literal %{{.*}} 5
 \ CHECK-NEXT:  %{{.*}} = forth.gt
 \ CHECK:       forth.pop_flag
-\ CHECK-NEXT:  cf.cond_br %{{.*}}, ^bb8(%{{.*}} : !forth.stack), ^bb9(%{{.*}} : !forth.stack)
+\ CHECK-NEXT:  cf.cond_br %{{[^,]*}}, ^bb7(%{{[^)]*}} : !forth.stack), ^bb8(%{{[^)]*}} : !forth.stack)
 
 \ === Nested DO with J ===
-\ After first DO loop exits: bb7 sets up nested DO (3 0 DO)
-\ CHECK:     ^bb7(%[[B7:.*]]: !forth.stack):
-\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B7]] 3
+\ After first DO loop exits: sets up nested DO (3 0 DO)
+\ CHECK:     ^bb6(%[[B6:.*]]: !forth.stack):
+\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B6]] 3
 3 0 DO 4 0 DO J I + LOOP LOOP
 
 \ IF I true branch: push loop index
-\ CHECK:     ^bb8(%[[B8:.*]]: !forth.stack):
-\ CHECK:       forth.push_value %[[B8]]
-\ CHECK-NEXT:  cf.br ^bb9
+\ CHECK:     ^bb7(%[[B7:.*]]: !forth.stack):
+\ CHECK:       forth.push_value %[[B7]]
+\ CHECK-NEXT:  cf.br ^bb8
 
-\ Loop increment and back-edge
-\ CHECK:     ^bb9(%{{.*}}: !forth.stack):
+\ Loop end with crossing test and back-edge
+\ CHECK:     ^bb8(%{{.*}}: !forth.stack):
 \ CHECK:       arith.addi
 \ CHECK:       memref.store
-\ CHECK:       cf.br ^bb5
-
-\ Outer DO loop (3 0 DO) header
-\ CHECK:     ^bb10(%{{.*}}: !forth.stack):
+\ CHECK:       arith.xori
 \ CHECK:       arith.cmpi slt
 \ CHECK:       cf.cond_br
 
-\ Inner DO setup (4 0 DO)
-\ CHECK:     ^bb11(%{{.*}}: !forth.stack):
+\ Outer DO body (3 0 DO) with inner DO setup (4 0 DO)
+\ CHECK:     ^bb9(%{{.*}}: !forth.stack):
 \ CHECK:       forth.literal %{{.*}} 4
 \ CHECK:       forth.literal %{{.*}} 0
 \ CHECK:       forth.pop
@@ -75,78 +68,75 @@
 
 \ === Triple-nested DO with K ===
 \ After nested DO exits: sets up triple-nested DO (2 0 DO)
-\ CHECK:     ^bb12(%{{.*}}: !forth.stack):
+\ CHECK:     ^bb10(%{{.*}}: !forth.stack):
 \ CHECK:       forth.literal %{{.*}} 2
 2 0 DO 2 0 DO 2 0 DO K J I + + LOOP LOOP LOOP
 
-\ Inner loop of J I + (bb13 header, bb14 body)
-\ CHECK:     ^bb13(%{{.*}}: !forth.stack):
-\ CHECK:       arith.cmpi slt
-\ CHECK:       cf.cond_br
-
-\ J I + body
-\ CHECK:     ^bb14(%{{.*}}: !forth.stack):
+\ Inner loop of J I + (bb11 body)
+\ CHECK:     ^bb11(%{{.*}}: !forth.stack):
 \ CHECK:       forth.push_value
 \ CHECK:       forth.push_value
 \ CHECK:       forth.add
 
-\ Outer loop increment (bb15)
-\ CHECK:     ^bb15(%{{.*}}: !forth.stack):
-\ CHECK:       arith.addi
-\ CHECK:       cf.br ^bb10
+\ Inner loop crossing test
+\ CHECK:       arith.xori
+\ CHECK:       arith.cmpi slt
+\ CHECK:       cf.cond_br
 
-\ Triple-nested outer loop header (bb16)
-\ CHECK:     ^bb16(%{{.*}}: !forth.stack):
+\ Outer loop increment (bb12)
+\ CHECK:     ^bb12(%{{.*}}: !forth.stack):
+\ CHECK:       arith.addi
+\ CHECK:       arith.xori
 \ CHECK:       arith.cmpi slt
 \ CHECK:       cf.cond_br
 
-\ Triple-nested middle loop setup (bb17)
-\ CHECK:     ^bb17(%{{.*}}: !forth.stack):
+\ Triple-nested outer loop body (bb13)
+\ CHECK:     ^bb13(%{{.*}}: !forth.stack):
 \ CHECK:       forth.literal %{{.*}} 2
 \ CHECK:       forth.literal %{{.*}} 0
 
 \ === BEGIN/WHILE inside IF ===
 \ After triple-nested exits: 5 IF BEGIN DUP WHILE 1 - REPEAT THEN
-\ CHECK:     ^bb18(%{{.*}}: !forth.stack):
+\ CHECK:     ^bb14(%{{.*}}: !forth.stack):
 \ CHECK:       forth.literal %{{.*}} 5
 \ CHECK:       forth.pop_flag
 \ CHECK-NEXT:  cf.cond_br
 5 IF BEGIN DUP WHILE 1 - REPEAT THEN
 
-\ bb25: IF true branch -> jump to begin/while header
-\ CHECK:     ^bb25(%{{.*}}: !forth.stack):
-\ CHECK-NEXT:  cf.br ^bb27
+\ bb19: IF true branch -> jump to begin/while header
+\ CHECK:     ^bb19(%{{.*}}: !forth.stack):
+\ CHECK-NEXT:  cf.br ^bb21
 
-\ bb26: IF false branch (and WHILE exit) -> jump to BEGIN/UNTIL
-\ CHECK:     ^bb26(%{{.*}}: !forth.stack):
-\ CHECK-NEXT:  cf.br ^bb30
+\ bb20: IF false branch (and WHILE exit) -> jump to BEGIN/UNTIL
+\ CHECK:     ^bb20(%{{.*}}: !forth.stack):
+\ CHECK-NEXT:  cf.br ^bb24
 
 \ WHILE condition: DUP + pop_flag
-\ CHECK:     ^bb27(%{{.*}}: !forth.stack):
+\ CHECK:     ^bb21(%{{.*}}: !forth.stack):
 \ CHECK:       forth.dup
 \ CHECK:       forth.pop_flag
 \ CHECK-NEXT:  cf.cond_br
 
 \ WHILE body: 1 -
-\ CHECK:     ^bb28(%[[B28:.*]]: !forth.stack):
-\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B28]] 1
+\ CHECK:     ^bb22(%[[B22:.*]]: !forth.stack):
+\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B22]] 1
 \ CHECK-NEXT:  %{{.*}} = forth.sub
 
 \ === IF inside BEGIN/UNTIL ===
 \ BEGIN/UNTIL header: DUP 10 <
-\ CHECK:     ^bb30(%{{.*}}: !forth.stack):
+\ CHECK:     ^bb24(%{{.*}}: !forth.stack):
 \ CHECK:       forth.dup
 \ CHECK:       forth.literal %{{.*}} 10
 \ CHECK-NEXT:  %{{.*}} = forth.lt
 BEGIN DUP 10 < IF 1 + THEN DUP 20 = UNTIL
 
 \ IF true branch: 1 +
-\ CHECK:     ^bb31(%[[B31:.*]]: !forth.stack):
-\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B31]] 1
+\ CHECK:     ^bb25(%[[B25:.*]]: !forth.stack):
+\ CHECK-NEXT:  %{{.*}} = forth.literal %[[B25]] 1
 \ CHECK-NEXT:  %{{.*}} = forth.add
 
 \ UNTIL condition: DUP 20 =
-\ CHECK:     ^bb32(%{{.*}}: !forth.stack):
+\ CHECK:     ^bb26(%{{.*}}: !forth.stack):
 \ CHECK:       forth.dup
 \ CHECK:       forth.literal %{{.*}} 20
 \ CHECK-NEXT:  %{{.*}} = forth.eq
diff --git a/test/Translation/Forth/plus-loop-negative.forth b/test/Translation/Forth/plus-loop-negative.forth
new file mode 100644
index 0000000..233af08
--- /dev/null
+++ b/test/Translation/Forth/plus-loop-negative.forth
@@ -0,0 +1,24 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s
+
+\ Verify +LOOP with negative step uses crossing test (handles negative direction)
+
+\ CHECK:       %[[S0:.*]] = forth.stack !forth.stack
+\ CHECK-NEXT:  %[[S1:.*]] = forth.literal %[[S0]] 0 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[S2:.*]] = forth.literal %[[S1]] 10 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[OS:.*]], %[[VAL:.*]] = forth.pop %[[S2]] : !forth.stack -> !forth.stack, i64
+\ CHECK-NEXT:  %[[OS2:.*]], %[[LIM:.*]] = forth.pop %[[OS]] : !forth.stack -> !forth.stack, i64
+\ CHECK:       cf.br ^bb1(%[[OS2]] : !forth.stack)
+\ CHECK:     ^bb1(%[[B1:.*]]: !forth.stack):
+\ CHECK:       %[[STEP_S:.*]] = forth.literal %[[B1]] -1 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[POP_S:.*]], %[[STEP:.*]] = forth.pop %[[STEP_S]] : !forth.stack -> !forth.stack, i64
+\ CHECK:       %[[OLD:.*]] = memref.load
+\ CHECK:       %[[NEW:.*]] = arith.addi %[[OLD]], %[[STEP]] : i64
+\ CHECK:       %[[D1:.*]] = arith.subi %[[OLD]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[D2:.*]] = arith.subi %[[NEW]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[XOR:.*]] = arith.xori %[[D1]], %[[D2]] : i64
+\ CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i64
+\ CHECK-NEXT:  %[[CROSSED:.*]] = arith.cmpi slt, %[[XOR]], %[[ZERO]] : i64
+\ CHECK-NEXT:  cf.cond_br %[[CROSSED]], ^bb2(%[[POP_S]] : !forth.stack), ^bb1(%[[POP_S]] : !forth.stack)
+\ CHECK:     ^bb2(%{{.*}}: !forth.stack):
+\ CHECK-NEXT:  return
+0 10 DO -1 +LOOP
diff --git a/test/Translation/Forth/plus-loop-without-do-error.forth b/test/Translation/Forth/plus-loop-without-do-error.forth
new file mode 100644
index 0000000..06bc674
--- /dev/null
+++ b/test/Translation/Forth/plus-loop-without-do-error.forth
@@ -0,0 +1,3 @@
+\ RUN: %not %warpforth-translate --forth-to-mlir %s 2>&1 | %FileCheck %s
+\ CHECK: +LOOP without matching DO
++LOOP
diff --git a/test/Translation/Forth/plus-loop.forth b/test/Translation/Forth/plus-loop.forth
new file mode 100644
index 0000000..ac7f4ac
--- /dev/null
+++ b/test/Translation/Forth/plus-loop.forth
@@ -0,0 +1,29 @@
+\ RUN: %warpforth-translate --forth-to-mlir %s | %FileCheck %s
+
+\ Verify +LOOP pops step from data stack and uses it as increment
+
+\ CHECK:       %[[S0:.*]] = forth.stack !forth.stack
+\ CHECK-NEXT:  %[[S1:.*]] = forth.literal %[[S0]] 10 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[S2:.*]] = forth.literal %[[S1]] 0 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[OS:.*]], %[[VAL:.*]] = forth.pop %[[S2]] : !forth.stack -> !forth.stack, i64
+\ CHECK-NEXT:  %[[OS2:.*]], %[[LIM:.*]] = forth.pop %[[OS]] : !forth.stack -> !forth.stack, i64
+\ CHECK-NEXT:  %[[ALLOCA:.*]] = memref.alloca() : memref<1xi64>
+\ CHECK-NEXT:  %[[C0:.*]] = arith.constant 0 : index
+\ CHECK-NEXT:  memref.store %[[VAL]], %[[ALLOCA]][%[[C0]]] : memref<1xi64>
+\ CHECK-NEXT:  cf.br ^bb1(%[[OS2]] : !forth.stack)
+\ CHECK:     ^bb1(%[[B1:.*]]: !forth.stack):
+\ CHECK:       %[[STEP_S:.*]] = forth.literal %[[B1]] 2 : !forth.stack -> !forth.stack
+\ CHECK-NEXT:  %[[POP_S:.*]], %[[STEP:.*]] = forth.pop %[[STEP_S]] : !forth.stack -> !forth.stack, i64
+\ CHECK-NEXT:  %[[C0_2:.*]] = arith.constant 0 : index
+\ CHECK-NEXT:  %[[OLD:.*]] = memref.load %[[ALLOCA]][%[[C0_2]]] : memref<1xi64>
+\ CHECK-NEXT:  %[[NEW:.*]] = arith.addi %[[OLD]], %[[STEP]] : i64
+\ CHECK-NEXT:  memref.store %[[NEW]], %[[ALLOCA]][%[[C0_2]]] : memref<1xi64>
+\ CHECK-NEXT:  %[[D1:.*]] = arith.subi %[[OLD]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[D2:.*]] = arith.subi %[[NEW]], %[[LIM]] : i64
+\ CHECK-NEXT:  %[[XOR:.*]] = arith.xori %[[D1]], %[[D2]] : i64
+\ CHECK-NEXT:  %[[ZERO:.*]] = arith.constant 0 : i64
+\ CHECK-NEXT:  %[[CROSSED:.*]] = arith.cmpi slt, %[[XOR]], %[[ZERO]] : i64
+\ CHECK-NEXT:  cf.cond_br %[[CROSSED]], ^bb2(%[[POP_S]] : !forth.stack), ^bb1(%[[POP_S]] : !forth.stack)
+\ CHECK:     ^bb2(%[[B2:.*]]: !forth.stack):
+\ CHECK-NEXT:  return
+10 0 DO 2 +LOOP
diff --git a/test/Translation/Forth/unloop-exit.forth b/test/Translation/Forth/unloop-exit.forth
index 1718718..e5058a4 100644
--- a/test/Translation/Forth/unloop-exit.forth
+++ b/test/Translation/Forth/unloop-exit.forth
@@ -4,13 +4,11 @@
 
 \ CHECK: func.func private @FIND_FIVE(%{{.*}}: !forth.stack) -> !forth.stack
 \ CHECK: memref.alloca
-\ CHECK: cf.br ^bb[[#CHECK:]]
-\ CHECK: ^bb[[#CHECK]](%{{.*}}: !forth.stack):
-\ CHECK: cf.cond_br %{{.*}}, ^bb[[#BODY:]](%{{.*}}), ^bb[[#EXIT:]](%{{.*}})
+\ CHECK: cf.br ^bb[[#BODY:]]
 \ CHECK: ^bb[[#BODY]](%{{.*}}: !forth.stack):
 \ CHECK: forth.eq
 \ CHECK: cf.cond_br %{{.*}}, ^bb[[#THEN:]](%{{.*}}), ^bb[[#ENDIF:]](%{{.*}})
-\ CHECK: ^bb[[#EXIT]](%{{.*}}: !forth.stack):
+\ CHECK: ^bb[[#EXIT:]](%{{.*}}: !forth.stack):
 \ CHECK: return
 \ CHECK: ^bb[[#THEN]](%[[T:.*]]: !forth.stack):
 \ CHECK: cf.cond_br %true, ^bb[[#RET:]](%[[T]]{{.*}})