From 0519a95d2bdba53ad8ea77d058960945494adad1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 29 Jan 2026 16:34:29 +0100 Subject: [PATCH 1/2] LoRA: Optimise LoKr at runtime --- ggml_extend.hpp | 83 +++++++++++++++++++++++++++++++++- lora.hpp | 116 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 4 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 7dac03738..d7f0e8e9a 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -1577,7 +1577,7 @@ struct WeightAdapter { bool force_prec_f32 = false; float scale = 1.f; } linear; - struct { + struct conv2d_params_t{ int s0 = 1; int s1 = 1; int p0 = 0; @@ -2630,4 +2630,85 @@ class MultiheadAttention : public GGMLBlock { } }; +__STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( + struct ggml_context* ctx, + struct ggml_tensor* h, // Input: [q, batch] or [W, H, q, batch] + struct ggml_tensor* w1, // Outer C (Full rank) + struct ggml_tensor* w1a, // Outer A (Low rank part 1) + struct ggml_tensor* w1b, // Outer B (Low rank part 2) + struct ggml_tensor* w2, // Inner BA (Full rank) + struct ggml_tensor* w2a, // Inner A (Low rank part 1) + struct ggml_tensor* w2b, // Inner B (Low rank part 2) + bool is_conv, + WeightAdapter::ForwardParams::conv2d_params_t conv_params, + float scale) { + + GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL))); + GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL))); + + int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0]; + int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]); + + int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0]; + int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1]; + + int q_expected = uq * vq; + int q_actual = is_conv ? h->ne[2] : h->ne[0]; + GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split"); + + struct ggml_tensor* hb; + + if (!is_conv) { + // Treat input as a grid: [vq, uq * batch] + struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]); + + if (w2 != NULL) { + hb = ggml_mul_mat(ctx, w2, h_mat); + } else { + hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat)); + } + } else { + // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch] + struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]); + + if (w2 != NULL) { + hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + } else { + // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp] + struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1); + hb = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + } + } + + // At this point hb is [W_out, H_out, vp, uq * batch] + // We reshape to isolate uq for matrix multiplication + int w_out = is_conv ? hb->ne[0] : 1; + int h_out = is_conv ? hb->ne[1] : 1; + int batch = is_conv ? h->ne[3] : h->ne[1]; + + // Rearrange to [vp, uq, spatial*batch] + struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch); + + // Transpose so uq is ne[0] for ggml_mul_mat + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + + struct ggml_tensor* hc; + if (w1 != NULL) { + hc = ggml_mul_mat(ctx, w1, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } + + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* out; + if (is_conv) { + out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch); + } else { + + out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + } + + return ggml_scale(ctx, out, scale); +} + #endif // __GGML_EXTEND__HPP__ diff --git a/lora.hpp b/lora.hpp index e5d9906ff..fd461086a 100644 --- a/lora.hpp +++ b/lora.hpp @@ -483,7 +483,7 @@ struct LoraModel : public GGMLRunner { diff = get_loha_weight_diff(model_tensor_name, ctx); } // lokr - if (diff == nullptr) { + if (diff == nullptr && with_lora) { diff = get_lokr_weight_diff(model_tensor_name, ctx); } if (diff != nullptr) { @@ -501,6 +501,8 @@ struct LoraModel : public GGMLRunner { return diff; } + + ggml_tensor* get_out_diff(ggml_context* ctx, ggml_tensor* x, WeightAdapter::ForwardParams forward_params, @@ -514,6 +516,115 @@ struct LoraModel : public GGMLRunner { } else { key = model_tensor_name + "." + std::to_string(index); } + bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; + + + std::string lokr_w1_name = "lora." + key + ".lokr_w1"; + std::string lokr_w1_a_name = "lora." + key + ".lokr_w1_a"; + // if either of these is found, then we have a lokr lora + auto iter = lora_tensors.find(lokr_w1_name); + auto iter_a = lora_tensors.find(lokr_w1_a_name); + if (iter != lora_tensors.end() || iter_a != lora_tensors.end()) { + std::string lokr_w1_b_name = "lora." + key + ".lokr_w1_b"; + std::string lokr_w2_name = "lora." + key + ".lokr_w2"; + std::string lokr_w2_a_name = "lora." + key + ".lokr_w2_a"; + std::string lokr_w2_b_name = "lora." + key + ".lokr_w2_b"; + std::string alpha_name = "lora." + key + ".alpha"; + + ggml_tensor* lokr_w1 = nullptr; + ggml_tensor* lokr_w1_a = nullptr; + ggml_tensor* lokr_w1_b = nullptr; + ggml_tensor* lokr_w2 = nullptr; + ggml_tensor* lokr_w2_a = nullptr; + ggml_tensor* lokr_w2_b = nullptr; + + if (iter != lora_tensors.end()) { + lokr_w1 = iter->second; + if (is_conv2d && lokr_w1->type != GGML_TYPE_F16) { + lokr_w1 = ggml_cast(ctx, lokr_w1, GGML_TYPE_F16); + } + } + iter = iter_a; + if (iter != lora_tensors.end()) { + lokr_w1_a = iter->second; + if (is_conv2d && lokr_w1_a->type != GGML_TYPE_F16) { + lokr_w1_a = ggml_cast(ctx, lokr_w1_a, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w1_b_name); + if (iter != lora_tensors.end()) { + lokr_w1_b = iter->second; + if (is_conv2d && lokr_w1_b->type != GGML_TYPE_F16) { + lokr_w1_b = ggml_cast(ctx, lokr_w1_b, GGML_TYPE_F16); + } + } + + iter = lora_tensors.find(lokr_w2_name); + if (iter != lora_tensors.end()) { + lokr_w2 = iter->second; + if (is_conv2d && lokr_w2->type != GGML_TYPE_F16) { + lokr_w2 = ggml_cast(ctx, lokr_w2, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w2_a_name); + if (iter != lora_tensors.end()) { + lokr_w2_a = iter->second; + if (is_conv2d && lokr_w2_a->type != GGML_TYPE_F16) { + lokr_w2_a = ggml_cast(ctx, lokr_w2_a, GGML_TYPE_F16); + } + } + iter = lora_tensors.find(lokr_w2_b_name); + if (iter != lora_tensors.end()) { + lokr_w2_b = iter->second; + if (is_conv2d && lokr_w2_b->type != GGML_TYPE_F16) { + lokr_w2_b = ggml_cast(ctx, lokr_w2_b, GGML_TYPE_F16); + } + } + + int rank = 1; + if (lokr_w1_b) { + rank = lokr_w1_b->ne[ggml_n_dims(lokr_w1_b) - 1]; + } + if (lokr_w2_b) { + rank = lokr_w2_b->ne[ggml_n_dims(lokr_w2_b) - 1]; + } + + float scale_value = 1.0f; + iter = lora_tensors.find(alpha_name); + if (iter != lora_tensors.end()) { + float alpha = ggml_ext_backend_tensor_get_f32(iter->second); + scale_value = alpha / rank; + applied_lora_tensors.insert(alpha_name); + } + + if (rank == 1) { + scale_value = 1.0f; + } + scale_value *= multiplier; + + auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value); + if (out_diff == nullptr) { + out_diff = curr_out_diff; + } else { + out_diff = ggml_concat(ctx, out_diff, curr_out_diff, 0); + } + + if(lokr_w1) applied_lora_tensors.insert(lokr_w1_name); + if(lokr_w1_a) applied_lora_tensors.insert(lokr_w1_a_name); + if(lokr_w1_b) applied_lora_tensors.insert(lokr_w1_b_name); + if(lokr_w2) applied_lora_tensors.insert(lokr_w2_name); + if(lokr_w2_a) applied_lora_tensors.insert(lokr_w2_name); + if(lokr_w2_b) applied_lora_tensors.insert(lokr_w2_b_name); + applied_lora_tensors.insert(alpha_name); + + + index++; + continue; + } + + // not a lork, normal lora path + + std::string lora_down_name = "lora." + key + ".lora_down"; std::string lora_up_name = "lora." + key + ".lora_up"; @@ -525,9 +636,8 @@ struct LoraModel : public GGMLRunner { ggml_tensor* lora_mid = nullptr; ggml_tensor* lora_down = nullptr; - bool is_conv2d = forward_params.op_type == WeightAdapter::ForwardParams::op_type_t::OP_CONV2D; - auto iter = lora_tensors.find(lora_up_name); + iter = lora_tensors.find(lora_up_name); if (iter != lora_tensors.end()) { lora_up = iter->second; if (is_conv2d && lora_up->type != GGML_TYPE_F16) { From b6c2f868d529db6d5718d907834a1d2f9fd53cb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 30 Jan 2026 21:06:12 +0100 Subject: [PATCH 2/2] lokr: fix convs --- ggml_extend.hpp | 96 +++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 39 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index d7f0e8e9a..efd320f0a 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -2646,69 +2646,87 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_lokr_forward( GGML_ASSERT((w1 != NULL || (w1a != NULL && w1b != NULL))); GGML_ASSERT((w2 != NULL || (w2a != NULL && w2b != NULL))); - int vq = (w2 != NULL) ? w2->ne[0] : w2a->ne[0]; - int vp = (w2 != NULL) ? w2->ne[1] : (is_conv ? w2b->ne[3] : w2b->ne[1]); + int uq = (w1 != NULL) ? (int)w1->ne[0] : (int)w1a->ne[0]; + int up = (w1 != NULL) ? (int)w1->ne[1] : (int)w1b->ne[1]; - int uq = (w1 != NULL) ? w1->ne[0] : w1a->ne[0]; - int up = (w1 != NULL) ? w1->ne[1] : w1b->ne[1]; + int q_actual = is_conv ? (int)h->ne[2] : (int)h->ne[0]; + int vq = q_actual / uq; - int q_expected = uq * vq; - int q_actual = is_conv ? h->ne[2] : h->ne[0]; - GGML_ASSERT(q_actual == q_expected && "Input dimension mismatch for LoKR split"); + int vp = (w2 != NULL) ? (is_conv ? (int)w2->ne[3] : (int)w2->ne[1]) + : (int)w2a->ne[1]; + GGML_ASSERT(q_actual == (uq * vq) && "Input dimension mismatch for LoKR split"); struct ggml_tensor* hb; if (!is_conv) { - // Treat input as a grid: [vq, uq * batch] - struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * h->ne[1]); + int batch = (int)h->ne[1]; + struct ggml_tensor* h_mat = ggml_reshape_2d(ctx, h, vq, uq * batch); if (w2 != NULL) { hb = ggml_mul_mat(ctx, w2, h_mat); } else { hb = ggml_mul_mat(ctx, w2b, ggml_mul_mat(ctx, w2a, h_mat)); } + + struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, batch); + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + + struct ggml_tensor* hc; + if (w1 != NULL) { + hc = ggml_mul_mat(ctx, w1, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } + + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + return ggml_scale(ctx, out, scale); + } else { - // Reshape so uq is in the batch dimension: [W, H, vq, uq * batch] - struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * h->ne[3]); + int batch = (int)h->ne[3]; + + // Reshape input: [W, H, vq*uq, batch] -> [W, H, vq, uq * batch] + struct ggml_tensor* h_grouped = ggml_reshape_4d(ctx, h, h->ne[0], h->ne[1], vq, uq * batch); if (w2 != NULL) { - hb = ggml_ext_conv_2d(ctx, w2, h_grouped,nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + hb = ggml_conv_2d(ctx, w2, h_grouped, conv_params.s0, conv_params.s1, + conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); } else { - // w2a is [1, 1, vq, rank], w2b is [kw, kh, rank, vp] - struct ggml_tensor* tmp = ggml_conv_2d(ctx, w2a, h_grouped, 1, 1, 0, 0, 1, 1); - hb = ggml_ext_conv_2d(ctx, w2b, tmp, nullptr, conv_params.s0, conv_params.s1, conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1, conv_params.direct, conv_params.circular_x, conv_params.circular_y, conv_params.scale); + // Low-rank decomposition: w2b is the spatial kernel, w2a is the 1x1 projection + // Inner LoRA: w2b is the spatial/down-project, w2a is the 1x1 up-project + int rank = (int)w2b->ne[1]; + int k = (int)sqrt(w2b->ne[0] / vq); + + struct ggml_tensor* w2b_4d = (ggml_n_dims(w2b) < 3) ? ggml_reshape_4d(ctx, w2b, k, k, vq, rank) : w2b; + struct ggml_tensor* w2a_4d = (ggml_n_dims(w2a) < 3) ? ggml_reshape_4d(ctx, w2a, 1, 1, rank, vp) : w2a; + + struct ggml_tensor* ha = ggml_conv_2d(ctx, w2b_4d, h_grouped, conv_params.s0, conv_params.s1, + conv_params.p0, conv_params.p1, conv_params.d0, conv_params.d1); + hb = ggml_conv_2d(ctx, w2a_4d, ha, 1, 1, 0, 0, 1, 1); } - } - // At this point hb is [W_out, H_out, vp, uq * batch] - // We reshape to isolate uq for matrix multiplication - int w_out = is_conv ? hb->ne[0] : 1; - int h_out = is_conv ? hb->ne[1] : 1; - int batch = is_conv ? h->ne[3] : h->ne[1]; + int w_out = (int)hb->ne[0]; + int h_out = (int)hb->ne[1]; - // Rearrange to [vp, uq, spatial*batch] - struct ggml_tensor* hb_unbundled = ggml_reshape_3d(ctx, hb, vp, uq, w_out * h_out * batch); + struct ggml_tensor* hb_flat = ggml_reshape_3d(ctx, hb, w_out * h_out * vp, uq, batch); + struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_flat); - // Transpose so uq is ne[0] for ggml_mul_mat - struct ggml_tensor* hb_t = ggml_transpose(ctx, hb_unbundled); + struct ggml_tensor* hc; + struct ggml_tensor* w1_mat = (w1 != NULL) ? ggml_reshape_2d(ctx, w1, uq, up) : NULL; - struct ggml_tensor* hc; - if (w1 != NULL) { - hc = ggml_mul_mat(ctx, w1, hb_t); - } else { - hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); - } + if (w1_mat != NULL) { + hc = ggml_mul_mat(ctx, w1_mat, hb_t); + } else { + hc = ggml_mul_mat(ctx, w1b, ggml_mul_mat(ctx, w1a, hb_t)); + } - struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); - struct ggml_tensor* out; - if (is_conv) { - out = ggml_reshape_4d(ctx, hc_t, w_out, h_out, up * vp, batch); - } else { + struct ggml_tensor* hc_t = ggml_transpose(ctx, hc); + struct ggml_tensor* hc_res = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_t), vp, w_out * h_out, up, batch); + struct ggml_tensor* hc_perm = ggml_permute(ctx, hc_res, 1, 2, 0, 3); + struct ggml_tensor* out = ggml_reshape_4d(ctx, ggml_cont(ctx, hc_perm), w_out, h_out, up * vp, batch); - out = ggml_reshape_2d(ctx, ggml_cont(ctx, hc_t), up * vp, batch); + return ggml_scale(ctx, out, scale); } - - return ggml_scale(ctx, out, scale); } #endif // __GGML_EXTEND__HPP__