mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
llama/compat: add glm-ocr text handler + text-loader load-op hook
glm-ocr (text side):
* Arch rename `glmocr` → `glm4` (incl. KV prefix); upstream supports
GLM-OCR via LLM_ARCH_GLM4 with n_layer=17 (16 main + 1 nextn). We
report n_layer=16 and leave nextn_predict_layers absent — the
Ollama blob doesn't ship the nextn layer's weights.
* M-RoPE: pad `rope.mrope_section` (3 elements) →
`rope.dimension_sections` (4 elements with trailing 0).
* Inject `rope.dimension_count = key_length`.
* Tokenizer pre-tokenizer rename `llama-bpe` → `chatglm-bpe`.
* Tensor renames: `attn_out`→`attn_output`, `post_attn_norm`→
`post_attention_norm`, `post_ffn_norm`→`post_ffw_norm`.
* **Per-block FFN concat**: GLM4 expects fused
`ffn_up.weight: [n_embd, n_ff*2]` (gate || up). Ollama writes
separate `ffn_gate.weight` + `ffn_up.weight` (each `[n_embd, n_ff]`).
Register a load-time concat op that stitches gate+up into the fused
upstream slot, then add a per-block skip-prefix for the orphan
`blk.X.ffn_gate.` so the n_tensors check lines up.
* Hide embedded `v.*`/`mm.*` from the text loader.
This is the first text-side compat that needs custom load-time tensor
data (the FFN concat). Until now load-op support only covered the
clip side. New plumbing:
* `set_loader_path(ml, fname)` — store the model file path on a
per-loader registry, called from the loader constructor.
* `maybe_load_text_tensor(ml, cur, off, buft)` — the text-side
counterpart to `maybe_load_tensor`; looks up the path from the
registry then delegates to the existing load-op machinery.
* Upstream patch grows two new lines: a `set_loader_path` call in
the constructor and a `maybe_load_text_tensor` hook in
`load_all_data` (before the use_mmap branch).
Verified: with --no-mmap, glm-ocr's blk.X.ffn_up.weight load fires
the concat op (28MB per block on the 1B variant) and the model emits
coherent text. Through `ollama run` the proper chat template applies.
Note: vision (clip) handler is a follow-up.
This commit is contained in:
parent
5d45391016
commit
7e07653271
3 changed files with 157 additions and 2 deletions
136
llama/compat/llama-ollama-compat.cpp
vendored
136
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -5,7 +5,9 @@
|
|||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
|
|
@ -397,6 +399,115 @@ void handle_llama4(const llama_model_loader * ml, gguf_context * meta, ggml_cont
|
|||
add_skip_prefix(ml, "mm.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// glm-ocr (text side)
|
||||
// =========================================================================
|
||||
//
|
||||
// Ollama uses arch name "glmocr" / KV prefix "glmocr.*" with 16 blocks.
|
||||
// Upstream uses "glm4" / "glm4.*" — the GLM-OCR variant of LLM_ARCH_GLM4
|
||||
// is identified by `n_layer = 17` (16 main + 1 nextn predict layer).
|
||||
// Ollama drops the nextn layer entirely, so we report n_layer = 16 and
|
||||
// leave `nextn_predict_layers` absent (defaults to 0 = no nextn path).
|
||||
//
|
||||
// Bigger surgery: GLM4 expects fused gate+up MLP weights stored at
|
||||
// `blk.X.ffn_up.weight` with shape `[n_embd, n_ff*2]`. Ollama writes
|
||||
// the gate and up halves as separate `ffn_gate.weight` / `ffn_up.weight`
|
||||
// tensors (each `[n_embd, n_ff]`). We register a concat load op that
|
||||
// reads gate+up bytes and stitches them into the fused upstream slot.
|
||||
|
||||
// Per-block: register a concat load that fuses Ollama's separate
|
||||
// ffn_gate + ffn_up into upstream's single `blk.X.ffn_up.weight`
|
||||
// tensor with doubled out dim. Capture source file offsets BEFORE any
|
||||
// renames invalidate them (same pattern as qwen35moe QKV merge).
|
||||
void register_glm4_ffn_concat(gguf_context * meta, ggml_context * ctx, int block_idx) {
|
||||
char gate_n[64], up_n[64];
|
||||
std::snprintf(gate_n, sizeof(gate_n), "blk.%d.ffn_gate.weight", block_idx);
|
||||
std::snprintf(up_n, sizeof(up_n), "blk.%d.ffn_up.weight", block_idx);
|
||||
|
||||
if (!ggml_get_tensor(ctx, gate_n) || !ggml_get_tensor(ctx, up_n)) return;
|
||||
|
||||
// GLM4's fused ffn_up has gate as first half, up as second half
|
||||
// (so ggml_swiglu's silu(first_half) * second_half gives silu(gate) * up).
|
||||
register_concat_load(meta, up_n, {gate_n, up_n});
|
||||
|
||||
if (ggml_tensor * t = ggml_get_tensor(ctx, up_n)) {
|
||||
set_tensor_shape(t, {t->ne[0], t->ne[1] * 2});
|
||||
}
|
||||
}
|
||||
|
||||
bool detect_ollama_glmocr(const gguf_context * meta) {
|
||||
const int64_t arch_kid = gguf_find_key(meta, "general.architecture");
|
||||
if (arch_kid < 0) return false;
|
||||
return std::strcmp(gguf_get_val_str(meta, arch_kid), "glmocr") == 0;
|
||||
}
|
||||
|
||||
void handle_glmocr(const llama_model_loader * ml, gguf_context * meta,
|
||||
ggml_context * ctx, std::string & arch_name) {
|
||||
if (!detect_ollama_glmocr(meta)) return;
|
||||
|
||||
LLAMA_LOG_INFO("%s: detected Ollama-format glmocr GGUF; applying compatibility fixes\n", __func__);
|
||||
|
||||
gguf_set_val_str(meta, "general.architecture", "glm4");
|
||||
rename_kv_prefix(meta, "glmocr.", "glm4.");
|
||||
arch_name = "glm4";
|
||||
|
||||
// M-RoPE: Ollama writes a 3-element `rope.mrope_section`, upstream expects
|
||||
// a 4-element `rope.dimension_sections` (pad trailing 0).
|
||||
{
|
||||
const int64_t kid = gguf_find_key(meta, "glm4.rope.mrope_section");
|
||||
if (kid >= 0 && gguf_get_arr_n(meta, kid) == 3) {
|
||||
const auto * src = static_cast<const int32_t *>(gguf_get_arr_data(meta, kid));
|
||||
const int32_t padded[4] = { src[0], src[1], src[2], 0 };
|
||||
gguf_set_arr_data(meta, "glm4.rope.dimension_sections",
|
||||
GGUF_TYPE_INT32, padded, 4);
|
||||
}
|
||||
}
|
||||
// Inject `rope.dimension_count` from key_length (used as the rope dim).
|
||||
if (!has_key(meta, "glm4.rope.dimension_count")) {
|
||||
const int64_t kid = gguf_find_key(meta, "glm4.attention.key_length");
|
||||
if (kid >= 0) {
|
||||
gguf_set_val_u32(meta, "glm4.rope.dimension_count",
|
||||
gguf_get_val_u32(meta, kid));
|
||||
}
|
||||
}
|
||||
|
||||
// Tokenizer pre-tokenizer: Ollama wrote `llama-bpe`, but glm-ocr uses
|
||||
// `chatglm-bpe` (different regex split — wrong pre-tokenization can
|
||||
// fragment GLM's special tokens).
|
||||
{
|
||||
const int64_t kid = gguf_find_key(meta, "tokenizer.ggml.pre");
|
||||
if (kid >= 0) {
|
||||
const char * cur = gguf_get_val_str(meta, kid);
|
||||
if (cur && std::strcmp(cur, "chatglm-bpe") != 0) {
|
||||
gguf_set_val_str(meta, "tokenizer.ggml.pre", "chatglm-bpe");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tensor renames (substring): each leaf appears once per block and
|
||||
// doesn't overlap the others.
|
||||
rename_tensors_containing(meta, ctx, ".attn_out", ".attn_output");
|
||||
rename_tensors_containing(meta, ctx, ".post_attn_norm", ".post_attention_norm");
|
||||
rename_tensors_containing(meta, ctx, ".post_ffn_norm", ".post_ffw_norm");
|
||||
|
||||
// Fuse ffn_gate + ffn_up → ffn_up[:, 2*n_ff] for every block, then mark
|
||||
// the orphan ffn_gate tensors as skip so n_tensors lines up.
|
||||
{
|
||||
const int64_t n_blk_kid = gguf_find_key(meta, "glm4.block_count");
|
||||
const uint32_t n_blocks = n_blk_kid >= 0 ? gguf_get_val_u32(meta, n_blk_kid) : 16;
|
||||
for (uint32_t b = 0; b < n_blocks; ++b) {
|
||||
register_glm4_ffn_concat(meta, ctx, (int) b);
|
||||
char skip_pref[64];
|
||||
std::snprintf(skip_pref, sizeof(skip_pref), "blk.%u.ffn_gate.", b);
|
||||
add_skip_prefix(ml, skip_pref);
|
||||
}
|
||||
}
|
||||
|
||||
// Hide embedded vision + projector tensors from the text loader.
|
||||
add_skip_prefix(ml, "v.");
|
||||
add_skip_prefix(ml, "mm.");
|
||||
}
|
||||
|
||||
// =========================================================================
|
||||
// gpt-oss (text only)
|
||||
// =========================================================================
|
||||
|
|
@ -1137,6 +1248,7 @@ void translate_metadata(const llama_model_loader * ml,
|
|||
if (arch_name == "deepseekocr") handle_deepseekocr (ml, meta, ctx, arch_name);
|
||||
if (arch_name == "nemotron_h_moe") handle_nemotron_h_moe(ml, meta, ctx);
|
||||
if (arch_name == "llama4") handle_llama4 (ml, meta, ctx);
|
||||
if (arch_name == "glmocr") handle_glmocr (ml, meta, ctx, arch_name);
|
||||
// Dispatch. Add more arches as they are wired up.
|
||||
}
|
||||
|
||||
|
|
@ -1198,4 +1310,28 @@ bool maybe_load_tensor(ggml_tensor * cur,
|
|||
return true;
|
||||
}
|
||||
|
||||
namespace {
|
||||
std::mutex g_loader_path_mutex;
|
||||
std::unordered_map<const llama_model_loader *, std::string> g_loader_paths;
|
||||
}
|
||||
|
||||
void set_loader_path(const llama_model_loader * ml, const char * fname) {
|
||||
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
|
||||
g_loader_paths[ml] = fname ? fname : "";
|
||||
}
|
||||
|
||||
bool maybe_load_text_tensor(const llama_model_loader * ml,
|
||||
ggml_tensor * cur,
|
||||
size_t file_offset,
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
std::string path;
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
|
||||
auto it = g_loader_paths.find(ml);
|
||||
if (it == g_loader_paths.end() || it->second.empty()) return false;
|
||||
path = it->second;
|
||||
}
|
||||
return maybe_load_tensor(cur, path.c_str(), file_offset, buft);
|
||||
}
|
||||
|
||||
} // namespace llama_ollama_compat
|
||||
|
|
|
|||
11
llama/compat/llama-ollama-compat.h
vendored
11
llama/compat/llama-ollama-compat.h
vendored
|
|
@ -60,4 +60,15 @@ bool maybe_load_tensor(ggml_tensor * cur,
|
|||
size_t file_offset,
|
||||
ggml_backend_buffer_type_t buft);
|
||||
|
||||
// Same as maybe_load_tensor but for the text-side llama_model_loader,
|
||||
// which doesn't have the clip loader's `fname` in scope at the read
|
||||
// site. Looks up the model's file path from a per-loader registry
|
||||
// populated by `set_loader_path` (called from the model loader's
|
||||
// constructor right after `fname` is in scope).
|
||||
bool maybe_load_text_tensor(const llama_model_loader * ml,
|
||||
ggml_tensor * cur,
|
||||
size_t file_offset,
|
||||
ggml_backend_buffer_type_t buft);
|
||||
void set_loader_path(const llama_model_loader * ml, const char * fname);
|
||||
|
||||
} // namespace llama_ollama_compat
|
||||
|
|
|
|||
|
|
@ -10,10 +10,11 @@ index 4e65a45a5..75836c683 100644
|
|||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -549,6 +550,7 @@ llama_model_loader::llama_model_loader(
|
||||
@@ -549,6 +550,8 @@ llama_model_loader::llama_model_loader(
|
||||
}
|
||||
|
||||
|
||||
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
||||
+ llama_ollama_compat::set_loader_path(this, fname.c_str());
|
||||
+ llama_ollama_compat::translate_metadata(this, metadata, ctx, arch_name);
|
||||
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
||||
|
||||
|
|
@ -38,6 +39,13 @@ index 4e65a45a5..75836c683 100644
|
|||
// make sure there is no duplicated tensor names
|
||||
if (weights_map.find(tensor_name) != weights_map.end()) {
|
||||
throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur)));
|
||||
@@ -1535,3 +1543,6 @@ bool llama_model_loader::load_all_data(
|
||||
size_t n_size = ggml_nbytes(cur);
|
||||
|
||||
+ if (llama_ollama_compat::maybe_load_text_tensor(this, cur, weight->offs, cur->buffer ? ggml_backend_buffer_get_type(cur->buffer) : nullptr)) {
|
||||
+ continue;
|
||||
+ }
|
||||
if (use_mmap) {
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index f0e8786b6..35defa89d 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue