llama/compat: shrink clip.cpp injection from 18 lines to 1

The clip.cpp tensor-read loop was the fattest hook in the patch — it duplicated the host-vs-device buffer dispatch around a call into the compat layer. Move that dispatch into our code (maybe_load_tensor), so the upstream patch is a single conditional call. Net: upstream patch drops from 48 lines across 6 files to 34 lines. Every remaining edit is either a 1-line include, a 1-line function call, or the gguf_rename_tensor shim (which accesses gguf_context internals and has to live in gguf.cpp). Verified end-to-end: text + vision both still correct after rebuild.
2026-05-13 14:27:00 +00:00 · 2026-04-19 10:50:34 -07:00 · 2026-04-19 10:50:34 -07:00 · 021389f7bb
commit 021389f7bb
parent 8c2c9d4c89
3 changed files with 38 additions and 50 deletions
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -393,50 +393,53 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
    }
 }

-bool supply_promoted_tensor_data(const ggml_tensor * cur,
-                                 const char * source_file,
-                                 size_t file_offset,
-                                 std::vector<uint8_t> & out) {
+bool maybe_load_tensor(ggml_tensor * cur,
+                       const char * source_file,
+                       size_t file_offset,
+                       ggml_backend_buffer_type_t buft) {
+    // Check registry: is this tensor marked for F16→F32 promotion?
    {
        std::lock_guard<std::mutex> lk(g_promote_mutex);
        if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) {
            return false;
        }
    }
-    // cur->type is F32 (after promotion). Source bytes are F16 at file_offset.
-    if (cur->type != GGML_TYPE_F32) {
-        return false;
-    }
+    // Destination was promoted to F32 by translate_clip_metadata. Source
+    // bytes on disk are still F16 at file_offset.
+    if (cur->type != GGML_TYPE_F32) return false;

-    const size_t n_elem = ggml_nelements(cur);
-    const size_t src_bytes = n_elem * sizeof(uint16_t);
-    const size_t dst_bytes = n_elem * sizeof(float);
+    const size_t n_elem   = ggml_nelements(cur);
+    const size_t src_size = n_elem * sizeof(uint16_t);
+    const size_t dst_size = n_elem * sizeof(float);

-    std::vector<uint8_t> src(src_bytes);
+    std::vector<uint8_t> src(src_size);

    FILE * f = std::fopen(source_file, "rb");
    if (!f) {
        LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file);
        return false;
    }
-    if (std::fseek(f, (long) file_offset, SEEK_SET) != 0) {
-        std::fclose(f);
-        LLAMA_LOG_ERROR("%s: failed to seek in '%s'\n", __func__, source_file);
-        return false;
-    }
-    if (std::fread(src.data(), 1, src_bytes, f) != src_bytes) {
+    if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 ||
+        std::fread(src.data(), 1, src_size, f) != src_size) {
        std::fclose(f);
        LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n",
-                        __func__, src_bytes, ggml_get_name(cur));
+                        __func__, src_size, ggml_get_name(cur));
        return false;
    }
    std::fclose(f);

-    out.resize(dst_bytes);
+    std::vector<uint8_t> dst(dst_size);
    convert_f16_to_f32(reinterpret_cast<const uint16_t *>(src.data()),
-                       reinterpret_cast<float *>(out.data()),
+                       reinterpret_cast<float *>(dst.data()),
                       n_elem);

+    // Deliver the converted bytes to the tensor's final backend buffer.
+    if (ggml_backend_buft_is_host(buft)) {
+        std::memcpy(cur->data, dst.data(), dst_size);
+    } else {
+        ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
+    }
+
    LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n",
                   __func__, ggml_get_name(cur), n_elem);
    return true;
--- a/llama/compat/llama-ollama-compat.h
+++ b/llama/compat/llama-ollama-compat.h
@ -18,6 +18,8 @@
 #include <string>
 #include <vector>

+#include "ggml-backend.h" // for ggml_backend_buffer_type_t
+
 struct gguf_context;
 struct ggml_context;
 struct ggml_tensor;
@ -64,15 +66,17 @@ void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx);
 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx);

 // Called from clip.cpp's tensor-loading loop, before reading bytes from the
-// file. If this tensor was marked for type promotion by translate_clip_metadata,
-// fills `out` with the promoted data (e.g. F16→F32) and returns true. The
-// caller should then use `out` instead of reading from the file.
+// file. If this tensor was marked for type promotion by translate_clip_metadata
+// (e.g. F16→F32), reads the source bytes, converts them, and writes the
+// result directly into `cur` (choosing host copy vs. backend upload based
+// on `buft`). Returns true if the tensor was handled — caller should skip
+// its normal file-read path. Returns false otherwise; caller loads normally.
 //
 // `file_offset` is the absolute file offset of the original (pre-promotion)
 // tensor data in the source GGUF.
-bool supply_promoted_tensor_data(const ggml_tensor * cur,
-                                 const char * source_file,
-                                 size_t file_offset,
-                                 std::vector<uint8_t> & out);
+bool maybe_load_tensor(ggml_tensor * cur,
+                       const char * source_file,
+                       size_t file_offset,
+                       ggml_backend_buffer_type_t buft);

 } // namespace llama_ollama_compat
--- a/llama/compat/upstream-edits.patch
+++ b/llama/compat/upstream-edits.patch
@ -108,7 +108,7 @@ index 4ded484dd..7d3509c23 100644
 
     if (use_mmap_buffer) {
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
-index f0e8786b6..ec2a7d320 100644
+index f0e8786b6..1e6319ca0 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,8 @@
@ -132,30 +132,11 @@ index f0e8786b6..ec2a7d320 100644
         const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
 
         // print gguf info
-@@ -2358,11 +2365,25 @@ struct clip_model_loader {
+@@ -2358,6 +2365,7 @@ struct clip_model_loader {
                 auto it_off = tensor_offset.find(t->name);
                 GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
                 const size_t offset = it_off->second;
-+                size_t num_bytes = ggml_nbytes(cur);
-+
-+                // Ollama-compat: let the compat layer supply promoted tensor
-+                // data (e.g. F16→F32 for conv weights) instead of reading
-+                // bytes directly from the file.
-+                std::vector<uint8_t> compat_buf;
-+                if (llama_ollama_compat::supply_promoted_tensor_data(cur, fname.c_str(), offset, compat_buf)) {
-+                    if (ggml_backend_buft_is_host(buft)) {
-+                        std::memcpy(cur->data, compat_buf.data(), num_bytes);
-+                    } else {
-+                        ggml_backend_tensor_set(cur, compat_buf.data(), 0, num_bytes);
-+                    }
-+                    continue;
-+                }
-+
+                if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
                 fin.seekg(offset, std::ios::beg);
                 if (!fin) {
                     throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
-                 }
-                size_t num_bytes = ggml_nbytes(cur);
-                 if (ggml_backend_buft_is_host(buft)) {
-                     // for the CPU and Metal backend, we can read directly into the tensor
-                     fin.read(reinterpret_cast<char *>(cur->data), num_bytes);