mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
ggml update to b7108 (#12992)
* Revert "vulkan: temporary cary of vulkan fixes (#12971)"
This reverts commit 3a9e8e9fd4.
* ggml update to b7087
* fix argsort on metal
* update to b7108
* fix bakllava regression
This model lacks the metadata for the projector type.
* update to b7209
* fix TopK perf
* only build arm code on arm
This commit is contained in:
68
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
68
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
@@ -5,12 +5,20 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include <algorithm>
|
||||
#include <cerrno>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
|
||||
// represents raw image data, layout is RGBRGBRGB...
|
||||
@@ -93,14 +101,26 @@ const char * mtmd_default_marker() {
|
||||
return "<__media__>";
|
||||
}
|
||||
|
||||
static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_type flash_attn_type) {
|
||||
switch (flash_attn_type) {
|
||||
case LLAMA_FLASH_ATTN_TYPE_AUTO: return CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
case LLAMA_FLASH_ATTN_TYPE_DISABLED: return CLIP_FLASH_ATTN_TYPE_DISABLED;
|
||||
case LLAMA_FLASH_ATTN_TYPE_ENABLED: return CLIP_FLASH_ATTN_TYPE_ENABLED;
|
||||
}
|
||||
return CLIP_FLASH_ATTN_TYPE_AUTO;
|
||||
}
|
||||
|
||||
mtmd_context_params mtmd_context_params_default() {
|
||||
mtmd_context_params params;
|
||||
params.use_gpu = true;
|
||||
params.print_timings = true;
|
||||
params.n_threads = 4;
|
||||
params.verbosity = GGML_LOG_LEVEL_INFO;
|
||||
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
|
||||
params.media_marker = mtmd_default_marker();
|
||||
mtmd_context_params params {
|
||||
/* use_gpu */ true,
|
||||
/* print_timings */ true,
|
||||
/* n_threads */ 4,
|
||||
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
|
||||
/* media_marker */ mtmd_default_marker(),
|
||||
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ -1,
|
||||
/* image_max_tokens */ -1,
|
||||
};
|
||||
return params;
|
||||
}
|
||||
|
||||
@@ -152,7 +172,7 @@ struct mtmd_context {
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd(text_model))
|
||||
n_embd_text (llama_model_n_embd_inp(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
@@ -162,9 +182,13 @@ struct mtmd_context {
|
||||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
clip_context_params ctx_clip_params {
|
||||
/* use_gpu */ ctx_params.use_gpu,
|
||||
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
|
||||
/* image_min_tokens */ ctx_params.image_min_tokens,
|
||||
/* image_max_tokens */ ctx_params.image_max_tokens,
|
||||
};
|
||||
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
ctx_v = res.ctx_v;
|
||||
ctx_a = res.ctx_a;
|
||||
@@ -268,7 +292,7 @@ struct mtmd_context {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
@@ -285,6 +309,11 @@ struct mtmd_context {
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
|
||||
// <|im_start|> ... (image embeddings) ... <|im_end|>
|
||||
img_beg = "<|im_start|>";
|
||||
img_end = "<|im_end|>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -374,9 +403,7 @@ mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
|
||||
}
|
||||
|
||||
void mtmd_free(mtmd_context * ctx) {
|
||||
if (ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
struct mtmd_tokenizer {
|
||||
@@ -1036,7 +1063,9 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
|
||||
|
||||
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
|
||||
if (image_tokens->use_mrope_pos) {
|
||||
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
|
||||
// for M-RoPE, temporal dimension = max(t,h,w)
|
||||
// t is omitted as we don't support video input
|
||||
return std::max(image_tokens->nx, image_tokens->ny);
|
||||
}
|
||||
return image_tokens->n_tokens();
|
||||
}
|
||||
@@ -1075,3 +1104,8 @@ mtmd_input_chunks * mtmd_test_create_input_chunks() {
|
||||
|
||||
return chunks;
|
||||
}
|
||||
|
||||
void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user