mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Update GGML to b6646 (#12245)
Notable EOLs with this change: - MacOS v12 and v13 are no longer supported (v14+ required) - AMD gfx900 and gfx906 are no longer supported
This commit is contained in:
6
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
6
llama/llama.cpp/tools/mtmd/clip-impl.h
vendored
@@ -44,6 +44,7 @@
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
||||
// audio-specific
|
||||
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
|
||||
@@ -81,6 +82,7 @@
|
||||
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
|
||||
#define TN_IMAGE_NEWLINE "model.image_newline"
|
||||
#define TN_MM_INP_NORM "mm.input_norm.weight"
|
||||
#define TN_MM_INP_NORM_B "mm.input_norm.bias"
|
||||
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
|
||||
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
|
||||
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3
|
||||
@@ -132,6 +134,8 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_VOXTRAL,
|
||||
PROJECTOR_TYPE_LFM2,
|
||||
PROJECTOR_TYPE_KIMIVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -152,6 +156,8 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
|
||||
{ PROJECTOR_TYPE_LFM2, "lfm2"},
|
||||
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
||||
429
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
429
llama/llama.cpp/tools/mtmd/clip.cpp
vendored
@@ -214,6 +214,7 @@ struct clip_hparams {
|
||||
// legacy
|
||||
bool has_llava_projector = false;
|
||||
int minicpmv_version = 0;
|
||||
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
|
||||
};
|
||||
|
||||
struct clip_layer {
|
||||
@@ -277,6 +278,7 @@ struct clip_model {
|
||||
|
||||
// LLaVA projection
|
||||
ggml_tensor * mm_input_norm_w = nullptr;
|
||||
ggml_tensor * mm_input_norm_b = nullptr;
|
||||
ggml_tensor * mm_0_w = nullptr;
|
||||
ggml_tensor * mm_0_b = nullptr;
|
||||
ggml_tensor * mm_2_w = nullptr;
|
||||
@@ -417,6 +419,7 @@ struct clip_ctx {
|
||||
}
|
||||
if (!backend) {
|
||||
backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
|
||||
backend = backend ? backend : ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -500,11 +503,17 @@ struct clip_graph {
|
||||
|
||||
ggml_cgraph * build_siglip() {
|
||||
ggml_tensor * inp = build_inp();
|
||||
|
||||
ggml_tensor * learned_pos_embd = model.position_embeddings;
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||
learned_pos_embd = resize_position_embeddings();
|
||||
}
|
||||
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
model.position_embeddings,
|
||||
learned_pos_embd,
|
||||
nullptr);
|
||||
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) {
|
||||
@@ -513,8 +522,8 @@ struct clip_graph {
|
||||
const int patches_per_image = n_patches_x;
|
||||
const int kernel_size = hparams.proj_scale_factor;
|
||||
|
||||
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
||||
cur = ggml_reshape_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
||||
cur = ggml_transpose(ctx0, cur);
|
||||
cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
|
||||
|
||||
// doing a pool2d to reduce the number of output tokens
|
||||
cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
|
||||
@@ -531,29 +540,27 @@ struct clip_graph {
|
||||
cur);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// pixel_shuffle
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
|
||||
const int scale_factor = model.hparams.proj_scale_factor;
|
||||
const int n_embd = cur->ne[0];
|
||||
const int seq = cur->ne[1];
|
||||
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
||||
const int height = std::sqrt(seq);
|
||||
const int width = std::sqrt(seq);
|
||||
GGML_ASSERT(scale_factor != 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
|
||||
n_embd * scale_factor * scale_factor,
|
||||
seq / (scale_factor * scale_factor),
|
||||
bsz);
|
||||
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
|
||||
} else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
const int scale_factor = model.hparams.proj_scale_factor;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
} else {
|
||||
GGML_ABORT("SigLIP: Unsupported projector type");
|
||||
}
|
||||
@@ -681,15 +688,15 @@ struct clip_graph {
|
||||
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_reshape_4d(
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
|
||||
inp = ggml_cont_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(
|
||||
ctx0, inp,
|
||||
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
|
||||
inp = ggml_reshape_3d(
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(
|
||||
ctx0, inp,
|
||||
n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
@@ -879,21 +886,8 @@ struct clip_graph {
|
||||
int n_embd = clip_n_mmproj_embd(ctx);
|
||||
const int d_head = 128;
|
||||
int n_head = n_embd/d_head;
|
||||
int num_query = 96;
|
||||
if (ctx->model.hparams.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
num_query = 96;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
num_query = 64;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
num_query = 64;
|
||||
} else if (ctx->model.hparams.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
num_query = 64;
|
||||
}
|
||||
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = ctx->model.hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
@@ -967,14 +961,14 @@ struct clip_graph {
|
||||
GGML_ASSERT(scale_factor > 0);
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
height / scale_factor,
|
||||
width / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
cur->ne[1] * cur->ne[2]);
|
||||
}
|
||||
@@ -1060,14 +1054,14 @@ struct clip_graph {
|
||||
n_patches_y,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
||||
cur = ggml_cont_4d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches_x / scale_factor,
|
||||
n_patches_y / scale_factor,
|
||||
bsz);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
//cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// flatten to 2D
|
||||
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
||||
cur = ggml_cont_2d(ctx0, cur,
|
||||
n_embd * scale_factor * scale_factor,
|
||||
n_patches / scale_factor / scale_factor);
|
||||
cb(cur, "pixel_shuffle", -1);
|
||||
@@ -1092,6 +1086,67 @@ struct clip_graph {
|
||||
return gf;
|
||||
}
|
||||
|
||||
ggml_cgraph * build_kimivl() {
|
||||
// 2D input positions
|
||||
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_h, "pos_h");
|
||||
ggml_set_input(pos_h);
|
||||
|
||||
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
|
||||
ggml_set_name(pos_w, "pos_w");
|
||||
ggml_set_input(pos_w);
|
||||
|
||||
ggml_tensor * learned_pos_embd = resize_position_embeddings();
|
||||
|
||||
// build ViT with 2D position embeddings
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
// first half is X axis and second half is Y axis
|
||||
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_NORMAL,
|
||||
hparams.ffn_op,
|
||||
learned_pos_embd,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
{
|
||||
// patch_merger
|
||||
const int scale_factor = model.hparams.proj_scale_factor;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
|
||||
// projection norm
|
||||
int proj_inp_dim = cur->ne[0];
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
n_embd, cur->ne[1] * scale_factor * scale_factor,
|
||||
ggml_row_size(cur->type, n_embd), 0);
|
||||
cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
|
||||
cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
|
||||
cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
|
||||
cur = ggml_view_2d(ctx0, cur,
|
||||
proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
|
||||
ggml_row_size(cur->type, proj_inp_dim), 0);
|
||||
cb(cur, "proj_inp_normed", -1);
|
||||
|
||||
// projection mlp
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_1_b);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_2_b);
|
||||
cb(cur, "proj_out", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
// this graph is used by llava, granite and glm
|
||||
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
||||
ggml_cgraph * build_llava() {
|
||||
@@ -1300,8 +1355,8 @@ struct clip_graph {
|
||||
ggml_tensor * block_1 = nullptr;
|
||||
{
|
||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
||||
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
|
||||
mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
// stride = 1, padding = 1, bias is nullptr
|
||||
block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
@@ -1406,9 +1461,9 @@ struct clip_graph {
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3));
|
||||
mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
|
||||
// mlp_2 ne = [576, 2048, 1, 1]
|
||||
mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||
mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
|
||||
// mlp_2 ne [24, 24, 2048, 1]
|
||||
mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
|
||||
// weight ne = [3, 3, 2048, 1]
|
||||
@@ -1428,8 +1483,8 @@ struct clip_graph {
|
||||
// glm projector
|
||||
else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) {
|
||||
size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||
embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
|
||||
embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
|
||||
embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
|
||||
embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
|
||||
embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
|
||||
@@ -1585,6 +1640,29 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
// siglip2 naflex
|
||||
ggml_tensor * resize_position_embeddings() {
|
||||
ggml_tensor * pos_embd = model.position_embeddings;
|
||||
const int height = img.ny / patch_size;
|
||||
const int width = img.nx / patch_size;
|
||||
const uint32_t mode = GGML_SCALE_MODE_BILINEAR;
|
||||
const int n_per_side = (int)std::sqrt(pos_embd->ne[1]);
|
||||
|
||||
GGML_ASSERT(pos_embd);
|
||||
|
||||
if (height == n_per_side && width == n_per_side) {
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side)
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd)
|
||||
pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd)
|
||||
pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height)
|
||||
pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height)
|
||||
|
||||
return pos_embd;
|
||||
}
|
||||
|
||||
// build vision transformer (ViT) cgraph
|
||||
// this function should cover most of the models
|
||||
// if your model has specific features, you should probably duplicate this function
|
||||
@@ -1963,7 +2041,6 @@ private:
|
||||
ggml_row_size(cur->type, n_dim),
|
||||
ggml_row_size(cur->type, n_dim*n_head),
|
||||
n_dim/2 * ggml_element_size(cur));
|
||||
second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
|
||||
second = ggml_rope_ext(
|
||||
ctx0,
|
||||
second,
|
||||
@@ -1980,6 +2057,39 @@ private:
|
||||
return cur;
|
||||
}
|
||||
|
||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||
// support dynamic resolution
|
||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) {
|
||||
GGML_ASSERT(scale_factor > 1);
|
||||
|
||||
const int n_embd = cur->ne[0];
|
||||
int width = img.nx / patch_size;
|
||||
int height = img.ny / patch_size;
|
||||
|
||||
// pad width and height to factor
|
||||
const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width;
|
||||
const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height;
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height);
|
||||
if (pad_width || pad_height) {
|
||||
cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0);
|
||||
width += pad_width;
|
||||
height += pad_height;
|
||||
}
|
||||
|
||||
// unshuffle h
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
|
||||
// unshuffle w
|
||||
cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
cb(cur, "pixel_shuffle", -1);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
|
||||
@@ -1991,6 +2101,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
switch (ctx->proj_type()) {
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
{
|
||||
res = graph.build_siglip();
|
||||
} break;
|
||||
@@ -2021,6 +2132,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||
{
|
||||
res = graph.build_whisper_enc();
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
res = graph.build_kimivl();
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
res = graph.build_llava();
|
||||
@@ -2151,7 +2266,21 @@ struct clip_model_loader {
|
||||
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
||||
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
||||
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
|
||||
|
||||
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
|
||||
if (hparams.minicpmv_query_num == 0) {
|
||||
// Fallback to hardcoded values for legacy models
|
||||
if (hparams.minicpmv_version == 3) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else if (hparams.minicpmv_version == 4) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else if (hparams.minicpmv_version == 5) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else if (hparams.minicpmv_version == 6) {
|
||||
hparams.minicpmv_query_num = 64;
|
||||
} else {
|
||||
hparams.minicpmv_query_num = 96;
|
||||
}
|
||||
}
|
||||
} else if (is_audio) {
|
||||
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
||||
|
||||
@@ -2243,6 +2372,7 @@ struct clip_model_loader {
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
{
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
@@ -2256,6 +2386,12 @@ struct clip_model_loader {
|
||||
hparams.image_size = 1024;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
hparams.warmup_image_size = hparams.patch_size * 8;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
{
|
||||
// default value (used by all model sizes in gemma 3 family)
|
||||
@@ -2420,7 +2556,20 @@ struct clip_model_loader {
|
||||
|
||||
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
||||
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
||||
if (layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd) {
|
||||
bool is_ffn_swapped = (
|
||||
// only old models need this fix
|
||||
model.proj_type == PROJECTOR_TYPE_MLP
|
||||
|| model.proj_type == PROJECTOR_TYPE_MLP_NORM
|
||||
|| model.proj_type == PROJECTOR_TYPE_LDP
|
||||
|| model.proj_type == PROJECTOR_TYPE_LDPV2
|
||||
|| model.proj_type == PROJECTOR_TYPE_QWEN2VL
|
||||
|| model.proj_type == PROJECTOR_TYPE_QWEN25VL
|
||||
|| model.proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| model.proj_type == PROJECTOR_TYPE_GEMMA3
|
||||
|| model.proj_type == PROJECTOR_TYPE_IDEFICS3
|
||||
|| model.proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
|
||||
if (is_ffn_swapped) {
|
||||
// swap up and down weights
|
||||
ggml_tensor * tmp = layer.ff_up_w;
|
||||
layer.ff_up_w = layer.ff_down_w;
|
||||
@@ -2429,6 +2578,9 @@ struct clip_model_loader {
|
||||
tmp = layer.ff_up_b;
|
||||
layer.ff_up_b = layer.ff_down_b;
|
||||
layer.ff_down_b = tmp;
|
||||
if (il == 0) {
|
||||
LOG_WRN("%s: ffn up/down are swapped\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2546,6 +2698,16 @@ struct clip_model_loader {
|
||||
{
|
||||
model.projection = get_tensor(TN_MM_PROJECTOR);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
@@ -2944,7 +3106,7 @@ struct image_manipulation {
|
||||
dst.buf.resize(3 * target_width * target_height);
|
||||
|
||||
float Cc;
|
||||
float C[5];
|
||||
float C[5] = {};
|
||||
float d0, d2, d3, a0, a1, a2, a3;
|
||||
int i, j, k, jj;
|
||||
int x, y;
|
||||
@@ -3467,6 +3629,45 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
res_imgs->grid_y = inst.grid_size.height;
|
||||
return true;
|
||||
|
||||
} else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
|
||||
|| ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
|
||||
) {
|
||||
GGML_ASSERT(params.proj_scale_factor);
|
||||
|
||||
// smart resize
|
||||
const int width = img->nx;
|
||||
const int height = img->ny;
|
||||
const int total_factor = params.patch_size * params.proj_scale_factor;
|
||||
constexpr int min_image_tokens = 64;
|
||||
constexpr int max_image_tokens = 1024;
|
||||
const float min_pixels = min_image_tokens * total_factor * total_factor;
|
||||
const float max_pixels = max_image_tokens * total_factor * total_factor;
|
||||
|
||||
auto round_by_factor = [f = total_factor](float x) { return static_cast<int>(std::nearbyintf(x / static_cast<float>(f))) * f; };
|
||||
auto ceil_by_factor = [f = total_factor](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
|
||||
auto floor_by_factor = [f = total_factor](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
|
||||
|
||||
int h_bar = std::max(total_factor, round_by_factor(height));
|
||||
int w_bar = std::max(total_factor, round_by_factor(width));
|
||||
|
||||
if (h_bar * w_bar > max_pixels) {
|
||||
const auto beta = std::sqrt((height * width) / max_pixels);
|
||||
h_bar = std::max(total_factor, floor_by_factor(height / beta));
|
||||
w_bar = std::max(total_factor, floor_by_factor(width / beta));
|
||||
} else if (h_bar * w_bar < min_pixels) {
|
||||
const auto beta = std::sqrt(min_pixels / (height * width));
|
||||
h_bar = ceil_by_factor(height * beta);
|
||||
w_bar = ceil_by_factor(width * beta);
|
||||
}
|
||||
|
||||
const std::array<uint8_t, 3> pad_color = {122, 116, 104};
|
||||
|
||||
clip_image_u8 resized_img;
|
||||
image_manipulation::resize_and_pad_image(*img, resized_img, clip_image_size{w_bar, h_bar}, pad_color);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
|
||||
res_imgs->entries.push_back(std::move(res));
|
||||
return true;
|
||||
}
|
||||
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
@@ -3506,10 +3707,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
||||
}
|
||||
|
||||
return true;
|
||||
|
||||
} else {
|
||||
GGML_ABORT("Unknown image preprocessing type");
|
||||
}
|
||||
|
||||
GGML_ASSERT(false && "Unknown image preprocessing type");
|
||||
}
|
||||
|
||||
ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
|
||||
@@ -3573,8 +3774,9 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
||||
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
|
||||
const auto & params = ctx->model.hparams;
|
||||
|
||||
// only for models using fixed size square images
|
||||
int n_patches_sq = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
// for models with fixed size image, the input image is already pre-processed and resized to square
|
||||
int patch_size = params.patch_size;
|
||||
int n_patches = (img->nx / patch_size) * (img->ny / patch_size);
|
||||
|
||||
projector_type proj = ctx->proj_type();
|
||||
|
||||
@@ -3588,89 +3790,97 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
||||
case PROJECTOR_TYPE_LDPV2:
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
{
|
||||
n_patches_sq /= 4;
|
||||
n_patches /= 4;
|
||||
if (ctx->model.mm_glm_tok_boi) {
|
||||
n_patches_sq += 2; // for BOI and EOI token embeddings
|
||||
n_patches += 2; // for BOI and EOI token embeddings
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
{
|
||||
if (params.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
n_patches_sq = 96;
|
||||
} else if (params.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
n_patches_sq = 64;
|
||||
} else if (params.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
n_patches_sq = 64;
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
if (params.minicpmv_query_num > 0) {
|
||||
n_patches = params.minicpmv_query_num;
|
||||
} else {
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
// Fallback to hardcoded values for legacy models
|
||||
if (params.minicpmv_version == 2) {
|
||||
n_patches = 96;
|
||||
} else if (params.minicpmv_version == 3) {
|
||||
n_patches = 64;
|
||||
} else if (params.minicpmv_version == 4) {
|
||||
n_patches = 64;
|
||||
} else if (params.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
n_patches = 64;
|
||||
} else if (params.minicpmv_version == 6) {
|
||||
// MiniCPM-V 4.5
|
||||
n_patches = 64;
|
||||
} else {
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
{
|
||||
// dynamic size
|
||||
// dynamic size (2 conv, so double patch size)
|
||||
int patch_size = params.patch_size * 2;
|
||||
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
|
||||
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
|
||||
n_patches_sq = x_patch * y_patch;
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
{
|
||||
int n_per_side = params.image_size / params.patch_size;
|
||||
int n_per_side_2d_pool = n_per_side / params.proj_scale_factor;
|
||||
n_patches_sq = n_per_side_2d_pool * n_per_side_2d_pool;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_IDEFICS3:
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
// both W and H are divided by proj_scale_factor
|
||||
n_patches_sq /= (params.proj_scale_factor * params.proj_scale_factor);
|
||||
// both X and Y are downscaled by the scale factor
|
||||
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||
n_patches /= (scale_factor * scale_factor);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
// dynamic size
|
||||
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||
int out_patch_size = params.patch_size * scale_factor;
|
||||
int x_patch = CLIP_ALIGN(img->nx, out_patch_size) / out_patch_size;
|
||||
int y_patch = CLIP_ALIGN(img->ny, out_patch_size) / out_patch_size;
|
||||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = params.spatial_merge_size;
|
||||
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
n_patches_sq = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LLAMA4:
|
||||
{
|
||||
int scale_factor = ctx->model.hparams.proj_scale_factor;
|
||||
n_patches_sq /= (scale_factor * scale_factor);
|
||||
int n_patches_x = img->nx / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
int n_patches_y = img->ny / patch_size / (n_merge > 0 ? n_merge : 1);
|
||||
n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
||||
} break;
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
{
|
||||
n_patches_sq = img->nx;
|
||||
n_patches = img->nx;
|
||||
|
||||
const int proj_stack_factor = ctx->model.hparams.proj_stack_factor;
|
||||
if (ctx->model.audio_has_stack_frames()) {
|
||||
GGML_ASSERT(proj_stack_factor > 0);
|
||||
const int n_len = CLIP_ALIGN(n_patches_sq, proj_stack_factor);
|
||||
n_patches_sq = n_len / proj_stack_factor;
|
||||
const int n_len = CLIP_ALIGN(n_patches, proj_stack_factor);
|
||||
n_patches = n_len / proj_stack_factor;
|
||||
}
|
||||
|
||||
// whisper downscales input token by half after conv1d
|
||||
n_patches_sq /= 2;
|
||||
n_patches /= 2;
|
||||
|
||||
if (ctx->model.audio_has_avgpool()) {
|
||||
// divide by 2 because of nn.AvgPool1d(2, stride=2)
|
||||
n_patches_sq /= 2;
|
||||
n_patches /= 2;
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
GGML_ABORT("unsupported projector type");
|
||||
}
|
||||
|
||||
return n_patches_sq;
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
|
||||
@@ -4019,6 +4229,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
// set the 2D positions
|
||||
int n_patches_per_col = image_size_width / patch_size;
|
||||
@@ -4070,6 +4281,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
case PROJECTOR_TYPE_INTERNVL:
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_VOXTRAL:
|
||||
{
|
||||
// do nothing
|
||||
@@ -4141,7 +4353,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
}
|
||||
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
switch (ctx->model.proj_type) {
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
@@ -4153,20 +4364,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
return ctx->model.mm_3_b->ne[0];
|
||||
case PROJECTOR_TYPE_MINICPMV:
|
||||
if (hparams.minicpmv_version == 2) {
|
||||
// MiniCPM-V 2.5
|
||||
return 4096;
|
||||
} else if (hparams.minicpmv_version == 3) {
|
||||
// MiniCPM-V 2.6
|
||||
return 3584;
|
||||
} else if (hparams.minicpmv_version == 4) {
|
||||
// MiniCPM-o 2.6
|
||||
return 3584;
|
||||
} else if (hparams.minicpmv_version == 5) {
|
||||
// MiniCPM-V 4.0
|
||||
return 2560;
|
||||
}
|
||||
GGML_ABORT("Unknown minicpmv version");
|
||||
return ctx->model.mm_model_proj->ne[0];
|
||||
case PROJECTOR_TYPE_GLM_EDGE:
|
||||
return ctx->model.mm_model_mlp_3_w->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
@@ -4185,6 +4383,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_QWEN2A:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
default:
|
||||
GGML_ABORT("Unknown projector type");
|
||||
}
|
||||
|
||||
5
llama/llama.cpp/tools/mtmd/clip.h
vendored
5
llama/llama.cpp/tools/mtmd/clip.h
vendored
@@ -82,11 +82,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
|
||||
*/
|
||||
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
|
||||
2
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
2
llama/llama.cpp/tools/mtmd/mtmd.cpp
vendored
@@ -217,7 +217,7 @@ struct mtmd_context {
|
||||
tok_row_end_trail = false; // no trailing end-of-row token
|
||||
ov_img_first = true;
|
||||
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) {
|
||||
} else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6) {
|
||||
// minicpmv 2.6 format:
|
||||
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
|
||||
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
|
||||
|
||||
Reference in New Issue
Block a user