mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
update llama.cpp to f64d44a
This commit is contained in:
546
llm/ggml.c
546
llm/ggml.c
@@ -1,5 +1,5 @@
|
||||
/**
|
||||
* llama.cpp - git 8183159cf3def112f6d1fe94815fce70e1bffa12
|
||||
* llama.cpp - git f64d44a9b9581cd58f7ec40f4fa1c3ca5ca18e1e
|
||||
*
|
||||
* MIT License
|
||||
*
|
||||
@@ -221,8 +221,8 @@ typedef void * thread_ret_t;
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
#else
|
||||
inline static void* ggml_aligned_malloc(size_t size) {
|
||||
void* aligned_memory = NULL;
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
void * aligned_memory = NULL;
|
||||
#ifdef GGML_USE_METAL
|
||||
int result = posix_memalign(&aligned_memory, getpagesize(), size);
|
||||
#else
|
||||
@@ -3837,7 +3837,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
||||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
|
||||
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
@@ -3909,7 +3909,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 59, "GGML_OP_COUNT != 59");
|
||||
static_assert(GGML_OP_COUNT == 62, "GGML_OP_COUNT != 62");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
@@ -4136,7 +4136,7 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
//
|
||||
// is enough, but just in case, adding the second part
|
||||
|
||||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||
return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||
@@ -4279,7 +4279,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return
|
||||
@@ -4628,7 +4628,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
/*.ne =*/ { 1, 1, 1, 1 },
|
||||
/*.nb =*/ { 0, 0, 0, 0 },
|
||||
/*.op =*/ GGML_OP_NONE,
|
||||
/*.op_params =*/ {0},
|
||||
/*.op_params =*/ { 0 },
|
||||
/*.is_param =*/ false,
|
||||
/*.grad =*/ NULL,
|
||||
/*.src =*/ { NULL },
|
||||
@@ -4660,6 +4660,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
||||
}
|
||||
|
||||
static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) {
|
||||
GGML_ASSERT(tensor != NULL); // silence -Warray-bounds warnings
|
||||
assert(params_size <= GGML_MAX_OP_PARAMS);
|
||||
memcpy(tensor->op_params, params, params_size);
|
||||
}
|
||||
@@ -6465,7 +6466,7 @@ struct ggml_tensor * ggml_permute(
|
||||
result->src[0] = a;
|
||||
|
||||
int32_t params[] = { axis0, axis1, axis2, axis3 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -6591,7 +6592,7 @@ static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_DIAG_MASK_INF;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6631,7 +6632,7 @@ static struct ggml_tensor * ggml_diag_mask_zero_impl(
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[] = { n_past, inplace ? 1 : 0 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_DIAG_MASK_ZERO;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6747,9 +6748,9 @@ static struct ggml_tensor * ggml_rope_impl(
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[6] = { n_past, n_dims, mode, n_ctx };
|
||||
memcpy(params + 4, &freq_base, sizeof(float));
|
||||
memcpy(params + 4, &freq_base, sizeof(float));
|
||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6823,7 +6824,7 @@ struct ggml_tensor * ggml_rope_back(
|
||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||
|
||||
int32_t params[] = { n_past, n_dims, mode, n_ctx };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_ROPE_BACK;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6854,7 +6855,7 @@ struct ggml_tensor * ggml_alibi(
|
||||
|
||||
int32_t op_params[3] = { n_past, n_head };
|
||||
memcpy(op_params + 2, &bias_max, sizeof(float));
|
||||
ggml_set_op_params(result, &op_params, sizeof(op_params));
|
||||
ggml_set_op_params(result, op_params, sizeof(op_params));
|
||||
|
||||
result->op = GGML_OP_ALIBI;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6881,7 +6882,7 @@ struct ggml_tensor * ggml_clamp(
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
||||
|
||||
float params[] = { min, max };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CLAMP;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6916,10 +6917,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
||||
a->ne[2], 1, 1,
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||
|
||||
int32_t params[] = { s0, p0, d0 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_1D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6931,10 +6932,10 @@ GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
|
||||
// ggml_conv_2d
|
||||
|
||||
struct ggml_tensor* ggml_conv_2d(
|
||||
struct ggml_context* ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
@@ -6955,10 +6956,10 @@ struct ggml_tensor* ggml_conv_2d(
|
||||
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
||||
a->ne[3], b->ne[3],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_2D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -6971,7 +6972,7 @@ struct ggml_tensor* ggml_conv_2d(
|
||||
|
||||
// ggml_conv_1d_ph
|
||||
|
||||
struct ggml_tensor* ggml_conv_1d_ph(
|
||||
struct ggml_tensor * ggml_conv_1d_ph(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
@@ -6989,7 +6990,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) {
|
||||
|
||||
// ggml_pool_1d
|
||||
|
||||
struct ggml_tensor* ggml_pool_1d(
|
||||
struct ggml_tensor * ggml_pool_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
@@ -7008,10 +7009,10 @@ struct ggml_tensor* ggml_pool_1d(
|
||||
ggml_calc_pool_output_size(a->ne[0], k0, s0, p0),
|
||||
a->ne[1],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
||||
|
||||
int32_t params[] = { op, k0, s0, p0 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_POOL_1D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -7022,7 +7023,7 @@ struct ggml_tensor* ggml_pool_1d(
|
||||
|
||||
// ggml_pool_2d
|
||||
|
||||
struct ggml_tensor* ggml_pool_2d(
|
||||
struct ggml_tensor * ggml_pool_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_pool op,
|
||||
@@ -7045,10 +7046,10 @@ struct ggml_tensor* ggml_pool_2d(
|
||||
ggml_calc_pool_output_size(a->ne[1], k1, s1, p1),
|
||||
a->ne[2],
|
||||
};
|
||||
struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
|
||||
int32_t params[] = { op, k0, k1, s0, s1, p0, p1 };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_POOL_2D;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -7216,7 +7217,7 @@ struct ggml_tensor * ggml_win_part(
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
int32_t params[] = { npx, npy, w };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_WIN_PART;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -7246,7 +7247,7 @@ struct ggml_tensor * ggml_win_unpart(
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
||||
|
||||
int32_t params[] = { w };
|
||||
ggml_set_op_params(result, ¶ms, sizeof(params));
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_WIN_UNPART;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
@@ -7375,7 +7376,7 @@ struct ggml_tensor * ggml_map_binary_inplace_f32(
|
||||
return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom1
|
||||
// ggml_map_custom1_f32
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
||||
struct ggml_context * ctx,
|
||||
@@ -7392,7 +7393,7 @@ static struct ggml_tensor * ggml_map_custom1_impl_f32(
|
||||
|
||||
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM1;
|
||||
result->op = GGML_OP_MAP_CUSTOM1_F32;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
@@ -7413,7 +7414,7 @@ struct ggml_tensor * ggml_map_custom1_inplace_f32(
|
||||
return ggml_map_custom1_impl_f32(ctx, a, fun, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom2
|
||||
// ggml_map_custom2_f32
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
||||
struct ggml_context * ctx,
|
||||
@@ -7431,7 +7432,7 @@ static struct ggml_tensor * ggml_map_custom2_impl_f32(
|
||||
|
||||
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM2;
|
||||
result->op = GGML_OP_MAP_CUSTOM2_F32;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
@@ -7455,7 +7456,7 @@ struct ggml_tensor * ggml_map_custom2_inplace_f32(
|
||||
return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom3
|
||||
// ggml_map_custom3_f32
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
||||
struct ggml_context * ctx,
|
||||
@@ -7474,7 +7475,7 @@ static struct ggml_tensor * ggml_map_custom3_impl_f32(
|
||||
|
||||
ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM3;
|
||||
result->op = GGML_OP_MAP_CUSTOM3_F32;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
@@ -7501,6 +7502,190 @@ struct ggml_tensor * ggml_map_custom3_inplace_f32(
|
||||
return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom1
|
||||
struct ggml_map_custom1_op_params {
|
||||
ggml_custom1_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom1_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_custom1_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && a->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
struct ggml_map_custom1_op_params params = {
|
||||
/*.fun =*/ fun,
|
||||
/*.n_tasks =*/ n_tasks,
|
||||
/*.userdata =*/ userdata
|
||||
};
|
||||
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM1;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_custom1_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom1_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_custom1_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom1_impl(ctx, a, fun, n_tasks, userdata, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom2
|
||||
|
||||
struct ggml_map_custom2_op_params {
|
||||
ggml_custom2_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom2_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_custom2_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad || b->grad)) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
struct ggml_map_custom2_op_params params = {
|
||||
/*.fun =*/ fun,
|
||||
/*.n_tasks =*/ n_tasks,
|
||||
/*.userdata =*/ userdata
|
||||
};
|
||||
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM2;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom2(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_custom2_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom2_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_custom2_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom2_impl(ctx, a, b, fun, n_tasks, userdata, true);
|
||||
}
|
||||
|
||||
// ggml_map_custom3
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
static struct ggml_tensor * ggml_map_custom3_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
const ggml_custom3_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(n_tasks == GGML_N_TASKS_MAX || n_tasks > 0);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (!inplace && (a->grad || b->grad || c->grad)) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
struct ggml_map_custom3_op_params params = {
|
||||
/*.fun =*/ fun,
|
||||
/*.n_tasks =*/ n_tasks,
|
||||
/*.userdata =*/ userdata
|
||||
};
|
||||
ggml_set_op_params(result, (const void *) ¶ms, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_MAP_CUSTOM3;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
result->src[2] = c;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom3(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
const ggml_custom3_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_map_custom3_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * c,
|
||||
const ggml_custom3_op_t fun,
|
||||
int n_tasks,
|
||||
void * userdata) {
|
||||
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ggml_cross_entropy_loss
|
||||
|
||||
struct ggml_tensor * ggml_cross_entropy_loss(
|
||||
@@ -9309,8 +9494,8 @@ static void ggml_compute_forward_sum_rows_f32(
|
||||
for (int64_t i3 = 0; i3 < ne03; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne02; i2++) {
|
||||
for (int64_t i1 = 0; i1 < ne01; i1++) {
|
||||
float* src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
float* dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
|
||||
float * dst_row = (float *) ((char *) dst->data + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
float row_sum = 0;
|
||||
ggml_vec_sum_f32(ne00, &row_sum, src_row);
|
||||
dst_row[0] = row_sum;
|
||||
@@ -10572,72 +10757,96 @@ static void ggml_compute_forward_mul_mat(
|
||||
return;
|
||||
}
|
||||
|
||||
// parallelize by src0 rows
|
||||
const int64_t dr = (ne01 + nth - 1)/nth;
|
||||
|
||||
const int64_t ir10 = dr*ith;
|
||||
const int64_t ir11 = MIN(ir10 + dr, ne01);
|
||||
|
||||
// src1 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13;
|
||||
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
|
||||
|
||||
for (int64_t ir1 = 0; ir1 < nr1; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*ne11));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
||||
const int64_t nr0 = ne01; // src0 rows
|
||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||
|
||||
const int64_t ir0 = (ir1/ne11)%(ne02*ne03);
|
||||
const int64_t i03 = (ir0/(ne02));
|
||||
// Hack for "Falcon multi-query-attention key stutter" / alternative to ggml_repeat2.
|
||||
// See https://github.com/ggerganov/llama.cpp/issues/1602#issuecomment-1606087470:
|
||||
// GG: this is likely the correct way to broadcast, though need some more thought
|
||||
// therefore leaving the comments to remind us for now
|
||||
const int64_t i02 = (i12 / (ne12 / ne02));
|
||||
// Original from PR/224 (and also essential/correct for non-broadcast matmuls in Falcon)
|
||||
// const int64_t i02 = (ir0 - i03*ne02);
|
||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||
|
||||
const char * src0_row = (const char *) src0->data + ( 0 + i02*nb02 + i03*nb03 );
|
||||
const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
|
||||
const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
const int64_t ith0 = ith % nth0;
|
||||
const int64_t ith1 = ith / nth0;
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
|
||||
const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
|
||||
|
||||
for (int64_t ir = ir10; ir < ir11; ++ir) {
|
||||
vec_dot(ne00, &dst_col[ir], src0_row + ir*nb01, src1_col);
|
||||
}
|
||||
const int64_t ir010 = dr0*ith0;
|
||||
const int64_t ir011 = MIN(ir010 + dr0, nr0);
|
||||
|
||||
const int64_t ir110 = dr1*ith1;
|
||||
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
||||
|
||||
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
||||
|
||||
// threads with no work simply yield (not sure if it helps)
|
||||
if (ir010 >= ir011 || ir110 >= ir111) {
|
||||
sched_yield();
|
||||
return;
|
||||
}
|
||||
|
||||
//int64_t t1 = ggml_time_us();
|
||||
//static int64_t acc = 0;
|
||||
//acc += t1 - t0;
|
||||
//if (t1 - t0 > 10) {
|
||||
// printf("\n");
|
||||
// printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
|
||||
// printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
|
||||
// printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
|
||||
assert(ne12 % ne02 == 0);
|
||||
assert(ne13 % ne03 == 0);
|
||||
|
||||
// printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
|
||||
//}
|
||||
// broadcast factors
|
||||
const int64_t r2 = ne12/ne02;
|
||||
const int64_t r3 = ne13/ne03;
|
||||
|
||||
// block-tiling attempt
|
||||
const int64_t blck_0 = 16;
|
||||
const int64_t blck_1 = 16;
|
||||
|
||||
// attempt to reduce false-sharing (does not seem to make a difference)
|
||||
float tmp[16];
|
||||
|
||||
for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
|
||||
for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
|
||||
for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
|
||||
const int64_t i13 = (ir1/(ne12*ne11));
|
||||
const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
|
||||
const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
|
||||
|
||||
// broadcast src0 into src1
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const int64_t i1 = i11;
|
||||
const int64_t i2 = i12;
|
||||
const int64_t i3 = i13;
|
||||
|
||||
const char * src0_row = (const char *) src0->data + (0 + i02*nb02 + i03*nb03);
|
||||
|
||||
// desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
|
||||
// if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
|
||||
// the original src1 data pointer, so we should index using the indices directly
|
||||
// TODO: this is a bit of a hack, we should probably have a better way to handle this
|
||||
const char * src1_col = (const char *) wdata +
|
||||
(src1_cont || src1->type != vec_dot_type
|
||||
? (i11 + i12*ne11 + i13*ne12*ne11)*row_size
|
||||
: (i11*nb11 + i12*nb12 + i13*nb13));
|
||||
|
||||
float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3));
|
||||
|
||||
//for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
// vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
|
||||
//}
|
||||
|
||||
for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
|
||||
vec_dot(ne00, &tmp[ir0 - iir0], src0_row + ir0*nb01, src1_col);
|
||||
}
|
||||
memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_out_prod
|
||||
|
||||
|
||||
static void ggml_compute_forward_out_prod_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
@@ -12920,7 +13129,7 @@ static void ggml_compute_forward_pool_1d(
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const int32_t* opts = (const int32_t*)dst->op_params;
|
||||
const int32_t * opts = (const int32_t *)dst->op_params;
|
||||
enum ggml_op_pool op = opts[0];
|
||||
const int k0 = opts[1];
|
||||
const int s0 = opts[2];
|
||||
@@ -14253,24 +14462,6 @@ static void ggml_compute_forward_map_custom1_f32(
|
||||
fun(dst, a);
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_custom1(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * a,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom1_op_f32_t fun) {
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_custom1_f32(params, a, dst, fun);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom2
|
||||
|
||||
static void ggml_compute_forward_map_custom2_f32(
|
||||
@@ -14289,24 +14480,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_custom2(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * a,
|
||||
const struct ggml_tensor * b,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom2_op_f32_t fun) {
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_custom2_f32(params, a, b, dst, fun);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom3
|
||||
|
||||
static void ggml_compute_forward_map_custom3_f32(
|
||||
@@ -14325,24 +14498,52 @@ static void ggml_compute_forward_map_custom3_f32(
|
||||
fun(dst, a, b, c);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom1
|
||||
|
||||
static void ggml_compute_forward_map_custom1(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * a,
|
||||
struct ggml_tensor * dst) {
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) dst->op_params;
|
||||
|
||||
p->fun(dst, a, params->ith, params->nth, p->userdata);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom2
|
||||
|
||||
static void ggml_compute_forward_map_custom2(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * a,
|
||||
const struct ggml_tensor * b,
|
||||
struct ggml_tensor * dst) {
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) dst->op_params;
|
||||
|
||||
p->fun(dst, a, b, params->ith, params->nth, p->userdata);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_map_custom3
|
||||
|
||||
static void ggml_compute_forward_map_custom3(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * a,
|
||||
const struct ggml_tensor * b,
|
||||
const struct ggml_tensor * c,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom3_op_f32_t fun) {
|
||||
switch (a->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_map_custom3_f32(params, a, b, c, dst, fun);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
struct ggml_tensor * dst) {
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) dst->op_params;
|
||||
|
||||
p->fun(dst, a, b, c, params->ith, params->nth, p->userdata);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_cross_entropy_loss
|
||||
@@ -14864,25 +15065,40 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||
ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
case GGML_OP_MAP_CUSTOM1_F32:
|
||||
{
|
||||
ggml_custom1_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun);
|
||||
ggml_compute_forward_map_custom1_f32(params, tensor->src[0], tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM2_F32:
|
||||
{
|
||||
ggml_custom2_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom2_f32(params, tensor->src[0], tensor->src[1], tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM3_F32:
|
||||
{
|
||||
ggml_custom3_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom3_f32(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
{
|
||||
ggml_compute_forward_map_custom1(params, tensor->src[0], tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM2:
|
||||
{
|
||||
ggml_custom2_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun);
|
||||
ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_MAP_CUSTOM3:
|
||||
{
|
||||
ggml_custom3_op_f32_t fun;
|
||||
memcpy(&fun, tensor->op_params, sizeof(fun));
|
||||
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun);
|
||||
ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
@@ -15690,6 +15906,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||
} break;
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
case GGML_OP_MAP_CUSTOM1_F32:
|
||||
case GGML_OP_MAP_CUSTOM2_F32:
|
||||
case GGML_OP_MAP_CUSTOM3_F32:
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
case GGML_OP_MAP_CUSTOM2:
|
||||
case GGML_OP_MAP_CUSTOM3:
|
||||
@@ -16475,12 +16694,39 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
||||
case GGML_OP_WIN_UNPART:
|
||||
case GGML_OP_MAP_UNARY:
|
||||
case GGML_OP_MAP_BINARY:
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
case GGML_OP_MAP_CUSTOM2:
|
||||
case GGML_OP_MAP_CUSTOM3:
|
||||
case GGML_OP_MAP_CUSTOM1_F32:
|
||||
case GGML_OP_MAP_CUSTOM2_F32:
|
||||
case GGML_OP_MAP_CUSTOM3_F32:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_MAP_CUSTOM1:
|
||||
{
|
||||
struct ggml_map_custom1_op_params * p = (struct ggml_map_custom1_op_params *) node->op_params;
|
||||
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
||||
n_tasks = n_threads;
|
||||
} else {
|
||||
n_tasks = MIN(p->n_tasks, n_threads);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MAP_CUSTOM2:
|
||||
{
|
||||
struct ggml_map_custom2_op_params * p = (struct ggml_map_custom2_op_params *) node->op_params;
|
||||
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
||||
n_tasks = n_threads;
|
||||
} else {
|
||||
n_tasks = MIN(p->n_tasks, n_threads);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MAP_CUSTOM3:
|
||||
{
|
||||
struct ggml_map_custom3_op_params * p = (struct ggml_map_custom3_op_params *) node->op_params;
|
||||
if (p->n_tasks == GGML_N_TASKS_MAX) {
|
||||
n_tasks = n_threads;
|
||||
} else {
|
||||
n_tasks = MIN(p->n_tasks, n_threads);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CROSS_ENTROPY_LOSS:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
||||
Reference in New Issue
Block a user