Merge branch 'ollama:main' into main

This commit is contained in:
likelovewant
2024-07-27 12:18:55 +08:00
committed by GitHub
31 changed files with 295 additions and 31 deletions

View File

@@ -41,6 +41,7 @@
#if defined(_WIN32)
#include <windows.h>
#include <errhandlingapi.h>
#endif
#include <cstddef>
@@ -2737,6 +2738,9 @@ int wmain(int argc, wchar_t **wargv) {
for (int i = 0; i < argc; ++i) {
argv[i] = wchar_to_char(wargv[i]);
}
// Adjust error mode to avoid error dialog after we start.
SetErrorMode(SEM_FAILCRITICALERRORS);
#else
int main(int argc, char **argv) {
#endif

View File

@@ -2,7 +2,10 @@ package llm
import (
"embed"
"syscall"
)
//go:embed build/darwin/x86_64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -2,7 +2,10 @@ package llm
import (
"embed"
"syscall"
)
//go:embed build/darwin/arm64/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -1,6 +1,11 @@
package llm
import "embed"
import (
"embed"
"syscall"
)
//go:embed build/linux/*/*/bin/*
var libEmbed embed.FS
var LlamaServerSysProcAttr = &syscall.SysProcAttr{}

View File

@@ -1,6 +1,20 @@
package llm
import "embed"
import (
"embed"
"syscall"
)
// unused on windows
var libEmbed embed.FS
const CREATE_DEFAULT_ERROR_MODE = 0x04000000
var LlamaServerSysProcAttr = &syscall.SysProcAttr{
// Wire up the default error handling logic If for some reason a DLL is
// missing in the path this will pop up a GUI Dialog explaining the fault so
// the user can either fix their PATH, or report a bug. Without this
// setting, the process exits immediately with a generic exit status but no
// way to (easily) figure out what the actual missing DLL was.
CreationFlags: CREATE_DEFAULT_ERROR_MODE,
}

View File

@@ -0,0 +1,70 @@
From 2f872f294fb6f5c6e8f983b68c40ea656053dd92 Mon Sep 17 00:00:00 2001
From: Michael Yang <mxyng@pm.me>
Date: Tue, 23 Jul 2024 14:33:29 -0700
Subject: [PATCH] llama 3.1 rope scaling
---
src/llama.cpp | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/src/llama.cpp b/src/llama.cpp
index 8fe51971..a9969df8 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2472,6 +2472,7 @@ struct llama_layer {
// long rope factors
struct ggml_tensor * rope_long = nullptr;
struct ggml_tensor * rope_short = nullptr;
+ struct ggml_tensor * rope_freqs = nullptr;
// bitnet scale
struct ggml_tensor * wq_scale;
@@ -6143,6 +6144,8 @@ static bool llm_load_tensors(
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), { n_embd/n_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+
if (n_expert == 0) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
@@ -8620,6 +8623,10 @@ struct llm_build_context {
// choose long/short freq factors based on the context size
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
+ if (model.layers[il].rope_freqs != nullptr) {
+ return model.layers[il].rope_freqs;
+ }
+
if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
return model.layers[il].rope_long;
}
@@ -8814,6 +8821,9 @@ struct llm_build_context {
// self-attention
{
+ // rope freq factors for llama3; may return nullptr for llama2 and other models
+ struct ggml_tensor * rope_factors = build_rope_factors(il);
+
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
cb(Qcur, "Qcur", il);
@@ -8837,14 +8847,14 @@ struct llm_build_context {
}
Qcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
Kcur = ggml_rope_ext(
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
--
2.45.2

View File

@@ -346,6 +346,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
s.cmd.Env = os.Environ()
s.cmd.Stdout = os.Stdout
s.cmd.Stderr = s.status
s.cmd.SysProcAttr = LlamaServerSysProcAttr
envWorkarounds := [][2]string{}
for _, gpu := range gpus {