mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 23:18:26 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
10
.github/workflows/release.yaml
vendored
10
.github/workflows/release.yaml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
security set-keychain-settings -lut 3600 build.keychain
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- name: Build Darwin
|
||||
env:
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
write-host "plugin installed"
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- run: go get ./...
|
||||
- run: |
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
write-host "plugin installed"
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- name: 'Install ROCm'
|
||||
run: |
|
||||
@@ -218,7 +218,7 @@ jobs:
|
||||
write-host "plugin installed"
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- name: 'Install CUDA'
|
||||
run: |
|
||||
@@ -306,7 +306,7 @@ jobs:
|
||||
write-host "plugin installed"
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- run: go get
|
||||
- uses: actions/download-artifact@v4
|
||||
|
||||
10
.github/workflows/test.yaml
vendored
10
.github/workflows/test.yaml
vendored
@@ -63,7 +63,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- run: go get ./...
|
||||
- run: |
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- name: 'Install ROCm'
|
||||
run: |
|
||||
@@ -200,7 +200,7 @@ jobs:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- name: 'Install CUDA'
|
||||
run: |
|
||||
@@ -255,7 +255,7 @@ jobs:
|
||||
submodules: recursive
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: false
|
||||
- run: |
|
||||
case ${{ matrix.arch }} in
|
||||
@@ -297,7 +297,7 @@ jobs:
|
||||
submodules: recursive
|
||||
- uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
go-version: "stable"
|
||||
cache: true
|
||||
- run: |
|
||||
case ${{ matrix.arch }} in
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
ARG GOLANG_VERSION=1.22.1
|
||||
ARG GOLANG_VERSION=1.22.5
|
||||
ARG CMAKE_VERSION=3.22.1
|
||||
# this CUDA_VERSION corresponds with the one specified in docs/gpu.md
|
||||
ARG CUDA_VERSION=11.3.1
|
||||
|
||||
@@ -1344,7 +1344,6 @@ func NewCLI() *cobra.Command {
|
||||
envVars["OLLAMA_TMPDIR"],
|
||||
envVars["OLLAMA_FLASH_ATTENTION"],
|
||||
envVars["OLLAMA_LLM_LIBRARY"],
|
||||
envVars["OLLAMA_MAX_VRAM"],
|
||||
})
|
||||
default:
|
||||
appendEnvDocs(cmd, envs)
|
||||
|
||||
@@ -71,6 +71,11 @@ func (m *MistralModel) WriteGGUF(ws io.WriteSeeker) error {
|
||||
"tokenizer.ggml.unknown_token_id": uint32(0),
|
||||
}
|
||||
|
||||
if m.Params.HeadDimension > 0 {
|
||||
kv["llama.attention.key_length"] = uint32(m.Params.HeadDimension)
|
||||
kv["llama.attention.value_length"] = uint32(m.Params.HeadDimension)
|
||||
}
|
||||
|
||||
return llm.NewGGUFV3(m.Params.ByteOrder).Encode(ws, kv, m.Tensors)
|
||||
}
|
||||
|
||||
|
||||
84
docs/api.md
84
docs/api.md
@@ -1026,7 +1026,7 @@ If `stream` is set to `false`, then the response is a single JSON object:
|
||||
## Generate Embeddings
|
||||
|
||||
```shell
|
||||
POST /api/embeddings
|
||||
POST /api/embed
|
||||
```
|
||||
|
||||
Generate embeddings from a model
|
||||
@@ -1034,10 +1034,11 @@ Generate embeddings from a model
|
||||
### Parameters
|
||||
|
||||
- `model`: name of model to generate embeddings from
|
||||
- `prompt`: text to generate embeddings for
|
||||
- `input`: text or list of text to generate embeddings for
|
||||
|
||||
Advanced parameters:
|
||||
|
||||
- `truncate`: truncates the end of each input to fit within context length. Returns error if `false` and context length is exceeded. Defaults to `true`
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
@@ -1046,9 +1047,9 @@ Advanced parameters:
|
||||
#### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/embeddings -d '{
|
||||
curl http://localhost:11434/api/embed -d '{
|
||||
"model": "all-minilm",
|
||||
"prompt": "Here is an article about llamas..."
|
||||
"input": "Why is the sky blue?"
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -1056,10 +1057,35 @@ curl http://localhost:11434/api/embeddings -d '{
|
||||
|
||||
```json
|
||||
{
|
||||
"embedding": [
|
||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
||||
]
|
||||
"model": "all-minilm",
|
||||
"embeddings": [[
|
||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
||||
]]
|
||||
}
|
||||
```
|
||||
|
||||
#### Request (Multiple input)
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/embed -d '{
|
||||
"model": "all-minilm",
|
||||
"input": ["Why is the sky blue?", "Why is the grass green?"]
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "all-minilm",
|
||||
"embeddings": [[
|
||||
0.010071029, -0.0017594862, 0.05007221, 0.04692972, 0.054916814,
|
||||
0.008599704, 0.105441414, -0.025878139, 0.12958129, 0.031952348
|
||||
],[
|
||||
-0.0098027075, 0.06042469, 0.025257962, -0.006364387, 0.07272725,
|
||||
0.017194884, 0.09032035, -0.051705178, 0.09951512, 0.09072481
|
||||
]]
|
||||
}
|
||||
```
|
||||
|
||||
@@ -1106,3 +1132,45 @@ A single JSON object will be returned.
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Generate Embedding
|
||||
|
||||
> Note: this endpoint has been superseded by `/api/embed`
|
||||
|
||||
```shell
|
||||
POST /api/embeddings
|
||||
```
|
||||
|
||||
Generate embeddings from a model
|
||||
|
||||
### Parameters
|
||||
|
||||
- `model`: name of model to generate embeddings from
|
||||
- `prompt`: text to generate embeddings for
|
||||
|
||||
Advanced parameters:
|
||||
|
||||
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
|
||||
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
|
||||
|
||||
### Examples
|
||||
|
||||
#### Request
|
||||
|
||||
```shell
|
||||
curl http://localhost:11434/api/embeddings -d '{
|
||||
"model": "all-minilm",
|
||||
"prompt": "Here is an article about llamas..."
|
||||
}'
|
||||
```
|
||||
|
||||
#### Response
|
||||
|
||||
```json
|
||||
{
|
||||
"embedding": [
|
||||
0.5670403838157654, 0.009260174818336964, 0.23178744316101074, -0.2916173040866852, -0.8924556970596313,
|
||||
0.8785552978515625, -0.34576427936553955, 0.5742510557174683, -0.04222835972905159, -0.137906014919281
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -43,8 +43,6 @@ var (
|
||||
MaxRunners int
|
||||
// Set via OLLAMA_MAX_QUEUE in the environment
|
||||
MaxQueuedRequests int
|
||||
// Set via OLLAMA_MAX_VRAM in the environment
|
||||
MaxVRAM uint64
|
||||
// Set via OLLAMA_MODELS in the environment
|
||||
ModelsDir string
|
||||
// Set via OLLAMA_NOHISTORY in the environment
|
||||
@@ -89,7 +87,6 @@ func AsMap() map[string]EnvVar {
|
||||
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary, "Set LLM library to bypass autodetection"},
|
||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||
"OLLAMA_MAX_VRAM": {"OLLAMA_MAX_VRAM", MaxVRAM, "Maximum VRAM"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
||||
@@ -194,16 +191,6 @@ func LoadConfig() {
|
||||
|
||||
TmpDir = clean("OLLAMA_TMPDIR")
|
||||
|
||||
userLimit := clean("OLLAMA_MAX_VRAM")
|
||||
if userLimit != "" {
|
||||
avail, err := strconv.ParseUint(userLimit, 10, 64)
|
||||
if err != nil {
|
||||
slog.Error("invalid setting, ignoring", "OLLAMA_MAX_VRAM", userLimit, "error", err)
|
||||
} else {
|
||||
MaxVRAM = avail
|
||||
}
|
||||
}
|
||||
|
||||
LLMLibrary = clean("OLLAMA_LLM_LIBRARY")
|
||||
|
||||
if onp := clean("OLLAMA_NUM_PARALLEL"); onp != "" {
|
||||
|
||||
@@ -69,7 +69,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
reqLimit := len(req)
|
||||
iterLimit := 5
|
||||
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
||||
if vram != "" {
|
||||
max, err := strconv.ParseUint(vram, 10, 64)
|
||||
require.NoError(t, err)
|
||||
@@ -106,7 +106,7 @@ func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
|
||||
|
||||
// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
|
||||
func TestMultiModelStress(t *testing.T) {
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM")
|
||||
vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
|
||||
if vram == "" {
|
||||
t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
|
||||
}
|
||||
|
||||
Submodule llm/llama.cpp updated: a8db2a9ce6...d94c6e0ccb
@@ -1,8 +1,8 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 2b9ace28..172640e2 100644
|
||||
index 8fe51971..7113ba64 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
|
||||
@@ -5433,16 +5433,7 @@ static void llm_load_vocab(
|
||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||
vocab.tokenizer_add_space_prefix = false;
|
||||
vocab.tokenizer_clean_spaces = true;
|
||||
@@ -20,9 +20,9 @@ index 2b9ace28..172640e2 100644
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
} else if (
|
||||
tokenizer_pre == "llama3" ||
|
||||
@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
|
||||
tokenizer_pre == "jais") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||
@@ -5526,7 +5517,8 @@ static void llm_load_vocab(
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else {
|
||||
- throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
+ LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 40d2ec2c..f34eb79a 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -6943,7 +6943,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
- if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
||||
+ if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX || model.arch == LLM_ARCH_QWEN2) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
360
llm/patches/09-lora.diff
Normal file
360
llm/patches/09-lora.diff
Normal file
@@ -0,0 +1,360 @@
|
||||
diff --git a/common/common.cpp b/common/common.cpp
|
||||
index dbb724fb..c26fe6ee 100644
|
||||
--- a/common/common.cpp
|
||||
+++ b/common/common.cpp
|
||||
@@ -2087,14 +2087,29 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||
+
|
||||
+ // try to load as gguf
|
||||
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
|
||||
if (adapter == nullptr) {
|
||||
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
- llama_free(lctx);
|
||||
- llama_free_model(model);
|
||||
- return std::make_tuple(nullptr, nullptr);
|
||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter, trying ggla\n", __func__);
|
||||
+
|
||||
+ // if that fails, try loading as ggla for compatibility
|
||||
+ int err = llama_model_apply_lora_from_file(model,
|
||||
+ lora_adapter.c_str(),
|
||||
+ lora_scale,
|
||||
+ ((i > 0) || params.lora_base.empty())
|
||||
+ ? NULL
|
||||
+ : params.lora_base.c_str(),
|
||||
+ params.n_threads);
|
||||
+ if (err != 0) {
|
||||
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
+ llama_free(lctx);
|
||||
+ llama_free_model(model);
|
||||
+ return std::make_tuple(nullptr, nullptr);
|
||||
+ }
|
||||
+ } else {
|
||||
+ llama_lora_adapter_set(lctx, adapter, lora_scale);
|
||||
}
|
||||
- llama_lora_adapter_set(lctx, adapter, lora_scale);
|
||||
}
|
||||
|
||||
if (params.ignore_eos) {
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index 93fd77ca..b0fb37a6 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -1160,6 +1160,20 @@ extern "C" {
|
||||
|
||||
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
+ // Apply a LoRA adapter to a loaded model
|
||||
+ // path_base_model is the path to a higher quality model to use as a base for
|
||||
+ // the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
+ // The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
+ // will be applied on top of the previous one
|
||||
+ // Returns 0 on success
|
||||
+ LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
+ const struct llama_model * model,
|
||||
+ const char * path_lora,
|
||||
+ float scale,
|
||||
+ const char * path_base_model,
|
||||
+ int32_t n_threads);
|
||||
+
|
||||
+
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 80a0dd0f..9d7b0e17 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -21880,3 +21880,290 @@ static void llama_log_callback_default(ggml_log_level level, const char * text,
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
+
|
||||
+static int llama_apply_lora_from_file_internal(
|
||||
+ const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
||||
+) {
|
||||
+ LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
+
|
||||
+ const int64_t t_start_lora_us = ggml_time_us();
|
||||
+
|
||||
+ llama_file fin(path_lora, "rb");
|
||||
+
|
||||
+ // verify magic and version
|
||||
+ {
|
||||
+ uint32_t magic = fin.read_u32();
|
||||
+ if (magic != LLAMA_FILE_MAGIC_GGLA) {
|
||||
+ LLAMA_LOG_ERROR("%s: bad file magic\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ uint32_t format_version = fin.read_u32();
|
||||
+ if (format_version != 1) {
|
||||
+ LLAMA_LOG_ERROR("%s: unsupported file version\n", __func__ );
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ int32_t lora_r = fin.read_u32();
|
||||
+ int32_t lora_alpha = fin.read_u32();
|
||||
+ float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||
+
|
||||
+ LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
+
|
||||
+ // load base model
|
||||
+ std::unique_ptr<llama_model_loader> ml;
|
||||
+ if (path_base_model) {
|
||||
+ LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
+ ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
|
||||
+ ml->init_mappings(/*prefetch*/ false); // no prefetching
|
||||
+ }
|
||||
+
|
||||
+ struct tensor_meta {
|
||||
+ std::string name;
|
||||
+ ggml_type type;
|
||||
+ int32_t ne[2];
|
||||
+ size_t offset;
|
||||
+ };
|
||||
+ std::map<std::string, tensor_meta> tensor_meta_map;
|
||||
+
|
||||
+ // load all tensor meta
|
||||
+ while (true) {
|
||||
+ if (fin.tell() == fin.size) {
|
||||
+ // eof
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ int32_t n_dims;
|
||||
+ int32_t name_len;
|
||||
+ int32_t ftype;
|
||||
+
|
||||
+ fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
+ fin.read_raw(&name_len, sizeof(name_len));
|
||||
+ fin.read_raw(&ftype, sizeof(ftype));
|
||||
+
|
||||
+ if (n_dims != 1 && n_dims != 2) {
|
||||
+ LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ int32_t ne[2] = { 1, 1 };
|
||||
+ for (int i = 0; i < n_dims; ++i) {
|
||||
+ fin.read_raw(&ne[i], sizeof(ne[i]));
|
||||
+ }
|
||||
+
|
||||
+ std::string name;
|
||||
+ {
|
||||
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
|
||||
+ char buf[GGML_MAX_NAME];
|
||||
+ fin.read_raw(buf, name_len);
|
||||
+ name = std::string(buf, name_len);
|
||||
+ }
|
||||
+
|
||||
+ // check for lora suffix
|
||||
+ std::string lora_suffix;
|
||||
+ if (name.length() > 6) {
|
||||
+ lora_suffix = name.substr(name.length() - 6);
|
||||
+ }
|
||||
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
||||
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // tensor type
|
||||
+ ggml_type wtype;
|
||||
+ switch (ftype) {
|
||||
+ case 0: wtype = GGML_TYPE_F32; break;
|
||||
+ case 1: wtype = GGML_TYPE_F16; break;
|
||||
+ default:
|
||||
+ {
|
||||
+ LLAMA_LOG_ERROR("%s: invalid tensor data type '%d'\n",
|
||||
+ __func__, ftype);
|
||||
+ return 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ // data offset
|
||||
+ size_t offset = fin.tell();
|
||||
+ offset = (offset + 31) & -32;
|
||||
+
|
||||
+ // skip tensor data
|
||||
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
||||
+
|
||||
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
||||
+ }
|
||||
+
|
||||
+ bool warned = false;
|
||||
+ int n_tensors = 0;
|
||||
+
|
||||
+ // apply
|
||||
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
+ if (backend_cpu == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
||||
+
|
||||
+ std::vector<no_init<uint8_t>> read_buf;
|
||||
+ for (const auto & it : model.tensors_by_name) {
|
||||
+ const std::string & base_name = it.first;
|
||||
+ ggml_tensor * model_t = it.second;
|
||||
+
|
||||
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
||||
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
||||
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
||||
+
|
||||
+ ggml_init_params lora_init_params = {
|
||||
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||
+ /* .mem_buffer */ nullptr,
|
||||
+ /* .no_alloc */ true,
|
||||
+ };
|
||||
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
|
||||
+ if (lora_ctx == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // create tensors
|
||||
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
||||
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
||||
+ ggml_set_name(loraA, metaA.name.c_str());
|
||||
+ ggml_set_name(loraB, metaB.name.c_str());
|
||||
+
|
||||
+ ggml_tensor * base_t;
|
||||
+ if (ml) {
|
||||
+ if (!ml->get_tensor_meta(base_name.c_str())) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
+ return 1;
|
||||
+ }
|
||||
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
||||
+ } else {
|
||||
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
|
||||
+ }
|
||||
+ ggml_set_name(base_t, base_name.c_str());
|
||||
+
|
||||
+ // allocate in backend buffer
|
||||
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
+ if (lora_buf == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ // load tensor data
|
||||
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
||||
+ read_buf.resize(ggml_nbytes(tensor));
|
||||
+ fin.seek(tensor_meta.offset, SEEK_SET);
|
||||
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
||||
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
||||
+ };
|
||||
+ load_tensor(metaA, loraA);
|
||||
+ load_tensor(metaB, loraB);
|
||||
+
|
||||
+ // load base model tensor data
|
||||
+ if (ml) {
|
||||
+ ml->load_data_for(base_t);
|
||||
+ } else {
|
||||
+ ggml_backend_tensor_copy(model_t, base_t);
|
||||
+ }
|
||||
+
|
||||
+ if (ggml_is_quantized(base_t->type) && !warned) {
|
||||
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
+ "use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
+ warned = true;
|
||||
+ }
|
||||
+
|
||||
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
+ ggml_free(lora_ctx);
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ auto build_lora_graph = [&]() {
|
||||
+ // w = w + BA*s
|
||||
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
+ ggml_set_name(BA, "BA");
|
||||
+
|
||||
+ if (scaling != 1.0f) {
|
||||
+ BA = ggml_scale(lora_ctx, BA, scaling);
|
||||
+ ggml_set_name(BA, "BA_scaled");
|
||||
+ }
|
||||
+
|
||||
+ ggml_tensor * r;
|
||||
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
|
||||
+ ggml_set_name(r, "r_add");
|
||||
+
|
||||
+ if (base_t->type != model_t->type) {
|
||||
+ // convert the result to the model type
|
||||
+ r = ggml_cast(lora_ctx, r, model_t->type);
|
||||
+ ggml_set_name(r, "r_cast");
|
||||
+ }
|
||||
+
|
||||
+ return r;
|
||||
+ };
|
||||
+
|
||||
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
+ ggml_tensor * r = build_lora_graph();
|
||||
+ ggml_build_forward_expand(gf, r);
|
||||
+
|
||||
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
+ if (graph_buf == nullptr) {
|
||||
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
||||
+ ggml_free(lora_ctx);
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+ return 1;
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_graph_compute(backend_cpu, gf);
|
||||
+
|
||||
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||
+
|
||||
+#if 0
|
||||
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
||||
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
||||
+
|
||||
+ // sched compute
|
||||
+ ggml_build_forward_expand(gf, build_graph());
|
||||
+ ggml_backend_sched_init_measure(sched, gf);
|
||||
+
|
||||
+ // create the graph again, since the previous one was destroyed by the measure
|
||||
+ ggml_graph_clear(gf);
|
||||
+ ggml_build_forward_expand(gf, build_graph());
|
||||
+ ggml_backend_sched_graph_compute(sched, gf);
|
||||
+ ggml_backend_sched_free(sched);
|
||||
+#endif
|
||||
+
|
||||
+ ggml_backend_buffer_free(lora_buf);
|
||||
+ ggml_backend_buffer_free(graph_buf);
|
||||
+ ggml_free(lora_ctx);
|
||||
+
|
||||
+ n_tensors++;
|
||||
+ if (n_tensors % 4 == 0) {
|
||||
+ LLAMA_LOG_INFO(".");
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ ggml_backend_free(backend_cpu);
|
||||
+
|
||||
+ const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
+ LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
||||
+ try {
|
||||
+ return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
||||
+ } catch (const std::exception & err) {
|
||||
+ LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||
+ return 1;
|
||||
+ }
|
||||
+}
|
||||
\ No newline at end of file
|
||||
@@ -1,43 +0,0 @@
|
||||
diff --git a/include/llama.h b/include/llama.h
|
||||
index bb4b05ba..a92174e0 100644
|
||||
--- a/include/llama.h
|
||||
+++ b/include/llama.h
|
||||
@@ -92,6 +92,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 18364976..435b6fe5 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -5429,6 +5429,12 @@ static void llm_load_vocab(
|
||||
} else if (
|
||||
tokenizer_pre == "jais") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||
+ } else if (
|
||||
+ tokenizer_pre == "tekken") {
|
||||
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
||||
+ vocab.tokenizer_clean_spaces = false;
|
||||
+ vocab.tokenizer_ignore_merges = true;
|
||||
+ vocab.tokenizer_add_bos = true;
|
||||
} else {
|
||||
LLAMA_LOG_WARN("%s: missing or unrecognized pre-tokenizer type, using: 'default'\n", __func__);
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||
@@ -15448,6 +15454,13 @@ struct llm_tokenizer_bpe {
|
||||
" ?[^(\\s|.,!?…。,、।۔،)]+",
|
||||
};
|
||||
break;
|
||||
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
|
||||
+ // original regex from tokenizer.json
|
||||
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
+ regex_exprs = {
|
||||
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
+ };
|
||||
+ break;
|
||||
default:
|
||||
// default regex for BPE tokenization pre-processing
|
||||
regex_exprs = {
|
||||
@@ -1,19 +0,0 @@
|
||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||
index 2b9ace28..e60d3d8d 100644
|
||||
--- a/src/llama.cpp
|
||||
+++ b/src/llama.cpp
|
||||
@@ -6052,10 +6052,10 @@ static bool llm_load_tensors(
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
- layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
- layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
- layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
- layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head});
|
||||
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||
|
||||
// optional bias tensors
|
||||
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
@@ -417,7 +417,17 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
|
||||
// reap subprocess when it exits
|
||||
go func() {
|
||||
s.done <- s.cmd.Wait()
|
||||
err := s.cmd.Wait()
|
||||
// Favor a more detailed message over the process exit status
|
||||
if err != nil && s.status != nil && s.status.LastErrMsg != "" {
|
||||
slog.Debug("llama runner terminated", "error", err)
|
||||
if strings.Contains(s.status.LastErrMsg, "unknown model") {
|
||||
s.status.LastErrMsg = "this model is not supported by your version of Ollama. You may need to upgrade"
|
||||
}
|
||||
s.done <- fmt.Errorf(s.status.LastErrMsg)
|
||||
} else {
|
||||
s.done <- err
|
||||
}
|
||||
}()
|
||||
|
||||
return s, nil
|
||||
@@ -580,14 +590,7 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
|
||||
slog.Warn("client connection closed before server finished loading, aborting load")
|
||||
return fmt.Errorf("timed out waiting for llama runner to start: %w", ctx.Err())
|
||||
case err := <-s.done:
|
||||
msg := ""
|
||||
if s.status != nil && s.status.LastErrMsg != "" {
|
||||
msg = s.status.LastErrMsg
|
||||
}
|
||||
if strings.Contains(msg, "unknown model") {
|
||||
return fmt.Errorf("this model is not supported by your version of Ollama. You may need to upgrade")
|
||||
}
|
||||
return fmt.Errorf("llama runner process has terminated: %v %s", err, msg)
|
||||
return fmt.Errorf("llama runner process has terminated: %w", err)
|
||||
default:
|
||||
}
|
||||
if time.Now().After(stallTimer) {
|
||||
|
||||
@@ -344,6 +344,10 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
|
||||
}
|
||||
}
|
||||
|
||||
if name == "" || arguments == "" {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
var objs []map[string]any
|
||||
for offset := 0; offset < len(s); {
|
||||
var obj map[string]any
|
||||
@@ -361,23 +365,40 @@ func (m *Model) parseToolCalls(s string) ([]api.ToolCall, bool) {
|
||||
return nil, false
|
||||
} else {
|
||||
offset += int(decoder.InputOffset())
|
||||
objs = append(objs, obj)
|
||||
|
||||
// collect all nested objects
|
||||
var collect func(any) []map[string]any
|
||||
collect = func(obj any) (all []map[string]any) {
|
||||
switch o := obj.(type) {
|
||||
case map[string]any:
|
||||
all = append(all, o)
|
||||
for _, v := range o {
|
||||
all = append(all, collect(v)...)
|
||||
}
|
||||
case []any:
|
||||
for _, v := range o {
|
||||
all = append(all, collect(v)...)
|
||||
}
|
||||
}
|
||||
|
||||
return all
|
||||
}
|
||||
objs = append(objs, collect(obj)...)
|
||||
}
|
||||
}
|
||||
|
||||
var toolCalls []api.ToolCall
|
||||
for _, kv := range objs {
|
||||
var call api.ToolCall
|
||||
for k, v := range kv {
|
||||
switch k {
|
||||
case name:
|
||||
call.Function.Name = v.(string)
|
||||
case arguments:
|
||||
call.Function.Arguments = v.(map[string]any)
|
||||
}
|
||||
n, nok := kv[name].(string)
|
||||
a, aok := kv[arguments].(map[string]any)
|
||||
if nok && aok {
|
||||
toolCalls = append(toolCalls, api.ToolCall{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: n,
|
||||
Arguments: a,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
toolCalls = append(toolCalls, call)
|
||||
}
|
||||
|
||||
return toolCalls, len(toolCalls) > 0
|
||||
|
||||
@@ -166,6 +166,7 @@ The temperature in San Francisco, CA is 70°F and in Toronto, Canada is 20°C.`,
|
||||
{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}}
|
||||
{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}
|
||||
</tool_call>`, true},
|
||||
{"xlam", `{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"fahrenheit","location":"San Francisco, CA"}},{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Toronto, Canada"}}]}`, true},
|
||||
}
|
||||
|
||||
var tools []api.Tool
|
||||
|
||||
@@ -609,12 +609,11 @@ func (s *Server) CreateModelHandler(c *gin.Context) {
|
||||
defer cancel()
|
||||
|
||||
quantization := cmp.Or(r.Quantize, r.Quantization)
|
||||
if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); err != nil {
|
||||
if errors.Is(err, errBadTemplate) {
|
||||
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
|
||||
}
|
||||
if err := CreateModel(ctx, name, filepath.Dir(r.Path), strings.ToUpper(quantization), f, fn); errors.Is(err, errBadTemplate) {
|
||||
ch <- gin.H{"error": err.Error(), "status": http.StatusBadRequest}
|
||||
} else if err != nil {
|
||||
ch <- gin.H{"error": err.Error()}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
if r.Stream != nil && !*r.Stream {
|
||||
|
||||
45
server/testdata/tools/xlam.gotmpl
vendored
Normal file
45
server/testdata/tools/xlam.gotmpl
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
{{- if .System }}{{ .System }}
|
||||
{{ end }}
|
||||
{{- range $i, $_ := .Messages }}
|
||||
{{- if eq .Role "user" }}### Instruction:
|
||||
{{- if and $.Tools (le (len (slice $.Messages $i)) 2) }}
|
||||
[BEGIN OF TASK INSTRUCTION]
|
||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
||||
If none of the functions can be used, point it out and refuse to answer.
|
||||
If the given question lacks the parameters required by the function, also point it out.
|
||||
[END OF TASK INSTRUCTION]
|
||||
|
||||
[BEGIN OF AVAILABLE TOOLS]
|
||||
{{ $.Tools }}
|
||||
[END OF AVAILABLE TOOLS]
|
||||
|
||||
[BEGIN OF FORMAT INSTRUCTION]
|
||||
The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
|
||||
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
|
||||
```
|
||||
{
|
||||
"tool_calls": [
|
||||
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
|
||||
... (more tool calls as required)
|
||||
]
|
||||
}
|
||||
```
|
||||
[END OF FORMAT INSTRUCTION]
|
||||
|
||||
[BEGIN OF QUERY]
|
||||
{{ .Content }}
|
||||
[END OF QUERY]
|
||||
|
||||
|
||||
{{ else }}
|
||||
{{ .Content }}
|
||||
{{ end }}
|
||||
{{- else if .ToolCalls }}### Response:
|
||||
{"tool_calls": [{{ range .ToolCalls }}{"name": "{{ .Function.Name }}", "arguments": {{ .Function.Arguments }}}{{ end }}]}
|
||||
<|EOT|>
|
||||
{{ else if eq .Role "assistant" }}### Response:
|
||||
{{ .Content }}
|
||||
<|EOT|>
|
||||
{{ end }}
|
||||
{{- end }}### Response:
|
||||
40
server/testdata/tools/xlam.out
vendored
Normal file
40
server/testdata/tools/xlam.out
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
You are a knowledgable assistant. You can answer questions and perform tasks.
|
||||
### Instruction:
|
||||
What's the weather like today in Paris?
|
||||
### Response:
|
||||
{"tool_calls": [{"name": "get_current_weather", "arguments": {"format":"celsius","location":"Paris, France"}}]}
|
||||
<|EOT|>
|
||||
### Response:
|
||||
The current temperature in Paris, France is 22 degrees Celsius.
|
||||
<|EOT|>
|
||||
### Instruction:
|
||||
[BEGIN OF TASK INSTRUCTION]
|
||||
You are an expert in composing functions. You are given a question and a set of possible functions.
|
||||
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
|
||||
If none of the functions can be used, point it out and refuse to answer.
|
||||
If the given question lacks the parameters required by the function, also point it out.
|
||||
[END OF TASK INSTRUCTION]
|
||||
|
||||
[BEGIN OF AVAILABLE TOOLS]
|
||||
[{"type":"function","function":{"name":"get_current_weather","description":"Get the current weather","parameters":{"type":"object","required":["location","format"],"properties":{"format":{"type":"string","description":"The temperature unit to use. Infer this from the users location.","enum":["celsius","fahrenheit"]},"location":{"type":"string","description":"The city and state, e.g. San Francisco, CA"}}}}}]
|
||||
[END OF AVAILABLE TOOLS]
|
||||
|
||||
[BEGIN OF FORMAT INSTRUCTION]
|
||||
The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
|
||||
The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
|
||||
```
|
||||
{
|
||||
"tool_calls": [
|
||||
{"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
|
||||
... (more tool calls as required)
|
||||
]
|
||||
}
|
||||
```
|
||||
[END OF FORMAT INSTRUCTION]
|
||||
|
||||
[BEGIN OF QUERY]
|
||||
What's the weather like today in San Francisco and Toronto?
|
||||
[END OF QUERY]
|
||||
|
||||
|
||||
### Response:
|
||||
Reference in New Issue
Block a user