mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 14:53:56 +00:00
Compare commits
17 Commits
v0.1.45-al
...
v0.1.48-al
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c648e512e | ||
|
|
159dcaa93b | ||
|
|
717f7229eb | ||
|
|
5f034f5b63 | ||
|
|
b910fa9010 | ||
|
|
6d4219083c | ||
|
|
1ed4f521c4 | ||
|
|
de2163dafd | ||
|
|
2cc7d05012 | ||
|
|
b5286d46dc | ||
|
|
d5fd3ae7ea | ||
|
|
123a722a6f | ||
|
|
4d311eb731 | ||
|
|
0fc2f9c5f2 | ||
|
|
7ef869f2dc | ||
|
|
cb42e607c5 | ||
|
|
2aa91a937b |
@@ -71,8 +71,8 @@ Here are some example models that can be downloaded:
|
|||||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
||||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||||
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
||||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||||
|
|||||||
87
cmd/cmd.go
87
cmd/cmd.go
@@ -162,9 +162,6 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
}
|
}
|
||||||
defer tempfile.Close()
|
defer tempfile.Close()
|
||||||
|
|
||||||
zipfile := zip.NewWriter(tempfile)
|
|
||||||
defer zipfile.Close()
|
|
||||||
|
|
||||||
detectContentType := func(path string) (string, error) {
|
detectContentType := func(path string) (string, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -233,6 +230,9 @@ func tempZipFiles(path string) (string, error) {
|
|||||||
files = append(files, tks...)
|
files = append(files, tks...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
zipfile := zip.NewWriter(tempfile)
|
||||||
|
defer zipfile.Close()
|
||||||
|
|
||||||
for _, file := range files {
|
for _, file := range files {
|
||||||
f, err := os.Open(file)
|
f, err := os.Open(file)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -287,38 +287,12 @@ func createBlob(cmd *cobra.Command, client *api.Client, path string) (string, er
|
|||||||
}
|
}
|
||||||
|
|
||||||
func RunHandler(cmd *cobra.Command, args []string) error {
|
func RunHandler(cmd *cobra.Command, args []string) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
name := args[0]
|
|
||||||
|
|
||||||
// check if the model exists on the server
|
|
||||||
show, err := client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
|
||||||
var statusError api.StatusError
|
|
||||||
switch {
|
|
||||||
case errors.As(err, &statusError) && statusError.StatusCode == http.StatusNotFound:
|
|
||||||
if err := PullHandler(cmd, []string{name}); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
show, err = client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
case err != nil:
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
interactive := true
|
interactive := true
|
||||||
|
|
||||||
opts := runOptions{
|
opts := runOptions{
|
||||||
Model: args[0],
|
Model: args[0],
|
||||||
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
WordWrap: os.Getenv("TERM") == "xterm-256color",
|
||||||
Options: map[string]interface{}{},
|
Options: map[string]interface{}{},
|
||||||
MultiModal: slices.Contains(show.Details.Families, "clip"),
|
|
||||||
ParentModel: show.Details.ParentModel,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
format, err := cmd.Flags().GetString("format")
|
format, err := cmd.Flags().GetString("format")
|
||||||
@@ -362,12 +336,39 @@ func RunHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
opts.WordWrap = !nowrap
|
opts.WordWrap = !nowrap
|
||||||
|
|
||||||
if !interactive {
|
// Fill out the rest of the options based on information about the
|
||||||
return generate(cmd, opts)
|
// model.
|
||||||
|
client, err := api.ClientFromEnvironment()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
name := args[0]
|
||||||
|
info, err := func() (*api.ShowResponse, error) {
|
||||||
|
showReq := &api.ShowRequest{Name: name}
|
||||||
|
info, err := client.Show(cmd.Context(), showReq)
|
||||||
|
var se api.StatusError
|
||||||
|
if errors.As(err, &se) && se.StatusCode == http.StatusNotFound {
|
||||||
|
if err := PullHandler(cmd, []string{name}); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return client.Show(cmd.Context(), &api.ShowRequest{Name: name})
|
||||||
|
}
|
||||||
|
return info, err
|
||||||
|
}()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
opts.MultiModal = slices.Contains(info.Details.Families, "clip")
|
||||||
|
opts.ParentModel = info.Details.ParentModel
|
||||||
|
opts.Messages = append(opts.Messages, info.Messages...)
|
||||||
|
|
||||||
|
if interactive {
|
||||||
return generateInteractive(cmd, opts)
|
return generateInteractive(cmd, opts)
|
||||||
}
|
}
|
||||||
|
return generate(cmd, opts)
|
||||||
|
}
|
||||||
|
|
||||||
func errFromUnknownKey(unknownKeyErr error) error {
|
func errFromUnknownKey(unknownKeyErr error) error {
|
||||||
// find SSH public key in the error message
|
// find SSH public key in the error message
|
||||||
@@ -623,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
||||||
}
|
}
|
||||||
|
|
||||||
if flagsSet == 1 {
|
|
||||||
req := api.ShowRequest{Name: args[0]}
|
req := api.ShowRequest{Name: args[0]}
|
||||||
resp, err := client.Show(cmd.Context(), &req)
|
resp, err := client.Show(cmd.Context(), &req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if flagsSet == 1 {
|
||||||
switch showType {
|
switch showType {
|
||||||
case "license":
|
case "license":
|
||||||
fmt.Println(resp.License)
|
fmt.Println(resp.License)
|
||||||
@@ -646,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
req := api.ShowRequest{Name: args[0]}
|
showInfo(resp)
|
||||||
resp, err := client.Show(cmd.Context(), &req)
|
|
||||||
if err != nil {
|
return nil
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func showInfo(resp *api.ShowResponse) {
|
||||||
arch := resp.ModelInfo["general.architecture"].(string)
|
arch := resp.ModelInfo["general.architecture"].(string)
|
||||||
|
|
||||||
modelData := [][]string{
|
modelData := [][]string{
|
||||||
@@ -671,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
projectorData := [][]string{
|
projectorData := [][]string{
|
||||||
{"arch", "clip"},
|
{"arch", "clip"},
|
||||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
||||||
{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
|
|
||||||
{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
|
||||||
{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
||||||
|
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
||||||
|
}
|
||||||
|
|
||||||
|
projectorData = append(projectorData,
|
||||||
|
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||||
|
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||||
|
)
|
||||||
|
|
||||||
mainTableData = append(mainTableData,
|
mainTableData = append(mainTableData,
|
||||||
[]string{"Projector"},
|
[]string{"Projector"},
|
||||||
[]string{renderSubTable(projectorData, false)},
|
[]string{renderSubTable(projectorData, false)},
|
||||||
@@ -704,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
table.Render()
|
table.Render()
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderSubTable(data [][]string, file bool) string {
|
func renderSubTable(data [][]string, file bool) string {
|
||||||
|
|||||||
@@ -31,41 +31,24 @@ const (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
||||||
client, err := api.ClientFromEnvironment()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
p := progress.NewProgress(os.Stderr)
|
p := progress.NewProgress(os.Stderr)
|
||||||
defer p.StopAndClear()
|
defer p.StopAndClear()
|
||||||
|
|
||||||
spinner := progress.NewSpinner("")
|
spinner := progress.NewSpinner("")
|
||||||
p.Add("", spinner)
|
p.Add("", spinner)
|
||||||
|
|
||||||
showReq := api.ShowRequest{Name: opts.Model}
|
client, err := api.ClientFromEnvironment()
|
||||||
showResp, err := client.Show(cmd.Context(), &showReq)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
opts.MultiModal = slices.Contains(showResp.Details.Families, "clip")
|
|
||||||
opts.ParentModel = showResp.Details.ParentModel
|
|
||||||
|
|
||||||
if len(showResp.Messages) > 0 {
|
|
||||||
opts.Messages = append(opts.Messages, showResp.Messages...)
|
|
||||||
}
|
|
||||||
|
|
||||||
chatReq := &api.ChatRequest{
|
chatReq := &api.ChatRequest{
|
||||||
Model: opts.Model,
|
Model: opts.Model,
|
||||||
Messages: []api.Message{},
|
KeepAlive: opts.KeepAlive,
|
||||||
}
|
}
|
||||||
|
|
||||||
if opts.KeepAlive != nil {
|
return client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
||||||
chatReq.KeepAlive = opts.KeepAlive
|
|
||||||
}
|
|
||||||
|
|
||||||
err = client.Chat(cmd.Context(), chatReq, func(resp api.ChatResponse) error {
|
|
||||||
p.StopAndClear()
|
p.StopAndClear()
|
||||||
if len(opts.Messages) > 0 {
|
|
||||||
for _, msg := range opts.Messages {
|
for _, msg := range opts.Messages {
|
||||||
switch msg.Role {
|
switch msg.Role {
|
||||||
case "user":
|
case "user":
|
||||||
@@ -77,19 +60,11 @@ func loadModel(cmd *cobra.Command, opts *runOptions) error {
|
|||||||
fmt.Println()
|
fmt.Println()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||||
opts.Messages = make([]api.Message, 0)
|
|
||||||
|
|
||||||
err := loadModel(cmd, &opts)
|
err := loadModel(cmd, &opts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -429,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
|||||||
|
|
||||||
switch args[1] {
|
switch args[1] {
|
||||||
case "info":
|
case "info":
|
||||||
fmt.Println("Model details:")
|
showInfo(resp)
|
||||||
if len(resp.Details.Families) > 0 {
|
|
||||||
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
|
|
||||||
} else if resp.Details.Family != "" {
|
|
||||||
fmt.Printf("Family %s\n", resp.Details.Family)
|
|
||||||
}
|
|
||||||
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
|
|
||||||
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
|
|
||||||
fmt.Println("")
|
|
||||||
case "license":
|
case "license":
|
||||||
if resp.License == "" {
|
if resp.License == "" {
|
||||||
fmt.Println("No license was specified for this model.")
|
fmt.Println("No license was specified for this model.")
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
|
|||||||
|
|
||||||
#### Notes
|
#### Notes
|
||||||
|
|
||||||
- `finish_reason` will always be `stop`
|
|
||||||
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
||||||
|
|
||||||
## Models
|
## Models
|
||||||
|
|||||||
@@ -93,7 +93,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
|||||||
//}
|
//}
|
||||||
if gfxOverride == "" {
|
if gfxOverride == "" {
|
||||||
if !slices.Contains[[]string, string](supported, gfx) {
|
if !slices.Contains[[]string, string](supported, gfx) {
|
||||||
slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
//slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
||||||
continue
|
continue
|
||||||
@@ -109,10 +109,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||||
if totalMemory < IGPUMemLimit {
|
//if totalMemory < IGPUMemLimit {
|
||||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
// slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||||
continue
|
// continue
|
||||||
}
|
//}
|
||||||
|
|
||||||
// TODO revisit this once ROCm v6 is available on windows.
|
// TODO revisit this once ROCm v6 is available on windows.
|
||||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
||||||
|
|||||||
40
llm/ext_server/server.cpp
vendored
40
llm/ext_server/server.cpp
vendored
@@ -1650,26 +1650,41 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
|
char buf[256];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", buf, 256);
|
||||||
|
bool gemma2 = strcmp(buf, "gemma2") == 0;
|
||||||
|
|
||||||
|
int32_t truncate_at = slot.n_ctx;
|
||||||
|
|
||||||
|
// truncate at 2/3 of the context length for gemma2 models
|
||||||
|
// as they do not support context shifts (from the sliding window implementation).
|
||||||
|
// this way, prompts that almost fit the context length can still generate a full
|
||||||
|
// response without a sudden stop from hitting the context limit
|
||||||
|
if (gemma2) {
|
||||||
|
truncate_at = 2 * slot.n_ctx / 3;
|
||||||
|
}
|
||||||
|
|
||||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
|
||||||
{
|
{
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
const int n_block_size = n_left / 2;
|
const int n_shift = n_left / 2;
|
||||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
|
||||||
|
|
||||||
std::vector<llama_token> new_tokens(
|
std::vector<llama_token> new_tokens(
|
||||||
prompt_tokens.begin(),
|
prompt_tokens.begin(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep);
|
prompt_tokens.begin() + slot.params.n_keep);
|
||||||
new_tokens.insert(
|
new_tokens.insert(
|
||||||
new_tokens.end(),
|
new_tokens.end(),
|
||||||
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
prompt_tokens.begin() + slot.params.n_keep + n_erase,
|
||||||
prompt_tokens.end());
|
prompt_tokens.end());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_INFO("input truncated", {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
{"n_keep", slot.params.n_keep},
|
{"n_keep", slot.params.n_keep},
|
||||||
{"n_left", n_left},
|
{"n_left", n_left},
|
||||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
{"n_shift", n_shift},
|
||||||
|
{"n_erase", n_erase},
|
||||||
});
|
});
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
@@ -1678,6 +1693,19 @@ struct llama_server_context
|
|||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Models with sliding window attention do not work with context shifts, so
|
||||||
|
// limit their prediction to the context length
|
||||||
|
if (gemma2) {
|
||||||
|
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
|
||||||
|
slot.n_predict = limit;
|
||||||
|
slot.params.n_predict = limit;
|
||||||
|
LOG_INFO("model does not support sliding window, limiting generation", {
|
||||||
|
{"n_ctx", slot.n_ctx},
|
||||||
|
{"n_prompt_tokens", slot.n_prompt_tokens},
|
||||||
|
{"n_predict", slot.n_predict}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
if (!slot.params.cache_prompt)
|
||||||
{
|
{
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ function amdGPUs {
|
|||||||
"gfx900"
|
"gfx900"
|
||||||
"gfx902"
|
"gfx902"
|
||||||
"gfx904"
|
"gfx904"
|
||||||
"gfx90c"
|
"gfx90c:xnack-"
|
||||||
"gfx906:xnack-"
|
"gfx906:xnack-"
|
||||||
"gfx908:xnack-"
|
"gfx908:xnack-"
|
||||||
"gfx90a:xnack+"
|
"gfx90a:xnack+"
|
||||||
@@ -22,7 +22,7 @@ function amdGPUs {
|
|||||||
"gfx942"
|
"gfx942"
|
||||||
"gfx1010"
|
"gfx1010"
|
||||||
"gfx1011"
|
"gfx1011"
|
||||||
"gfx1012"
|
"gfx1012:xnack-"
|
||||||
"gfx1030"
|
"gfx1030"
|
||||||
"gfx1031"
|
"gfx1031"
|
||||||
"gfx1032"
|
"gfx1032"
|
||||||
|
|||||||
13
llm/ggla.go
13
llm/ggla.go
@@ -53,7 +53,7 @@ func (llm *ggla) Tensors() Tensors {
|
|||||||
return llm.tensors
|
return llm.tensors
|
||||||
}
|
}
|
||||||
|
|
||||||
func (llm *ggla) decode(rs io.ReadSeeker) error {
|
func (llm *ggla) decode(rs io.ReadSeeker) (retErr error) {
|
||||||
var r uint32
|
var r uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &r); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -69,9 +69,18 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
|||||||
for {
|
for {
|
||||||
var dims uint32
|
var dims uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &dims); err != nil {
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if errors.Is(retErr, io.EOF) {
|
||||||
|
retErr = io.ErrUnexpectedEOF
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
var namesize uint32
|
var namesize uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &namesize); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -108,7 +117,7 @@ func (llm *ggla) decode(rs io.ReadSeeker) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if _, err := rs.Seek((offset+31)&-32, io.SeekStart); err != nil {
|
if _, err := rs.Seek((offset+31)&-32-offset, io.SeekCurrent); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
40
llm/ggml.go
40
llm/ggml.go
@@ -6,6 +6,8 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/util/bufioutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
type GGML struct {
|
type GGML struct {
|
||||||
@@ -278,7 +280,18 @@ func DetectGGMLType(b []byte) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
// DecodeGGML decodes a GGML model from the given reader.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func DecodeGGML(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
|
||||||
|
if maxArraySize == 0 {
|
||||||
|
maxArraySize = 1024
|
||||||
|
}
|
||||||
|
|
||||||
|
rs = bufioutil.NewBufferedSeeker(rs, 32<<10)
|
||||||
|
|
||||||
var magic uint32
|
var magic uint32
|
||||||
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
@@ -291,17 +304,15 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
|
|||||||
case FILE_MAGIC_GGLA:
|
case FILE_MAGIC_GGLA:
|
||||||
c = &containerGGLA{}
|
c = &containerGGLA{}
|
||||||
case FILE_MAGIC_GGUF_LE:
|
case FILE_MAGIC_GGUF_LE:
|
||||||
c = &containerGGUF{ByteOrder: binary.LittleEndian}
|
c = &containerGGUF{ByteOrder: binary.LittleEndian, maxArraySize: maxArraySize}
|
||||||
case FILE_MAGIC_GGUF_BE:
|
case FILE_MAGIC_GGUF_BE:
|
||||||
c = &containerGGUF{ByteOrder: binary.BigEndian}
|
c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
|
||||||
default:
|
default:
|
||||||
return nil, 0, errors.New("invalid file magic")
|
return nil, 0, errors.New("invalid file magic")
|
||||||
}
|
}
|
||||||
|
|
||||||
model, err := c.Decode(rs)
|
model, err := c.Decode(rs)
|
||||||
if errors.Is(err, io.EOF) {
|
if err != nil {
|
||||||
// noop
|
|
||||||
} else if err != nil {
|
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -321,7 +332,7 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
embedding := llm.KV().EmbeddingLength()
|
embedding := llm.KV().EmbeddingLength()
|
||||||
heads := llm.KV().HeadCount()
|
heads := llm.KV().HeadCount()
|
||||||
headsKV := llm.KV().HeadCountKV()
|
headsKV := llm.KV().HeadCountKV()
|
||||||
vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
|
vocab := uint64(llm.KV()["tokenizer.ggml.tokens"].(*array).size)
|
||||||
|
|
||||||
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
embeddingHeads := llm.KV().EmbeddingHeadCount()
|
||||||
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
embeddingHeadsK := llm.KV().EmbeddingHeadCountK()
|
||||||
@@ -355,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
|||||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
case "gemma":
|
case "gemma", "gemma2":
|
||||||
fullOffload = 4 * batch * (embedding + vocab)
|
fullOffload = max(
|
||||||
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
|
4*batch*(embedding+vocab),
|
||||||
|
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||||
|
)
|
||||||
|
|
||||||
|
partialOffload = max(
|
||||||
|
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
|
||||||
|
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
|
||||||
|
4*embeddingHeadsK*context*8+
|
||||||
|
embedding*embeddingHeadsK*heads*9/16,
|
||||||
|
)
|
||||||
case "command-r":
|
case "command-r":
|
||||||
fullOffload = max(
|
fullOffload = max(
|
||||||
4*batch*(embedding+vocab),
|
4*batch*(embedding+vocab),
|
||||||
|
|||||||
1
llm/ggml_test.go
Normal file
1
llm/ggml_test.go
Normal file
@@ -0,0 +1 @@
|
|||||||
|
package llm
|
||||||
118
llm/gguf.go
118
llm/gguf.go
@@ -3,11 +3,10 @@ package llm
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"log/slog"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type containerGGUF struct {
|
type containerGGUF struct {
|
||||||
@@ -29,6 +28,12 @@ type containerGGUF struct {
|
|||||||
NumTensor uint64
|
NumTensor uint64
|
||||||
NumKV uint64
|
NumKV uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxArraySize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *containerGGUF) canCollectArray(size int) bool {
|
||||||
|
return c.maxArraySize < 0 || size <= c.maxArraySize
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *containerGGUF) Name() string {
|
func (c *containerGGUF) Name() string {
|
||||||
@@ -54,7 +59,6 @@ func (c *containerGGUF) Decode(rs io.ReadSeeker) (model, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
model := newGGUF(c)
|
model := newGGUF(c)
|
||||||
slog.Debug(fmt.Sprintf("model = %#v", model))
|
|
||||||
if err := model.Decode(rs); err != nil {
|
if err := model.Decode(rs); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -85,6 +89,8 @@ type gguf struct {
|
|||||||
tensors []*Tensor
|
tensors []*Tensor
|
||||||
|
|
||||||
parameters uint64
|
parameters uint64
|
||||||
|
|
||||||
|
scratch [16 << 10]byte
|
||||||
}
|
}
|
||||||
|
|
||||||
func newGGUF(container *containerGGUF) *gguf {
|
func newGGUF(container *containerGGUF) *gguf {
|
||||||
@@ -181,34 +187,34 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// decode tensors
|
// decode tensors
|
||||||
for i := 0; uint64(i) < llm.numTensor(); i++ {
|
for range llm.numTensor() {
|
||||||
name, err := readGGUFString(llm, rs)
|
name, err := readGGUFString(llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor name: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// dims is the number of dimensions in the tensor
|
// dims is the number of dimensions in the tensor
|
||||||
dims, err := readGGUF[uint32](llm, rs)
|
dims, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor dimensions: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
shape := [4]uint64{1, 1, 1, 1}
|
shape := [4]uint64{1, 1, 1, 1}
|
||||||
for i := 0; uint32(i) < dims; i++ {
|
for i := 0; uint32(i) < dims; i++ {
|
||||||
shape[i], err = readGGUF[uint64](llm, rs)
|
shape[i], err = readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor shape: %w", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kind, err := readGGUF[uint32](llm, rs)
|
kind, err := readGGUF[uint32](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor kind: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
offset, err := readGGUF[uint64](llm, rs)
|
offset, err := readGGUF[uint64](llm, rs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to read tensor offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor := Tensor{
|
tensor := Tensor{
|
||||||
@@ -230,24 +236,19 @@ func (llm *gguf) Decode(rs io.ReadSeeker) error {
|
|||||||
alignment = 32
|
alignment = 32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, tensor := range llm.tensors {
|
||||||
offset, err := rs.Seek(0, io.SeekCurrent)
|
offset, err := rs.Seek(0, io.SeekCurrent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to get current offset: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
padding := llm.padding(offset, int64(alignment))
|
padding := llm.padding(offset, int64(alignment))
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to seek to init padding: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tensor := range llm.tensors {
|
|
||||||
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
if _, err := rs.Seek(int64(tensor.Size()), io.SeekCurrent); err != nil {
|
||||||
return err
|
return fmt.Errorf("failed to seek to tensor: %w", err)
|
||||||
}
|
|
||||||
|
|
||||||
padding := llm.padding(int64(tensor.Size()), int64(alignment))
|
|
||||||
if _, err := rs.Seek(padding, io.SeekCurrent); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -285,22 +286,48 @@ func readGGUFV1String(llm *gguf, r io.Reader) (string, error) {
|
|||||||
return b.String(), nil
|
return b.String(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func discardGGUFString(llm *gguf, r io.Reader) error {
|
||||||
|
buf := llm.scratch[:8]
|
||||||
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
size := int(llm.ByteOrder.Uint64(buf))
|
||||||
|
for size > 0 {
|
||||||
|
n, err := r.Read(llm.scratch[:min(size, cap(llm.scratch))])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
size -= n
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
func readGGUFString(llm *gguf, r io.Reader) (string, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1String(llm, r)
|
return readGGUFV1String(llm, r)
|
||||||
}
|
}
|
||||||
|
|
||||||
var length uint64
|
buf := llm.scratch[:8]
|
||||||
if err := binary.Read(r, llm.ByteOrder, &length); err != nil {
|
_, err := io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
var b bytes.Buffer
|
length := int(llm.ByteOrder.Uint64(buf))
|
||||||
if _, err := io.CopyN(&b, r, int64(length)); err != nil {
|
if length > len(llm.scratch) {
|
||||||
|
buf = make([]byte, length)
|
||||||
|
} else {
|
||||||
|
buf = llm.scratch[:length]
|
||||||
|
}
|
||||||
|
clear(buf)
|
||||||
|
|
||||||
|
_, err = io.ReadFull(r, buf)
|
||||||
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
return string(buf), nil
|
||||||
return b.String(), nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
||||||
@@ -316,7 +343,16 @@ func writeGGUFString(llm *gguf, w io.Writer, s string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
type array struct {
|
||||||
|
size int
|
||||||
|
values []any
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *array) MarshalJSON() ([]byte, error) {
|
||||||
|
return json.Marshal(a.values)
|
||||||
|
}
|
||||||
|
|
||||||
|
func readGGUFV1Array(llm *gguf, r io.Reader) (*array, error) {
|
||||||
t, err := readGGUF[uint32](llm, r)
|
t, err := readGGUF[uint32](llm, r)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -327,7 +363,12 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint32(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, 0, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
@@ -361,13 +402,15 @@ func readGGUFV1Array(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
func readGGUFArray(llm *gguf, r io.Reader) (*array, error) {
|
||||||
if llm.Version == 1 {
|
if llm.Version == 1 {
|
||||||
return readGGUFV1Array(llm, r)
|
return readGGUFV1Array(llm, r)
|
||||||
}
|
}
|
||||||
@@ -382,7 +425,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := 0; uint64(i) < n; i++ {
|
a := &array{size: int(n)}
|
||||||
|
if llm.canCollectArray(int(n)) {
|
||||||
|
a.values = make([]any, int(n))
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range n {
|
||||||
var e any
|
var e any
|
||||||
switch t {
|
switch t {
|
||||||
case ggufTypeUint8:
|
case ggufTypeUint8:
|
||||||
@@ -408,7 +456,11 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
case ggufTypeBool:
|
case ggufTypeBool:
|
||||||
e, err = readGGUF[bool](llm, r)
|
e, err = readGGUF[bool](llm, r)
|
||||||
case ggufTypeString:
|
case ggufTypeString:
|
||||||
|
if a.values != nil {
|
||||||
e, err = readGGUFString(llm, r)
|
e, err = readGGUFString(llm, r)
|
||||||
|
} else {
|
||||||
|
err = discardGGUFString(llm, r)
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("invalid array type: %d", t)
|
return nil, fmt.Errorf("invalid array type: %d", t)
|
||||||
}
|
}
|
||||||
@@ -416,10 +468,12 @@ func readGGUFArray(llm *gguf, r io.Reader) (a []any, err error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
a = append(a, e)
|
if a.values != nil {
|
||||||
|
a.values[i] = e
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
return a, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
func writeGGUFArray[S ~[]E, E any](llm *gguf, w io.Writer, t uint32, s S) error {
|
||||||
|
|||||||
@@ -22,13 +22,14 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
defer f.Close()
|
defer f.Close()
|
||||||
gguf := NewGGUFV3(binary.LittleEndian)
|
gguf := NewGGUFV3(binary.LittleEndian)
|
||||||
inputLayerCount := 5
|
inputLayerCount := 5
|
||||||
|
|
||||||
tensors := []Tensor{
|
tensors := []Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.1.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.2.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.3.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.4.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
}
|
}
|
||||||
assert.Len(t, tensors, inputLayerCount+1)
|
assert.Len(t, tensors, inputLayerCount+1)
|
||||||
err = gguf.Encode(f, KV{
|
err = gguf.Encode(f, KV{
|
||||||
@@ -45,8 +46,10 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
}, tensors)
|
}, tensors)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
ggml, err := LoadModel(f.Name())
|
ggml, err := LoadModel(f.Name(), 0)
|
||||||
require.NoError(t, err)
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Simple CPU scenario
|
// Simple CPU scenario
|
||||||
gpus := []gpu.GpuInfo{
|
gpus := []gpu.GpuInfo{
|
||||||
|
|||||||
305
llm/patches/07-gemma.diff
Normal file
305
llm/patches/07-gemma.diff
Normal file
@@ -0,0 +1,305 @@
|
|||||||
|
From 5cadb45f39d001ffbad95b690d6cf0abcb4a6d96 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Ollama maintainers <hello@ollama.com>
|
||||||
|
Date: Wed, 26 Jun 2024 16:18:09 -0700
|
||||||
|
Subject: [PATCH] Architecture support
|
||||||
|
|
||||||
|
---
|
||||||
|
llama.cpp | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||||
|
1 file changed, 193 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/llama.cpp b/llama.cpp
|
||||||
|
index 61948751..3b4196f5 100644
|
||||||
|
--- a/llama.cpp
|
||||||
|
+++ b/llama.cpp
|
||||||
|
@@ -217,6 +217,7 @@ enum llm_arch {
|
||||||
|
LLM_ARCH_INTERNLM2,
|
||||||
|
LLM_ARCH_MINICPM,
|
||||||
|
LLM_ARCH_GEMMA,
|
||||||
|
+ LLM_ARCH_GEMMA2,
|
||||||
|
LLM_ARCH_STARCODER2,
|
||||||
|
LLM_ARCH_MAMBA,
|
||||||
|
LLM_ARCH_XVERSE,
|
||||||
|
@@ -255,6 +256,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
|
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||||
|
{ LLM_ARCH_MINICPM, "minicpm" },
|
||||||
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
|
+ { LLM_ARCH_GEMMA2, "gemma2" },
|
||||||
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
|
@@ -464,10 +466,12 @@ enum llm_tensor {
|
||||||
|
LLM_TENSOR_ATTN_NORM,
|
||||||
|
LLM_TENSOR_ATTN_NORM_2,
|
||||||
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
|
+ LLM_TENSOR_ATTN_POST_NORM,
|
||||||
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_NORM,
|
||||||
|
+ LLM_TENSOR_FFN_POST_NORM,
|
||||||
|
LLM_TENSOR_FFN_GATE,
|
||||||
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
LLM_TENSOR_FFN_UP,
|
||||||
|
@@ -960,6 +964,24 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
+ {
|
||||||
|
+ LLM_ARCH_GEMMA2,
|
||||||
|
+ {
|
||||||
|
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
+ { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||||
|
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
+ { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||||
|
+ },
|
||||||
|
+ },
|
||||||
|
{
|
||||||
|
LLM_ARCH_STARCODER2,
|
||||||
|
{
|
||||||
|
@@ -1941,6 +1963,8 @@ enum e_model {
|
||||||
|
MODEL_8x22B,
|
||||||
|
MODEL_16x12B,
|
||||||
|
MODEL_10B_128x3_66B,
|
||||||
|
+ MODEL_9B,
|
||||||
|
+ MODEL_27B,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const size_t kiB = 1024;
|
||||||
|
@@ -2114,6 +2138,7 @@ struct llama_layer {
|
||||||
|
struct ggml_tensor * attn_out_norm_b;
|
||||||
|
struct ggml_tensor * attn_q_a_norm;
|
||||||
|
struct ggml_tensor * attn_kv_a_norm;
|
||||||
|
+ struct ggml_tensor * attn_post_norm;
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor * wq;
|
||||||
|
@@ -2136,6 +2161,7 @@ struct llama_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * ffn_norm;
|
||||||
|
struct ggml_tensor * ffn_norm_b;
|
||||||
|
+ struct ggml_tensor * ffn_post_norm;
|
||||||
|
struct ggml_tensor * layer_out_norm;
|
||||||
|
struct ggml_tensor * layer_out_norm_b;
|
||||||
|
struct ggml_tensor * ffn_norm_exps;
|
||||||
|
@@ -4529,6 +4555,16 @@ static void llm_load_hparams(
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case LLM_ARCH_GEMMA:
|
||||||
|
+ {
|
||||||
|
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
+
|
||||||
|
+ switch (hparams.n_layer) {
|
||||||
|
+ case 18: model.type = e_model::MODEL_9B; break;
|
||||||
|
+ case 28: model.type = e_model::MODEL_27B; break;
|
||||||
|
+ default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
+ }
|
||||||
|
+ } break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
@@ -6305,6 +6341,40 @@ static bool llm_load_tensors(
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
+ {
|
||||||
|
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
+
|
||||||
|
+ // output
|
||||||
|
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
||||||
|
+
|
||||||
|
+ const int64_t n_ff = hparams.n_ff;
|
||||||
|
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
+ const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
|
+ const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
|
+
|
||||||
|
+ for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
|
+ ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
+ ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
+
|
||||||
|
+ auto & layer = model.layers[i];
|
||||||
|
+
|
||||||
|
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
+
|
||||||
|
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * hparams.n_head});
|
||||||
|
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa});
|
||||||
|
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa});
|
||||||
|
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * hparams.n_head, n_embd});
|
||||||
|
+ layer.attn_post_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
+
|
||||||
|
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
+ layer.ffn_post_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd});
|
||||||
|
+ }
|
||||||
|
+ } break;
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
@@ -10614,6 +10684,123 @@ struct llm_build_context {
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
+ struct ggml_cgraph * build_gemma2() {
|
||||||
|
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
+
|
||||||
|
+ const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * cur;
|
||||||
|
+ struct ggml_tensor * inpL;
|
||||||
|
+
|
||||||
|
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
+
|
||||||
|
+ inpL = ggml_scale(ctx0, inpL, sqrtf(n_embd));
|
||||||
|
+ cb(inpL, "inp_scaled", -1);
|
||||||
|
+
|
||||||
|
+ // inp_pos - contains the positions
|
||||||
|
+ struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
+
|
||||||
|
+ // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
+ struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
+
|
||||||
|
+ for (int il = 0; il < n_layer; ++il) {
|
||||||
|
+ // norm
|
||||||
|
+ cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
+ model.layers[il].attn_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "attn_norm", il);
|
||||||
|
+
|
||||||
|
+ // self-attention
|
||||||
|
+ {
|
||||||
|
+ // compute Q and K and RoPE them
|
||||||
|
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
+ cb(Qcur, "Qcur", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
+ cb(Kcur, "Kcur", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
+ cb(Vcur, "Vcur", il);
|
||||||
|
+
|
||||||
|
+ Qcur = ggml_rope_ext(
|
||||||
|
+ ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
+ cb(Qcur, "Qcur", il);
|
||||||
|
+
|
||||||
|
+ Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
||||||
|
+ cb(Qcur, "Qcur_scaled", il);
|
||||||
|
+
|
||||||
|
+ Kcur = ggml_rope_ext(
|
||||||
|
+ ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
+ n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
|
+ ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
|
+ cb(Kcur, "Kcur", il);
|
||||||
|
+
|
||||||
|
+ cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
+ model.layers[il].wo, NULL,
|
||||||
|
+ Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (il == n_layer - 1) {
|
||||||
|
+ // skip computing output for unused tokens
|
||||||
|
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
+ inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.layers[il].attn_post_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "attn_post_norm", il);
|
||||||
|
+
|
||||||
|
+ struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL);
|
||||||
|
+ cb(sa_out, "sa_out", il);
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, sa_out, hparams,
|
||||||
|
+ model.layers[il].ffn_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, il);
|
||||||
|
+ cb(cur, "ffn_norm", il);
|
||||||
|
+
|
||||||
|
+ // feed-forward network
|
||||||
|
+ {
|
||||||
|
+ cur = llm_build_ffn(ctx0, cur,
|
||||||
|
+ model.layers[il].ffn_up, NULL,
|
||||||
|
+ model.layers[il].ffn_gate, NULL,
|
||||||
|
+ model.layers[il].ffn_down, NULL,
|
||||||
|
+ NULL,
|
||||||
|
+ LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
||||||
|
+ cb(cur, "ffn_out", il);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.layers[il].ffn_post_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, -1);
|
||||||
|
+ cb(cur, "ffn_post_norm", -1);
|
||||||
|
+
|
||||||
|
+ cur = ggml_add(ctx0, cur, sa_out);
|
||||||
|
+ cb(cur, "l_out", il);
|
||||||
|
+
|
||||||
|
+ // input for next layer
|
||||||
|
+ inpL = cur;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ cur = inpL;
|
||||||
|
+
|
||||||
|
+ cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
+ model.output_norm, NULL,
|
||||||
|
+ LLM_NORM_RMS, cb, -1);
|
||||||
|
+ cb(cur, "result_norm", -1);
|
||||||
|
+
|
||||||
|
+ // lm_head
|
||||||
|
+ cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
+ cb(cur, "result_output", -1);
|
||||||
|
+
|
||||||
|
+ ggml_build_forward_expand(gf, cur);
|
||||||
|
+
|
||||||
|
+ return gf;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
struct ggml_cgraph * build_starcoder2() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
@@ -11847,6 +12034,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
|
{
|
||||||
|
result = llm.build_gemma();
|
||||||
|
} break;
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
+ {
|
||||||
|
+ result = llm.build_gemma2();
|
||||||
|
+ } break;
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
{
|
||||||
|
result = llm.build_starcoder2();
|
||||||
|
@@ -16671,6 +16862,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
|
case LLM_ARCH_PHI2:
|
||||||
|
case LLM_ARCH_PHI3:
|
||||||
|
case LLM_ARCH_GEMMA:
|
||||||
|
+ case LLM_ARCH_GEMMA2:
|
||||||
|
case LLM_ARCH_STARCODER2:
|
||||||
|
case LLM_ARCH_GPTNEOX:
|
||||||
|
return LLAMA_ROPE_TYPE_NEOX;
|
||||||
|
@@ -18551,7 +18743,7 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<s>assistant\n";
|
||||||
|
}
|
||||||
|
- } else if (tmpl == "gemma" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
|
+ } else if (tmpl == "gemma" || tmpl == "gemma2" || tmpl.find("<start_of_turn>") != std::string::npos) {
|
||||||
|
// google/gemma-7b-it
|
||||||
|
std::string system_prompt = "";
|
||||||
|
for (auto message : chat) {
|
||||||
|
--
|
||||||
|
2.45.2
|
||||||
|
|
||||||
@@ -60,7 +60,12 @@ type llmServer struct {
|
|||||||
sem *semaphore.Weighted
|
sem *semaphore.Weighted
|
||||||
}
|
}
|
||||||
|
|
||||||
func LoadModel(model string) (*GGML, error) {
|
// LoadModel will load a model from disk. The model must be in the GGML format.
|
||||||
|
//
|
||||||
|
// It collects array values for arrays with a size less than or equal to
|
||||||
|
// maxArraySize. If maxArraySize is 0, the default value of 1024 is used. If
|
||||||
|
// the maxArraySize is negative, all arrays are collected.
|
||||||
|
func LoadModel(model string, maxArraySize int) (*GGML, error) {
|
||||||
if _, err := os.Stat(model); err != nil {
|
if _, err := os.Stat(model); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -71,7 +76,7 @@ func LoadModel(model string) (*GGML, error) {
|
|||||||
}
|
}
|
||||||
defer f.Close()
|
defer f.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(f)
|
ggml, _, err := DecodeGGML(f, maxArraySize)
|
||||||
return ggml, err
|
return ggml, err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -412,7 +417,7 @@ func projectorMemoryRequirements(filename string) uint64 {
|
|||||||
}
|
}
|
||||||
defer file.Close()
|
defer file.Close()
|
||||||
|
|
||||||
ggml, _, err := DecodeGGML(file)
|
ggml, _, err := DecodeGGML(file, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -423,7 +423,7 @@ func CreateModel(ctx context.Context, name model.Name, modelFileDir, quantizatio
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(temp)
|
ggml, _, err := llm.DecodeGGML(temp, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/ollama/ollama/api"
|
"github.com/ollama/ollama/api"
|
||||||
"github.com/ollama/ollama/convert"
|
"github.com/ollama/ollama/convert"
|
||||||
@@ -63,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
}
|
}
|
||||||
defer blob.Close()
|
defer blob.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(blob)
|
ggml, _, err := llm.DecodeGGML(blob, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -77,62 +78,80 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
|
|||||||
return layers, nil
|
return layers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
func extractFromZipFile(p string, file *os.File, fn func(api.ProgressResponse)) error {
|
||||||
stat, err := file.Stat()
|
stat, err := file.Stat()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
r, err := zip.NewReader(file, stat.Size())
|
r, err := zip.NewReader(file, stat.Size())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
tempdir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
defer os.RemoveAll(tempdir)
|
|
||||||
|
|
||||||
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
fn(api.ProgressResponse{Status: "unpacking model metadata"})
|
||||||
for _, f := range r.File {
|
for _, f := range r.File {
|
||||||
|
n := filepath.Join(p, f.Name)
|
||||||
|
if !strings.HasPrefix(n, p) {
|
||||||
|
slog.Warn("skipped extracting file outside of context", "name", f.Name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.MkdirAll(filepath.Dir(n), 0o750); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(mxyng): this should not write out all files to disk
|
// TODO(mxyng): this should not write out all files to disk
|
||||||
outfile, err := os.Create(filepath.Join(tempdir, f.Name))
|
outfile, err := os.Create(n)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
defer outfile.Close()
|
defer outfile.Close()
|
||||||
|
|
||||||
infile, err := f.Open()
|
infile, err := f.Open()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
defer infile.Close()
|
defer infile.Close()
|
||||||
|
|
||||||
if _, err = io.Copy(outfile, infile); err != nil {
|
if _, err = io.Copy(outfile, infile); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := outfile.Close(); err != nil {
|
if err := outfile.Close(); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := infile.Close(); err != nil {
|
if err := infile.Close(); err != nil {
|
||||||
return nil, err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
mf, err := convert.GetModelFormat(tempdir)
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(api.ProgressResponse)) (layers []*layerGGML, err error) {
|
||||||
|
tempDir, err := os.MkdirTemp(filepath.Dir(file.Name()), "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tempDir)
|
||||||
|
|
||||||
|
if err := extractFromZipFile(tempDir, file, fn); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
mf, err := convert.GetModelFormat(tempDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
params, err := mf.GetParams(tempdir)
|
params, err := mf.GetParams(tempDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
mArch, err := mf.GetModelArch("", tempdir, params)
|
mArch, err := mf.GetModelArch("", tempDir, params)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -150,7 +169,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
|||||||
|
|
||||||
// TODO(mxyng): this should write directly into a layer
|
// TODO(mxyng): this should write directly into a layer
|
||||||
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
// e.g. NewLayer(arch.Reader(), "application/vnd.ollama.image.model")
|
||||||
temp, err := os.CreateTemp(tempdir, "fp16")
|
temp, err := os.CreateTemp(tempDir, "fp16")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -176,7 +195,7 @@ func parseFromZipFile(_ context.Context, file *os.File, digest string, fn func(a
|
|||||||
}
|
}
|
||||||
defer bin.Close()
|
defer bin.Close()
|
||||||
|
|
||||||
ggml, _, err := llm.DecodeGGML(bin)
|
ggml, _, err := llm.DecodeGGML(bin, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -210,7 +229,7 @@ func parseFromFile(ctx context.Context, file *os.File, digest string, fn func(ap
|
|||||||
|
|
||||||
var offset int64
|
var offset int64
|
||||||
for offset < stat.Size() {
|
for offset < stat.Size() {
|
||||||
ggml, n, err := llm.DecodeGGML(file)
|
ggml, n, err := llm.DecodeGGML(file, 0)
|
||||||
if errors.Is(err, io.EOF) {
|
if errors.Is(err, io.EOF) {
|
||||||
break
|
break
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
|
|||||||
92
server/model_test.go
Normal file
92
server/model_test.go
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package server
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/zip"
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ollama/ollama/api"
|
||||||
|
)
|
||||||
|
|
||||||
|
func createZipFile(t *testing.T, name string) *os.File {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
f, err := os.CreateTemp(t.TempDir(), "")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
zf := zip.NewWriter(f)
|
||||||
|
defer zf.Close()
|
||||||
|
|
||||||
|
zh, err := zf.CreateHeader(&zip.FileHeader{Name: name})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := io.Copy(zh, bytes.NewReader([]byte(""))); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExtractFromZipFile(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
expect []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "good",
|
||||||
|
expect: []string{"good"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: filepath.Join("..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "..", "bad"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tt := range cases {
|
||||||
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
|
f := createZipFile(t, tt.name)
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
tempDir := t.TempDir()
|
||||||
|
if err := extractFromZipFile(tempDir, f, func(api.ProgressResponse) {}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var matches []string
|
||||||
|
if err := filepath.Walk(tempDir, func(p string, fi os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if !fi.IsDir() {
|
||||||
|
matches = append(matches, p)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var actual []string
|
||||||
|
for _, match := range matches {
|
||||||
|
rel, err := filepath.Rel(tempDir, match)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
actual = append(actual, rel)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !slices.Equal(actual, tt.expect) {
|
||||||
|
t.Fatalf("expected %d files, got %d", len(tt.expect), len(matches))
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -754,7 +754,11 @@ func GetModelInfo(req api.ShowRequest) (*api.ShowResponse, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
func getKVData(digest string, verbose bool) (llm.KV, error) {
|
||||||
kvData, err := llm.LoadModel(digest)
|
maxArraySize := 0
|
||||||
|
if verbose {
|
||||||
|
maxArraySize = -1
|
||||||
|
}
|
||||||
|
kvData, err := llm.LoadModel(digest, maxArraySize)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -1101,11 +1105,20 @@ func Serve(ln net.Listener) error {
|
|||||||
schedCtx, schedDone := context.WithCancel(ctx)
|
schedCtx, schedDone := context.WithCancel(ctx)
|
||||||
sched := InitScheduler(schedCtx)
|
sched := InitScheduler(schedCtx)
|
||||||
s := &Server{addr: ln.Addr(), sched: sched}
|
s := &Server{addr: ln.Addr(), sched: sched}
|
||||||
r := s.GenerateRoutes()
|
|
||||||
|
http.Handle("/", s.GenerateRoutes())
|
||||||
|
|
||||||
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version))
|
||||||
srvr := &http.Server{
|
srvr := &http.Server{
|
||||||
Handler: r,
|
// Use http.DefaultServeMux so we get net/http/pprof for
|
||||||
|
// free.
|
||||||
|
//
|
||||||
|
// TODO(bmizerany): Decide if we want to make this
|
||||||
|
// configurable so it is not exposed by default, or allow
|
||||||
|
// users to bind it to a different port. This was a quick
|
||||||
|
// and easy way to get pprof, but it may not be the best
|
||||||
|
// way.
|
||||||
|
Handler: nil,
|
||||||
}
|
}
|
||||||
|
|
||||||
// listen for a ctrl+c and stop any loaded llm
|
// listen for a ctrl+c and stop any loaded llm
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Load model for fitting
|
// Load model for fitting
|
||||||
ggml, err := llm.LoadModel(pending.model.ModelPath)
|
ggml, err := llm.LoadModel(pending.model.ModelPath, 0)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
pending.errCh <- err
|
pending.errCh <- err
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -128,14 +128,14 @@ func newScenario(t *testing.T, ctx context.Context, modelName string, estimatedV
|
|||||||
"tokenizer.ggml.scores": []float32{0},
|
"tokenizer.ggml.scores": []float32{0},
|
||||||
"tokenizer.ggml.token_type": []int32{0},
|
"tokenizer.ggml.token_type": []int32{0},
|
||||||
}, []llm.Tensor{
|
}, []llm.Tensor{
|
||||||
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: &bytes.Reader{}},
|
{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
|
||||||
})
|
})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
fname := f.Name()
|
fname := f.Name()
|
||||||
model := &Model{Name: modelName, ModelPath: fname}
|
model := &Model{Name: modelName, ModelPath: fname}
|
||||||
scenario.ggml, err = llm.LoadModel(model.ModelPath)
|
scenario.ggml, err = llm.LoadModel(model.ModelPath, 0)
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
|
|
||||||
scenario.req = &LlmRequest{
|
scenario.req = &LlmRequest{
|
||||||
|
|||||||
34
util/bufioutil/buffer_seeker.go
Normal file
34
util/bufioutil/buffer_seeker.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BufferedSeeker struct {
|
||||||
|
rs io.ReadSeeker
|
||||||
|
br *bufio.Reader
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewBufferedSeeker(rs io.ReadSeeker, size int) *BufferedSeeker {
|
||||||
|
return &BufferedSeeker{
|
||||||
|
rs: rs,
|
||||||
|
br: bufio.NewReaderSize(rs, size),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Read(p []byte) (int, error) {
|
||||||
|
return b.br.Read(p)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *BufferedSeeker) Seek(offset int64, whence int) (int64, error) {
|
||||||
|
if whence == io.SeekCurrent {
|
||||||
|
offset -= int64(b.br.Buffered())
|
||||||
|
}
|
||||||
|
n, err := b.rs.Seek(offset, whence)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
b.br.Reset(b.rs)
|
||||||
|
return n, nil
|
||||||
|
}
|
||||||
64
util/bufioutil/buffer_seeker_test.go
Normal file
64
util/bufioutil/buffer_seeker_test.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package bufioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestBufferedSeeker(t *testing.T) {
|
||||||
|
const alphabet = "abcdefghijklmnopqrstuvwxyz"
|
||||||
|
|
||||||
|
bs := NewBufferedSeeker(strings.NewReader(alphabet), 0) // minReadBufferSize = 16
|
||||||
|
|
||||||
|
checkRead := func(buf []byte, expected string) {
|
||||||
|
t.Helper()
|
||||||
|
_, err := bs.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if !bytes.Equal(buf, []byte(expected)) {
|
||||||
|
t.Fatalf("expected %s, got %s", expected, buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read the first 5 bytes
|
||||||
|
buf := make([]byte, 5)
|
||||||
|
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err := bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// read 'a'
|
||||||
|
checkRead(buf[:1], "a")
|
||||||
|
|
||||||
|
if bs.br.Buffered() == 0 {
|
||||||
|
t.Fatalf("totally unexpected sanity check failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Seek past 'b'
|
||||||
|
_, err = bs.Seek(1, io.SeekCurrent)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "cdefg")
|
||||||
|
|
||||||
|
// Seek back to the beginning
|
||||||
|
_, err = bs.Seek(0, io.SeekStart)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "abcde")
|
||||||
|
|
||||||
|
// Seek to the end
|
||||||
|
_, err = bs.Seek(-5, io.SeekEnd)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
checkRead(buf, "vwxyz")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user