mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-22 06:43:57 +00:00
Compare commits
9 Commits
v0.1.47-Al
...
v0.1.48-al
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c648e512e | ||
|
|
159dcaa93b | ||
|
|
717f7229eb | ||
|
|
5f034f5b63 | ||
|
|
b910fa9010 | ||
|
|
6d4219083c | ||
|
|
1ed4f521c4 | ||
|
|
de2163dafd | ||
|
|
2cc7d05012 |
@@ -71,8 +71,8 @@ Here are some example models that can be downloaded:
|
||||
| Llama 3 | 70B | 40GB | `ollama run llama3:70b` |
|
||||
| Phi 3 Mini | 3.8B | 2.3GB | `ollama run phi3` |
|
||||
| Phi 3 Medium | 14B | 7.9GB | `ollama run phi3:medium` |
|
||||
| Gemma | 2B | 1.4GB | `ollama run gemma:2b` |
|
||||
| Gemma | 7B | 4.8GB | `ollama run gemma:7b` |
|
||||
| Gemma 2 | 9B | 5.5GB | `ollama run gemma2` |
|
||||
| Gemma 2 | 27B | 16GB | `ollama run gemma2:27b` |
|
||||
| Mistral | 7B | 4.1GB | `ollama run mistral` |
|
||||
| Moondream 2 | 1.4B | 829MB | `ollama run moondream` |
|
||||
| Neural Chat | 7B | 4.1GB | `ollama run neural-chat` |
|
||||
|
||||
36
cmd/cmd.go
36
cmd/cmd.go
@@ -624,13 +624,13 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||
return errors.New("only one of '--license', '--modelfile', '--parameters', '--system', or '--template' can be specified")
|
||||
}
|
||||
|
||||
if flagsSet == 1 {
|
||||
req := api.ShowRequest{Name: args[0]}
|
||||
resp, err := client.Show(cmd.Context(), &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
req := api.ShowRequest{Name: args[0]}
|
||||
resp, err := client.Show(cmd.Context(), &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if flagsSet == 1 {
|
||||
switch showType {
|
||||
case "license":
|
||||
fmt.Println(resp.License)
|
||||
@@ -647,12 +647,12 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
req := api.ShowRequest{Name: args[0]}
|
||||
resp, err := client.Show(cmd.Context(), &req)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
showInfo(resp)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func showInfo(resp *api.ShowResponse) {
|
||||
arch := resp.ModelInfo["general.architecture"].(string)
|
||||
|
||||
modelData := [][]string{
|
||||
@@ -672,11 +672,17 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||
projectorData := [][]string{
|
||||
{"arch", "clip"},
|
||||
{"parameters", format.HumanNumber(uint64(resp.ProjectorInfo["general.parameter_count"].(float64)))},
|
||||
{"projector type", resp.ProjectorInfo["clip.projector_type"].(string)},
|
||||
{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||
{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||
}
|
||||
|
||||
if projectorType, ok := resp.ProjectorInfo["clip.projector_type"]; ok {
|
||||
projectorData = append(projectorData, []string{"projector type", projectorType.(string)})
|
||||
}
|
||||
|
||||
projectorData = append(projectorData,
|
||||
[]string{"embedding length", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.embedding_length"].(float64))},
|
||||
[]string{"projection dimensionality", fmt.Sprintf("%v", resp.ProjectorInfo["clip.vision.projection_dim"].(float64))},
|
||||
)
|
||||
|
||||
mainTableData = append(mainTableData,
|
||||
[]string{"Projector"},
|
||||
[]string{renderSubTable(projectorData, false)},
|
||||
@@ -705,8 +711,6 @@ func ShowHandler(cmd *cobra.Command, args []string) error {
|
||||
}
|
||||
|
||||
table.Render()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func renderSubTable(data [][]string, file bool) string {
|
||||
|
||||
@@ -404,15 +404,7 @@ func generateInteractive(cmd *cobra.Command, opts runOptions) error {
|
||||
|
||||
switch args[1] {
|
||||
case "info":
|
||||
fmt.Println("Model details:")
|
||||
if len(resp.Details.Families) > 0 {
|
||||
fmt.Printf("Family %s\n", strings.Join(resp.Details.Families, ", "))
|
||||
} else if resp.Details.Family != "" {
|
||||
fmt.Printf("Family %s\n", resp.Details.Family)
|
||||
}
|
||||
fmt.Printf("Parameter Size %s\n", resp.Details.ParameterSize)
|
||||
fmt.Printf("Quantization Level %s\n", resp.Details.QuantizationLevel)
|
||||
fmt.Println("")
|
||||
showInfo(resp)
|
||||
case "license":
|
||||
if resp.License == "" {
|
||||
fmt.Println("No license was specified for this model.")
|
||||
|
||||
@@ -104,7 +104,6 @@ curl http://localhost:11434/v1/chat/completions \
|
||||
|
||||
#### Notes
|
||||
|
||||
- `finish_reason` will always be `stop`
|
||||
- `usage.prompt_tokens` will be 0 for completions where prompt evaluation is cached
|
||||
|
||||
## Models
|
||||
|
||||
@@ -93,7 +93,7 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||
//}
|
||||
if gfxOverride == "" {
|
||||
if !slices.Contains[[]string, string](supported, gfx) {
|
||||
slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||
//slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
|
||||
// TODO - consider discrete markdown just for ROCM troubleshooting?
|
||||
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
|
||||
continue
|
||||
@@ -109,10 +109,10 @@ func AMDGetGPUInfo() []RocmGPUInfo {
|
||||
}
|
||||
|
||||
// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
|
||||
if totalMemory < IGPUMemLimit {
|
||||
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||
continue
|
||||
}
|
||||
//if totalMemory < IGPUMemLimit {
|
||||
// slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
|
||||
// continue
|
||||
//}
|
||||
|
||||
// TODO revisit this once ROCm v6 is available on windows.
|
||||
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
|
||||
|
||||
46
llm/ext_server/server.cpp
vendored
46
llm/ext_server/server.cpp
vendored
@@ -1650,26 +1650,41 @@ struct llama_server_context
|
||||
}
|
||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||
|
||||
char buf[256];
|
||||
llama_model_meta_val_str(model, "general.architecture", buf, 256);
|
||||
bool gemma2 = strcmp(buf, "gemma2") == 0;
|
||||
|
||||
int32_t truncate_at = slot.n_ctx;
|
||||
|
||||
// truncate at 2/3 of the context length for gemma2 models
|
||||
// as they do not support context shifts (from the sliding window implementation).
|
||||
// this way, prompts that almost fit the context length can still generate a full
|
||||
// response without a sudden stop from hitting the context limit
|
||||
if (gemma2) {
|
||||
truncate_at = 2 * slot.n_ctx / 3;
|
||||
}
|
||||
|
||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
|
||||
{
|
||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||
const int n_block_size = n_left / 2;
|
||||
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||
const int n_shift = n_left / 2;
|
||||
const int n_erase = slot.n_prompt_tokens - slot.params.n_keep - n_shift;
|
||||
|
||||
std::vector<llama_token> new_tokens(
|
||||
prompt_tokens.begin(),
|
||||
prompt_tokens.begin() + slot.params.n_keep);
|
||||
new_tokens.insert(
|
||||
new_tokens.end(),
|
||||
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
||||
prompt_tokens.begin() + slot.params.n_keep + n_erase,
|
||||
prompt_tokens.end());
|
||||
|
||||
LOG_VERBOSE("input truncated", {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_keep", slot.params.n_keep},
|
||||
{"n_left", n_left},
|
||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||
LOG_INFO("input truncated", {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_keep", slot.params.n_keep},
|
||||
{"n_left", n_left},
|
||||
{"n_shift", n_shift},
|
||||
{"n_erase", n_erase},
|
||||
});
|
||||
slot.truncated = true;
|
||||
prompt_tokens = new_tokens;
|
||||
@@ -1678,6 +1693,19 @@ struct llama_server_context
|
||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||
}
|
||||
|
||||
// Models with sliding window attention do not work with context shifts, so
|
||||
// limit their prediction to the context length
|
||||
if (gemma2) {
|
||||
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
|
||||
slot.n_predict = limit;
|
||||
slot.params.n_predict = limit;
|
||||
LOG_INFO("model does not support sliding window, limiting generation", {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"n_prompt_tokens", slot.n_prompt_tokens},
|
||||
{"n_predict", slot.n_predict}
|
||||
});
|
||||
}
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
llama_sampling_reset(slot.ctx_sampling);
|
||||
|
||||
15
llm/ggml.go
15
llm/ggml.go
@@ -366,9 +366,18 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui
|
||||
4*batch*(1+2*embedding+context*(1+heads))+embedding*(6*context*headsKV/heads+embedding*9/16),
|
||||
)
|
||||
}
|
||||
case "gemma":
|
||||
fullOffload = 4 * batch * (embedding + vocab)
|
||||
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
|
||||
case "gemma", "gemma2":
|
||||
fullOffload = max(
|
||||
4*batch*(embedding+vocab),
|
||||
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
|
||||
)
|
||||
|
||||
partialOffload = max(
|
||||
4*embedding*batch+embedding*vocab*105/128+4*vocab*batch,
|
||||
4*batch*(2*embedding+1+2*embeddingHeadsK*heads+context+context*heads)+
|
||||
4*embeddingHeadsK*context*8+
|
||||
embedding*embeddingHeadsK*heads*9/16,
|
||||
)
|
||||
case "command-r":
|
||||
fullOffload = max(
|
||||
4*batch*(embedding+vocab),
|
||||
|
||||
Reference in New Issue
Block a user