From 298c996e54c4d562c25c2e88e42432a68b1661b7 Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Thu, 30 May 2024 16:02:07 -0700 Subject: [PATCH 1/6] added IsValidNamespace function --- types/model/name.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/types/model/name.go b/types/model/name.go index f32b2596..2739b53c 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -251,6 +251,16 @@ func (n Name) DisplayShortest() string { return sb.String() } +func IsValidNamespace(namespace string) bool { + name := Name{ + Host: "h", + Model: "m", + Namespace: namespace, + Tag: "t", + } + return name.IsValid() +} + // IsValid reports whether all parts of the name are present and valid. The // digest is a special case, and is checked for validity only if present. func (n Name) IsValid() bool { From c365f195a88cdc5c121878e0dde40274ac71a78e Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Thu, 30 May 2024 16:40:04 -0700 Subject: [PATCH 2/6] directly use isvalidpart --- types/model/name.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/types/model/name.go b/types/model/name.go index 2739b53c..d85fd0c6 100644 --- a/types/model/name.go +++ b/types/model/name.go @@ -252,13 +252,7 @@ func (n Name) DisplayShortest() string { } func IsValidNamespace(namespace string) bool { - name := Name{ - Host: "h", - Model: "m", - Namespace: namespace, - Tag: "t", - } - return name.IsValid() + return isValidPart(kindNamespace, namespace) } // IsValid reports whether all parts of the name are present and valid. The From 2e4da8eec2d19d671df5f58c81dfdfa35de8679c Mon Sep 17 00:00:00 2001 From: Josh Yan Date: Fri, 31 May 2024 11:48:07 -0700 Subject: [PATCH 3/6] added tests for IsValidNamespace --- types/model/name_test.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/types/model/name_test.go b/types/model/name_test.go index 27a8ccf8..26d70ef3 100644 --- a/types/model/name_test.go +++ b/types/model/name_test.go @@ -385,3 +385,30 @@ func FuzzName(f *testing.F) { }) } + +func TestIsValidNamespace(t *testing.T) { + cases := []struct { + username string + expected bool + }{ + {"", false}, + {"a", true}, + {"a:b", false}, + {"a/b", false}, + {"a:b/c", false}, + {"a/b:c", false}, + {"a/b:c", false}, + {"a/b:c/d", false}, + {"a/b:c/d@e", false}, + {"a/b:c/d@sha256-100", false}, + {"himynameisjoe", true}, + {"himynameisreallyreallyreallyreallylongbutitshouldstillbevalid", true}, + } + for _, tt := range cases { + t.Run(tt.username, func(t *testing.T) { + if got := IsValidNamespace(tt.username); got != tt.expected { + t.Errorf("IsValidName(%q) = %v; want %v", tt.username, got, tt.expected) + } + }) + } +} From 829ff87bd1a98eff727003d3b24748f0f7d8c3ac Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 31 May 2024 18:54:21 -0700 Subject: [PATCH 4/6] revert tokenize ffi (#4761) * Revert "use `int32_t` for call to tokenize (#4738)" This reverts commit 763bb65dbb88004cd046c8acc0c8e889816e1828. * Revert "vocab only" This reverts commit bf54c845e9ea63ec58762a991dcea78d2c934b47. * Revert "use ffi for tokenizing/detokenizing" This reverts commit 26a00a04108f6cae625802e69faa4b48480bc208. --- llm/ext_server/server.cpp | 43 +++++++++++++++ llm/llm.go | 60 -------------------- llm/server.go | 113 ++++++++++++++++++++++++++++++++++---- 3 files changed, 144 insertions(+), 72 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index c1a803f1..8a0dffea 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -2625,6 +2625,21 @@ static json format_partial_response( return res; } +static json format_tokenizer_response(const std::vector &tokens) +{ + return json { + {"tokens", tokens} + }; +} + +static json format_detokenized_response(std::string content) +{ + return json { + {"content", content} + }; +} + + static void log_server_request(const httplib::Request &req, const httplib::Response &res) { // skip GH copilot requests when using default port @@ -3114,6 +3129,34 @@ int main(int argc, char **argv) { } }); + svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) + { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + const json body = json::parse(req.body); + std::vector tokens; + if (body.count("content") != 0) + { + tokens = llama.tokenize(body["content"], false); + } + const json data = format_tokenizer_response(tokens); + return res.set_content(data.dump(), "application/json; charset=utf-8"); + }); + + svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) + { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + const json body = json::parse(req.body); + std::string content; + if (body.count("tokens") != 0) + { + const std::vector tokens = body["tokens"]; + content = tokens_to_str(llama.ctx, tokens.cbegin(), tokens.cend()); + } + + const json data = format_detokenized_response(content); + return res.set_content(data.dump(), "application/json; charset=utf-8"); + }); + svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); diff --git a/llm/llm.go b/llm/llm.go index 4492d39f..2a0c4b91 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -12,7 +12,6 @@ package llm import "C" import ( "fmt" - "strings" "unsafe" ) @@ -38,62 +37,3 @@ func Quantize(infile, outfile string, ftype fileType) error { return nil } - -type llamaModel struct { - m *C.struct_llama_model -} - -func newLlamaModel(p string) *llamaModel { - cs := C.CString(p) - defer C.free(unsafe.Pointer(cs)) - - params := C.llama_model_default_params() - params.vocab_only = true - - return &llamaModel{ - C.llama_load_model_from_file(cs, params), - } -} - -func (llm *llamaModel) Close() { - C.llama_free_model(llm.m) -} - -func (llm *llamaModel) Tokenize(s string) []int { - cs := C.CString(s) - defer C.free(unsafe.Pointer(cs)) - - ltokens := make([]C.llama_token, len(s)+2) - n := C.llama_tokenize( - llm.m, - cs, - C.int32_t(len(s)), - <okens[0], - C.int32_t(len(ltokens)), - false, - true, - ) - - if n < 0 { - return nil - } - - tokens := make([]int, n) - for i := 0; i < int(n); i++ { - tokens[i] = int(ltokens[i]) - } - - return tokens -} - -func (llm *llamaModel) Detokenize(i32s []int) string { - var sb strings.Builder - for _, i32 := range i32s { - c := make([]byte, 512) - if n := C.llama_token_to_piece(llm.m, C.llama_token(i32), (*C.char)(unsafe.Pointer(&c[0])), C.int(len(c)), false); n > 0 { - sb.WriteString(unsafe.String(&c[0], n)) - } - } - - return sb.String() -} diff --git a/llm/server.go b/llm/server.go index 97aa2a15..3af8a329 100644 --- a/llm/server.go +++ b/llm/server.go @@ -57,8 +57,6 @@ type llmServer struct { loadDuration time.Duration // Record how long it took the model to load loadProgress float32 - *llamaModel - sem *semaphore.Weighted } @@ -311,7 +309,6 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr totalLayers: ggml.KV().BlockCount() + 1, gpuCount: gpuCount, done: make(chan error, 1), - llamaModel: newLlamaModel(model), } s.cmd.Env = os.Environ() @@ -849,12 +846,12 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) } - var b bytes.Buffer - if err := json.NewEncoder(&b).Encode(EmbeddingRequest{Content: prompt}); err != nil { + data, err := json.Marshal(TokenizeRequest{Content: prompt}) + if err != nil { return nil, fmt.Errorf("error marshaling embed data: %w", err) } - req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), &b) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/embedding", s.port), bytes.NewBuffer(data)) if err != nil { return nil, fmt.Errorf("error creating embed request: %w", err) } @@ -884,12 +881,108 @@ func (s *llmServer) Embedding(ctx context.Context, prompt string) ([]float64, er return embedding.Embedding, nil } +type TokenizeRequest struct { + Content string `json:"content"` +} + +type TokenizeResponse struct { + Tokens []int `json:"tokens"` +} + func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) { - return s.llamaModel.Tokenize(content), nil + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return nil, err + } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { + return nil, fmt.Errorf("unexpected server status: %s", status.ToString()) + } + + data, err := json.Marshal(TokenizeRequest{Content: content}) + if err != nil { + return nil, fmt.Errorf("marshaling encode data: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data)) + if err != nil { + return nil, fmt.Errorf("encode request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("do encode request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read encode request: %w", err) + } + + if resp.StatusCode >= 400 { + log.Printf("llm encode error: %s", body) + return nil, fmt.Errorf("%s", body) + } + + var encoded TokenizeResponse + if err := json.Unmarshal(body, &encoded); err != nil { + return nil, fmt.Errorf("unmarshal encode response: %w", err) + } + + return encoded.Tokens, nil +} + +type DetokenizeRequest struct { + Tokens []int `json:"tokens"` +} + +type DetokenizeResponse struct { + Content string `json:"content"` } func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) { - return s.llamaModel.Detokenize(tokens), nil + // Make sure the server is ready + status, err := s.getServerStatus(ctx) + if err != nil { + return "", err + } else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable { + return "", fmt.Errorf("unexpected server status: %s", status.ToString()) + } + + data, err := json.Marshal(DetokenizeRequest{Tokens: tokens}) + if err != nil { + return "", fmt.Errorf("marshaling decode data: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data)) + if err != nil { + return "", fmt.Errorf("decode request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return "", fmt.Errorf("do decode request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read decode request: %w", err) + } + + if resp.StatusCode >= 400 { + log.Printf("llm decode error: %s", body) + return "", fmt.Errorf("%s", body) + } + + var decoded DetokenizeResponse + if err := json.Unmarshal(body, &decoded); err != nil { + return "", fmt.Errorf("unmarshal encode response: %w", err) + } + + return decoded.Content, nil } func (s *llmServer) Close() error { @@ -907,10 +1000,6 @@ func (s *llmServer) Close() error { slog.Debug("llama server stopped") } - if s.llamaModel != nil { - s.llamaModel.Close() - } - return nil } From 476fb8e89242720a7cdd57400ba928de4dde9cc1 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 1 Jun 2024 19:24:33 -0700 Subject: [PATCH 5/6] Limit GPU lib search for now (#4777) * fix oneapi errors on windows 10 --- gpu/gpu.go | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/gpu/gpu.go b/gpu/gpu.go index defdf04d..03e16702 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -16,13 +16,12 @@ import ( "os" "path/filepath" "runtime" - "strconv" "strings" "sync" "unsafe" - "github.com/ollama/ollama/format" "github.com/ollama/ollama/envconfig" + "github.com/ollama/ollama/format" ) type handles struct { @@ -105,8 +104,6 @@ func initGPUHandles() *handles { var cudartMgmtPatterns []string var nvcudaMgmtName string var nvcudaMgmtPatterns []string - var oneapiMgmtName string - var oneapiMgmtPatterns []string tmpDir, _ := PayloadsDir() switch runtime.GOOS { @@ -118,8 +115,6 @@ func initGPUHandles() *handles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "nvcuda.dll" nvcudaMgmtPatterns = NvcudaWindowsGlobs - oneapiMgmtName = "ze_intel_gpu64.dll" - oneapiMgmtPatterns = OneapiWindowsGlobs case "linux": cudartMgmtName = "libcudart.so*" if tmpDir != "" { @@ -130,8 +125,6 @@ func initGPUHandles() *handles { // Aligned with driver, we can't carry as payloads nvcudaMgmtName = "libcuda.so*" nvcudaMgmtPatterns = NvcudaLinuxGlobs - oneapiMgmtName = "libze_intel_gpu.so" - oneapiMgmtPatterns = OneapiLinuxGlobs default: return gpuHandles } @@ -159,17 +152,6 @@ func initGPUHandles() *handles { } } - oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns) - if len(oneapiLibPaths) > 0 { - deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths) - if oneapi != nil { - slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount) - gpuHandles.oneapi = oneapi - gpuHandles.deviceCount = deviceCount - return gpuHandles - } - } - return gpuHandles } @@ -245,18 +227,6 @@ func GetGPUInfo() GpuInfoList { // TODO potentially sort on our own algorithm instead of what the underlying GPU library does... resp = append(resp, gpuInfo) } - if gpuHandles.oneapi != nil { - gpuInfo := GpuInfo{ - Library: "oneapi", - } - C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo) - var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend. - memInfo.free = C.uint64_t(totalFreeMem) - gpuInfo.TotalMemory = uint64(memInfo.total) - gpuInfo.FreeMemory = uint64(memInfo.free) - gpuInfo.ID = strconv.Itoa(i) - resp = append(resp, gpuInfo) - } } // Then AMD From d4a86102fd5f84cca50757af00296606ac191890 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 1 Jun 2024 21:05:51 -0700 Subject: [PATCH 6/6] update welcome prompt in windows to `llama3` (#4779) --- app/ollama_welcome.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1 index e7056952..9af37a46 100644 --- a/app/ollama_welcome.ps1 +++ b/app/ollama_welcome.ps1 @@ -4,5 +4,5 @@ write-host "Welcome to Ollama!" write-host "" write-host "Run your first model:" write-host "" -write-host "`tollama run llama2" +write-host "`tollama run llama3" write-host "" \ No newline at end of file