Merge branch 'ollama:main' into main

2025-12-23 23:18:26 +00:00 · 2024-05-18 13:52:47 +08:00
parent d497e31f4b ba04afc9a4
commit fc2f25c1d5
6 changed files with 21 additions and 32 deletions
--- a/README.md
+++ b/README.md
@@ -409,6 +409,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [AI Telegram Bot](https://github.com/tusharhero/aitelegrambot) (Telegram bot using Ollama in backend)
 - [AI ST Completion](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (Sublime Text 4 AI assistant plugin with Ollama support)
 - [Discord-Ollama Chat Bot](https://github.com/kevinthedang/discord-ollama) (Generalized TypeScript Discord Bot w/ Tuning Documentation)
+- [Discord AI chat/moderation bot](https://github.com/rapmd73/Companion) Chat/moderation bot written in python. Uses Ollama to create personalities.

 ### Supported backends 
 - [llama.cpp](https://github.com/ggerganov/llama.cpp) project founded by Georgi Gerganov.
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -25,7 +25,7 @@ import (
 	"time"

 	"github.com/containerd/console"
-
+	"github.com/mattn/go-runewidth"
 	"github.com/olekukonko/tablewriter"
 	"github.com/spf13/cobra"
 	"golang.org/x/crypto/ssh"
@@ -744,7 +744,8 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 	if wordWrap && termWidth >= 10 {
 		for _, ch := range content {
 			if state.lineLength+1 > termWidth-5 {
-				if len(state.wordBuffer) > termWidth-10 {
+
+				if runewidth.StringWidth(state.wordBuffer) > termWidth-10 {
 					fmt.Printf("%s%c", state.wordBuffer, ch)
 					state.wordBuffer = ""
 					state.lineLength = 0
@@ -752,12 +753,18 @@ func displayResponse(content string, wordWrap bool, state *displayResponseState)
 				}

 				// backtrack the length of the last word and clear to the end of the line
-				fmt.Printf("\x1b[%dD\x1b[K\n", len(state.wordBuffer))
+				fmt.Printf("\x1b[%dD\x1b[K\n", runewidth.StringWidth(state.wordBuffer))
 				fmt.Printf("%s%c", state.wordBuffer, ch)
-				state.lineLength = len(state.wordBuffer) + 1
+				chWidth := runewidth.RuneWidth(ch)
+
+				state.lineLength = runewidth.StringWidth(state.wordBuffer) + chWidth
 			} else {
 				fmt.Print(string(ch))
-				state.lineLength += 1
+				state.lineLength += runewidth.RuneWidth(ch)
+				if runewidth.RuneWidth(ch) >= 2 {
+					state.wordBuffer = ""
+					continue
+				}

 				switch ch {
 				case ' ':
--- a/integration/max_queue_test.go
+++ b/integration/max_queue_test.go
@@ -19,6 +19,11 @@ import (
 )

 func TestMaxQueue(t *testing.T) {
+	if os.Getenv("OLLAMA_TEST_EXISTING") != "" {
+		t.Skip("Max Queue test requires spawing a local server so we can adjust the queue size")
+		return
+	}
+
 	// Note: This test can be quite slow when running in CPU mode, so keep the threadCount low unless your on GPU
 	// Also note that by default Darwin can't sustain > ~128 connections without adjusting limits
 	threadCount := 32
@@ -109,9 +114,9 @@ func TestMaxQueue(t *testing.T) {
 	slog.Info("generate done, waiting for embeds")
 	embedwg.Wait()

+	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
 	require.Equal(t, resetByPeerCount, 0, "Connections reset by peer, have you updated your fd and socket limits?")
 	require.True(t, busyCount > 0, "no requests hit busy error but some should have")
 	require.True(t, canceledCount == 0, "no requests should have been canceled due to timeout")

-	slog.Info("embeds completed", "success", succesCount, "busy", busyCount, "reset", resetByPeerCount, "canceled", canceledCount)
 }
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
--- a/llm/patches/05-clip-fix.diff
+++ b/llm/patches/05-clip-fix.diff
@@ -1,24 +0,0 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index e3c9bcd4..b43f892d 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -573,14 +573,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
-     struct ggml_tensor * embeddings = inp;
-     if (ctx->has_class_embedding) {
-         embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-+    }
-+    ggml_set_name(embeddings, "embeddings");
-+    ggml_set_input(embeddings);
-+
-+    if (ctx->has_class_embedding) {
-         embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-         embeddings = ggml_acc(ctx0, embeddings, inp,
-                 embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-     }
-    ggml_set_name(embeddings, "embeddings");
-    ggml_set_input(embeddings);
-
- 
-     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-     ggml_set_name(positions, "positions");
--- a/server/routes.go
+++ b/server/routes.go
@@ -1086,7 +1086,7 @@ func Serve(ln net.Listener) error {
 		return err
 	}
 	<-ctx.Done()
-	return err
+	return nil
 }

 func waitForStream(c *gin.Context, ch chan interface{}) {