embedding db

2025-10-08 21:04:33 +02:00 · 2025-10-08 21:04:33 +02:00 · c63890b104
parent 46a4374e69
commit c63890b104
12 changed files with 448 additions and 40 deletions
--- a/39
+++ b/39
@ -1,6 +1,6 @@
 # Makefile for running the Vet Clinic Chat Assistant locally with Ollama
-.PHONY: run ollama-start ollama-stop ollama-pull ollama-status
+.PHONY: run ollama-start ollama-stop ollama-pull ollama-status curl-embed curl-translate curl-chat
 # Start Ollama server (if not already running)
 ollama-start:
@ -20,6 +20,15 @@ ollama-pull:
 ollama-status:
 	ollama list
 # Ollama host & models (override as needed)
 OLLAMA_HOST ?= http://localhost:11434
 # Primary chat / reasoning model (already using OPENAI_MODEL var for compatibility)
 OPENAI_MODEL ?= qwen3:latest
 # Optional separate embedding model
 OLLAMA_EMBED_MODEL ?= all-minilm
 # Translation prompt (mirrors config.yaml translate_prompt). Can override: make curl-translate PROMPT="..." TRANSLATE_PROMPT="..."
 TRANSLATE_PROMPT ?= Translate the following veterinary-related sentence to English. Input: '$(PROMPT)'. Return ONLY the English translation, no extra text, no markdown, no quotes. If already English, return as is.
 # Database configuration (override via: make run DB_PASSWORD=secret DB_NAME=other)
 DB_HOST ?= localhost
 DB_PORT ?= 5432
@ -70,3 +79,31 @@ test-coverage:
 # Run tests with HTML coverage report
 test-coverage-html: test-coverage
 	go tool cover -html=coverage.out
 # --- Utility curl targets ---
 # Example: make curl-embed PROMPT="warm up"
 curl-embed:
 	@test -n "$(PROMPT)" || { echo "Usage: make curl-embed PROMPT='text' [OLLAMA_EMBED_MODEL=model]"; exit 1; }
 	@echo "[curl-embed] model=$(OLLAMA_EMBED_MODEL) prompt='$(PROMPT)'"
 	@curl -sS -X POST "$(OLLAMA_HOST)/api/embeddings" \
 	  -H 'Content-Type: application/json' \
 	  -d '{"model":"$(OLLAMA_EMBED_MODEL)","prompt":"$(PROMPT)"}' | jq . || true
 # Example: make curl-translate PROMPT="A kutyám nem eszik"
 curl-translate:
 	@test -n "$(PROMPT)" || { echo "Usage: make curl-translate PROMPT='sentence to translate'"; exit 1; }
 	@echo "[curl-translate] model=$(OPENAI_MODEL)"; \
 	PROMPT_JSON=$$(printf '%s' "$(TRANSLATE_PROMPT)" | jq -Rs .); \
 	curl -sS -X POST "$(OLLAMA_HOST)/api/chat" \
 	  -H 'Content-Type: application/json' \
 	  -d '{"model":"$(OPENAI_MODEL)","messages":[{"role":"user","content":'$$PROMPT_JSON'}],"stream":false}' | jq -r '.message.content' || true
 # Generic chat invocation (raw user PROMPT)
 # Example: make curl-chat PROMPT="List 3 dog breeds"
 curl-chat:
 	@test -n "$(PROMPT)" || { echo "Usage: make curl-chat PROMPT='your message'"; exit 1; }
 	@echo "[curl-chat] model=$(OPENAI_MODEL)"; \
 	PROMPT_JSON=$$(printf '%s' "$(PROMPT)" | jq -Rs .); \
 	curl -sS -X POST "$(OLLAMA_HOST)/api/chat" \
 	  -H 'Content-Type: application/json' \
 	  -d '{"model":"$(OPENAI_MODEL)","messages":[{"role":"user","content":'$$PROMPT_JSON'}],"stream":false}' | jq -r '.message.content' || true
--- a/config/config.yaml
+++ b/config/config.yaml
@ -1,4 +1,4 @@
 llm:
  extract_keywords_prompt: "You will extract structured data from the user input. Input text: {{.Message}}. Return ONLY valid minified JSON object with keys: translate (English translation of input), keyword (array of 3-5 concise English veterinary-related keywords derived strictly from the input), animal (animal mentioned or 'unknown'). Example: {\"translate\":\"dog has diarrhea\",\"keyword\":[\"diarrhea\",\"digestive\"],\"animal\":\"dog\"}. Do not add extra text, markdown, or quotes outside JSON."
  disambiguate_prompt: "Given candidate visit entries (JSON array): {{.Entries}} and user message: {{.Message}} choose the best matching visit's ID. Return ONLY JSON: {\"visitReason\":\"<one of the candidate IDs or empty string if none>\"}. No other text."
-  translate_prompt: "Translate the following veterinary-related sentence to English. Input: '{{.Message}}'. Return ONLY the English translation, no extra text, no markdown, no quotes. If already English, return as is."
+  translate_prompt: "Translate the following veterinary-related sentence to English. Input: '{{.Message}}'. Return ONLY the English translation as one concise sentence. IMPORTANT: Do NOT output any <think> tags, reasoning, analysis, or explanations. No markdown, no quotes. If already English, return it unchanged."
--- a/llm.go
+++ b/llm.go
@ -55,7 +55,14 @@ func NewLLMClientFromEnv(repo ChatRepositoryAPI) LLMClientAPI {
 	case "openai", "openrouter":
 		return NewOpenAIClient(apiKey, baseURL, model, repo)
 	case "ollama", "":
-		return NewOllamaClient(apiKey, baseURL, model, repo)
+		oc := NewOllamaClient(apiKey, baseURL, model, repo)
 		em := os.Getenv("OLLAMA_EMBED_MODEL")
 		if strings.TrimSpace(em) == "" {
 			em = "all-minilm"
 			logrus.Infof("No OLLAMA_EMBED_MODEL specified; defaulting embedding model to %s", em)
 		}
 		oc.EmbeddingModel = em
 		return oc
 	default:
 		logrus.Warnf("Unknown LLM_PROVIDER %q, defaulting to Ollama", provider)
 		return NewOllamaClient(apiKey, baseURL, model, repo)
--- a/main.go
+++ b/main.go
@ -72,6 +72,9 @@ func main() {
 	// Initialize LLM client
 	llm := NewLLMClientFromEnv(repo)
 	// Launch background backfill of sentence embeddings (non-blocking)
 	startSentenceEmbeddingBackfill(repo, llm, &visitDB)
 	// Wrap templates for controller
 	uiTmpl := &TemplateWrapper{Tmpl: uiTemplate}
 	uiDBEditTmpl := &TemplateWrapper{Tmpl: uiDBEditTemplate}
--- a/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql
+++ b/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql
@ -0,0 +1,8 @@
 -- +goose Up
 ALTER TABLE sentence_embeddings
    ALTER COLUMN visit_id TYPE TEXT USING visit_id::text;
 -- +goose Down
 ALTER TABLE sentence_embeddings
    ALTER COLUMN visit_id TYPE INTEGER USING visit_id::integer;
--- a/migrations/0005_add_unique_idx_sentence_embeddings.up.sql
+++ b/migrations/0005_add_unique_idx_sentence_embeddings.up.sql
@ -0,0 +1,6 @@
 -- +goose Up
 CREATE UNIQUE INDEX IF NOT EXISTS ux_sentence_embeddings_visit_sentence
    ON sentence_embeddings(visit_id, sentence);
 -- +goose Down
 DROP INDEX IF EXISTS ux_sentence_embeddings_visit_sentence;
--- a/migrations/0006_alter_sentence_embeddings_dim_384.up.sql
+++ b/migrations/0006_alter_sentence_embeddings_dim_384.up.sql
@ -0,0 +1,14 @@
 -- +goose Up
 -- WARNING: This alters the embeddings vector dimension from 1536 to 384.
 -- Ensure you are switching to a 384-dim embedding model (e.g., all-minilm).
 -- If existing rows have 1536-d vectors this command will fail; you may need to
 -- TRUNCATE TABLE sentence_embeddings first (after backing up) before applying.
 ALTER TABLE sentence_embeddings
    ALTER COLUMN embeddings TYPE vector(384);
 -- +goose Down
 -- Revert to 1536 dimensions (for models like OpenAI text-embedding-3-large).
 -- Will fail if existing rows are 384-d.
 ALTER TABLE sentence_embeddings
    ALTER COLUMN embeddings TYPE vector(1536);
--- a/migrations/0007_add_dual_embedding_columns.up.sql
+++ b/migrations/0007_add_dual_embedding_columns.up.sql
@ -0,0 +1,13 @@
 -- +goose Up
 -- Add separate columns for different embedding dimensions.
 -- Existing 'embeddings' column (if present) is left untouched for backward compatibility.
 -- Application code will now populate embedding_384 or embedding_1536 instead.
 ALTER TABLE sentence_embeddings
    ADD COLUMN IF NOT EXISTS embedding_384   vector(384),
    ADD COLUMN IF NOT EXISTS embedding_1536  vector(1536);
 -- +goose Down
 ALTER TABLE sentence_embeddings
    DROP COLUMN IF EXISTS embedding_384,
    DROP COLUMN IF EXISTS embedding_1536;
--- a/migrations/0008_drop_legacy_embeddings_column.up.sql
+++ b/migrations/0008_drop_legacy_embeddings_column.up.sql
@ -0,0 +1,11 @@
 -- +goose Up
 -- Drop the legacy single-dimension embeddings column (was NOT NULL) to allow inserts
 -- that now use embedding_384 / embedding_1536. All new data goes into those columns.
 ALTER TABLE sentence_embeddings
    DROP COLUMN IF EXISTS embeddings;
 -- +goose Down
 -- Re-create the legacy embeddings column (empty) as vector(1536) NULLABLE for rollback.
 ALTER TABLE sentence_embeddings
    ADD COLUMN IF NOT EXISTS embeddings vector(1536);
--- a/ollama_client.go
+++ b/ollama_client.go
@ -7,7 +7,10 @@ import (
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/sirupsen/logrus"
 )
@ -15,10 +18,11 @@ import (
 // --- OllamaClient implementation ---
 type OllamaClient struct {
-	APIKey  string
+	APIKey         string
-	BaseURL string
+	BaseURL        string
-	Model   string
+	Model          string
-	Repo    ChatRepositoryAPI
+	EmbeddingModel string
 	Repo           ChatRepositoryAPI
 }
 func NewOllamaClient(apiKey, baseURL, model string, repo ChatRepositoryAPI) *OllamaClient {
@ -90,12 +94,28 @@ func (llm *OllamaClient) ollamaCompletion(ctx context.Context, prompt string, fo
 	if apiURL == "" {
 		apiURL = "http://localhost:11434/api/chat"
 	}
 	messages := []map[string]string{{"role": "user", "content": prompt}}
 	//if os.Getenv("DISABLE_THINK") == "1" {
 	// System message to suppress chain-of-thought style outputs.
 	messages = append([]map[string]string{{
 		"role":    "system",
 		"content": "You are a concise assistant. Output ONLY the final answer requested by the user. Do not include reasoning, analysis, or <think> tags.",
 	}}, messages...)
 	//}
 	body := map[string]interface{}{
 		"model":    llm.Model,
-		"messages": []map[string]string{{"role": "user", "content": prompt}},
+		"messages": messages,
 		"stream":   false,
 		"format":   format,
 	}
 	// Optional: Add a stop sequence to prevent <think> tags if they appear
 	if os.Getenv("DISABLE_THINK") == "1" {
 		body["options"] = map[string]interface{}{"stop": []string{"<think>"}}
 	}
 	jsonBody, _ := json.Marshal(body)
 	req, _ := http.NewRequestWithContext(ctx, http.MethodPost, apiURL, bytes.NewBuffer(jsonBody))
 	if llm.APIKey != "" {
@ -128,43 +148,112 @@ func (llm *OllamaClient) ollamaCompletion(ctx context.Context, prompt string, fo
 	return "", fmt.Errorf("unrecognized LLM response format: %.200s", string(raw))
 }
 func normalizeOllamaHost(raw string) string {
 	if raw == "" {
 		return "http://localhost:11434"
 	}
 	// strip trailing /api/* paths if user provided full endpoint
 	lower := strings.ToLower(raw)
 	for _, seg := range []string{"/api/chat", "/api/embeddings", "/api/generate"} {
 		if strings.HasSuffix(lower, seg) {
 			return raw[:len(raw)-len(seg)]
 		}
 	}
 	return raw
 }
 func (llm *OllamaClient) GetEmbeddings(ctx context.Context, input string) ([]float64, error) {
-	apiURL := llm.BaseURL
+	host := normalizeOllamaHost(llm.BaseURL)
-	if apiURL == "" {
+	apiURL := host + "/api/embeddings"
-		apiURL = "http://localhost:11434/api/embeddings"
+	modelName := llm.Model
 	if llm.EmbeddingModel != "" {
 		modelName = llm.EmbeddingModel
 	}
-	body := map[string]interface{}{
+	// retry parameters (env override OLLAMA_EMBED_ATTEMPTS)
-		"model":  llm.Model,
+	maxAttempts := 5
-		"prompt": input,
+	if v := os.Getenv("OLLAMA_EMBED_ATTEMPTS"); v != "" {
 		if n, err := strconv.Atoi(v); err == nil && n > 0 && n < 20 {
 			maxAttempts = n
 		}
 	}
-	jsonBody, _ := json.Marshal(body)
+	baseBackoff := 300 * time.Millisecond
-	req, _ := http.NewRequestWithContext(ctx, http.MethodPost, apiURL, bytes.NewBuffer(jsonBody))
+	var lastErr error
-	if llm.APIKey != "" {
+	for attempt := 0; attempt < maxAttempts; attempt++ {
-		req.Header.Set("Authorization", "Bearer "+llm.APIKey)
+		select {
 		case <-ctx.Done():
 			return nil, ctx.Err()
 		default:
 		}
 		body := map[string]interface{}{
 			"model":  modelName,
 			"prompt": input,
 		}
 		jsonBody, _ := json.Marshal(body)
 		req, _ := http.NewRequestWithContext(ctx, http.MethodPost, apiURL, bytes.NewBuffer(jsonBody))
 		if llm.APIKey != "" {
 			req.Header.Set("Authorization", "Bearer "+llm.APIKey)
 		}
 		req.Header.Set("Content-Type", "application/json")
 		req.Header.Set("Accept", "application/json")
 		resp, err := (&http.Client{}).Do(req)
 		if err != nil {
 			lastErr = err
 			logrus.WithError(err).Warnf("[Ollama] embeddings request attempt=%d failed", attempt+1)
 		} else {
 			raw, rerr := io.ReadAll(resp.Body)
 			resp.Body.Close()
 			if rerr != nil {
 				lastErr = rerr
 			} else {
 				var generic map[string]json.RawMessage
 				if jerr := json.Unmarshal(raw, &generic); jerr != nil {
 					lastErr = fmt.Errorf("unrecognized response (parse): %w", jerr)
 				} else if embRaw, ok := generic["embedding"]; ok && len(embRaw) > 0 {
 					var emb []float64
 					if jerr := json.Unmarshal(embRaw, &emb); jerr != nil {
 						lastErr = fmt.Errorf("failed to decode embedding: %w", jerr)
 					} else if len(emb) == 0 {
 						lastErr = fmt.Errorf("empty embedding returned")
 					} else {
 						return emb, nil
 					}
 				} else if drRaw, ok := generic["done_reason"]; ok {
 					var reason string
 					_ = json.Unmarshal(drRaw, &reason)
 					if reason == "load" { // transient model loading state
 						lastErr = fmt.Errorf("model loading")
 					} else {
 						lastErr = fmt.Errorf("unexpected done_reason=%s", reason)
 					}
 				} else if errRaw, ok := generic["error"]; ok {
 					var errMsg string
 					_ = json.Unmarshal(errRaw, &errMsg)
 					if errMsg != "" {
 						lastErr = fmt.Errorf("embedding error: %s", errMsg)
 					} else {
 						lastErr = fmt.Errorf("embedding error (empty message)")
 					}
 				} else {
 					lastErr = fmt.Errorf("unrecognized embedding response: %.200s", string(raw))
 				}
 			}
 		}
 		if lastErr == nil {
 			break
 		}
 		// backoff if not last attempt
 		if attempt < maxAttempts-1 {
 			delay := baseBackoff << attempt
 			if strings.Contains(strings.ToLower(lastErr.Error()), "model loading") {
 				delay += 1 * time.Second
 			}
 			time.Sleep(delay)
 		}
 	}
-	req.Header.Set("Content-Type", "application/json")
+	if lastErr == nil {
-	req.Header.Set("Accept", "application/json")
+		lastErr = fmt.Errorf("embedding retrieval failed with no error info")
 	client := &http.Client{}
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, err
 	}
-	defer resp.Body.Close()
+	return nil, lastErr
 	raw, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return nil, err
 	}
 	var ollama struct {
 		Embedding []float64 `json:"embedding"`
 		Error     string    `json:"error"`
 	}
 	if err := json.Unmarshal(raw, &ollama); err == nil && len(ollama.Embedding) > 0 {
 		return ollama.Embedding, nil
 	}
 	if ollama.Error != "" {
 		return nil, fmt.Errorf("embedding error: %s", ollama.Error)
 	}
 	return nil, fmt.Errorf("unrecognized embedding response: %.200s", string(raw))
 }
 func (llm *OllamaClient) TranslateToEnglish(ctx context.Context, message string) (string, error) {
--- a/repository.go
+++ b/repository.go
@ -2,7 +2,10 @@ package main
 import (
 	"context"
 	"fmt"
 	"os"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/jackc/pgx/v5/pgconn"
@ -263,6 +266,69 @@ func (r *PGChatRepository) CreateUser(ctx context.Context, username, passwordHas
 	return err
 }
 // InsertSentenceEmbedding inserts a sentence embedding if not already present (unique index on visit_id,sentence)
 func (r *PGChatRepository) InsertSentenceEmbedding(ctx context.Context, visitID, sentence, translated string, embedding []float64) error {
 	if r == nil || r.pool == nil {
 		return nil
 	}
 	l := len(embedding)
 	if l != 384 && l != 1536 {
 		err := fmt.Errorf("unsupported embedding length %d (expected 384 or 1536)", l)
 		logrus.WithError(err).Warn("skipping sentence embedding insert")
 		return err
 	}
 	// Build vector literal
 	var b strings.Builder
 	b.Grow(len(embedding)*8 + 2)
 	b.WriteByte('[')
 	for i, v := range embedding {
 		if i > 0 {
 			b.WriteByte(',')
 		}
 		b.WriteString(strconv.FormatFloat(v, 'f', -1, 64))
 	}
 	b.WriteByte(']')
 	vecLiteral := b.String()
 	ctx, cancel := context.WithTimeout(ctx, 6*time.Second)
 	defer cancel()
 	var sqlStmt string
 	if l == 384 {
 		sqlStmt = `INSERT INTO sentence_embeddings (visit_id, sentence, translated, embedding_384)
 			VALUES ($1,$2,$3,$4::vector)
 			ON CONFLICT (visit_id, sentence) DO UPDATE
 			SET embedding_384 = EXCLUDED.embedding_384,
 			    translated = COALESCE(sentence_embeddings.translated, EXCLUDED.translated)`
 	} else { // 1536
 		sqlStmt = `INSERT INTO sentence_embeddings (visit_id, sentence, translated, embedding_1536)
 			VALUES ($1,$2,$3,$4::vector)
 			ON CONFLICT (visit_id, sentence) DO UPDATE
 			SET embedding_1536 = EXCLUDED.embedding_1536,
 			    translated = COALESCE(sentence_embeddings.translated, EXCLUDED.translated)`
 	}
 	_, err := r.pool.Exec(ctx, sqlStmt, visitID, sentence, translated, vecLiteral)
 	if err != nil {
 		logrus.WithError(err).Warn("failed to upsert sentence embedding (dual columns)")
 	}
 	return err
 }
 // ExistsSentenceEmbedding checks if a sentence embedding exists
 func (r *PGChatRepository) ExistsSentenceEmbedding(ctx context.Context, visitID, sentence string) (bool, error) {
 	if r == nil || r.pool == nil {
 		return false, nil
 	}
 	ctx, cancel := context.WithTimeout(ctx, 2*time.Second)
 	defer cancel()
 	var exists bool
 	err := r.pool.QueryRow(ctx, `SELECT EXISTS (SELECT 1 FROM sentence_embeddings WHERE visit_id=$1 AND sentence=$2)`, visitID, sentence).Scan(&exists)
 	if err != nil {
 		return false, err
 	}
 	return exists, nil
 }
 // Close releases pool resources
 func (r *PGChatRepository) Close() {
 	if r != nil && r.pool != nil {
--- a/sentence_embeddings.go
+++ b/sentence_embeddings.go
@ -0,0 +1,154 @@
 package main
 import (
 	"context"
 	"os"
 	"regexp"
 	"strings"
 	"time"
 	"github.com/sirupsen/logrus"
 )
 var sentenceSplitRegex = regexp.MustCompile(`(?m)(?:[^.!?\n]+[.!?]|[^.!?\n]+$)`)
 // configurable via env (seconds); defaults chosen for model cold start friendliness
 func envDuration(key string, def time.Duration) time.Duration {
 	if v := os.Getenv(key); v != "" {
 		if d, err := time.ParseDuration(v); err == nil {
 			return d
 		}
 	}
 	return def
 }
 // startSentenceEmbeddingBackfill launches a background goroutine that iterates all visits
 // and stores (visit_id, sentence, translated, embedding) records in sentence_embeddings table
 // if they do not already exist (relying on unique index ON CONFLICT DO NOTHING).
 func startSentenceEmbeddingBackfill(repo *PGChatRepository, llm LLMClientAPI, vdb *VisitDB) {
 	if repo == nil || llm == nil || vdb == nil {
 		logrus.Info("Sentence embedding backfill skipped (missing repo, llm or vdb)")
 		return
 	}
 	if disable := strings.ToLower(os.Getenv("SENTENCE_BACKFILL_DISABLE")); disable == "1" || disable == "true" {
 		logrus.Info("Sentence embedding backfill disabled via SENTENCE_BACKFILL_DISABLE env var")
 		return
 	}
 	translateTimeout := envDuration("TRANSLATE_TIMEOUT", 45*time.Second)
 	embeddingTimeout := envDuration("EMBEDDING_TIMEOUT", 45*time.Second)
 	maxTranslateAttempts := 3
 	maxEmbeddingAttempts := 3
 	go func() {
 		start := time.Now()
 		logrus.WithFields(logrus.Fields{"translateTimeout": translateTimeout, "embeddingTimeout": embeddingTimeout}).Info("Sentence embedding backfill started")
 		processed := 0
 		inserted := 0
 		skippedExisting := 0
 		skippedDueToFailures := 0
 		for _, visit := range vdb.visitsDB { // visitsDB accessible within package
 			if strings.TrimSpace(visit.Visit) == "" {
 				continue
 			}
 			sentences := extractSentences(visit.Visit)
 			for _, s := range sentences {
 				processed++
 				trimmed := strings.TrimSpace(s)
 				if len(trimmed) < 3 {
 					continue
 				}
 				// Existence check before any LLM calls
 				existsCtx, existsCancel := context.WithTimeout(context.Background(), 2*time.Second)
 				exists, err := repo.ExistsSentenceEmbedding(existsCtx, visit.ID, trimmed)
 				existsCancel()
 				if err != nil {
 					logrus.WithError(err).Warnf("Exists check failed visit=%s sentence=%q", visit.ID, trimmed)
 				} else if exists {
 					skippedExisting++
 					continue
 				}
 				// Translation with retry/backoff
 				var translated string
 				translateErr := retry(maxTranslateAttempts, 0, func(at int) error {
 					ctx, cancel := context.WithTimeout(context.Background(), translateTimeout)
 					defer cancel()
 					resp, err := llm.TranslateToEnglish(ctx, trimmed)
 					if err != nil {
 						logrus.WithError(err).Warnf("Translate attempt=%d failed visit=%s sentence=%q", at+1, visit.ID, trimmed)
 						return err
 					}
 					translated = strings.TrimSpace(resp)
 					return nil
 				})
 				if translateErr != nil || translated == "" {
 					translated = trimmed // fallback keep original language
 				}
 				// Embedding with retry/backoff (skip if translation totally failed with deadline each time)
 				var emb []float64
 				embErr := retry(maxEmbeddingAttempts, 0, func(at int) error {
 					ctx, cancel := context.WithTimeout(context.Background(), embeddingTimeout)
 					defer cancel()
 					vec, err := llm.GetEmbeddings(ctx, translated)
 					if err != nil {
 						logrus.WithError(err).Warnf("Embeddings attempt=%d failed visit=%s sentence=%q", at+1, visit.ID, trimmed)
 						return err
 					}
 					emb = vec
 					return nil
 				})
 				if embErr != nil {
 					skippedDueToFailures++
 					continue
 				}
 				persistCtx, pcancel := context.WithTimeout(context.Background(), 5*time.Second)
 				if err := repo.InsertSentenceEmbedding(persistCtx, visit.ID, trimmed, translated, emb); err == nil {
 					inserted++
 				}
 				pcancel()
 				// Throttle (configurable?)
 				time.Sleep(50 * time.Millisecond)
 			}
 		}
 		logrus.Infof("Sentence embedding backfill complete processed=%d inserted=%d skipped_existing=%d skipped_failures=%d elapsed=%s", processed, inserted, skippedExisting, skippedDueToFailures, time.Since(start))
 	}()
 }
 // retry executes fn up to attempts times with exponential backoff starting at base (or 200ms if base==0)
 func retry(attempts int, base time.Duration, fn func(attempt int) error) error {
 	if attempts <= 0 {
 		return nil
 	}
 	if base <= 0 {
 		base = 200 * time.Millisecond
 	}
 	var err error
 	for a := 0; a < attempts; a++ {
 		err = fn(a)
 		if err == nil {
 			return nil
 		}
 		// backoff except after last attempt
 		if a < attempts-1 {
 			backoff := base << a // exponential
 			time.Sleep(backoff)
 		}
 	}
 	return err
 }
 // extractSentences splits a block of text into sentence-like units.
 func extractSentences(text string) []string {
 	// First replace newlines with space to keep regex simpler, keep periods.
 	normalized := strings.ReplaceAll(text, "\n", " ")
 	matches := sentenceSplitRegex.FindAllString(normalized, -1)
 	var out []string
 	for _, m := range matches {
 		m = strings.TrimSpace(m)
 		if m != "" {
 			out = append(out, m)
 		}
 	}
 	return out
 }