diff --git a/migrations/0003_create_sentence_embeddings_table.up.sql b/migrations/0003_create_sentence_embeddings_table.up.sql index 36c4990..e791604 100644 --- a/migrations/0003_create_sentence_embeddings_table.up.sql +++ b/migrations/0003_create_sentence_embeddings_table.up.sql @@ -1,16 +1,16 @@ -- +goose Up -CREATE EXTENSION IF NOT EXISTS vector; - +-- Create sentence_embeddings table using standard Postgres types (no vector extension) CREATE TABLE sentence_embeddings ( id SERIAL PRIMARY KEY, visit_id INTEGER NOT NULL, sentence TEXT NOT NULL, translated TEXT, - embeddings VECTOR(1536) NOT NULL, + embeddings FLOAT[] NOT NULL, -- Using standard float array instead of vector created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); +-- Create unique index for efficient lookups and preventing duplicates +CREATE UNIQUE INDEX idx_sentence_embeddings_visit_sentence ON sentence_embeddings (visit_id, sentence); + -- +goose Down DROP TABLE IF EXISTS sentence_embeddings; -DROP EXTENSION IF EXISTS vector; - diff --git a/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql b/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql index 2d58220..86b3b6e 100644 --- a/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql +++ b/migrations/0004_alter_sentence_embeddings_visit_id_type.up.sql @@ -1,8 +1,8 @@ -- +goose Up +-- Altering visit_id type, keeping compatibility with standard Postgres types ALTER TABLE sentence_embeddings - ALTER COLUMN visit_id TYPE TEXT USING visit_id::text; + ALTER COLUMN visit_id TYPE TEXT; -- +goose Down ALTER TABLE sentence_embeddings - ALTER COLUMN visit_id TYPE INTEGER USING visit_id::integer; - + ALTER COLUMN visit_id TYPE INTEGER USING (visit_id::integer); diff --git a/migrations/0005_add_unique_idx_sentence_embeddings.up.sql b/migrations/0005_add_unique_idx_sentence_embeddings.up.sql index 23db44e..8ec487e 100644 --- a/migrations/0005_add_unique_idx_sentence_embeddings.up.sql +++ b/migrations/0005_add_unique_idx_sentence_embeddings.up.sql @@ -1,6 +1,6 @@ -- +goose Up -CREATE UNIQUE INDEX IF NOT EXISTS ux_sentence_embeddings_visit_sentence - ON sentence_embeddings(visit_id, sentence); +-- The unique index was already created in migration 0003 when we switched to standard Postgres types +-- This migration is kept for consistency in migration sequence but doesn't perform any action -- +goose Down -DROP INDEX IF EXISTS ux_sentence_embeddings_visit_sentence; +-- No action needed for rollback diff --git a/migrations/0006_alter_sentence_embeddings_dim_384.up.sql b/migrations/0006_alter_sentence_embeddings_dim_384.up.sql index 415c4d6..fec1cdd 100644 --- a/migrations/0006_alter_sentence_embeddings_dim_384.up.sql +++ b/migrations/0006_alter_sentence_embeddings_dim_384.up.sql @@ -1,14 +1,6 @@ -- +goose Up --- WARNING: This alters the embeddings vector dimension from 1536 to 384. --- Ensure you are switching to a 384-dim embedding model (e.g., all-minilm). --- If existing rows have 1536-d vectors this command will fail; you may need to --- TRUNCATE TABLE sentence_embeddings first (after backing up) before applying. -ALTER TABLE sentence_embeddings - ALTER COLUMN embeddings TYPE vector(384); +-- Update schema to support 384-dimensional embeddings using standard Postgres types +-- No need to modify column type as we're now using a flexible FLOAT[] array -- +goose Down --- Revert to 1536 dimensions (for models like OpenAI text-embedding-3-large). --- Will fail if existing rows are 384-d. -ALTER TABLE sentence_embeddings - ALTER COLUMN embeddings TYPE vector(1536); - +-- No action needed for rollback since we're using a flexible array type diff --git a/migrations/0007_add_dual_embedding_columns.up.sql b/migrations/0007_add_dual_embedding_columns.up.sql index c708bbd..2096133 100644 --- a/migrations/0007_add_dual_embedding_columns.up.sql +++ b/migrations/0007_add_dual_embedding_columns.up.sql @@ -1,13 +1,10 @@ -- +goose Up --- Add separate columns for different embedding dimensions. --- Existing 'embeddings' column (if present) is left untouched for backward compatibility. --- Application code will now populate embedding_384 or embedding_1536 instead. +-- Add separate columns for different embedding dimensions using standard Postgres FLOAT[] arrays ALTER TABLE sentence_embeddings - ADD COLUMN IF NOT EXISTS embedding_384 vector(384), - ADD COLUMN IF NOT EXISTS embedding_1536 vector(1536); + ADD COLUMN IF NOT EXISTS embedding_384 FLOAT[], + ADD COLUMN IF NOT EXISTS embedding_1536 FLOAT[]; -- +goose Down ALTER TABLE sentence_embeddings DROP COLUMN IF EXISTS embedding_384, DROP COLUMN IF EXISTS embedding_1536; - diff --git a/migrations/0008_drop_legacy_embeddings_column.up.sql b/migrations/0008_drop_legacy_embeddings_column.up.sql index f02028e..0ccf452 100644 --- a/migrations/0008_drop_legacy_embeddings_column.up.sql +++ b/migrations/0008_drop_legacy_embeddings_column.up.sql @@ -1,11 +1,7 @@ -- +goose Up --- Drop the legacy single-dimension embeddings column (was NOT NULL) to allow inserts --- that now use embedding_384 / embedding_1536. All new data goes into those columns. +-- Drop legacy embeddings column as it's been replaced by embedding_384 and embedding_1536 ALTER TABLE sentence_embeddings DROP COLUMN IF EXISTS embeddings; -- +goose Down --- Re-create the legacy embeddings column (empty) as vector(1536) NULLABLE for rollback. -ALTER TABLE sentence_embeddings - ADD COLUMN IF NOT EXISTS embeddings vector(1536); - +-- No restoration action needed as embedding_384 and embedding_1536 are preserved diff --git a/repository.go b/repository.go index d51870b..0529924 100644 --- a/repository.go +++ b/repository.go @@ -277,18 +277,18 @@ func (r *PGChatRepository) InsertSentenceEmbedding(ctx context.Context, visitID, logrus.WithError(err).Warn("skipping sentence embedding insert") return err } - // Build vector literal + // Build array literal var b strings.Builder b.Grow(len(embedding)*8 + 2) - b.WriteByte('[') + b.WriteByte('{') for i, v := range embedding { if i > 0 { b.WriteByte(',') } b.WriteString(strconv.FormatFloat(v, 'f', -1, 64)) } - b.WriteByte(']') - vecLiteral := b.String() + b.WriteByte('}') + arrayLiteral := b.String() ctx, cancel := context.WithTimeout(ctx, 6*time.Second) defer cancel() @@ -296,18 +296,18 @@ func (r *PGChatRepository) InsertSentenceEmbedding(ctx context.Context, visitID, var sqlStmt string if l == 384 { sqlStmt = `INSERT INTO sentence_embeddings (visit_id, sentence, translated, embedding_384) - VALUES ($1,$2,$3,$4::vector) + VALUES ($1,$2,$3,$4::float[]) ON CONFLICT (visit_id, sentence) DO UPDATE SET embedding_384 = EXCLUDED.embedding_384, translated = COALESCE(sentence_embeddings.translated, EXCLUDED.translated)` } else { // 1536 sqlStmt = `INSERT INTO sentence_embeddings (visit_id, sentence, translated, embedding_1536) - VALUES ($1,$2,$3,$4::vector) + VALUES ($1,$2,$3,$4::float[]) ON CONFLICT (visit_id, sentence) DO UPDATE SET embedding_1536 = EXCLUDED.embedding_1536, translated = COALESCE(sentence_embeddings.translated, EXCLUDED.translated)` } - _, err := r.pool.Exec(ctx, sqlStmt, visitID, sentence, translated, vecLiteral) + _, err := r.pool.Exec(ctx, sqlStmt, visitID, sentence, translated, arrayLiteral) if err != nil { logrus.WithError(err).Warn("failed to upsert sentence embedding (dual columns)") }