diff options
| author | bndw <ben@bdw.to> | 2026-03-06 07:47:10 -0800 |
|---|---|---|
| committer | bndw <ben@bdw.to> | 2026-03-06 07:47:10 -0800 |
| commit | 9b4e7b8ba88f65c9c4a77b461f8353cf706e2206 (patch) | |
| tree | 9c445bde32662f03cc569c1c25c091adb1aa7bab /internal/index | |
| parent | f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (diff) | |
Diffstat (limited to 'internal/index')
| -rw-r--r-- | internal/index/index.go | 163 |
1 files changed, 66 insertions, 97 deletions
diff --git a/internal/index/index.go b/internal/index/index.go index 008e487..5ce9f4f 100644 --- a/internal/index/index.go +++ b/internal/index/index.go | |||
| @@ -2,18 +2,16 @@ package index | |||
| 2 | 2 | ||
| 3 | import ( | 3 | import ( |
| 4 | "database/sql" | 4 | "database/sql" |
| 5 | "encoding/binary" | ||
| 6 | "math" | ||
| 7 | "os" | 5 | "os" |
| 8 | "path/filepath" | 6 | "path/filepath" |
| 9 | "sort" | ||
| 10 | 7 | ||
| 11 | _ "modernc.org/sqlite" | 8 | sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/cgo" |
| 9 | _ "github.com/mattn/go-sqlite3" | ||
| 12 | 10 | ||
| 13 | "code.northwest.io/codevec/internal/chunker" | 11 | "code.northwest.io/codevec/internal/chunker" |
| 14 | ) | 12 | ) |
| 15 | 13 | ||
| 16 | // Index stores chunks and embeddings in SQLite | 14 | // Index stores chunks and embeddings in SQLite with sqlite-vec |
| 17 | type Index struct { | 15 | type Index struct { |
| 18 | db *sql.DB | 16 | db *sql.DB |
| 19 | dims int | 17 | dims int |
| @@ -21,13 +19,16 @@ type Index struct { | |||
| 21 | 19 | ||
| 22 | // Open opens or creates an index at the given path | 20 | // Open opens or creates an index at the given path |
| 23 | func Open(path string, dims int) (*Index, error) { | 21 | func Open(path string, dims int) (*Index, error) { |
| 22 | // Register sqlite-vec extension | ||
| 23 | sqlite_vec.Auto() | ||
| 24 | |||
| 24 | // Ensure directory exists | 25 | // Ensure directory exists |
| 25 | dir := filepath.Dir(path) | 26 | dir := filepath.Dir(path) |
| 26 | if err := os.MkdirAll(dir, 0755); err != nil { | 27 | if err := os.MkdirAll(dir, 0755); err != nil { |
| 27 | return nil, err | 28 | return nil, err |
| 28 | } | 29 | } |
| 29 | 30 | ||
| 30 | db, err := sql.Open("sqlite", path) | 31 | db, err := sql.Open("sqlite3", path) |
| 31 | if err != nil { | 32 | if err != nil { |
| 32 | return nil, err | 33 | return nil, err |
| 33 | } | 34 | } |
| @@ -42,7 +43,7 @@ func Open(path string, dims int) (*Index, error) { | |||
| 42 | } | 43 | } |
| 43 | 44 | ||
| 44 | func (idx *Index) init() error { | 45 | func (idx *Index) init() error { |
| 45 | // Create chunks table with embedding column | 46 | // Create chunks table |
| 46 | _, err := idx.db.Exec(` | 47 | _, err := idx.db.Exec(` |
| 47 | CREATE TABLE IF NOT EXISTS chunks ( | 48 | CREATE TABLE IF NOT EXISTS chunks ( |
| 48 | id INTEGER PRIMARY KEY, | 49 | id INTEGER PRIMARY KEY, |
| @@ -53,7 +54,6 @@ func (idx *Index) init() error { | |||
| 53 | name TEXT, | 54 | name TEXT, |
| 54 | content TEXT NOT NULL, | 55 | content TEXT NOT NULL, |
| 55 | hash TEXT NOT NULL, | 56 | hash TEXT NOT NULL, |
| 56 | embedding BLOB, | ||
| 57 | created_at INTEGER DEFAULT (unixepoch()) | 57 | created_at INTEGER DEFAULT (unixepoch()) |
| 58 | ) | 58 | ) |
| 59 | `) | 59 | `) |
| @@ -84,6 +84,17 @@ func (idx *Index) init() error { | |||
| 84 | return err | 84 | return err |
| 85 | } | 85 | } |
| 86 | 86 | ||
| 87 | // Create vec0 virtual table for vectors | ||
| 88 | _, err = idx.db.Exec(` | ||
| 89 | CREATE VIRTUAL TABLE IF NOT EXISTS vectors USING vec0( | ||
| 90 | chunk_id INTEGER PRIMARY KEY, | ||
| 91 | embedding FLOAT[768] distance_metric=cosine | ||
| 92 | ) | ||
| 93 | `) | ||
| 94 | if err != nil { | ||
| 95 | return err | ||
| 96 | } | ||
| 97 | |||
| 87 | // Index on file for faster deletion | 98 | // Index on file for faster deletion |
| 88 | _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) | 99 | _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) |
| 89 | return err | 100 | return err |
| @@ -96,11 +107,27 @@ func (idx *Index) Close() error { | |||
| 96 | 107 | ||
| 97 | // InsertChunk inserts a chunk with its embedding | 108 | // InsertChunk inserts a chunk with its embedding |
| 98 | func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { | 109 | func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { |
| 99 | embeddingBlob := serializeEmbedding(embedding) | 110 | // Insert chunk |
| 100 | _, err := idx.db.Exec(` | 111 | result, err := idx.db.Exec(` |
| 101 | INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) | 112 | INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash) |
| 102 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) | 113 | VALUES (?, ?, ?, ?, ?, ?, ?) |
| 103 | `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) | 114 | `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash) |
| 115 | if err != nil { | ||
| 116 | return err | ||
| 117 | } | ||
| 118 | |||
| 119 | chunkID, err := result.LastInsertId() | ||
| 120 | if err != nil { | ||
| 121 | return err | ||
| 122 | } | ||
| 123 | |||
| 124 | // Insert vector | ||
| 125 | vecBlob, err := sqlite_vec.SerializeFloat32(embedding) | ||
| 126 | if err != nil { | ||
| 127 | return err | ||
| 128 | } | ||
| 129 | |||
| 130 | _, err = idx.db.Exec(`INSERT INTO vectors (chunk_id, embedding) VALUES (?, ?)`, chunkID, vecBlob) | ||
| 104 | return err | 131 | return err |
| 105 | } | 132 | } |
| 106 | 133 | ||
| @@ -110,62 +137,39 @@ type SearchResult struct { | |||
| 110 | Distance float64 | 137 | Distance float64 |
| 111 | } | 138 | } |
| 112 | 139 | ||
| 113 | // Search finds chunks similar to the query embedding using cosine similarity | 140 | // Search finds chunks similar to the query embedding using sqlite-vec |
| 114 | func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { | 141 | func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { |
| 115 | // Load all embeddings | 142 | vecBlob, err := sqlite_vec.SerializeFloat32(queryEmb) |
| 116 | rows, err := idx.db.Query(` | ||
| 117 | SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding | ||
| 118 | FROM chunks | ||
| 119 | WHERE embedding IS NOT NULL | ||
| 120 | `) | ||
| 121 | if err != nil { | 143 | if err != nil { |
| 122 | return nil, err | 144 | return nil, err |
| 123 | } | 145 | } |
| 124 | defer rows.Close() | ||
| 125 | 146 | ||
| 126 | type candidate struct { | 147 | // Query similar vectors |
| 127 | chunk chunker.Chunk | 148 | rows, err := idx.db.Query(` |
| 128 | distance float64 | 149 | SELECT v.chunk_id, v.distance, c.file, c.start_line, c.end_line, c.chunk_type, c.name, c.content, c.hash |
| 150 | FROM vectors v | ||
| 151 | JOIN chunks c ON c.id = v.chunk_id | ||
| 152 | WHERE v.embedding MATCH ? AND k = ? | ||
| 153 | ORDER BY v.distance | ||
| 154 | `, vecBlob, limit) | ||
| 155 | if err != nil { | ||
| 156 | return nil, err | ||
| 129 | } | 157 | } |
| 130 | var candidates []candidate | 158 | defer rows.Close() |
| 131 | 159 | ||
| 160 | var results []SearchResult | ||
| 132 | for rows.Next() { | 161 | for rows.Next() { |
| 133 | var id int64 | 162 | var chunkID int64 |
| 163 | var distance float64 | ||
| 134 | var c chunker.Chunk | 164 | var c chunker.Chunk |
| 135 | var embBlob []byte | 165 | err := rows.Scan(&chunkID, &distance, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash) |
| 136 | err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) | ||
| 137 | if err != nil { | 166 | if err != nil { |
| 138 | return nil, err | 167 | return nil, err |
| 139 | } | 168 | } |
| 140 | 169 | results = append(results, SearchResult{Chunk: c, Distance: distance}) | |
| 141 | emb := deserializeEmbedding(embBlob) | ||
| 142 | dist := cosineDistance(queryEmb, emb) | ||
| 143 | candidates = append(candidates, candidate{chunk: c, distance: dist}) | ||
| 144 | } | ||
| 145 | |||
| 146 | if err := rows.Err(); err != nil { | ||
| 147 | return nil, err | ||
| 148 | } | ||
| 149 | |||
| 150 | // Sort by distance (lower is better) | ||
| 151 | sort.Slice(candidates, func(i, j int) bool { | ||
| 152 | return candidates[i].distance < candidates[j].distance | ||
| 153 | }) | ||
| 154 | |||
| 155 | // Return top-k | ||
| 156 | if limit > len(candidates) { | ||
| 157 | limit = len(candidates) | ||
| 158 | } | 170 | } |
| 159 | 171 | ||
| 160 | results := make([]SearchResult, limit) | 172 | return results, rows.Err() |
| 161 | for i := 0; i < limit; i++ { | ||
| 162 | results[i] = SearchResult{ | ||
| 163 | Chunk: candidates[i].chunk, | ||
| 164 | Distance: candidates[i].distance, | ||
| 165 | } | ||
| 166 | } | ||
| 167 | |||
| 168 | return results, nil | ||
| 169 | } | 173 | } |
| 170 | 174 | ||
| 171 | // GetFileHash returns the stored hash for a file, or empty string if not indexed | 175 | // GetFileHash returns the stored hash for a file, or empty string if not indexed |
| @@ -189,7 +193,13 @@ func (idx *Index) SetFileHash(path, hash string) error { | |||
| 189 | 193 | ||
| 190 | // DeleteChunksForFile removes all chunks for a file | 194 | // DeleteChunksForFile removes all chunks for a file |
| 191 | func (idx *Index) DeleteChunksForFile(path string) error { | 195 | func (idx *Index) DeleteChunksForFile(path string) error { |
| 192 | _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) | 196 | // Delete vectors for chunks in this file |
| 197 | _, err := idx.db.Exec(`DELETE FROM vectors WHERE chunk_id IN (SELECT id FROM chunks WHERE file = ?)`, path) | ||
| 198 | if err != nil { | ||
| 199 | return err | ||
| 200 | } | ||
| 201 | |||
| 202 | _, err = idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) | ||
| 193 | if err != nil { | 203 | if err != nil { |
| 194 | return err | 204 | return err |
| 195 | } | 205 | } |
| @@ -228,44 +238,3 @@ func (idx *Index) GetMetadata(key string) (string, error) { | |||
| 228 | } | 238 | } |
| 229 | return value, err | 239 | return value, err |
| 230 | } | 240 | } |
| 231 | |||
| 232 | // serializeEmbedding converts float32 slice to bytes | ||
| 233 | func serializeEmbedding(embedding []float32) []byte { | ||
| 234 | buf := make([]byte, len(embedding)*4) | ||
| 235 | for i, v := range embedding { | ||
| 236 | binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) | ||
| 237 | } | ||
| 238 | return buf | ||
| 239 | } | ||
| 240 | |||
| 241 | // deserializeEmbedding converts bytes back to float32 slice | ||
| 242 | func deserializeEmbedding(data []byte) []float32 { | ||
| 243 | n := len(data) / 4 | ||
| 244 | result := make([]float32, n) | ||
| 245 | for i := 0; i < n; i++ { | ||
| 246 | bits := binary.LittleEndian.Uint32(data[i*4:]) | ||
| 247 | result[i] = math.Float32frombits(bits) | ||
| 248 | } | ||
| 249 | return result | ||
| 250 | } | ||
| 251 | |||
| 252 | // cosineDistance computes 1 - cosine_similarity (so lower is more similar) | ||
| 253 | func cosineDistance(a, b []float32) float64 { | ||
| 254 | if len(a) != len(b) { | ||
| 255 | return 1.0 | ||
| 256 | } | ||
| 257 | |||
| 258 | var dotProduct, normA, normB float64 | ||
| 259 | for i := range a { | ||
| 260 | dotProduct += float64(a[i]) * float64(b[i]) | ||
| 261 | normA += float64(a[i]) * float64(a[i]) | ||
| 262 | normB += float64(b[i]) * float64(b[i]) | ||
| 263 | } | ||
| 264 | |||
| 265 | if normA == 0 || normB == 0 { | ||
| 266 | return 1.0 | ||
| 267 | } | ||
| 268 | |||
| 269 | similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) | ||
| 270 | return 1.0 - similarity | ||
| 271 | } | ||
