aboutsummaryrefslogtreecommitdiffstats
path: root/internal/index/index.go
diff options
context:
space:
mode:
authorbndw <ben@bdw.to>2026-03-06 07:47:10 -0800
committerbndw <ben@bdw.to>2026-03-06 07:47:10 -0800
commit9b4e7b8ba88f65c9c4a77b461f8353cf706e2206 (patch)
tree9c445bde32662f03cc569c1c25c091adb1aa7bab /internal/index/index.go
parentf1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (diff)
Diffstat (limited to 'internal/index/index.go')
-rw-r--r--internal/index/index.go163
1 files changed, 66 insertions, 97 deletions
diff --git a/internal/index/index.go b/internal/index/index.go
index 008e487..5ce9f4f 100644
--- a/internal/index/index.go
+++ b/internal/index/index.go
@@ -2,18 +2,16 @@ package index
2 2
3import ( 3import (
4 "database/sql" 4 "database/sql"
5 "encoding/binary"
6 "math"
7 "os" 5 "os"
8 "path/filepath" 6 "path/filepath"
9 "sort"
10 7
11 _ "modernc.org/sqlite" 8 sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/cgo"
9 _ "github.com/mattn/go-sqlite3"
12 10
13 "code.northwest.io/codevec/internal/chunker" 11 "code.northwest.io/codevec/internal/chunker"
14) 12)
15 13
16// Index stores chunks and embeddings in SQLite 14// Index stores chunks and embeddings in SQLite with sqlite-vec
17type Index struct { 15type Index struct {
18 db *sql.DB 16 db *sql.DB
19 dims int 17 dims int
@@ -21,13 +19,16 @@ type Index struct {
21 19
22// Open opens or creates an index at the given path 20// Open opens or creates an index at the given path
23func Open(path string, dims int) (*Index, error) { 21func Open(path string, dims int) (*Index, error) {
22 // Register sqlite-vec extension
23 sqlite_vec.Auto()
24
24 // Ensure directory exists 25 // Ensure directory exists
25 dir := filepath.Dir(path) 26 dir := filepath.Dir(path)
26 if err := os.MkdirAll(dir, 0755); err != nil { 27 if err := os.MkdirAll(dir, 0755); err != nil {
27 return nil, err 28 return nil, err
28 } 29 }
29 30
30 db, err := sql.Open("sqlite", path) 31 db, err := sql.Open("sqlite3", path)
31 if err != nil { 32 if err != nil {
32 return nil, err 33 return nil, err
33 } 34 }
@@ -42,7 +43,7 @@ func Open(path string, dims int) (*Index, error) {
42} 43}
43 44
44func (idx *Index) init() error { 45func (idx *Index) init() error {
45 // Create chunks table with embedding column 46 // Create chunks table
46 _, err := idx.db.Exec(` 47 _, err := idx.db.Exec(`
47 CREATE TABLE IF NOT EXISTS chunks ( 48 CREATE TABLE IF NOT EXISTS chunks (
48 id INTEGER PRIMARY KEY, 49 id INTEGER PRIMARY KEY,
@@ -53,7 +54,6 @@ func (idx *Index) init() error {
53 name TEXT, 54 name TEXT,
54 content TEXT NOT NULL, 55 content TEXT NOT NULL,
55 hash TEXT NOT NULL, 56 hash TEXT NOT NULL,
56 embedding BLOB,
57 created_at INTEGER DEFAULT (unixepoch()) 57 created_at INTEGER DEFAULT (unixepoch())
58 ) 58 )
59 `) 59 `)
@@ -84,6 +84,17 @@ func (idx *Index) init() error {
84 return err 84 return err
85 } 85 }
86 86
87 // Create vec0 virtual table for vectors
88 _, err = idx.db.Exec(`
89 CREATE VIRTUAL TABLE IF NOT EXISTS vectors USING vec0(
90 chunk_id INTEGER PRIMARY KEY,
91 embedding FLOAT[768] distance_metric=cosine
92 )
93 `)
94 if err != nil {
95 return err
96 }
97
87 // Index on file for faster deletion 98 // Index on file for faster deletion
88 _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) 99 _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`)
89 return err 100 return err
@@ -96,11 +107,27 @@ func (idx *Index) Close() error {
96 107
97// InsertChunk inserts a chunk with its embedding 108// InsertChunk inserts a chunk with its embedding
98func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { 109func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error {
99 embeddingBlob := serializeEmbedding(embedding) 110 // Insert chunk
100 _, err := idx.db.Exec(` 111 result, err := idx.db.Exec(`
101 INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) 112 INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash)
102 VALUES (?, ?, ?, ?, ?, ?, ?, ?) 113 VALUES (?, ?, ?, ?, ?, ?, ?)
103 `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) 114 `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash)
115 if err != nil {
116 return err
117 }
118
119 chunkID, err := result.LastInsertId()
120 if err != nil {
121 return err
122 }
123
124 // Insert vector
125 vecBlob, err := sqlite_vec.SerializeFloat32(embedding)
126 if err != nil {
127 return err
128 }
129
130 _, err = idx.db.Exec(`INSERT INTO vectors (chunk_id, embedding) VALUES (?, ?)`, chunkID, vecBlob)
104 return err 131 return err
105} 132}
106 133
@@ -110,62 +137,39 @@ type SearchResult struct {
110 Distance float64 137 Distance float64
111} 138}
112 139
113// Search finds chunks similar to the query embedding using cosine similarity 140// Search finds chunks similar to the query embedding using sqlite-vec
114func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { 141func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) {
115 // Load all embeddings 142 vecBlob, err := sqlite_vec.SerializeFloat32(queryEmb)
116 rows, err := idx.db.Query(`
117 SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding
118 FROM chunks
119 WHERE embedding IS NOT NULL
120 `)
121 if err != nil { 143 if err != nil {
122 return nil, err 144 return nil, err
123 } 145 }
124 defer rows.Close()
125 146
126 type candidate struct { 147 // Query similar vectors
127 chunk chunker.Chunk 148 rows, err := idx.db.Query(`
128 distance float64 149 SELECT v.chunk_id, v.distance, c.file, c.start_line, c.end_line, c.chunk_type, c.name, c.content, c.hash
150 FROM vectors v
151 JOIN chunks c ON c.id = v.chunk_id
152 WHERE v.embedding MATCH ? AND k = ?
153 ORDER BY v.distance
154 `, vecBlob, limit)
155 if err != nil {
156 return nil, err
129 } 157 }
130 var candidates []candidate 158 defer rows.Close()
131 159
160 var results []SearchResult
132 for rows.Next() { 161 for rows.Next() {
133 var id int64 162 var chunkID int64
163 var distance float64
134 var c chunker.Chunk 164 var c chunker.Chunk
135 var embBlob []byte 165 err := rows.Scan(&chunkID, &distance, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash)
136 err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob)
137 if err != nil { 166 if err != nil {
138 return nil, err 167 return nil, err
139 } 168 }
140 169 results = append(results, SearchResult{Chunk: c, Distance: distance})
141 emb := deserializeEmbedding(embBlob)
142 dist := cosineDistance(queryEmb, emb)
143 candidates = append(candidates, candidate{chunk: c, distance: dist})
144 }
145
146 if err := rows.Err(); err != nil {
147 return nil, err
148 }
149
150 // Sort by distance (lower is better)
151 sort.Slice(candidates, func(i, j int) bool {
152 return candidates[i].distance < candidates[j].distance
153 })
154
155 // Return top-k
156 if limit > len(candidates) {
157 limit = len(candidates)
158 } 170 }
159 171
160 results := make([]SearchResult, limit) 172 return results, rows.Err()
161 for i := 0; i < limit; i++ {
162 results[i] = SearchResult{
163 Chunk: candidates[i].chunk,
164 Distance: candidates[i].distance,
165 }
166 }
167
168 return results, nil
169} 173}
170 174
171// GetFileHash returns the stored hash for a file, or empty string if not indexed 175// GetFileHash returns the stored hash for a file, or empty string if not indexed
@@ -189,7 +193,13 @@ func (idx *Index) SetFileHash(path, hash string) error {
189 193
190// DeleteChunksForFile removes all chunks for a file 194// DeleteChunksForFile removes all chunks for a file
191func (idx *Index) DeleteChunksForFile(path string) error { 195func (idx *Index) DeleteChunksForFile(path string) error {
192 _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) 196 // Delete vectors for chunks in this file
197 _, err := idx.db.Exec(`DELETE FROM vectors WHERE chunk_id IN (SELECT id FROM chunks WHERE file = ?)`, path)
198 if err != nil {
199 return err
200 }
201
202 _, err = idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path)
193 if err != nil { 203 if err != nil {
194 return err 204 return err
195 } 205 }
@@ -228,44 +238,3 @@ func (idx *Index) GetMetadata(key string) (string, error) {
228 } 238 }
229 return value, err 239 return value, err
230} 240}
231
232// serializeEmbedding converts float32 slice to bytes
233func serializeEmbedding(embedding []float32) []byte {
234 buf := make([]byte, len(embedding)*4)
235 for i, v := range embedding {
236 binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v))
237 }
238 return buf
239}
240
241// deserializeEmbedding converts bytes back to float32 slice
242func deserializeEmbedding(data []byte) []float32 {
243 n := len(data) / 4
244 result := make([]float32, n)
245 for i := 0; i < n; i++ {
246 bits := binary.LittleEndian.Uint32(data[i*4:])
247 result[i] = math.Float32frombits(bits)
248 }
249 return result
250}
251
252// cosineDistance computes 1 - cosine_similarity (so lower is more similar)
253func cosineDistance(a, b []float32) float64 {
254 if len(a) != len(b) {
255 return 1.0
256 }
257
258 var dotProduct, normA, normB float64
259 for i := range a {
260 dotProduct += float64(a[i]) * float64(b[i])
261 normA += float64(a[i]) * float64(a[i])
262 normB += float64(b[i]) * float64(b[i])
263 }
264
265 if normA == 0 || normB == 0 {
266 return 1.0
267 }
268
269 similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
270 return 1.0 - similarity
271}