From 9b4e7b8ba88f65c9c4a77b461f8353cf706e2206 Mon Sep 17 00:00:00 2001 From: bndw Date: Fri, 6 Mar 2026 07:47:10 -0800 Subject: cgo --- internal/index/index.go | 163 ++++++++++++++++++++---------------------------- 1 file changed, 66 insertions(+), 97 deletions(-) (limited to 'internal') diff --git a/internal/index/index.go b/internal/index/index.go index 008e487..5ce9f4f 100644 --- a/internal/index/index.go +++ b/internal/index/index.go @@ -2,18 +2,16 @@ package index import ( "database/sql" - "encoding/binary" - "math" "os" "path/filepath" - "sort" - _ "modernc.org/sqlite" + sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/cgo" + _ "github.com/mattn/go-sqlite3" "code.northwest.io/codevec/internal/chunker" ) -// Index stores chunks and embeddings in SQLite +// Index stores chunks and embeddings in SQLite with sqlite-vec type Index struct { db *sql.DB dims int @@ -21,13 +19,16 @@ type Index struct { // Open opens or creates an index at the given path func Open(path string, dims int) (*Index, error) { + // Register sqlite-vec extension + sqlite_vec.Auto() + // Ensure directory exists dir := filepath.Dir(path) if err := os.MkdirAll(dir, 0755); err != nil { return nil, err } - db, err := sql.Open("sqlite", path) + db, err := sql.Open("sqlite3", path) if err != nil { return nil, err } @@ -42,7 +43,7 @@ func Open(path string, dims int) (*Index, error) { } func (idx *Index) init() error { - // Create chunks table with embedding column + // Create chunks table _, err := idx.db.Exec(` CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY, @@ -53,7 +54,6 @@ func (idx *Index) init() error { name TEXT, content TEXT NOT NULL, hash TEXT NOT NULL, - embedding BLOB, created_at INTEGER DEFAULT (unixepoch()) ) `) @@ -84,6 +84,17 @@ func (idx *Index) init() error { return err } + // Create vec0 virtual table for vectors + _, err = idx.db.Exec(` + CREATE VIRTUAL TABLE IF NOT EXISTS vectors USING vec0( + chunk_id INTEGER PRIMARY KEY, + embedding FLOAT[768] distance_metric=cosine + ) + `) + if err != nil { + return err + } + // Index on file for faster deletion _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) return err @@ -96,11 +107,27 @@ func (idx *Index) Close() error { // InsertChunk inserts a chunk with its embedding func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { - embeddingBlob := serializeEmbedding(embedding) - _, err := idx.db.Exec(` - INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) + // Insert chunk + result, err := idx.db.Exec(` + INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash) + VALUES (?, ?, ?, ?, ?, ?, ?) + `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash) + if err != nil { + return err + } + + chunkID, err := result.LastInsertId() + if err != nil { + return err + } + + // Insert vector + vecBlob, err := sqlite_vec.SerializeFloat32(embedding) + if err != nil { + return err + } + + _, err = idx.db.Exec(`INSERT INTO vectors (chunk_id, embedding) VALUES (?, ?)`, chunkID, vecBlob) return err } @@ -110,62 +137,39 @@ type SearchResult struct { Distance float64 } -// Search finds chunks similar to the query embedding using cosine similarity +// Search finds chunks similar to the query embedding using sqlite-vec func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { - // Load all embeddings - rows, err := idx.db.Query(` - SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding - FROM chunks - WHERE embedding IS NOT NULL - `) + vecBlob, err := sqlite_vec.SerializeFloat32(queryEmb) if err != nil { return nil, err } - defer rows.Close() - type candidate struct { - chunk chunker.Chunk - distance float64 + // Query similar vectors + rows, err := idx.db.Query(` + SELECT v.chunk_id, v.distance, c.file, c.start_line, c.end_line, c.chunk_type, c.name, c.content, c.hash + FROM vectors v + JOIN chunks c ON c.id = v.chunk_id + WHERE v.embedding MATCH ? AND k = ? + ORDER BY v.distance + `, vecBlob, limit) + if err != nil { + return nil, err } - var candidates []candidate + defer rows.Close() + var results []SearchResult for rows.Next() { - var id int64 + var chunkID int64 + var distance float64 var c chunker.Chunk - var embBlob []byte - err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) + err := rows.Scan(&chunkID, &distance, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash) if err != nil { return nil, err } - - emb := deserializeEmbedding(embBlob) - dist := cosineDistance(queryEmb, emb) - candidates = append(candidates, candidate{chunk: c, distance: dist}) - } - - if err := rows.Err(); err != nil { - return nil, err - } - - // Sort by distance (lower is better) - sort.Slice(candidates, func(i, j int) bool { - return candidates[i].distance < candidates[j].distance - }) - - // Return top-k - if limit > len(candidates) { - limit = len(candidates) + results = append(results, SearchResult{Chunk: c, Distance: distance}) } - results := make([]SearchResult, limit) - for i := 0; i < limit; i++ { - results[i] = SearchResult{ - Chunk: candidates[i].chunk, - Distance: candidates[i].distance, - } - } - - return results, nil + return results, rows.Err() } // GetFileHash returns the stored hash for a file, or empty string if not indexed @@ -189,7 +193,13 @@ func (idx *Index) SetFileHash(path, hash string) error { // DeleteChunksForFile removes all chunks for a file func (idx *Index) DeleteChunksForFile(path string) error { - _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) + // Delete vectors for chunks in this file + _, err := idx.db.Exec(`DELETE FROM vectors WHERE chunk_id IN (SELECT id FROM chunks WHERE file = ?)`, path) + if err != nil { + return err + } + + _, err = idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) if err != nil { return err } @@ -228,44 +238,3 @@ func (idx *Index) GetMetadata(key string) (string, error) { } return value, err } - -// serializeEmbedding converts float32 slice to bytes -func serializeEmbedding(embedding []float32) []byte { - buf := make([]byte, len(embedding)*4) - for i, v := range embedding { - binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) - } - return buf -} - -// deserializeEmbedding converts bytes back to float32 slice -func deserializeEmbedding(data []byte) []float32 { - n := len(data) / 4 - result := make([]float32, n) - for i := 0; i < n; i++ { - bits := binary.LittleEndian.Uint32(data[i*4:]) - result[i] = math.Float32frombits(bits) - } - return result -} - -// cosineDistance computes 1 - cosine_similarity (so lower is more similar) -func cosineDistance(a, b []float32) float64 { - if len(a) != len(b) { - return 1.0 - } - - var dotProduct, normA, normB float64 - for i := range a { - dotProduct += float64(a[i]) * float64(b[i]) - normA += float64(a[i]) * float64(a[i]) - normB += float64(b[i]) * float64(b[i]) - } - - if normA == 0 || normB == 0 { - return 1.0 - } - - similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) - return 1.0 - similarity -} -- cgit v1.2.3