package index import ( "database/sql" "encoding/binary" "math" "os" "path/filepath" "sort" _ "modernc.org/sqlite" "code.northwest.io/codevec/internal/chunker" ) // Index stores chunks and embeddings in SQLite type Index struct { db *sql.DB dims int } // Open opens or creates an index at the given path func Open(path string, dims int) (*Index, error) { // Ensure directory exists dir := filepath.Dir(path) if err := os.MkdirAll(dir, 0755); err != nil { return nil, err } db, err := sql.Open("sqlite", path) if err != nil { return nil, err } idx := &Index{db: db, dims: dims} if err := idx.init(); err != nil { db.Close() return nil, err } return idx, nil } func (idx *Index) init() error { // Create chunks table with embedding column _, err := idx.db.Exec(` CREATE TABLE IF NOT EXISTS chunks ( id INTEGER PRIMARY KEY, file TEXT NOT NULL, start_line INTEGER NOT NULL, end_line INTEGER NOT NULL, chunk_type TEXT, name TEXT, content TEXT NOT NULL, hash TEXT NOT NULL, embedding BLOB, created_at INTEGER DEFAULT (unixepoch()) ) `) if err != nil { return err } // Create files table for tracking indexed files _, err = idx.db.Exec(` CREATE TABLE IF NOT EXISTS files ( path TEXT PRIMARY KEY, hash TEXT NOT NULL, indexed_at INTEGER DEFAULT (unixepoch()) ) `) if err != nil { return err } // Create metadata table _, err = idx.db.Exec(` CREATE TABLE IF NOT EXISTS metadata ( key TEXT PRIMARY KEY, value TEXT ) `) if err != nil { return err } // Index on file for faster deletion _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) return err } // Close closes the index func (idx *Index) Close() error { return idx.db.Close() } // InsertChunk inserts a chunk with its embedding func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { embeddingBlob := serializeEmbedding(embedding) _, err := idx.db.Exec(` INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) VALUES (?, ?, ?, ?, ?, ?, ?, ?) `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) return err } // SearchResult represents a search result type SearchResult struct { Chunk chunker.Chunk Distance float64 } // Search finds chunks similar to the query embedding using cosine similarity func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { // Load all embeddings rows, err := idx.db.Query(` SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding FROM chunks WHERE embedding IS NOT NULL `) if err != nil { return nil, err } defer rows.Close() type candidate struct { chunk chunker.Chunk distance float64 } var candidates []candidate for rows.Next() { var id int64 var c chunker.Chunk var embBlob []byte err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) if err != nil { return nil, err } emb := deserializeEmbedding(embBlob) dist := cosineDistance(queryEmb, emb) candidates = append(candidates, candidate{chunk: c, distance: dist}) } if err := rows.Err(); err != nil { return nil, err } // Sort by distance (lower is better) sort.Slice(candidates, func(i, j int) bool { return candidates[i].distance < candidates[j].distance }) // Return top-k if limit > len(candidates) { limit = len(candidates) } results := make([]SearchResult, limit) for i := 0; i < limit; i++ { results[i] = SearchResult{ Chunk: candidates[i].chunk, Distance: candidates[i].distance, } } return results, nil } // GetFileHash returns the stored hash for a file, or empty string if not indexed func (idx *Index) GetFileHash(path string) (string, error) { var hash string err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) if err == sql.ErrNoRows { return "", nil } return hash, err } // SetFileHash updates the hash for a file func (idx *Index) SetFileHash(path, hash string) error { _, err := idx.db.Exec(` INSERT OR REPLACE INTO files (path, hash, indexed_at) VALUES (?, ?, unixepoch()) `, path, hash) return err } // DeleteChunksForFile removes all chunks for a file func (idx *Index) DeleteChunksForFile(path string) error { _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) if err != nil { return err } _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) return err } // Stats returns index statistics type Stats struct { Files int Chunks int } func (idx *Index) Stats() (Stats, error) { var s Stats err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) if err != nil { return s, err } err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) return s, err } // SetMetadata stores metadata func (idx *Index) SetMetadata(key, value string) error { _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) return err } // GetMetadata retrieves metadata func (idx *Index) GetMetadata(key string) (string, error) { var value string err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) if err == sql.ErrNoRows { return "", nil } return value, err } // serializeEmbedding converts float32 slice to bytes func serializeEmbedding(embedding []float32) []byte { buf := make([]byte, len(embedding)*4) for i, v := range embedding { binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) } return buf } // deserializeEmbedding converts bytes back to float32 slice func deserializeEmbedding(data []byte) []float32 { n := len(data) / 4 result := make([]float32, n) for i := 0; i < n; i++ { bits := binary.LittleEndian.Uint32(data[i*4:]) result[i] = math.Float32frombits(bits) } return result } // cosineDistance computes 1 - cosine_similarity (so lower is more similar) func cosineDistance(a, b []float32) float64 { if len(a) != len(b) { return 1.0 } var dotProduct, normA, normB float64 for i := range a { dotProduct += float64(a[i]) * float64(b[i]) normA += float64(a[i]) * float64(a[i]) normB += float64(b[i]) * float64(b[i]) } if normA == 0 || normB == 0 { return 1.0 } similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) return 1.0 - similarity }