From f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:29:00 -0800 Subject: Implement core: walker, chunker, embedder, index, CLI --- internal/index/index.go | 271 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 internal/index/index.go (limited to 'internal/index') diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go @@ -0,0 +1,271 @@ +package index + +import ( + "database/sql" + "encoding/binary" + "math" + "os" + "path/filepath" + "sort" + + _ "modernc.org/sqlite" + + "code.northwest.io/codevec/internal/chunker" +) + +// Index stores chunks and embeddings in SQLite +type Index struct { + db *sql.DB + dims int +} + +// Open opens or creates an index at the given path +func Open(path string, dims int) (*Index, error) { + // Ensure directory exists + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0755); err != nil { + return nil, err + } + + db, err := sql.Open("sqlite", path) + if err != nil { + return nil, err + } + + idx := &Index{db: db, dims: dims} + if err := idx.init(); err != nil { + db.Close() + return nil, err + } + + return idx, nil +} + +func (idx *Index) init() error { + // Create chunks table with embedding column + _, err := idx.db.Exec(` + CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY, + file TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + chunk_type TEXT, + name TEXT, + content TEXT NOT NULL, + hash TEXT NOT NULL, + embedding BLOB, + created_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create files table for tracking indexed files + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + hash TEXT NOT NULL, + indexed_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create metadata table + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS metadata ( + key TEXT PRIMARY KEY, + value TEXT + ) + `) + if err != nil { + return err + } + + // Index on file for faster deletion + _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) + return err +} + +// Close closes the index +func (idx *Index) Close() error { + return idx.db.Close() +} + +// InsertChunk inserts a chunk with its embedding +func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { + embeddingBlob := serializeEmbedding(embedding) + _, err := idx.db.Exec(` + INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) + return err +} + +// SearchResult represents a search result +type SearchResult struct { + Chunk chunker.Chunk + Distance float64 +} + +// Search finds chunks similar to the query embedding using cosine similarity +func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { + // Load all embeddings + rows, err := idx.db.Query(` + SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding + FROM chunks + WHERE embedding IS NOT NULL + `) + if err != nil { + return nil, err + } + defer rows.Close() + + type candidate struct { + chunk chunker.Chunk + distance float64 + } + var candidates []candidate + + for rows.Next() { + var id int64 + var c chunker.Chunk + var embBlob []byte + err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) + if err != nil { + return nil, err + } + + emb := deserializeEmbedding(embBlob) + dist := cosineDistance(queryEmb, emb) + candidates = append(candidates, candidate{chunk: c, distance: dist}) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + // Sort by distance (lower is better) + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].distance < candidates[j].distance + }) + + // Return top-k + if limit > len(candidates) { + limit = len(candidates) + } + + results := make([]SearchResult, limit) + for i := 0; i < limit; i++ { + results[i] = SearchResult{ + Chunk: candidates[i].chunk, + Distance: candidates[i].distance, + } + } + + return results, nil +} + +// GetFileHash returns the stored hash for a file, or empty string if not indexed +func (idx *Index) GetFileHash(path string) (string, error) { + var hash string + err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) + if err == sql.ErrNoRows { + return "", nil + } + return hash, err +} + +// SetFileHash updates the hash for a file +func (idx *Index) SetFileHash(path, hash string) error { + _, err := idx.db.Exec(` + INSERT OR REPLACE INTO files (path, hash, indexed_at) + VALUES (?, ?, unixepoch()) + `, path, hash) + return err +} + +// DeleteChunksForFile removes all chunks for a file +func (idx *Index) DeleteChunksForFile(path string) error { + _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) + if err != nil { + return err + } + _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) + return err +} + +// Stats returns index statistics +type Stats struct { + Files int + Chunks int +} + +func (idx *Index) Stats() (Stats, error) { + var s Stats + err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) + if err != nil { + return s, err + } + err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) + return s, err +} + +// SetMetadata stores metadata +func (idx *Index) SetMetadata(key, value string) error { + _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) + return err +} + +// GetMetadata retrieves metadata +func (idx *Index) GetMetadata(key string) (string, error) { + var value string + err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) + if err == sql.ErrNoRows { + return "", nil + } + return value, err +} + +// serializeEmbedding converts float32 slice to bytes +func serializeEmbedding(embedding []float32) []byte { + buf := make([]byte, len(embedding)*4) + for i, v := range embedding { + binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) + } + return buf +} + +// deserializeEmbedding converts bytes back to float32 slice +func deserializeEmbedding(data []byte) []float32 { + n := len(data) / 4 + result := make([]float32, n) + for i := 0; i < n; i++ { + bits := binary.LittleEndian.Uint32(data[i*4:]) + result[i] = math.Float32frombits(bits) + } + return result +} + +// cosineDistance computes 1 - cosine_similarity (so lower is more similar) +func cosineDistance(a, b []float32) float64 { + if len(a) != len(b) { + return 1.0 + } + + var dotProduct, normA, normB float64 + for i := range a { + dotProduct += float64(a[i]) * float64(b[i]) + normA += float64(a[i]) * float64(a[i]) + normB += float64(b[i]) * float64(b[i]) + } + + if normA == 0 || normB == 0 { + return 1.0 + } + + similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) + return 1.0 - similarity +} -- cgit v1.2.3