diff options
| author | Clawd <ai@clawd.bot> | 2026-03-05 07:29:00 -0800 |
|---|---|---|
| committer | Clawd <ai@clawd.bot> | 2026-03-05 07:29:00 -0800 |
| commit | f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (patch) | |
| tree | 5e694f4a2e864c9fcdfcbb1ab869c3bae05b50e3 /internal/index | |
| parent | 03d8f49479b3446cf7f8ab9b6fdb2401584e3f12 (diff) | |
Implement core: walker, chunker, embedder, index, CLI
Diffstat (limited to 'internal/index')
| -rw-r--r-- | internal/index/index.go | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go | |||
| @@ -0,0 +1,271 @@ | |||
| 1 | package index | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "database/sql" | ||
| 5 | "encoding/binary" | ||
| 6 | "math" | ||
| 7 | "os" | ||
| 8 | "path/filepath" | ||
| 9 | "sort" | ||
| 10 | |||
| 11 | _ "modernc.org/sqlite" | ||
| 12 | |||
| 13 | "code.northwest.io/codevec/internal/chunker" | ||
| 14 | ) | ||
| 15 | |||
| 16 | // Index stores chunks and embeddings in SQLite | ||
| 17 | type Index struct { | ||
| 18 | db *sql.DB | ||
| 19 | dims int | ||
| 20 | } | ||
| 21 | |||
| 22 | // Open opens or creates an index at the given path | ||
| 23 | func Open(path string, dims int) (*Index, error) { | ||
| 24 | // Ensure directory exists | ||
| 25 | dir := filepath.Dir(path) | ||
| 26 | if err := os.MkdirAll(dir, 0755); err != nil { | ||
| 27 | return nil, err | ||
| 28 | } | ||
| 29 | |||
| 30 | db, err := sql.Open("sqlite", path) | ||
| 31 | if err != nil { | ||
| 32 | return nil, err | ||
| 33 | } | ||
| 34 | |||
| 35 | idx := &Index{db: db, dims: dims} | ||
| 36 | if err := idx.init(); err != nil { | ||
| 37 | db.Close() | ||
| 38 | return nil, err | ||
| 39 | } | ||
| 40 | |||
| 41 | return idx, nil | ||
| 42 | } | ||
| 43 | |||
| 44 | func (idx *Index) init() error { | ||
| 45 | // Create chunks table with embedding column | ||
| 46 | _, err := idx.db.Exec(` | ||
| 47 | CREATE TABLE IF NOT EXISTS chunks ( | ||
| 48 | id INTEGER PRIMARY KEY, | ||
| 49 | file TEXT NOT NULL, | ||
| 50 | start_line INTEGER NOT NULL, | ||
| 51 | end_line INTEGER NOT NULL, | ||
| 52 | chunk_type TEXT, | ||
| 53 | name TEXT, | ||
| 54 | content TEXT NOT NULL, | ||
| 55 | hash TEXT NOT NULL, | ||
| 56 | embedding BLOB, | ||
| 57 | created_at INTEGER DEFAULT (unixepoch()) | ||
| 58 | ) | ||
| 59 | `) | ||
| 60 | if err != nil { | ||
| 61 | return err | ||
| 62 | } | ||
| 63 | |||
| 64 | // Create files table for tracking indexed files | ||
| 65 | _, err = idx.db.Exec(` | ||
| 66 | CREATE TABLE IF NOT EXISTS files ( | ||
| 67 | path TEXT PRIMARY KEY, | ||
| 68 | hash TEXT NOT NULL, | ||
| 69 | indexed_at INTEGER DEFAULT (unixepoch()) | ||
| 70 | ) | ||
| 71 | `) | ||
| 72 | if err != nil { | ||
| 73 | return err | ||
| 74 | } | ||
| 75 | |||
| 76 | // Create metadata table | ||
| 77 | _, err = idx.db.Exec(` | ||
| 78 | CREATE TABLE IF NOT EXISTS metadata ( | ||
| 79 | key TEXT PRIMARY KEY, | ||
| 80 | value TEXT | ||
| 81 | ) | ||
| 82 | `) | ||
| 83 | if err != nil { | ||
| 84 | return err | ||
| 85 | } | ||
| 86 | |||
| 87 | // Index on file for faster deletion | ||
| 88 | _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) | ||
| 89 | return err | ||
| 90 | } | ||
| 91 | |||
| 92 | // Close closes the index | ||
| 93 | func (idx *Index) Close() error { | ||
| 94 | return idx.db.Close() | ||
| 95 | } | ||
| 96 | |||
| 97 | // InsertChunk inserts a chunk with its embedding | ||
| 98 | func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { | ||
| 99 | embeddingBlob := serializeEmbedding(embedding) | ||
| 100 | _, err := idx.db.Exec(` | ||
| 101 | INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) | ||
| 102 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) | ||
| 103 | `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) | ||
| 104 | return err | ||
| 105 | } | ||
| 106 | |||
| 107 | // SearchResult represents a search result | ||
| 108 | type SearchResult struct { | ||
| 109 | Chunk chunker.Chunk | ||
| 110 | Distance float64 | ||
| 111 | } | ||
| 112 | |||
| 113 | // Search finds chunks similar to the query embedding using cosine similarity | ||
| 114 | func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { | ||
| 115 | // Load all embeddings | ||
| 116 | rows, err := idx.db.Query(` | ||
| 117 | SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding | ||
| 118 | FROM chunks | ||
| 119 | WHERE embedding IS NOT NULL | ||
| 120 | `) | ||
| 121 | if err != nil { | ||
| 122 | return nil, err | ||
| 123 | } | ||
| 124 | defer rows.Close() | ||
| 125 | |||
| 126 | type candidate struct { | ||
| 127 | chunk chunker.Chunk | ||
| 128 | distance float64 | ||
| 129 | } | ||
| 130 | var candidates []candidate | ||
| 131 | |||
| 132 | for rows.Next() { | ||
| 133 | var id int64 | ||
| 134 | var c chunker.Chunk | ||
| 135 | var embBlob []byte | ||
| 136 | err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) | ||
| 137 | if err != nil { | ||
| 138 | return nil, err | ||
| 139 | } | ||
| 140 | |||
| 141 | emb := deserializeEmbedding(embBlob) | ||
| 142 | dist := cosineDistance(queryEmb, emb) | ||
| 143 | candidates = append(candidates, candidate{chunk: c, distance: dist}) | ||
| 144 | } | ||
| 145 | |||
| 146 | if err := rows.Err(); err != nil { | ||
| 147 | return nil, err | ||
| 148 | } | ||
| 149 | |||
| 150 | // Sort by distance (lower is better) | ||
| 151 | sort.Slice(candidates, func(i, j int) bool { | ||
| 152 | return candidates[i].distance < candidates[j].distance | ||
| 153 | }) | ||
| 154 | |||
| 155 | // Return top-k | ||
| 156 | if limit > len(candidates) { | ||
| 157 | limit = len(candidates) | ||
| 158 | } | ||
| 159 | |||
| 160 | results := make([]SearchResult, limit) | ||
| 161 | for i := 0; i < limit; i++ { | ||
| 162 | results[i] = SearchResult{ | ||
| 163 | Chunk: candidates[i].chunk, | ||
| 164 | Distance: candidates[i].distance, | ||
| 165 | } | ||
| 166 | } | ||
| 167 | |||
| 168 | return results, nil | ||
| 169 | } | ||
| 170 | |||
| 171 | // GetFileHash returns the stored hash for a file, or empty string if not indexed | ||
| 172 | func (idx *Index) GetFileHash(path string) (string, error) { | ||
| 173 | var hash string | ||
| 174 | err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) | ||
| 175 | if err == sql.ErrNoRows { | ||
| 176 | return "", nil | ||
| 177 | } | ||
| 178 | return hash, err | ||
| 179 | } | ||
| 180 | |||
| 181 | // SetFileHash updates the hash for a file | ||
| 182 | func (idx *Index) SetFileHash(path, hash string) error { | ||
| 183 | _, err := idx.db.Exec(` | ||
| 184 | INSERT OR REPLACE INTO files (path, hash, indexed_at) | ||
| 185 | VALUES (?, ?, unixepoch()) | ||
| 186 | `, path, hash) | ||
| 187 | return err | ||
| 188 | } | ||
| 189 | |||
| 190 | // DeleteChunksForFile removes all chunks for a file | ||
| 191 | func (idx *Index) DeleteChunksForFile(path string) error { | ||
| 192 | _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) | ||
| 193 | if err != nil { | ||
| 194 | return err | ||
| 195 | } | ||
| 196 | _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) | ||
| 197 | return err | ||
| 198 | } | ||
| 199 | |||
| 200 | // Stats returns index statistics | ||
| 201 | type Stats struct { | ||
| 202 | Files int | ||
| 203 | Chunks int | ||
| 204 | } | ||
| 205 | |||
| 206 | func (idx *Index) Stats() (Stats, error) { | ||
| 207 | var s Stats | ||
| 208 | err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) | ||
| 209 | if err != nil { | ||
| 210 | return s, err | ||
| 211 | } | ||
| 212 | err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) | ||
| 213 | return s, err | ||
| 214 | } | ||
| 215 | |||
| 216 | // SetMetadata stores metadata | ||
| 217 | func (idx *Index) SetMetadata(key, value string) error { | ||
| 218 | _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) | ||
| 219 | return err | ||
| 220 | } | ||
| 221 | |||
| 222 | // GetMetadata retrieves metadata | ||
| 223 | func (idx *Index) GetMetadata(key string) (string, error) { | ||
| 224 | var value string | ||
| 225 | err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) | ||
| 226 | if err == sql.ErrNoRows { | ||
| 227 | return "", nil | ||
| 228 | } | ||
| 229 | return value, err | ||
| 230 | } | ||
| 231 | |||
| 232 | // serializeEmbedding converts float32 slice to bytes | ||
| 233 | func serializeEmbedding(embedding []float32) []byte { | ||
| 234 | buf := make([]byte, len(embedding)*4) | ||
| 235 | for i, v := range embedding { | ||
| 236 | binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) | ||
| 237 | } | ||
| 238 | return buf | ||
| 239 | } | ||
| 240 | |||
| 241 | // deserializeEmbedding converts bytes back to float32 slice | ||
| 242 | func deserializeEmbedding(data []byte) []float32 { | ||
| 243 | n := len(data) / 4 | ||
| 244 | result := make([]float32, n) | ||
| 245 | for i := 0; i < n; i++ { | ||
| 246 | bits := binary.LittleEndian.Uint32(data[i*4:]) | ||
| 247 | result[i] = math.Float32frombits(bits) | ||
| 248 | } | ||
| 249 | return result | ||
| 250 | } | ||
| 251 | |||
| 252 | // cosineDistance computes 1 - cosine_similarity (so lower is more similar) | ||
| 253 | func cosineDistance(a, b []float32) float64 { | ||
| 254 | if len(a) != len(b) { | ||
| 255 | return 1.0 | ||
| 256 | } | ||
| 257 | |||
| 258 | var dotProduct, normA, normB float64 | ||
| 259 | for i := range a { | ||
| 260 | dotProduct += float64(a[i]) * float64(b[i]) | ||
| 261 | normA += float64(a[i]) * float64(a[i]) | ||
| 262 | normB += float64(b[i]) * float64(b[i]) | ||
| 263 | } | ||
| 264 | |||
| 265 | if normA == 0 || normB == 0 { | ||
| 266 | return 1.0 | ||
| 267 | } | ||
| 268 | |||
| 269 | similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) | ||
| 270 | return 1.0 - similarity | ||
| 271 | } | ||
