aboutsummaryrefslogtreecommitdiffstats
path: root/internal/index/index.go
diff options
context:
space:
mode:
authorClawd <ai@clawd.bot>2026-03-05 07:29:00 -0800
committerClawd <ai@clawd.bot>2026-03-05 07:29:00 -0800
commitf1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (patch)
tree5e694f4a2e864c9fcdfcbb1ab869c3bae05b50e3 /internal/index/index.go
parent03d8f49479b3446cf7f8ab9b6fdb2401584e3f12 (diff)
Implement core: walker, chunker, embedder, index, CLI
Diffstat (limited to 'internal/index/index.go')
-rw-r--r--internal/index/index.go271
1 files changed, 271 insertions, 0 deletions
diff --git a/internal/index/index.go b/internal/index/index.go
new file mode 100644
index 0000000..008e487
--- /dev/null
+++ b/internal/index/index.go
@@ -0,0 +1,271 @@
1package index
2
3import (
4 "database/sql"
5 "encoding/binary"
6 "math"
7 "os"
8 "path/filepath"
9 "sort"
10
11 _ "modernc.org/sqlite"
12
13 "code.northwest.io/codevec/internal/chunker"
14)
15
16// Index stores chunks and embeddings in SQLite
17type Index struct {
18 db *sql.DB
19 dims int
20}
21
22// Open opens or creates an index at the given path
23func Open(path string, dims int) (*Index, error) {
24 // Ensure directory exists
25 dir := filepath.Dir(path)
26 if err := os.MkdirAll(dir, 0755); err != nil {
27 return nil, err
28 }
29
30 db, err := sql.Open("sqlite", path)
31 if err != nil {
32 return nil, err
33 }
34
35 idx := &Index{db: db, dims: dims}
36 if err := idx.init(); err != nil {
37 db.Close()
38 return nil, err
39 }
40
41 return idx, nil
42}
43
44func (idx *Index) init() error {
45 // Create chunks table with embedding column
46 _, err := idx.db.Exec(`
47 CREATE TABLE IF NOT EXISTS chunks (
48 id INTEGER PRIMARY KEY,
49 file TEXT NOT NULL,
50 start_line INTEGER NOT NULL,
51 end_line INTEGER NOT NULL,
52 chunk_type TEXT,
53 name TEXT,
54 content TEXT NOT NULL,
55 hash TEXT NOT NULL,
56 embedding BLOB,
57 created_at INTEGER DEFAULT (unixepoch())
58 )
59 `)
60 if err != nil {
61 return err
62 }
63
64 // Create files table for tracking indexed files
65 _, err = idx.db.Exec(`
66 CREATE TABLE IF NOT EXISTS files (
67 path TEXT PRIMARY KEY,
68 hash TEXT NOT NULL,
69 indexed_at INTEGER DEFAULT (unixepoch())
70 )
71 `)
72 if err != nil {
73 return err
74 }
75
76 // Create metadata table
77 _, err = idx.db.Exec(`
78 CREATE TABLE IF NOT EXISTS metadata (
79 key TEXT PRIMARY KEY,
80 value TEXT
81 )
82 `)
83 if err != nil {
84 return err
85 }
86
87 // Index on file for faster deletion
88 _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`)
89 return err
90}
91
92// Close closes the index
93func (idx *Index) Close() error {
94 return idx.db.Close()
95}
96
97// InsertChunk inserts a chunk with its embedding
98func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error {
99 embeddingBlob := serializeEmbedding(embedding)
100 _, err := idx.db.Exec(`
101 INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding)
102 VALUES (?, ?, ?, ?, ?, ?, ?, ?)
103 `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob)
104 return err
105}
106
107// SearchResult represents a search result
108type SearchResult struct {
109 Chunk chunker.Chunk
110 Distance float64
111}
112
113// Search finds chunks similar to the query embedding using cosine similarity
114func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) {
115 // Load all embeddings
116 rows, err := idx.db.Query(`
117 SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding
118 FROM chunks
119 WHERE embedding IS NOT NULL
120 `)
121 if err != nil {
122 return nil, err
123 }
124 defer rows.Close()
125
126 type candidate struct {
127 chunk chunker.Chunk
128 distance float64
129 }
130 var candidates []candidate
131
132 for rows.Next() {
133 var id int64
134 var c chunker.Chunk
135 var embBlob []byte
136 err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob)
137 if err != nil {
138 return nil, err
139 }
140
141 emb := deserializeEmbedding(embBlob)
142 dist := cosineDistance(queryEmb, emb)
143 candidates = append(candidates, candidate{chunk: c, distance: dist})
144 }
145
146 if err := rows.Err(); err != nil {
147 return nil, err
148 }
149
150 // Sort by distance (lower is better)
151 sort.Slice(candidates, func(i, j int) bool {
152 return candidates[i].distance < candidates[j].distance
153 })
154
155 // Return top-k
156 if limit > len(candidates) {
157 limit = len(candidates)
158 }
159
160 results := make([]SearchResult, limit)
161 for i := 0; i < limit; i++ {
162 results[i] = SearchResult{
163 Chunk: candidates[i].chunk,
164 Distance: candidates[i].distance,
165 }
166 }
167
168 return results, nil
169}
170
171// GetFileHash returns the stored hash for a file, or empty string if not indexed
172func (idx *Index) GetFileHash(path string) (string, error) {
173 var hash string
174 err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash)
175 if err == sql.ErrNoRows {
176 return "", nil
177 }
178 return hash, err
179}
180
181// SetFileHash updates the hash for a file
182func (idx *Index) SetFileHash(path, hash string) error {
183 _, err := idx.db.Exec(`
184 INSERT OR REPLACE INTO files (path, hash, indexed_at)
185 VALUES (?, ?, unixepoch())
186 `, path, hash)
187 return err
188}
189
190// DeleteChunksForFile removes all chunks for a file
191func (idx *Index) DeleteChunksForFile(path string) error {
192 _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path)
193 if err != nil {
194 return err
195 }
196 _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path)
197 return err
198}
199
200// Stats returns index statistics
201type Stats struct {
202 Files int
203 Chunks int
204}
205
206func (idx *Index) Stats() (Stats, error) {
207 var s Stats
208 err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files)
209 if err != nil {
210 return s, err
211 }
212 err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks)
213 return s, err
214}
215
216// SetMetadata stores metadata
217func (idx *Index) SetMetadata(key, value string) error {
218 _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value)
219 return err
220}
221
222// GetMetadata retrieves metadata
223func (idx *Index) GetMetadata(key string) (string, error) {
224 var value string
225 err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value)
226 if err == sql.ErrNoRows {
227 return "", nil
228 }
229 return value, err
230}
231
232// serializeEmbedding converts float32 slice to bytes
233func serializeEmbedding(embedding []float32) []byte {
234 buf := make([]byte, len(embedding)*4)
235 for i, v := range embedding {
236 binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v))
237 }
238 return buf
239}
240
241// deserializeEmbedding converts bytes back to float32 slice
242func deserializeEmbedding(data []byte) []float32 {
243 n := len(data) / 4
244 result := make([]float32, n)
245 for i := 0; i < n; i++ {
246 bits := binary.LittleEndian.Uint32(data[i*4:])
247 result[i] = math.Float32frombits(bits)
248 }
249 return result
250}
251
252// cosineDistance computes 1 - cosine_similarity (so lower is more similar)
253func cosineDistance(a, b []float32) float64 {
254 if len(a) != len(b) {
255 return 1.0
256 }
257
258 var dotProduct, normA, normB float64
259 for i := range a {
260 dotProduct += float64(a[i]) * float64(b[i])
261 normA += float64(a[i]) * float64(a[i])
262 normB += float64(b[i]) * float64(b[i])
263 }
264
265 if normA == 0 || normB == 0 {
266 return 1.0
267 }
268
269 similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
270 return 1.0 - similarity
271}