aboutsummaryrefslogtreecommitdiffstats
path: root/internal
diff options
context:
space:
mode:
authorClawd <ai@clawd.bot>2026-03-05 07:29:00 -0800
committerClawd <ai@clawd.bot>2026-03-05 07:29:00 -0800
commitf1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (patch)
tree5e694f4a2e864c9fcdfcbb1ab869c3bae05b50e3 /internal
parent03d8f49479b3446cf7f8ab9b6fdb2401584e3f12 (diff)
Implement core: walker, chunker, embedder, index, CLI
Diffstat (limited to 'internal')
-rw-r--r--internal/chunker/chunker.go185
-rw-r--r--internal/embedder/embedder.go222
-rw-r--r--internal/index/index.go271
-rw-r--r--internal/walker/walker.go109
4 files changed, 787 insertions, 0 deletions
diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go
new file mode 100644
index 0000000..f8de08d
--- /dev/null
+++ b/internal/chunker/chunker.go
@@ -0,0 +1,185 @@
1package chunker
2
3import (
4 "crypto/sha256"
5 "fmt"
6 "os"
7 "strings"
8
9 sitter "github.com/smacker/go-tree-sitter"
10 "github.com/smacker/go-tree-sitter/golang"
11)
12
13// Chunk represents a semantically meaningful piece of code
14type Chunk struct {
15 File string
16 StartLine int
17 EndLine int
18 Type string // "function", "method", "type"
19 Name string
20 Content string
21 Hash string
22}
23
24// Chunker extracts semantic chunks from source code
25type Chunker interface {
26 Chunk(path string, content []byte) ([]Chunk, error)
27}
28
29// GoChunker extracts chunks from Go source files using tree-sitter
30type GoChunker struct {
31 parser *sitter.Parser
32}
33
34// NewGoChunker creates a new Go chunker
35func NewGoChunker() *GoChunker {
36 parser := sitter.NewParser()
37 parser.SetLanguage(golang.GetLanguage())
38 return &GoChunker{parser: parser}
39}
40
41// ChunkFile reads and chunks a file
42func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) {
43 content, err := os.ReadFile(path)
44 if err != nil {
45 return nil, err
46 }
47 return c.Chunk(path, content)
48}
49
50// Chunk extracts semantic chunks from Go source
51func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) {
52 tree := c.parser.Parse(nil, content)
53 if tree == nil {
54 return nil, fmt.Errorf("failed to parse %s", path)
55 }
56 defer tree.Close()
57
58 var chunks []Chunk
59 root := tree.RootNode()
60
61 // Walk top-level declarations
62 for i := 0; i < int(root.ChildCount()); i++ {
63 node := root.Child(i)
64 chunk := c.extractChunk(node, content, path)
65 if chunk != nil {
66 chunks = append(chunks, *chunk)
67 }
68 }
69
70 return chunks, nil
71}
72
73func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk {
74 nodeType := node.Type()
75
76 switch nodeType {
77 case "function_declaration":
78 return c.extractFunction(node, content, path)
79 case "method_declaration":
80 return c.extractMethod(node, content, path)
81 case "type_declaration":
82 return c.extractType(node, content, path)
83 }
84
85 return nil
86}
87
88func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk {
89 nameNode := node.ChildByFieldName("name")
90 if nameNode == nil {
91 return nil
92 }
93
94 name := string(content[nameNode.StartByte():nameNode.EndByte()])
95 text := string(content[node.StartByte():node.EndByte()])
96
97 return &Chunk{
98 File: path,
99 StartLine: int(node.StartPoint().Row) + 1,
100 EndLine: int(node.EndPoint().Row) + 1,
101 Type: "function",
102 Name: name,
103 Content: text,
104 Hash: hash(text),
105 }
106}
107
108func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk {
109 nameNode := node.ChildByFieldName("name")
110 receiverNode := node.ChildByFieldName("receiver")
111 if nameNode == nil {
112 return nil
113 }
114
115 name := string(content[nameNode.StartByte():nameNode.EndByte()])
116
117 // Build receiver prefix like (*Server) or (s Server)
118 if receiverNode != nil {
119 recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()])
120 // Extract type from receiver, e.g., "(s *Server)" -> "*Server"
121 recvType := extractReceiverType(recvText)
122 if recvType != "" {
123 name = fmt.Sprintf("(%s).%s", recvType, name)
124 }
125 }
126
127 text := string(content[node.StartByte():node.EndByte()])
128
129 return &Chunk{
130 File: path,
131 StartLine: int(node.StartPoint().Row) + 1,
132 EndLine: int(node.EndPoint().Row) + 1,
133 Type: "method",
134 Name: name,
135 Content: text,
136 Hash: hash(text),
137 }
138}
139
140func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk {
141 // type_declaration contains type_spec children
142 for i := 0; i < int(node.ChildCount()); i++ {
143 child := node.Child(i)
144 if child.Type() == "type_spec" {
145 nameNode := child.ChildByFieldName("name")
146 if nameNode == nil {
147 continue
148 }
149
150 name := string(content[nameNode.StartByte():nameNode.EndByte()])
151 text := string(content[node.StartByte():node.EndByte()])
152
153 return &Chunk{
154 File: path,
155 StartLine: int(node.StartPoint().Row) + 1,
156 EndLine: int(node.EndPoint().Row) + 1,
157 Type: "type",
158 Name: name,
159 Content: text,
160 Hash: hash(text),
161 }
162 }
163 }
164 return nil
165}
166
167// extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server"
168func extractReceiverType(recv string) string {
169 // Remove parens
170 recv = strings.TrimPrefix(recv, "(")
171 recv = strings.TrimSuffix(recv, ")")
172 recv = strings.TrimSpace(recv)
173
174 // Split on space, take last part (the type)
175 parts := strings.Fields(recv)
176 if len(parts) == 0 {
177 return ""
178 }
179 return parts[len(parts)-1]
180}
181
182func hash(s string) string {
183 h := sha256.Sum256([]byte(s))
184 return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars
185}
diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go
new file mode 100644
index 0000000..42f8518
--- /dev/null
+++ b/internal/embedder/embedder.go
@@ -0,0 +1,222 @@
1package embedder
2
3import (
4 "bytes"
5 "context"
6 "encoding/json"
7 "fmt"
8 "net/http"
9 "os"
10)
11
12// Embedder generates embeddings for text
13type Embedder interface {
14 Embed(ctx context.Context, texts []string) ([][]float32, error)
15 Dimensions() int
16}
17
18// OllamaEmbedder uses Ollama's embedding API
19type OllamaEmbedder struct {
20 baseURL string
21 model string
22 dims int
23}
24
25// NewOllamaEmbedder creates an Ollama embedder
26func NewOllamaEmbedder(model string) *OllamaEmbedder {
27 baseURL := os.Getenv("CODEVEC_BASE_URL")
28 if baseURL == "" {
29 baseURL = "http://localhost:11434"
30 }
31 if model == "" {
32 model = "nomic-embed-text"
33 }
34
35 // Model dimensions
36 dims := 768 // nomic-embed-text default
37 switch model {
38 case "mxbai-embed-large":
39 dims = 1024
40 case "all-minilm":
41 dims = 384
42 }
43
44 return &OllamaEmbedder{
45 baseURL: baseURL,
46 model: model,
47 dims: dims,
48 }
49}
50
51func (e *OllamaEmbedder) Dimensions() int {
52 return e.dims
53}
54
55type ollamaRequest struct {
56 Model string `json:"model"`
57 Prompt string `json:"prompt"`
58}
59
60type ollamaResponse struct {
61 Embedding []float32 `json:"embedding"`
62}
63
64func (e *OllamaEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) {
65 embeddings := make([][]float32, len(texts))
66
67 // Ollama's /api/embeddings takes one prompt at a time
68 for i, text := range texts {
69 req := ollamaRequest{
70 Model: e.model,
71 Prompt: text,
72 }
73
74 body, err := json.Marshal(req)
75 if err != nil {
76 return nil, err
77 }
78
79 httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/api/embeddings", bytes.NewReader(body))
80 if err != nil {
81 return nil, err
82 }
83 httpReq.Header.Set("Content-Type", "application/json")
84
85 resp, err := http.DefaultClient.Do(httpReq)
86 if err != nil {
87 return nil, fmt.Errorf("ollama request failed: %w", err)
88 }
89 defer resp.Body.Close()
90
91 if resp.StatusCode != http.StatusOK {
92 return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode)
93 }
94
95 var result ollamaResponse
96 if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
97 return nil, err
98 }
99
100 embeddings[i] = result.Embedding
101 }
102
103 return embeddings, nil
104}
105
106// OpenAIEmbedder uses OpenAI-compatible embedding API
107type OpenAIEmbedder struct {
108 baseURL string
109 apiKey string
110 model string
111 dims int
112}
113
114// NewOpenAIEmbedder creates an OpenAI-compatible embedder
115func NewOpenAIEmbedder(model string) *OpenAIEmbedder {
116 baseURL := os.Getenv("CODEVEC_BASE_URL")
117 if baseURL == "" {
118 baseURL = "https://api.openai.com"
119 }
120 apiKey := os.Getenv("CODEVEC_API_KEY")
121 if model == "" {
122 model = "text-embedding-3-small"
123 }
124
125 dims := 1536 // text-embedding-3-small default
126 switch model {
127 case "text-embedding-3-large":
128 dims = 3072
129 case "text-embedding-ada-002":
130 dims = 1536
131 }
132
133 return &OpenAIEmbedder{
134 baseURL: baseURL,
135 apiKey: apiKey,
136 model: model,
137 dims: dims,
138 }
139}
140
141func (e *OpenAIEmbedder) Dimensions() int {
142 return e.dims
143}
144
145type openaiRequest struct {
146 Model string `json:"model"`
147 Input []string `json:"input"`
148}
149
150type openaiResponse struct {
151 Data []struct {
152 Embedding []float32 `json:"embedding"`
153 } `json:"data"`
154}
155
156func (e *OpenAIEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) {
157 if e.apiKey == "" {
158 return nil, fmt.Errorf("CODEVEC_API_KEY not set")
159 }
160
161 // Batch in groups of 100
162 const batchSize = 100
163 embeddings := make([][]float32, len(texts))
164
165 for start := 0; start < len(texts); start += batchSize {
166 end := start + batchSize
167 if end > len(texts) {
168 end = len(texts)
169 }
170 batch := texts[start:end]
171
172 req := openaiRequest{
173 Model: e.model,
174 Input: batch,
175 }
176
177 body, err := json.Marshal(req)
178 if err != nil {
179 return nil, err
180 }
181
182 httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/v1/embeddings", bytes.NewReader(body))
183 if err != nil {
184 return nil, err
185 }
186 httpReq.Header.Set("Content-Type", "application/json")
187 httpReq.Header.Set("Authorization", "Bearer "+e.apiKey)
188
189 resp, err := http.DefaultClient.Do(httpReq)
190 if err != nil {
191 return nil, fmt.Errorf("openai request failed: %w", err)
192 }
193 defer resp.Body.Close()
194
195 if resp.StatusCode != http.StatusOK {
196 return nil, fmt.Errorf("openai returned status %d", resp.StatusCode)
197 }
198
199 var result openaiResponse
200 if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
201 return nil, err
202 }
203
204 for i, d := range result.Data {
205 embeddings[start+i] = d.Embedding
206 }
207 }
208
209 return embeddings, nil
210}
211
212// New creates an embedder based on provider name
213func New(provider, model string) (Embedder, error) {
214 switch provider {
215 case "ollama":
216 return NewOllamaEmbedder(model), nil
217 case "openai":
218 return NewOpenAIEmbedder(model), nil
219 default:
220 return nil, fmt.Errorf("unknown provider: %s", provider)
221 }
222}
diff --git a/internal/index/index.go b/internal/index/index.go
new file mode 100644
index 0000000..008e487
--- /dev/null
+++ b/internal/index/index.go
@@ -0,0 +1,271 @@
1package index
2
3import (
4 "database/sql"
5 "encoding/binary"
6 "math"
7 "os"
8 "path/filepath"
9 "sort"
10
11 _ "modernc.org/sqlite"
12
13 "code.northwest.io/codevec/internal/chunker"
14)
15
16// Index stores chunks and embeddings in SQLite
17type Index struct {
18 db *sql.DB
19 dims int
20}
21
22// Open opens or creates an index at the given path
23func Open(path string, dims int) (*Index, error) {
24 // Ensure directory exists
25 dir := filepath.Dir(path)
26 if err := os.MkdirAll(dir, 0755); err != nil {
27 return nil, err
28 }
29
30 db, err := sql.Open("sqlite", path)
31 if err != nil {
32 return nil, err
33 }
34
35 idx := &Index{db: db, dims: dims}
36 if err := idx.init(); err != nil {
37 db.Close()
38 return nil, err
39 }
40
41 return idx, nil
42}
43
44func (idx *Index) init() error {
45 // Create chunks table with embedding column
46 _, err := idx.db.Exec(`
47 CREATE TABLE IF NOT EXISTS chunks (
48 id INTEGER PRIMARY KEY,
49 file TEXT NOT NULL,
50 start_line INTEGER NOT NULL,
51 end_line INTEGER NOT NULL,
52 chunk_type TEXT,
53 name TEXT,
54 content TEXT NOT NULL,
55 hash TEXT NOT NULL,
56 embedding BLOB,
57 created_at INTEGER DEFAULT (unixepoch())
58 )
59 `)
60 if err != nil {
61 return err
62 }
63
64 // Create files table for tracking indexed files
65 _, err = idx.db.Exec(`
66 CREATE TABLE IF NOT EXISTS files (
67 path TEXT PRIMARY KEY,
68 hash TEXT NOT NULL,
69 indexed_at INTEGER DEFAULT (unixepoch())
70 )
71 `)
72 if err != nil {
73 return err
74 }
75
76 // Create metadata table
77 _, err = idx.db.Exec(`
78 CREATE TABLE IF NOT EXISTS metadata (
79 key TEXT PRIMARY KEY,
80 value TEXT
81 )
82 `)
83 if err != nil {
84 return err
85 }
86
87 // Index on file for faster deletion
88 _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`)
89 return err
90}
91
92// Close closes the index
93func (idx *Index) Close() error {
94 return idx.db.Close()
95}
96
97// InsertChunk inserts a chunk with its embedding
98func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error {
99 embeddingBlob := serializeEmbedding(embedding)
100 _, err := idx.db.Exec(`
101 INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding)
102 VALUES (?, ?, ?, ?, ?, ?, ?, ?)
103 `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob)
104 return err
105}
106
107// SearchResult represents a search result
108type SearchResult struct {
109 Chunk chunker.Chunk
110 Distance float64
111}
112
113// Search finds chunks similar to the query embedding using cosine similarity
114func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) {
115 // Load all embeddings
116 rows, err := idx.db.Query(`
117 SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding
118 FROM chunks
119 WHERE embedding IS NOT NULL
120 `)
121 if err != nil {
122 return nil, err
123 }
124 defer rows.Close()
125
126 type candidate struct {
127 chunk chunker.Chunk
128 distance float64
129 }
130 var candidates []candidate
131
132 for rows.Next() {
133 var id int64
134 var c chunker.Chunk
135 var embBlob []byte
136 err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob)
137 if err != nil {
138 return nil, err
139 }
140
141 emb := deserializeEmbedding(embBlob)
142 dist := cosineDistance(queryEmb, emb)
143 candidates = append(candidates, candidate{chunk: c, distance: dist})
144 }
145
146 if err := rows.Err(); err != nil {
147 return nil, err
148 }
149
150 // Sort by distance (lower is better)
151 sort.Slice(candidates, func(i, j int) bool {
152 return candidates[i].distance < candidates[j].distance
153 })
154
155 // Return top-k
156 if limit > len(candidates) {
157 limit = len(candidates)
158 }
159
160 results := make([]SearchResult, limit)
161 for i := 0; i < limit; i++ {
162 results[i] = SearchResult{
163 Chunk: candidates[i].chunk,
164 Distance: candidates[i].distance,
165 }
166 }
167
168 return results, nil
169}
170
171// GetFileHash returns the stored hash for a file, or empty string if not indexed
172func (idx *Index) GetFileHash(path string) (string, error) {
173 var hash string
174 err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash)
175 if err == sql.ErrNoRows {
176 return "", nil
177 }
178 return hash, err
179}
180
181// SetFileHash updates the hash for a file
182func (idx *Index) SetFileHash(path, hash string) error {
183 _, err := idx.db.Exec(`
184 INSERT OR REPLACE INTO files (path, hash, indexed_at)
185 VALUES (?, ?, unixepoch())
186 `, path, hash)
187 return err
188}
189
190// DeleteChunksForFile removes all chunks for a file
191func (idx *Index) DeleteChunksForFile(path string) error {
192 _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path)
193 if err != nil {
194 return err
195 }
196 _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path)
197 return err
198}
199
200// Stats returns index statistics
201type Stats struct {
202 Files int
203 Chunks int
204}
205
206func (idx *Index) Stats() (Stats, error) {
207 var s Stats
208 err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files)
209 if err != nil {
210 return s, err
211 }
212 err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks)
213 return s, err
214}
215
216// SetMetadata stores metadata
217func (idx *Index) SetMetadata(key, value string) error {
218 _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value)
219 return err
220}
221
222// GetMetadata retrieves metadata
223func (idx *Index) GetMetadata(key string) (string, error) {
224 var value string
225 err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value)
226 if err == sql.ErrNoRows {
227 return "", nil
228 }
229 return value, err
230}
231
232// serializeEmbedding converts float32 slice to bytes
233func serializeEmbedding(embedding []float32) []byte {
234 buf := make([]byte, len(embedding)*4)
235 for i, v := range embedding {
236 binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v))
237 }
238 return buf
239}
240
241// deserializeEmbedding converts bytes back to float32 slice
242func deserializeEmbedding(data []byte) []float32 {
243 n := len(data) / 4
244 result := make([]float32, n)
245 for i := 0; i < n; i++ {
246 bits := binary.LittleEndian.Uint32(data[i*4:])
247 result[i] = math.Float32frombits(bits)
248 }
249 return result
250}
251
252// cosineDistance computes 1 - cosine_similarity (so lower is more similar)
253func cosineDistance(a, b []float32) float64 {
254 if len(a) != len(b) {
255 return 1.0
256 }
257
258 var dotProduct, normA, normB float64
259 for i := range a {
260 dotProduct += float64(a[i]) * float64(b[i])
261 normA += float64(a[i]) * float64(a[i])
262 normB += float64(b[i]) * float64(b[i])
263 }
264
265 if normA == 0 || normB == 0 {
266 return 1.0
267 }
268
269 similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))
270 return 1.0 - similarity
271}
diff --git a/internal/walker/walker.go b/internal/walker/walker.go
new file mode 100644
index 0000000..0ac470d
--- /dev/null
+++ b/internal/walker/walker.go
@@ -0,0 +1,109 @@
1package walker
2
3import (
4 "os"
5 "path/filepath"
6 "strings"
7
8 ignore "github.com/sabhiram/go-gitignore"
9)
10
11// DefaultIgnore patterns applied to all walks
12var DefaultIgnore = []string{
13 "vendor/",
14 "node_modules/",
15 ".git/",
16 ".codevec/",
17}
18
19// Walker walks a directory tree finding files to index
20type Walker struct {
21 root string
22 extensions []string // e.g., [".go"]
23 gitignore *ignore.GitIgnore
24}
25
26// New creates a walker for the given root directory
27func New(root string, extensions []string) (*Walker, error) {
28 root, err := filepath.Abs(root)
29 if err != nil {
30 return nil, err
31 }
32
33 w := &Walker{
34 root: root,
35 extensions: extensions,
36 }
37
38 // Load .gitignore if present
39 gitignorePath := filepath.Join(root, ".gitignore")
40 if _, err := os.Stat(gitignorePath); err == nil {
41 gi, err := ignore.CompileIgnoreFile(gitignorePath)
42 if err == nil {
43 w.gitignore = gi
44 }
45 }
46
47 return w, nil
48}
49
50// Walk returns all matching files in the directory tree
51func (w *Walker) Walk() ([]string, error) {
52 var files []string
53
54 err := filepath.WalkDir(w.root, func(path string, d os.DirEntry, err error) error {
55 if err != nil {
56 return err
57 }
58
59 // Get path relative to root for ignore matching
60 relPath, err := filepath.Rel(w.root, path)
61 if err != nil {
62 return err
63 }
64
65 // Skip default ignored directories
66 if d.IsDir() {
67 for _, pattern := range DefaultIgnore {
68 if strings.HasPrefix(relPath+"/", pattern) || relPath+"/" == pattern {
69 return filepath.SkipDir
70 }
71 }
72 }
73
74 // Skip if matched by .gitignore
75 if w.gitignore != nil && w.gitignore.MatchesPath(relPath) {
76 if d.IsDir() {
77 return filepath.SkipDir
78 }
79 return nil
80 }
81
82 // Skip directories and non-matching extensions
83 if d.IsDir() {
84 return nil
85 }
86
87 if !w.matchesExtension(path) {
88 return nil
89 }
90
91 files = append(files, path)
92 return nil
93 })
94
95 return files, err
96}
97
98func (w *Walker) matchesExtension(path string) bool {
99 if len(w.extensions) == 0 {
100 return true
101 }
102 ext := filepath.Ext(path)
103 for _, e := range w.extensions {
104 if ext == e {
105 return true
106 }
107 }
108 return false
109}