diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/chunker/chunker.go | 185 | ||||
| -rw-r--r-- | internal/embedder/embedder.go | 222 | ||||
| -rw-r--r-- | internal/index/index.go | 271 | ||||
| -rw-r--r-- | internal/walker/walker.go | 109 |
4 files changed, 787 insertions, 0 deletions
diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go new file mode 100644 index 0000000..f8de08d --- /dev/null +++ b/internal/chunker/chunker.go | |||
| @@ -0,0 +1,185 @@ | |||
| 1 | package chunker | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "crypto/sha256" | ||
| 5 | "fmt" | ||
| 6 | "os" | ||
| 7 | "strings" | ||
| 8 | |||
| 9 | sitter "github.com/smacker/go-tree-sitter" | ||
| 10 | "github.com/smacker/go-tree-sitter/golang" | ||
| 11 | ) | ||
| 12 | |||
| 13 | // Chunk represents a semantically meaningful piece of code | ||
| 14 | type Chunk struct { | ||
| 15 | File string | ||
| 16 | StartLine int | ||
| 17 | EndLine int | ||
| 18 | Type string // "function", "method", "type" | ||
| 19 | Name string | ||
| 20 | Content string | ||
| 21 | Hash string | ||
| 22 | } | ||
| 23 | |||
| 24 | // Chunker extracts semantic chunks from source code | ||
| 25 | type Chunker interface { | ||
| 26 | Chunk(path string, content []byte) ([]Chunk, error) | ||
| 27 | } | ||
| 28 | |||
| 29 | // GoChunker extracts chunks from Go source files using tree-sitter | ||
| 30 | type GoChunker struct { | ||
| 31 | parser *sitter.Parser | ||
| 32 | } | ||
| 33 | |||
| 34 | // NewGoChunker creates a new Go chunker | ||
| 35 | func NewGoChunker() *GoChunker { | ||
| 36 | parser := sitter.NewParser() | ||
| 37 | parser.SetLanguage(golang.GetLanguage()) | ||
| 38 | return &GoChunker{parser: parser} | ||
| 39 | } | ||
| 40 | |||
| 41 | // ChunkFile reads and chunks a file | ||
| 42 | func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) { | ||
| 43 | content, err := os.ReadFile(path) | ||
| 44 | if err != nil { | ||
| 45 | return nil, err | ||
| 46 | } | ||
| 47 | return c.Chunk(path, content) | ||
| 48 | } | ||
| 49 | |||
| 50 | // Chunk extracts semantic chunks from Go source | ||
| 51 | func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) { | ||
| 52 | tree := c.parser.Parse(nil, content) | ||
| 53 | if tree == nil { | ||
| 54 | return nil, fmt.Errorf("failed to parse %s", path) | ||
| 55 | } | ||
| 56 | defer tree.Close() | ||
| 57 | |||
| 58 | var chunks []Chunk | ||
| 59 | root := tree.RootNode() | ||
| 60 | |||
| 61 | // Walk top-level declarations | ||
| 62 | for i := 0; i < int(root.ChildCount()); i++ { | ||
| 63 | node := root.Child(i) | ||
| 64 | chunk := c.extractChunk(node, content, path) | ||
| 65 | if chunk != nil { | ||
| 66 | chunks = append(chunks, *chunk) | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | return chunks, nil | ||
| 71 | } | ||
| 72 | |||
| 73 | func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 74 | nodeType := node.Type() | ||
| 75 | |||
| 76 | switch nodeType { | ||
| 77 | case "function_declaration": | ||
| 78 | return c.extractFunction(node, content, path) | ||
| 79 | case "method_declaration": | ||
| 80 | return c.extractMethod(node, content, path) | ||
| 81 | case "type_declaration": | ||
| 82 | return c.extractType(node, content, path) | ||
| 83 | } | ||
| 84 | |||
| 85 | return nil | ||
| 86 | } | ||
| 87 | |||
| 88 | func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 89 | nameNode := node.ChildByFieldName("name") | ||
| 90 | if nameNode == nil { | ||
| 91 | return nil | ||
| 92 | } | ||
| 93 | |||
| 94 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 95 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 96 | |||
| 97 | return &Chunk{ | ||
| 98 | File: path, | ||
| 99 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 100 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 101 | Type: "function", | ||
| 102 | Name: name, | ||
| 103 | Content: text, | ||
| 104 | Hash: hash(text), | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 109 | nameNode := node.ChildByFieldName("name") | ||
| 110 | receiverNode := node.ChildByFieldName("receiver") | ||
| 111 | if nameNode == nil { | ||
| 112 | return nil | ||
| 113 | } | ||
| 114 | |||
| 115 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 116 | |||
| 117 | // Build receiver prefix like (*Server) or (s Server) | ||
| 118 | if receiverNode != nil { | ||
| 119 | recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()]) | ||
| 120 | // Extract type from receiver, e.g., "(s *Server)" -> "*Server" | ||
| 121 | recvType := extractReceiverType(recvText) | ||
| 122 | if recvType != "" { | ||
| 123 | name = fmt.Sprintf("(%s).%s", recvType, name) | ||
| 124 | } | ||
| 125 | } | ||
| 126 | |||
| 127 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 128 | |||
| 129 | return &Chunk{ | ||
| 130 | File: path, | ||
| 131 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 132 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 133 | Type: "method", | ||
| 134 | Name: name, | ||
| 135 | Content: text, | ||
| 136 | Hash: hash(text), | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 141 | // type_declaration contains type_spec children | ||
| 142 | for i := 0; i < int(node.ChildCount()); i++ { | ||
| 143 | child := node.Child(i) | ||
| 144 | if child.Type() == "type_spec" { | ||
| 145 | nameNode := child.ChildByFieldName("name") | ||
| 146 | if nameNode == nil { | ||
| 147 | continue | ||
| 148 | } | ||
| 149 | |||
| 150 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 151 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 152 | |||
| 153 | return &Chunk{ | ||
| 154 | File: path, | ||
| 155 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 156 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 157 | Type: "type", | ||
| 158 | Name: name, | ||
| 159 | Content: text, | ||
| 160 | Hash: hash(text), | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | return nil | ||
| 165 | } | ||
| 166 | |||
| 167 | // extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server" | ||
| 168 | func extractReceiverType(recv string) string { | ||
| 169 | // Remove parens | ||
| 170 | recv = strings.TrimPrefix(recv, "(") | ||
| 171 | recv = strings.TrimSuffix(recv, ")") | ||
| 172 | recv = strings.TrimSpace(recv) | ||
| 173 | |||
| 174 | // Split on space, take last part (the type) | ||
| 175 | parts := strings.Fields(recv) | ||
| 176 | if len(parts) == 0 { | ||
| 177 | return "" | ||
| 178 | } | ||
| 179 | return parts[len(parts)-1] | ||
| 180 | } | ||
| 181 | |||
| 182 | func hash(s string) string { | ||
| 183 | h := sha256.Sum256([]byte(s)) | ||
| 184 | return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars | ||
| 185 | } | ||
diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go new file mode 100644 index 0000000..42f8518 --- /dev/null +++ b/internal/embedder/embedder.go | |||
| @@ -0,0 +1,222 @@ | |||
| 1 | package embedder | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "context" | ||
| 6 | "encoding/json" | ||
| 7 | "fmt" | ||
| 8 | "net/http" | ||
| 9 | "os" | ||
| 10 | ) | ||
| 11 | |||
| 12 | // Embedder generates embeddings for text | ||
| 13 | type Embedder interface { | ||
| 14 | Embed(ctx context.Context, texts []string) ([][]float32, error) | ||
| 15 | Dimensions() int | ||
| 16 | } | ||
| 17 | |||
| 18 | // OllamaEmbedder uses Ollama's embedding API | ||
| 19 | type OllamaEmbedder struct { | ||
| 20 | baseURL string | ||
| 21 | model string | ||
| 22 | dims int | ||
| 23 | } | ||
| 24 | |||
| 25 | // NewOllamaEmbedder creates an Ollama embedder | ||
| 26 | func NewOllamaEmbedder(model string) *OllamaEmbedder { | ||
| 27 | baseURL := os.Getenv("CODEVEC_BASE_URL") | ||
| 28 | if baseURL == "" { | ||
| 29 | baseURL = "http://localhost:11434" | ||
| 30 | } | ||
| 31 | if model == "" { | ||
| 32 | model = "nomic-embed-text" | ||
| 33 | } | ||
| 34 | |||
| 35 | // Model dimensions | ||
| 36 | dims := 768 // nomic-embed-text default | ||
| 37 | switch model { | ||
| 38 | case "mxbai-embed-large": | ||
| 39 | dims = 1024 | ||
| 40 | case "all-minilm": | ||
| 41 | dims = 384 | ||
| 42 | } | ||
| 43 | |||
| 44 | return &OllamaEmbedder{ | ||
| 45 | baseURL: baseURL, | ||
| 46 | model: model, | ||
| 47 | dims: dims, | ||
| 48 | } | ||
| 49 | } | ||
| 50 | |||
| 51 | func (e *OllamaEmbedder) Dimensions() int { | ||
| 52 | return e.dims | ||
| 53 | } | ||
| 54 | |||
| 55 | type ollamaRequest struct { | ||
| 56 | Model string `json:"model"` | ||
| 57 | Prompt string `json:"prompt"` | ||
| 58 | } | ||
| 59 | |||
| 60 | type ollamaResponse struct { | ||
| 61 | Embedding []float32 `json:"embedding"` | ||
| 62 | } | ||
| 63 | |||
| 64 | func (e *OllamaEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { | ||
| 65 | embeddings := make([][]float32, len(texts)) | ||
| 66 | |||
| 67 | // Ollama's /api/embeddings takes one prompt at a time | ||
| 68 | for i, text := range texts { | ||
| 69 | req := ollamaRequest{ | ||
| 70 | Model: e.model, | ||
| 71 | Prompt: text, | ||
| 72 | } | ||
| 73 | |||
| 74 | body, err := json.Marshal(req) | ||
| 75 | if err != nil { | ||
| 76 | return nil, err | ||
| 77 | } | ||
| 78 | |||
| 79 | httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/api/embeddings", bytes.NewReader(body)) | ||
| 80 | if err != nil { | ||
| 81 | return nil, err | ||
| 82 | } | ||
| 83 | httpReq.Header.Set("Content-Type", "application/json") | ||
| 84 | |||
| 85 | resp, err := http.DefaultClient.Do(httpReq) | ||
| 86 | if err != nil { | ||
| 87 | return nil, fmt.Errorf("ollama request failed: %w", err) | ||
| 88 | } | ||
| 89 | defer resp.Body.Close() | ||
| 90 | |||
| 91 | if resp.StatusCode != http.StatusOK { | ||
| 92 | return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode) | ||
| 93 | } | ||
| 94 | |||
| 95 | var result ollamaResponse | ||
| 96 | if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { | ||
| 97 | return nil, err | ||
| 98 | } | ||
| 99 | |||
| 100 | embeddings[i] = result.Embedding | ||
| 101 | } | ||
| 102 | |||
| 103 | return embeddings, nil | ||
| 104 | } | ||
| 105 | |||
| 106 | // OpenAIEmbedder uses OpenAI-compatible embedding API | ||
| 107 | type OpenAIEmbedder struct { | ||
| 108 | baseURL string | ||
| 109 | apiKey string | ||
| 110 | model string | ||
| 111 | dims int | ||
| 112 | } | ||
| 113 | |||
| 114 | // NewOpenAIEmbedder creates an OpenAI-compatible embedder | ||
| 115 | func NewOpenAIEmbedder(model string) *OpenAIEmbedder { | ||
| 116 | baseURL := os.Getenv("CODEVEC_BASE_URL") | ||
| 117 | if baseURL == "" { | ||
| 118 | baseURL = "https://api.openai.com" | ||
| 119 | } | ||
| 120 | apiKey := os.Getenv("CODEVEC_API_KEY") | ||
| 121 | if model == "" { | ||
| 122 | model = "text-embedding-3-small" | ||
| 123 | } | ||
| 124 | |||
| 125 | dims := 1536 // text-embedding-3-small default | ||
| 126 | switch model { | ||
| 127 | case "text-embedding-3-large": | ||
| 128 | dims = 3072 | ||
| 129 | case "text-embedding-ada-002": | ||
| 130 | dims = 1536 | ||
| 131 | } | ||
| 132 | |||
| 133 | return &OpenAIEmbedder{ | ||
| 134 | baseURL: baseURL, | ||
| 135 | apiKey: apiKey, | ||
| 136 | model: model, | ||
| 137 | dims: dims, | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | func (e *OpenAIEmbedder) Dimensions() int { | ||
| 142 | return e.dims | ||
| 143 | } | ||
| 144 | |||
| 145 | type openaiRequest struct { | ||
| 146 | Model string `json:"model"` | ||
| 147 | Input []string `json:"input"` | ||
| 148 | } | ||
| 149 | |||
| 150 | type openaiResponse struct { | ||
| 151 | Data []struct { | ||
| 152 | Embedding []float32 `json:"embedding"` | ||
| 153 | } `json:"data"` | ||
| 154 | } | ||
| 155 | |||
| 156 | func (e *OpenAIEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { | ||
| 157 | if e.apiKey == "" { | ||
| 158 | return nil, fmt.Errorf("CODEVEC_API_KEY not set") | ||
| 159 | } | ||
| 160 | |||
| 161 | // Batch in groups of 100 | ||
| 162 | const batchSize = 100 | ||
| 163 | embeddings := make([][]float32, len(texts)) | ||
| 164 | |||
| 165 | for start := 0; start < len(texts); start += batchSize { | ||
| 166 | end := start + batchSize | ||
| 167 | if end > len(texts) { | ||
| 168 | end = len(texts) | ||
| 169 | } | ||
| 170 | batch := texts[start:end] | ||
| 171 | |||
| 172 | req := openaiRequest{ | ||
| 173 | Model: e.model, | ||
| 174 | Input: batch, | ||
| 175 | } | ||
| 176 | |||
| 177 | body, err := json.Marshal(req) | ||
| 178 | if err != nil { | ||
| 179 | return nil, err | ||
| 180 | } | ||
| 181 | |||
| 182 | httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/v1/embeddings", bytes.NewReader(body)) | ||
| 183 | if err != nil { | ||
| 184 | return nil, err | ||
| 185 | } | ||
| 186 | httpReq.Header.Set("Content-Type", "application/json") | ||
| 187 | httpReq.Header.Set("Authorization", "Bearer "+e.apiKey) | ||
| 188 | |||
| 189 | resp, err := http.DefaultClient.Do(httpReq) | ||
| 190 | if err != nil { | ||
| 191 | return nil, fmt.Errorf("openai request failed: %w", err) | ||
| 192 | } | ||
| 193 | defer resp.Body.Close() | ||
| 194 | |||
| 195 | if resp.StatusCode != http.StatusOK { | ||
| 196 | return nil, fmt.Errorf("openai returned status %d", resp.StatusCode) | ||
| 197 | } | ||
| 198 | |||
| 199 | var result openaiResponse | ||
| 200 | if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { | ||
| 201 | return nil, err | ||
| 202 | } | ||
| 203 | |||
| 204 | for i, d := range result.Data { | ||
| 205 | embeddings[start+i] = d.Embedding | ||
| 206 | } | ||
| 207 | } | ||
| 208 | |||
| 209 | return embeddings, nil | ||
| 210 | } | ||
| 211 | |||
| 212 | // New creates an embedder based on provider name | ||
| 213 | func New(provider, model string) (Embedder, error) { | ||
| 214 | switch provider { | ||
| 215 | case "ollama": | ||
| 216 | return NewOllamaEmbedder(model), nil | ||
| 217 | case "openai": | ||
| 218 | return NewOpenAIEmbedder(model), nil | ||
| 219 | default: | ||
| 220 | return nil, fmt.Errorf("unknown provider: %s", provider) | ||
| 221 | } | ||
| 222 | } | ||
diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go | |||
| @@ -0,0 +1,271 @@ | |||
| 1 | package index | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "database/sql" | ||
| 5 | "encoding/binary" | ||
| 6 | "math" | ||
| 7 | "os" | ||
| 8 | "path/filepath" | ||
| 9 | "sort" | ||
| 10 | |||
| 11 | _ "modernc.org/sqlite" | ||
| 12 | |||
| 13 | "code.northwest.io/codevec/internal/chunker" | ||
| 14 | ) | ||
| 15 | |||
| 16 | // Index stores chunks and embeddings in SQLite | ||
| 17 | type Index struct { | ||
| 18 | db *sql.DB | ||
| 19 | dims int | ||
| 20 | } | ||
| 21 | |||
| 22 | // Open opens or creates an index at the given path | ||
| 23 | func Open(path string, dims int) (*Index, error) { | ||
| 24 | // Ensure directory exists | ||
| 25 | dir := filepath.Dir(path) | ||
| 26 | if err := os.MkdirAll(dir, 0755); err != nil { | ||
| 27 | return nil, err | ||
| 28 | } | ||
| 29 | |||
| 30 | db, err := sql.Open("sqlite", path) | ||
| 31 | if err != nil { | ||
| 32 | return nil, err | ||
| 33 | } | ||
| 34 | |||
| 35 | idx := &Index{db: db, dims: dims} | ||
| 36 | if err := idx.init(); err != nil { | ||
| 37 | db.Close() | ||
| 38 | return nil, err | ||
| 39 | } | ||
| 40 | |||
| 41 | return idx, nil | ||
| 42 | } | ||
| 43 | |||
| 44 | func (idx *Index) init() error { | ||
| 45 | // Create chunks table with embedding column | ||
| 46 | _, err := idx.db.Exec(` | ||
| 47 | CREATE TABLE IF NOT EXISTS chunks ( | ||
| 48 | id INTEGER PRIMARY KEY, | ||
| 49 | file TEXT NOT NULL, | ||
| 50 | start_line INTEGER NOT NULL, | ||
| 51 | end_line INTEGER NOT NULL, | ||
| 52 | chunk_type TEXT, | ||
| 53 | name TEXT, | ||
| 54 | content TEXT NOT NULL, | ||
| 55 | hash TEXT NOT NULL, | ||
| 56 | embedding BLOB, | ||
| 57 | created_at INTEGER DEFAULT (unixepoch()) | ||
| 58 | ) | ||
| 59 | `) | ||
| 60 | if err != nil { | ||
| 61 | return err | ||
| 62 | } | ||
| 63 | |||
| 64 | // Create files table for tracking indexed files | ||
| 65 | _, err = idx.db.Exec(` | ||
| 66 | CREATE TABLE IF NOT EXISTS files ( | ||
| 67 | path TEXT PRIMARY KEY, | ||
| 68 | hash TEXT NOT NULL, | ||
| 69 | indexed_at INTEGER DEFAULT (unixepoch()) | ||
| 70 | ) | ||
| 71 | `) | ||
| 72 | if err != nil { | ||
| 73 | return err | ||
| 74 | } | ||
| 75 | |||
| 76 | // Create metadata table | ||
| 77 | _, err = idx.db.Exec(` | ||
| 78 | CREATE TABLE IF NOT EXISTS metadata ( | ||
| 79 | key TEXT PRIMARY KEY, | ||
| 80 | value TEXT | ||
| 81 | ) | ||
| 82 | `) | ||
| 83 | if err != nil { | ||
| 84 | return err | ||
| 85 | } | ||
| 86 | |||
| 87 | // Index on file for faster deletion | ||
| 88 | _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) | ||
| 89 | return err | ||
| 90 | } | ||
| 91 | |||
| 92 | // Close closes the index | ||
| 93 | func (idx *Index) Close() error { | ||
| 94 | return idx.db.Close() | ||
| 95 | } | ||
| 96 | |||
| 97 | // InsertChunk inserts a chunk with its embedding | ||
| 98 | func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { | ||
| 99 | embeddingBlob := serializeEmbedding(embedding) | ||
| 100 | _, err := idx.db.Exec(` | ||
| 101 | INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) | ||
| 102 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) | ||
| 103 | `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) | ||
| 104 | return err | ||
| 105 | } | ||
| 106 | |||
| 107 | // SearchResult represents a search result | ||
| 108 | type SearchResult struct { | ||
| 109 | Chunk chunker.Chunk | ||
| 110 | Distance float64 | ||
| 111 | } | ||
| 112 | |||
| 113 | // Search finds chunks similar to the query embedding using cosine similarity | ||
| 114 | func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { | ||
| 115 | // Load all embeddings | ||
| 116 | rows, err := idx.db.Query(` | ||
| 117 | SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding | ||
| 118 | FROM chunks | ||
| 119 | WHERE embedding IS NOT NULL | ||
| 120 | `) | ||
| 121 | if err != nil { | ||
| 122 | return nil, err | ||
| 123 | } | ||
| 124 | defer rows.Close() | ||
| 125 | |||
| 126 | type candidate struct { | ||
| 127 | chunk chunker.Chunk | ||
| 128 | distance float64 | ||
| 129 | } | ||
| 130 | var candidates []candidate | ||
| 131 | |||
| 132 | for rows.Next() { | ||
| 133 | var id int64 | ||
| 134 | var c chunker.Chunk | ||
| 135 | var embBlob []byte | ||
| 136 | err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) | ||
| 137 | if err != nil { | ||
| 138 | return nil, err | ||
| 139 | } | ||
| 140 | |||
| 141 | emb := deserializeEmbedding(embBlob) | ||
| 142 | dist := cosineDistance(queryEmb, emb) | ||
| 143 | candidates = append(candidates, candidate{chunk: c, distance: dist}) | ||
| 144 | } | ||
| 145 | |||
| 146 | if err := rows.Err(); err != nil { | ||
| 147 | return nil, err | ||
| 148 | } | ||
| 149 | |||
| 150 | // Sort by distance (lower is better) | ||
| 151 | sort.Slice(candidates, func(i, j int) bool { | ||
| 152 | return candidates[i].distance < candidates[j].distance | ||
| 153 | }) | ||
| 154 | |||
| 155 | // Return top-k | ||
| 156 | if limit > len(candidates) { | ||
| 157 | limit = len(candidates) | ||
| 158 | } | ||
| 159 | |||
| 160 | results := make([]SearchResult, limit) | ||
| 161 | for i := 0; i < limit; i++ { | ||
| 162 | results[i] = SearchResult{ | ||
| 163 | Chunk: candidates[i].chunk, | ||
| 164 | Distance: candidates[i].distance, | ||
| 165 | } | ||
| 166 | } | ||
| 167 | |||
| 168 | return results, nil | ||
| 169 | } | ||
| 170 | |||
| 171 | // GetFileHash returns the stored hash for a file, or empty string if not indexed | ||
| 172 | func (idx *Index) GetFileHash(path string) (string, error) { | ||
| 173 | var hash string | ||
| 174 | err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) | ||
| 175 | if err == sql.ErrNoRows { | ||
| 176 | return "", nil | ||
| 177 | } | ||
| 178 | return hash, err | ||
| 179 | } | ||
| 180 | |||
| 181 | // SetFileHash updates the hash for a file | ||
| 182 | func (idx *Index) SetFileHash(path, hash string) error { | ||
| 183 | _, err := idx.db.Exec(` | ||
| 184 | INSERT OR REPLACE INTO files (path, hash, indexed_at) | ||
| 185 | VALUES (?, ?, unixepoch()) | ||
| 186 | `, path, hash) | ||
| 187 | return err | ||
| 188 | } | ||
| 189 | |||
| 190 | // DeleteChunksForFile removes all chunks for a file | ||
| 191 | func (idx *Index) DeleteChunksForFile(path string) error { | ||
| 192 | _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) | ||
| 193 | if err != nil { | ||
| 194 | return err | ||
| 195 | } | ||
| 196 | _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) | ||
| 197 | return err | ||
| 198 | } | ||
| 199 | |||
| 200 | // Stats returns index statistics | ||
| 201 | type Stats struct { | ||
| 202 | Files int | ||
| 203 | Chunks int | ||
| 204 | } | ||
| 205 | |||
| 206 | func (idx *Index) Stats() (Stats, error) { | ||
| 207 | var s Stats | ||
| 208 | err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) | ||
| 209 | if err != nil { | ||
| 210 | return s, err | ||
| 211 | } | ||
| 212 | err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) | ||
| 213 | return s, err | ||
| 214 | } | ||
| 215 | |||
| 216 | // SetMetadata stores metadata | ||
| 217 | func (idx *Index) SetMetadata(key, value string) error { | ||
| 218 | _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) | ||
| 219 | return err | ||
| 220 | } | ||
| 221 | |||
| 222 | // GetMetadata retrieves metadata | ||
| 223 | func (idx *Index) GetMetadata(key string) (string, error) { | ||
| 224 | var value string | ||
| 225 | err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) | ||
| 226 | if err == sql.ErrNoRows { | ||
| 227 | return "", nil | ||
| 228 | } | ||
| 229 | return value, err | ||
| 230 | } | ||
| 231 | |||
| 232 | // serializeEmbedding converts float32 slice to bytes | ||
| 233 | func serializeEmbedding(embedding []float32) []byte { | ||
| 234 | buf := make([]byte, len(embedding)*4) | ||
| 235 | for i, v := range embedding { | ||
| 236 | binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) | ||
| 237 | } | ||
| 238 | return buf | ||
| 239 | } | ||
| 240 | |||
| 241 | // deserializeEmbedding converts bytes back to float32 slice | ||
| 242 | func deserializeEmbedding(data []byte) []float32 { | ||
| 243 | n := len(data) / 4 | ||
| 244 | result := make([]float32, n) | ||
| 245 | for i := 0; i < n; i++ { | ||
| 246 | bits := binary.LittleEndian.Uint32(data[i*4:]) | ||
| 247 | result[i] = math.Float32frombits(bits) | ||
| 248 | } | ||
| 249 | return result | ||
| 250 | } | ||
| 251 | |||
| 252 | // cosineDistance computes 1 - cosine_similarity (so lower is more similar) | ||
| 253 | func cosineDistance(a, b []float32) float64 { | ||
| 254 | if len(a) != len(b) { | ||
| 255 | return 1.0 | ||
| 256 | } | ||
| 257 | |||
| 258 | var dotProduct, normA, normB float64 | ||
| 259 | for i := range a { | ||
| 260 | dotProduct += float64(a[i]) * float64(b[i]) | ||
| 261 | normA += float64(a[i]) * float64(a[i]) | ||
| 262 | normB += float64(b[i]) * float64(b[i]) | ||
| 263 | } | ||
| 264 | |||
| 265 | if normA == 0 || normB == 0 { | ||
| 266 | return 1.0 | ||
| 267 | } | ||
| 268 | |||
| 269 | similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) | ||
| 270 | return 1.0 - similarity | ||
| 271 | } | ||
diff --git a/internal/walker/walker.go b/internal/walker/walker.go new file mode 100644 index 0000000..0ac470d --- /dev/null +++ b/internal/walker/walker.go | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | package walker | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "os" | ||
| 5 | "path/filepath" | ||
| 6 | "strings" | ||
| 7 | |||
| 8 | ignore "github.com/sabhiram/go-gitignore" | ||
| 9 | ) | ||
| 10 | |||
| 11 | // DefaultIgnore patterns applied to all walks | ||
| 12 | var DefaultIgnore = []string{ | ||
| 13 | "vendor/", | ||
| 14 | "node_modules/", | ||
| 15 | ".git/", | ||
| 16 | ".codevec/", | ||
| 17 | } | ||
| 18 | |||
| 19 | // Walker walks a directory tree finding files to index | ||
| 20 | type Walker struct { | ||
| 21 | root string | ||
| 22 | extensions []string // e.g., [".go"] | ||
| 23 | gitignore *ignore.GitIgnore | ||
| 24 | } | ||
| 25 | |||
| 26 | // New creates a walker for the given root directory | ||
| 27 | func New(root string, extensions []string) (*Walker, error) { | ||
| 28 | root, err := filepath.Abs(root) | ||
| 29 | if err != nil { | ||
| 30 | return nil, err | ||
| 31 | } | ||
| 32 | |||
| 33 | w := &Walker{ | ||
| 34 | root: root, | ||
| 35 | extensions: extensions, | ||
| 36 | } | ||
| 37 | |||
| 38 | // Load .gitignore if present | ||
| 39 | gitignorePath := filepath.Join(root, ".gitignore") | ||
| 40 | if _, err := os.Stat(gitignorePath); err == nil { | ||
| 41 | gi, err := ignore.CompileIgnoreFile(gitignorePath) | ||
| 42 | if err == nil { | ||
| 43 | w.gitignore = gi | ||
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 47 | return w, nil | ||
| 48 | } | ||
| 49 | |||
| 50 | // Walk returns all matching files in the directory tree | ||
| 51 | func (w *Walker) Walk() ([]string, error) { | ||
| 52 | var files []string | ||
| 53 | |||
| 54 | err := filepath.WalkDir(w.root, func(path string, d os.DirEntry, err error) error { | ||
| 55 | if err != nil { | ||
| 56 | return err | ||
| 57 | } | ||
| 58 | |||
| 59 | // Get path relative to root for ignore matching | ||
| 60 | relPath, err := filepath.Rel(w.root, path) | ||
| 61 | if err != nil { | ||
| 62 | return err | ||
| 63 | } | ||
| 64 | |||
| 65 | // Skip default ignored directories | ||
| 66 | if d.IsDir() { | ||
| 67 | for _, pattern := range DefaultIgnore { | ||
| 68 | if strings.HasPrefix(relPath+"/", pattern) || relPath+"/" == pattern { | ||
| 69 | return filepath.SkipDir | ||
| 70 | } | ||
| 71 | } | ||
| 72 | } | ||
| 73 | |||
| 74 | // Skip if matched by .gitignore | ||
| 75 | if w.gitignore != nil && w.gitignore.MatchesPath(relPath) { | ||
| 76 | if d.IsDir() { | ||
| 77 | return filepath.SkipDir | ||
| 78 | } | ||
| 79 | return nil | ||
| 80 | } | ||
| 81 | |||
| 82 | // Skip directories and non-matching extensions | ||
| 83 | if d.IsDir() { | ||
| 84 | return nil | ||
| 85 | } | ||
| 86 | |||
| 87 | if !w.matchesExtension(path) { | ||
| 88 | return nil | ||
| 89 | } | ||
| 90 | |||
| 91 | files = append(files, path) | ||
| 92 | return nil | ||
| 93 | }) | ||
| 94 | |||
| 95 | return files, err | ||
| 96 | } | ||
| 97 | |||
| 98 | func (w *Walker) matchesExtension(path string) bool { | ||
| 99 | if len(w.extensions) == 0 { | ||
| 100 | return true | ||
| 101 | } | ||
| 102 | ext := filepath.Ext(path) | ||
| 103 | for _, e := range w.extensions { | ||
| 104 | if ext == e { | ||
| 105 | return true | ||
| 106 | } | ||
| 107 | } | ||
| 108 | return false | ||
| 109 | } | ||
