diff options
| -rw-r--r-- | .gitignore | 2 | ||||
| -rw-r--r-- | Makefile | 13 | ||||
| -rw-r--r-- | cmd/codevec/main.go | 369 | ||||
| -rw-r--r-- | go.mod | 26 | ||||
| -rw-r--r-- | go.sum | 79 | ||||
| -rw-r--r-- | internal/chunker/chunker.go | 185 | ||||
| -rw-r--r-- | internal/embedder/embedder.go | 222 | ||||
| -rw-r--r-- | internal/index/index.go | 271 | ||||
| -rw-r--r-- | internal/walker/walker.go | 109 |
9 files changed, 1276 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96f1663 --- /dev/null +++ b/.gitignore | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | bin/ | ||
| 2 | .codevec/ | ||
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5f37491 --- /dev/null +++ b/Makefile | |||
| @@ -0,0 +1,13 @@ | |||
| 1 | .PHONY: build install clean | ||
| 2 | |||
| 3 | BINARY := codevec | ||
| 4 | BUILD_DIR := bin | ||
| 5 | |||
| 6 | build: | ||
| 7 | go build -o $(BUILD_DIR)/$(BINARY) ./cmd/codevec | ||
| 8 | |||
| 9 | install: build | ||
| 10 | cp $(BUILD_DIR)/$(BINARY) ~/.local/bin/ | ||
| 11 | |||
| 12 | clean: | ||
| 13 | rm -rf $(BUILD_DIR) | ||
diff --git a/cmd/codevec/main.go b/cmd/codevec/main.go new file mode 100644 index 0000000..8337367 --- /dev/null +++ b/cmd/codevec/main.go | |||
| @@ -0,0 +1,369 @@ | |||
| 1 | package main | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "context" | ||
| 5 | "crypto/sha256" | ||
| 6 | "encoding/json" | ||
| 7 | "fmt" | ||
| 8 | "os" | ||
| 9 | "path/filepath" | ||
| 10 | "strings" | ||
| 11 | |||
| 12 | "github.com/spf13/cobra" | ||
| 13 | |||
| 14 | "code.northwest.io/codevec/internal/chunker" | ||
| 15 | "code.northwest.io/codevec/internal/embedder" | ||
| 16 | "code.northwest.io/codevec/internal/index" | ||
| 17 | "code.northwest.io/codevec/internal/walker" | ||
| 18 | ) | ||
| 19 | |||
| 20 | const codevecDir = ".codevec" | ||
| 21 | const indexFile = "index.db" | ||
| 22 | |||
| 23 | var rootCmd = &cobra.Command{ | ||
| 24 | Use: "codevec", | ||
| 25 | Short: "Semantic code search via embeddings", | ||
| 26 | Long: `Index your codebase and query by concept. Get relevant code chunks with file paths and line numbers.`, | ||
| 27 | } | ||
| 28 | |||
| 29 | var indexCmd = &cobra.Command{ | ||
| 30 | Use: "index [path]", | ||
| 31 | Short: "Index a directory for semantic search", | ||
| 32 | Args: cobra.MaximumNArgs(1), | ||
| 33 | RunE: runIndex, | ||
| 34 | } | ||
| 35 | |||
| 36 | var queryCmd = &cobra.Command{ | ||
| 37 | Use: "query <text>", | ||
| 38 | Short: "Search for relevant code", | ||
| 39 | Args: cobra.ExactArgs(1), | ||
| 40 | RunE: runQuery, | ||
| 41 | } | ||
| 42 | |||
| 43 | var statusCmd = &cobra.Command{ | ||
| 44 | Use: "status", | ||
| 45 | Short: "Show index statistics", | ||
| 46 | RunE: runStatus, | ||
| 47 | } | ||
| 48 | |||
| 49 | func init() { | ||
| 50 | // index flags | ||
| 51 | indexCmd.Flags().BoolP("force", "f", false, "Re-index everything") | ||
| 52 | indexCmd.Flags().BoolP("verbose", "v", false, "Show progress") | ||
| 53 | indexCmd.Flags().StringP("provider", "p", "ollama", "Embedding provider (ollama, openai)") | ||
| 54 | indexCmd.Flags().StringP("model", "m", "", "Embedding model (default: provider-specific)") | ||
| 55 | |||
| 56 | // query flags | ||
| 57 | queryCmd.Flags().IntP("limit", "l", 10, "Max results") | ||
| 58 | queryCmd.Flags().Float64P("threshold", "t", 0.0, "Min similarity score (0-1)") | ||
| 59 | queryCmd.Flags().BoolP("show", "s", false, "Print chunk content") | ||
| 60 | queryCmd.Flags().Bool("json", false, "Output as JSON") | ||
| 61 | |||
| 62 | rootCmd.AddCommand(indexCmd) | ||
| 63 | rootCmd.AddCommand(queryCmd) | ||
| 64 | rootCmd.AddCommand(statusCmd) | ||
| 65 | } | ||
| 66 | |||
| 67 | func main() { | ||
| 68 | if err := rootCmd.Execute(); err != nil { | ||
| 69 | os.Exit(1) | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | func runIndex(cmd *cobra.Command, args []string) error { | ||
| 74 | path := "." | ||
| 75 | if len(args) > 0 { | ||
| 76 | path = args[0] | ||
| 77 | } | ||
| 78 | |||
| 79 | force, _ := cmd.Flags().GetBool("force") | ||
| 80 | verbose, _ := cmd.Flags().GetBool("verbose") | ||
| 81 | provider, _ := cmd.Flags().GetString("provider") | ||
| 82 | model, _ := cmd.Flags().GetString("model") | ||
| 83 | |||
| 84 | // Resolve absolute path | ||
| 85 | absPath, err := filepath.Abs(path) | ||
| 86 | if err != nil { | ||
| 87 | return err | ||
| 88 | } | ||
| 89 | |||
| 90 | // Create embedder | ||
| 91 | emb, err := embedder.New(provider, model) | ||
| 92 | if err != nil { | ||
| 93 | return err | ||
| 94 | } | ||
| 95 | |||
| 96 | // Open index | ||
| 97 | indexPath := filepath.Join(absPath, codevecDir, indexFile) | ||
| 98 | idx, err := index.Open(indexPath, emb.Dimensions()) | ||
| 99 | if err != nil { | ||
| 100 | return fmt.Errorf("failed to open index: %w", err) | ||
| 101 | } | ||
| 102 | defer idx.Close() | ||
| 103 | |||
| 104 | // Store metadata | ||
| 105 | idx.SetMetadata("provider", provider) | ||
| 106 | if model != "" { | ||
| 107 | idx.SetMetadata("model", model) | ||
| 108 | } | ||
| 109 | idx.SetMetadata("dimensions", fmt.Sprintf("%d", emb.Dimensions())) | ||
| 110 | |||
| 111 | // Walk directory | ||
| 112 | w, err := walker.New(absPath, []string{".go"}) | ||
| 113 | if err != nil { | ||
| 114 | return err | ||
| 115 | } | ||
| 116 | |||
| 117 | files, err := w.Walk() | ||
| 118 | if err != nil { | ||
| 119 | return err | ||
| 120 | } | ||
| 121 | |||
| 122 | if verbose { | ||
| 123 | fmt.Printf("Found %d Go files\n", len(files)) | ||
| 124 | } | ||
| 125 | |||
| 126 | // Create chunker | ||
| 127 | goChunker := chunker.NewGoChunker() | ||
| 128 | |||
| 129 | // Process files | ||
| 130 | var totalChunks int | ||
| 131 | var skipped int | ||
| 132 | ctx := context.Background() | ||
| 133 | |||
| 134 | for _, file := range files { | ||
| 135 | // Read file content | ||
| 136 | content, err := os.ReadFile(file) | ||
| 137 | if err != nil { | ||
| 138 | fmt.Fprintf(os.Stderr, "Warning: failed to read %s: %v\n", file, err) | ||
| 139 | continue | ||
| 140 | } | ||
| 141 | |||
| 142 | // Compute file hash | ||
| 143 | fileHash := fmt.Sprintf("%x", sha256.Sum256(content)) | ||
| 144 | |||
| 145 | // Check if already indexed | ||
| 146 | if !force { | ||
| 147 | existingHash, _ := idx.GetFileHash(file) | ||
| 148 | if existingHash == fileHash { | ||
| 149 | skipped++ | ||
| 150 | continue | ||
| 151 | } | ||
| 152 | } | ||
| 153 | |||
| 154 | // Delete old chunks for this file | ||
| 155 | idx.DeleteChunksForFile(file) | ||
| 156 | |||
| 157 | // Chunk file | ||
| 158 | chunks, err := goChunker.Chunk(file, content) | ||
| 159 | if err != nil { | ||
| 160 | fmt.Fprintf(os.Stderr, "Warning: failed to parse %s: %v\n", file, err) | ||
| 161 | continue | ||
| 162 | } | ||
| 163 | |||
| 164 | if len(chunks) == 0 { | ||
| 165 | continue | ||
| 166 | } | ||
| 167 | |||
| 168 | // Generate embeddings | ||
| 169 | texts := make([]string, len(chunks)) | ||
| 170 | for i, c := range chunks { | ||
| 171 | // Include file path and name for context | ||
| 172 | relPath, _ := filepath.Rel(absPath, c.File) | ||
| 173 | texts[i] = fmt.Sprintf("File: %s\n%s %s\n\n%s", relPath, c.Type, c.Name, c.Content) | ||
| 174 | } | ||
| 175 | |||
| 176 | embeddings, err := emb.Embed(ctx, texts) | ||
| 177 | if err != nil { | ||
| 178 | return fmt.Errorf("embedding failed for %s: %w", file, err) | ||
| 179 | } | ||
| 180 | |||
| 181 | // Store chunks and embeddings | ||
| 182 | for i, chunk := range chunks { | ||
| 183 | if err := idx.InsertChunk(chunk, embeddings[i]); err != nil { | ||
| 184 | return fmt.Errorf("failed to insert chunk: %w", err) | ||
| 185 | } | ||
| 186 | } | ||
| 187 | |||
| 188 | // Update file hash | ||
| 189 | idx.SetFileHash(file, fileHash) | ||
| 190 | |||
| 191 | totalChunks += len(chunks) | ||
| 192 | if verbose { | ||
| 193 | relPath, _ := filepath.Rel(absPath, file) | ||
| 194 | fmt.Printf(" %s: %d chunks\n", relPath, len(chunks)) | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | fmt.Printf("Indexed %d chunks from %d files", totalChunks, len(files)-skipped) | ||
| 199 | if skipped > 0 { | ||
| 200 | fmt.Printf(" (%d unchanged)", skipped) | ||
| 201 | } | ||
| 202 | fmt.Println() | ||
| 203 | |||
| 204 | return nil | ||
| 205 | } | ||
| 206 | |||
| 207 | func runQuery(cmd *cobra.Command, args []string) error { | ||
| 208 | query := args[0] | ||
| 209 | limit, _ := cmd.Flags().GetInt("limit") | ||
| 210 | threshold, _ := cmd.Flags().GetFloat64("threshold") | ||
| 211 | show, _ := cmd.Flags().GetBool("show") | ||
| 212 | jsonOutput, _ := cmd.Flags().GetBool("json") | ||
| 213 | |||
| 214 | // Find index | ||
| 215 | cwd, err := os.Getwd() | ||
| 216 | if err != nil { | ||
| 217 | return err | ||
| 218 | } | ||
| 219 | indexPath := filepath.Join(cwd, codevecDir, indexFile) | ||
| 220 | |||
| 221 | if _, err := os.Stat(indexPath); os.IsNotExist(err) { | ||
| 222 | return fmt.Errorf("no index found. Run 'codevec index' first") | ||
| 223 | } | ||
| 224 | |||
| 225 | // Get provider/model from metadata | ||
| 226 | idx, err := index.Open(indexPath, 768) // temp dims, we'll read from metadata | ||
| 227 | if err != nil { | ||
| 228 | return err | ||
| 229 | } | ||
| 230 | |||
| 231 | provider, _ := idx.GetMetadata("provider") | ||
| 232 | model, _ := idx.GetMetadata("model") | ||
| 233 | idx.Close() | ||
| 234 | |||
| 235 | if provider == "" { | ||
| 236 | provider = "ollama" | ||
| 237 | } | ||
| 238 | |||
| 239 | // Create embedder | ||
| 240 | emb, err := embedder.New(provider, model) | ||
| 241 | if err != nil { | ||
| 242 | return err | ||
| 243 | } | ||
| 244 | |||
| 245 | // Reopen with correct dimensions | ||
| 246 | idx, err = index.Open(indexPath, emb.Dimensions()) | ||
| 247 | if err != nil { | ||
| 248 | return err | ||
| 249 | } | ||
| 250 | defer idx.Close() | ||
| 251 | |||
| 252 | // Generate query embedding | ||
| 253 | ctx := context.Background() | ||
| 254 | embeddings, err := emb.Embed(ctx, []string{query}) | ||
| 255 | if err != nil { | ||
| 256 | return fmt.Errorf("failed to embed query: %w", err) | ||
| 257 | } | ||
| 258 | |||
| 259 | // Search | ||
| 260 | results, err := idx.Search(embeddings[0], limit) | ||
| 261 | if err != nil { | ||
| 262 | return fmt.Errorf("search failed: %w", err) | ||
| 263 | } | ||
| 264 | |||
| 265 | // Filter by threshold (distance is lower = more similar) | ||
| 266 | // Convert distance to similarity for threshold comparison | ||
| 267 | var filtered []index.SearchResult | ||
| 268 | for _, r := range results { | ||
| 269 | similarity := 1 - r.Distance | ||
| 270 | if similarity >= threshold { | ||
| 271 | filtered = append(filtered, r) | ||
| 272 | } | ||
| 273 | } | ||
| 274 | results = filtered | ||
| 275 | |||
| 276 | // Output | ||
| 277 | if jsonOutput { | ||
| 278 | type jsonResult struct { | ||
| 279 | File string `json:"file"` | ||
| 280 | StartLine int `json:"start_line"` | ||
| 281 | EndLine int `json:"end_line"` | ||
| 282 | Type string `json:"type"` | ||
| 283 | Name string `json:"name"` | ||
| 284 | Score float64 `json:"score"` | ||
| 285 | Content string `json:"content,omitempty"` | ||
| 286 | } | ||
| 287 | |||
| 288 | var output []jsonResult | ||
| 289 | for _, r := range results { | ||
| 290 | relPath, _ := filepath.Rel(cwd, r.Chunk.File) | ||
| 291 | jr := jsonResult{ | ||
| 292 | File: relPath, | ||
| 293 | StartLine: r.Chunk.StartLine, | ||
| 294 | EndLine: r.Chunk.EndLine, | ||
| 295 | Type: r.Chunk.Type, | ||
| 296 | Name: r.Chunk.Name, | ||
| 297 | Score: 1 - r.Distance, | ||
| 298 | } | ||
| 299 | if show { | ||
| 300 | jr.Content = r.Chunk.Content | ||
| 301 | } | ||
| 302 | output = append(output, jr) | ||
| 303 | } | ||
| 304 | |||
| 305 | enc := json.NewEncoder(os.Stdout) | ||
| 306 | enc.SetIndent("", " ") | ||
| 307 | return enc.Encode(output) | ||
| 308 | } | ||
| 309 | |||
| 310 | // Text output | ||
| 311 | if len(results) == 0 { | ||
| 312 | fmt.Println("No results found") | ||
| 313 | return nil | ||
| 314 | } | ||
| 315 | |||
| 316 | for _, r := range results { | ||
| 317 | relPath, _ := filepath.Rel(cwd, r.Chunk.File) | ||
| 318 | similarity := 1 - r.Distance | ||
| 319 | fmt.Printf("%s:%d-%d %s (%.2f)\n", relPath, r.Chunk.StartLine, r.Chunk.EndLine, r.Chunk.Name, similarity) | ||
| 320 | if show { | ||
| 321 | fmt.Println(strings.Repeat("-", 40)) | ||
| 322 | fmt.Println(r.Chunk.Content) | ||
| 323 | fmt.Println() | ||
| 324 | } | ||
| 325 | } | ||
| 326 | |||
| 327 | return nil | ||
| 328 | } | ||
| 329 | |||
| 330 | func runStatus(cmd *cobra.Command, args []string) error { | ||
| 331 | cwd, err := os.Getwd() | ||
| 332 | if err != nil { | ||
| 333 | return err | ||
| 334 | } | ||
| 335 | indexPath := filepath.Join(cwd, codevecDir, indexFile) | ||
| 336 | |||
| 337 | if _, err := os.Stat(indexPath); os.IsNotExist(err) { | ||
| 338 | fmt.Println("No index found. Run 'codevec index' first.") | ||
| 339 | return nil | ||
| 340 | } | ||
| 341 | |||
| 342 | idx, err := index.Open(indexPath, 768) | ||
| 343 | if err != nil { | ||
| 344 | return err | ||
| 345 | } | ||
| 346 | defer idx.Close() | ||
| 347 | |||
| 348 | stats, err := idx.Stats() | ||
| 349 | if err != nil { | ||
| 350 | return err | ||
| 351 | } | ||
| 352 | |||
| 353 | provider, _ := idx.GetMetadata("provider") | ||
| 354 | model, _ := idx.GetMetadata("model") | ||
| 355 | dims, _ := idx.GetMetadata("dimensions") | ||
| 356 | |||
| 357 | fmt.Printf("Index: %s\n", indexPath) | ||
| 358 | fmt.Printf("Files: %d\n", stats.Files) | ||
| 359 | fmt.Printf("Chunks: %d\n", stats.Chunks) | ||
| 360 | fmt.Printf("Provider: %s\n", provider) | ||
| 361 | if model != "" { | ||
| 362 | fmt.Printf("Model: %s\n", model) | ||
| 363 | } | ||
| 364 | if dims != "" { | ||
| 365 | fmt.Printf("Dimensions: %s\n", dims) | ||
| 366 | } | ||
| 367 | |||
| 368 | return nil | ||
| 369 | } | ||
| @@ -0,0 +1,26 @@ | |||
| 1 | module code.northwest.io/codevec | ||
| 2 | |||
| 3 | go 1.24.0 | ||
| 4 | |||
| 5 | require ( | ||
| 6 | github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 | ||
| 7 | github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 | ||
| 8 | github.com/spf13/cobra v1.10.2 | ||
| 9 | modernc.org/sqlite v1.46.1 | ||
| 10 | ) | ||
| 11 | |||
| 12 | require ( | ||
| 13 | github.com/dustin/go-humanize v1.0.1 // indirect | ||
| 14 | github.com/google/uuid v1.6.0 // indirect | ||
| 15 | github.com/inconshreveable/mousetrap v1.1.0 // indirect | ||
| 16 | github.com/mattn/go-isatty v0.0.20 // indirect | ||
| 17 | github.com/ncruces/go-strftime v1.0.0 // indirect | ||
| 18 | github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect | ||
| 19 | github.com/spf13/pflag v1.0.9 // indirect | ||
| 20 | golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect | ||
| 21 | golang.org/x/sync v0.19.0 // indirect | ||
| 22 | golang.org/x/sys v0.40.0 // indirect | ||
| 23 | modernc.org/libc v1.67.6 // indirect | ||
| 24 | modernc.org/mathutil v1.7.1 // indirect | ||
| 25 | modernc.org/memory v1.11.0 // indirect | ||
| 26 | ) | ||
| @@ -0,0 +1,79 @@ | |||
| 1 | github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= | ||
| 2 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
| 3 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= | ||
| 4 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
| 5 | github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= | ||
| 6 | github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= | ||
| 7 | github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= | ||
| 8 | github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= | ||
| 9 | github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= | ||
| 10 | github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= | ||
| 11 | github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= | ||
| 12 | github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= | ||
| 13 | github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= | ||
| 14 | github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= | ||
| 15 | github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= | ||
| 16 | github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= | ||
| 17 | github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= | ||
| 18 | github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= | ||
| 19 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
| 20 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
| 21 | github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= | ||
| 22 | github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= | ||
| 23 | github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= | ||
| 24 | github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 h1:OkMGxebDjyw0ULyrTYWeN0UNCCkmCWfjPnIA2W6oviI= | ||
| 25 | github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06/go.mod h1:+ePHsJ1keEjQtpvf9HHw0f4ZeJ0TLRsxhunSI2hYJSs= | ||
| 26 | github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 h1:6C8qej6f1bStuePVkLSFxoU22XBS165D3klxlzRg8F4= | ||
| 27 | github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82/go.mod h1:xe4pgH49k4SsmkQq5OT8abwhWmnzkhpgnXeekbx2efw= | ||
| 28 | github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= | ||
| 29 | github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= | ||
| 30 | github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= | ||
| 31 | github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= | ||
| 32 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | ||
| 33 | github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= | ||
| 34 | github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= | ||
| 35 | github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= | ||
| 36 | go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= | ||
| 37 | golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY= | ||
| 38 | golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70= | ||
| 39 | golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= | ||
| 40 | golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= | ||
| 41 | golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= | ||
| 42 | golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= | ||
| 43 | golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= | ||
| 44 | golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= | ||
| 45 | golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= | ||
| 46 | golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= | ||
| 47 | golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= | ||
| 48 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
| 49 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= | ||
| 50 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= | ||
| 51 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= | ||
| 52 | modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= | ||
| 53 | modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= | ||
| 54 | modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc= | ||
| 55 | modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM= | ||
| 56 | modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= | ||
| 57 | modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= | ||
| 58 | modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= | ||
| 59 | modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= | ||
| 60 | modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE= | ||
| 61 | modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= | ||
| 62 | modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= | ||
| 63 | modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= | ||
| 64 | modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI= | ||
| 65 | modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE= | ||
| 66 | modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= | ||
| 67 | modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= | ||
| 68 | modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= | ||
| 69 | modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= | ||
| 70 | modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= | ||
| 71 | modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= | ||
| 72 | modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= | ||
| 73 | modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= | ||
| 74 | modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= | ||
| 75 | modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= | ||
| 76 | modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= | ||
| 77 | modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= | ||
| 78 | modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= | ||
| 79 | modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= | ||
diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go new file mode 100644 index 0000000..f8de08d --- /dev/null +++ b/internal/chunker/chunker.go | |||
| @@ -0,0 +1,185 @@ | |||
| 1 | package chunker | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "crypto/sha256" | ||
| 5 | "fmt" | ||
| 6 | "os" | ||
| 7 | "strings" | ||
| 8 | |||
| 9 | sitter "github.com/smacker/go-tree-sitter" | ||
| 10 | "github.com/smacker/go-tree-sitter/golang" | ||
| 11 | ) | ||
| 12 | |||
| 13 | // Chunk represents a semantically meaningful piece of code | ||
| 14 | type Chunk struct { | ||
| 15 | File string | ||
| 16 | StartLine int | ||
| 17 | EndLine int | ||
| 18 | Type string // "function", "method", "type" | ||
| 19 | Name string | ||
| 20 | Content string | ||
| 21 | Hash string | ||
| 22 | } | ||
| 23 | |||
| 24 | // Chunker extracts semantic chunks from source code | ||
| 25 | type Chunker interface { | ||
| 26 | Chunk(path string, content []byte) ([]Chunk, error) | ||
| 27 | } | ||
| 28 | |||
| 29 | // GoChunker extracts chunks from Go source files using tree-sitter | ||
| 30 | type GoChunker struct { | ||
| 31 | parser *sitter.Parser | ||
| 32 | } | ||
| 33 | |||
| 34 | // NewGoChunker creates a new Go chunker | ||
| 35 | func NewGoChunker() *GoChunker { | ||
| 36 | parser := sitter.NewParser() | ||
| 37 | parser.SetLanguage(golang.GetLanguage()) | ||
| 38 | return &GoChunker{parser: parser} | ||
| 39 | } | ||
| 40 | |||
| 41 | // ChunkFile reads and chunks a file | ||
| 42 | func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) { | ||
| 43 | content, err := os.ReadFile(path) | ||
| 44 | if err != nil { | ||
| 45 | return nil, err | ||
| 46 | } | ||
| 47 | return c.Chunk(path, content) | ||
| 48 | } | ||
| 49 | |||
| 50 | // Chunk extracts semantic chunks from Go source | ||
| 51 | func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) { | ||
| 52 | tree := c.parser.Parse(nil, content) | ||
| 53 | if tree == nil { | ||
| 54 | return nil, fmt.Errorf("failed to parse %s", path) | ||
| 55 | } | ||
| 56 | defer tree.Close() | ||
| 57 | |||
| 58 | var chunks []Chunk | ||
| 59 | root := tree.RootNode() | ||
| 60 | |||
| 61 | // Walk top-level declarations | ||
| 62 | for i := 0; i < int(root.ChildCount()); i++ { | ||
| 63 | node := root.Child(i) | ||
| 64 | chunk := c.extractChunk(node, content, path) | ||
| 65 | if chunk != nil { | ||
| 66 | chunks = append(chunks, *chunk) | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 70 | return chunks, nil | ||
| 71 | } | ||
| 72 | |||
| 73 | func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 74 | nodeType := node.Type() | ||
| 75 | |||
| 76 | switch nodeType { | ||
| 77 | case "function_declaration": | ||
| 78 | return c.extractFunction(node, content, path) | ||
| 79 | case "method_declaration": | ||
| 80 | return c.extractMethod(node, content, path) | ||
| 81 | case "type_declaration": | ||
| 82 | return c.extractType(node, content, path) | ||
| 83 | } | ||
| 84 | |||
| 85 | return nil | ||
| 86 | } | ||
| 87 | |||
| 88 | func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 89 | nameNode := node.ChildByFieldName("name") | ||
| 90 | if nameNode == nil { | ||
| 91 | return nil | ||
| 92 | } | ||
| 93 | |||
| 94 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 95 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 96 | |||
| 97 | return &Chunk{ | ||
| 98 | File: path, | ||
| 99 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 100 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 101 | Type: "function", | ||
| 102 | Name: name, | ||
| 103 | Content: text, | ||
| 104 | Hash: hash(text), | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 109 | nameNode := node.ChildByFieldName("name") | ||
| 110 | receiverNode := node.ChildByFieldName("receiver") | ||
| 111 | if nameNode == nil { | ||
| 112 | return nil | ||
| 113 | } | ||
| 114 | |||
| 115 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 116 | |||
| 117 | // Build receiver prefix like (*Server) or (s Server) | ||
| 118 | if receiverNode != nil { | ||
| 119 | recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()]) | ||
| 120 | // Extract type from receiver, e.g., "(s *Server)" -> "*Server" | ||
| 121 | recvType := extractReceiverType(recvText) | ||
| 122 | if recvType != "" { | ||
| 123 | name = fmt.Sprintf("(%s).%s", recvType, name) | ||
| 124 | } | ||
| 125 | } | ||
| 126 | |||
| 127 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 128 | |||
| 129 | return &Chunk{ | ||
| 130 | File: path, | ||
| 131 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 132 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 133 | Type: "method", | ||
| 134 | Name: name, | ||
| 135 | Content: text, | ||
| 136 | Hash: hash(text), | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk { | ||
| 141 | // type_declaration contains type_spec children | ||
| 142 | for i := 0; i < int(node.ChildCount()); i++ { | ||
| 143 | child := node.Child(i) | ||
| 144 | if child.Type() == "type_spec" { | ||
| 145 | nameNode := child.ChildByFieldName("name") | ||
| 146 | if nameNode == nil { | ||
| 147 | continue | ||
| 148 | } | ||
| 149 | |||
| 150 | name := string(content[nameNode.StartByte():nameNode.EndByte()]) | ||
| 151 | text := string(content[node.StartByte():node.EndByte()]) | ||
| 152 | |||
| 153 | return &Chunk{ | ||
| 154 | File: path, | ||
| 155 | StartLine: int(node.StartPoint().Row) + 1, | ||
| 156 | EndLine: int(node.EndPoint().Row) + 1, | ||
| 157 | Type: "type", | ||
| 158 | Name: name, | ||
| 159 | Content: text, | ||
| 160 | Hash: hash(text), | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | return nil | ||
| 165 | } | ||
| 166 | |||
| 167 | // extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server" | ||
| 168 | func extractReceiverType(recv string) string { | ||
| 169 | // Remove parens | ||
| 170 | recv = strings.TrimPrefix(recv, "(") | ||
| 171 | recv = strings.TrimSuffix(recv, ")") | ||
| 172 | recv = strings.TrimSpace(recv) | ||
| 173 | |||
| 174 | // Split on space, take last part (the type) | ||
| 175 | parts := strings.Fields(recv) | ||
| 176 | if len(parts) == 0 { | ||
| 177 | return "" | ||
| 178 | } | ||
| 179 | return parts[len(parts)-1] | ||
| 180 | } | ||
| 181 | |||
| 182 | func hash(s string) string { | ||
| 183 | h := sha256.Sum256([]byte(s)) | ||
| 184 | return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars | ||
| 185 | } | ||
diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go new file mode 100644 index 0000000..42f8518 --- /dev/null +++ b/internal/embedder/embedder.go | |||
| @@ -0,0 +1,222 @@ | |||
| 1 | package embedder | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "context" | ||
| 6 | "encoding/json" | ||
| 7 | "fmt" | ||
| 8 | "net/http" | ||
| 9 | "os" | ||
| 10 | ) | ||
| 11 | |||
| 12 | // Embedder generates embeddings for text | ||
| 13 | type Embedder interface { | ||
| 14 | Embed(ctx context.Context, texts []string) ([][]float32, error) | ||
| 15 | Dimensions() int | ||
| 16 | } | ||
| 17 | |||
| 18 | // OllamaEmbedder uses Ollama's embedding API | ||
| 19 | type OllamaEmbedder struct { | ||
| 20 | baseURL string | ||
| 21 | model string | ||
| 22 | dims int | ||
| 23 | } | ||
| 24 | |||
| 25 | // NewOllamaEmbedder creates an Ollama embedder | ||
| 26 | func NewOllamaEmbedder(model string) *OllamaEmbedder { | ||
| 27 | baseURL := os.Getenv("CODEVEC_BASE_URL") | ||
| 28 | if baseURL == "" { | ||
| 29 | baseURL = "http://localhost:11434" | ||
| 30 | } | ||
| 31 | if model == "" { | ||
| 32 | model = "nomic-embed-text" | ||
| 33 | } | ||
| 34 | |||
| 35 | // Model dimensions | ||
| 36 | dims := 768 // nomic-embed-text default | ||
| 37 | switch model { | ||
| 38 | case "mxbai-embed-large": | ||
| 39 | dims = 1024 | ||
| 40 | case "all-minilm": | ||
| 41 | dims = 384 | ||
| 42 | } | ||
| 43 | |||
| 44 | return &OllamaEmbedder{ | ||
| 45 | baseURL: baseURL, | ||
| 46 | model: model, | ||
| 47 | dims: dims, | ||
| 48 | } | ||
| 49 | } | ||
| 50 | |||
| 51 | func (e *OllamaEmbedder) Dimensions() int { | ||
| 52 | return e.dims | ||
| 53 | } | ||
| 54 | |||
| 55 | type ollamaRequest struct { | ||
| 56 | Model string `json:"model"` | ||
| 57 | Prompt string `json:"prompt"` | ||
| 58 | } | ||
| 59 | |||
| 60 | type ollamaResponse struct { | ||
| 61 | Embedding []float32 `json:"embedding"` | ||
| 62 | } | ||
| 63 | |||
| 64 | func (e *OllamaEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { | ||
| 65 | embeddings := make([][]float32, len(texts)) | ||
| 66 | |||
| 67 | // Ollama's /api/embeddings takes one prompt at a time | ||
| 68 | for i, text := range texts { | ||
| 69 | req := ollamaRequest{ | ||
| 70 | Model: e.model, | ||
| 71 | Prompt: text, | ||
| 72 | } | ||
| 73 | |||
| 74 | body, err := json.Marshal(req) | ||
| 75 | if err != nil { | ||
| 76 | return nil, err | ||
| 77 | } | ||
| 78 | |||
| 79 | httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/api/embeddings", bytes.NewReader(body)) | ||
| 80 | if err != nil { | ||
| 81 | return nil, err | ||
| 82 | } | ||
| 83 | httpReq.Header.Set("Content-Type", "application/json") | ||
| 84 | |||
| 85 | resp, err := http.DefaultClient.Do(httpReq) | ||
| 86 | if err != nil { | ||
| 87 | return nil, fmt.Errorf("ollama request failed: %w", err) | ||
| 88 | } | ||
| 89 | defer resp.Body.Close() | ||
| 90 | |||
| 91 | if resp.StatusCode != http.StatusOK { | ||
| 92 | return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode) | ||
| 93 | } | ||
| 94 | |||
| 95 | var result ollamaResponse | ||
| 96 | if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { | ||
| 97 | return nil, err | ||
| 98 | } | ||
| 99 | |||
| 100 | embeddings[i] = result.Embedding | ||
| 101 | } | ||
| 102 | |||
| 103 | return embeddings, nil | ||
| 104 | } | ||
| 105 | |||
| 106 | // OpenAIEmbedder uses OpenAI-compatible embedding API | ||
| 107 | type OpenAIEmbedder struct { | ||
| 108 | baseURL string | ||
| 109 | apiKey string | ||
| 110 | model string | ||
| 111 | dims int | ||
| 112 | } | ||
| 113 | |||
| 114 | // NewOpenAIEmbedder creates an OpenAI-compatible embedder | ||
| 115 | func NewOpenAIEmbedder(model string) *OpenAIEmbedder { | ||
| 116 | baseURL := os.Getenv("CODEVEC_BASE_URL") | ||
| 117 | if baseURL == "" { | ||
| 118 | baseURL = "https://api.openai.com" | ||
| 119 | } | ||
| 120 | apiKey := os.Getenv("CODEVEC_API_KEY") | ||
| 121 | if model == "" { | ||
| 122 | model = "text-embedding-3-small" | ||
| 123 | } | ||
| 124 | |||
| 125 | dims := 1536 // text-embedding-3-small default | ||
| 126 | switch model { | ||
| 127 | case "text-embedding-3-large": | ||
| 128 | dims = 3072 | ||
| 129 | case "text-embedding-ada-002": | ||
| 130 | dims = 1536 | ||
| 131 | } | ||
| 132 | |||
| 133 | return &OpenAIEmbedder{ | ||
| 134 | baseURL: baseURL, | ||
| 135 | apiKey: apiKey, | ||
| 136 | model: model, | ||
| 137 | dims: dims, | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | func (e *OpenAIEmbedder) Dimensions() int { | ||
| 142 | return e.dims | ||
| 143 | } | ||
| 144 | |||
| 145 | type openaiRequest struct { | ||
| 146 | Model string `json:"model"` | ||
| 147 | Input []string `json:"input"` | ||
| 148 | } | ||
| 149 | |||
| 150 | type openaiResponse struct { | ||
| 151 | Data []struct { | ||
| 152 | Embedding []float32 `json:"embedding"` | ||
| 153 | } `json:"data"` | ||
| 154 | } | ||
| 155 | |||
| 156 | func (e *OpenAIEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { | ||
| 157 | if e.apiKey == "" { | ||
| 158 | return nil, fmt.Errorf("CODEVEC_API_KEY not set") | ||
| 159 | } | ||
| 160 | |||
| 161 | // Batch in groups of 100 | ||
| 162 | const batchSize = 100 | ||
| 163 | embeddings := make([][]float32, len(texts)) | ||
| 164 | |||
| 165 | for start := 0; start < len(texts); start += batchSize { | ||
| 166 | end := start + batchSize | ||
| 167 | if end > len(texts) { | ||
| 168 | end = len(texts) | ||
| 169 | } | ||
| 170 | batch := texts[start:end] | ||
| 171 | |||
| 172 | req := openaiRequest{ | ||
| 173 | Model: e.model, | ||
| 174 | Input: batch, | ||
| 175 | } | ||
| 176 | |||
| 177 | body, err := json.Marshal(req) | ||
| 178 | if err != nil { | ||
| 179 | return nil, err | ||
| 180 | } | ||
| 181 | |||
| 182 | httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/v1/embeddings", bytes.NewReader(body)) | ||
| 183 | if err != nil { | ||
| 184 | return nil, err | ||
| 185 | } | ||
| 186 | httpReq.Header.Set("Content-Type", "application/json") | ||
| 187 | httpReq.Header.Set("Authorization", "Bearer "+e.apiKey) | ||
| 188 | |||
| 189 | resp, err := http.DefaultClient.Do(httpReq) | ||
| 190 | if err != nil { | ||
| 191 | return nil, fmt.Errorf("openai request failed: %w", err) | ||
| 192 | } | ||
| 193 | defer resp.Body.Close() | ||
| 194 | |||
| 195 | if resp.StatusCode != http.StatusOK { | ||
| 196 | return nil, fmt.Errorf("openai returned status %d", resp.StatusCode) | ||
| 197 | } | ||
| 198 | |||
| 199 | var result openaiResponse | ||
| 200 | if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { | ||
| 201 | return nil, err | ||
| 202 | } | ||
| 203 | |||
| 204 | for i, d := range result.Data { | ||
| 205 | embeddings[start+i] = d.Embedding | ||
| 206 | } | ||
| 207 | } | ||
| 208 | |||
| 209 | return embeddings, nil | ||
| 210 | } | ||
| 211 | |||
| 212 | // New creates an embedder based on provider name | ||
| 213 | func New(provider, model string) (Embedder, error) { | ||
| 214 | switch provider { | ||
| 215 | case "ollama": | ||
| 216 | return NewOllamaEmbedder(model), nil | ||
| 217 | case "openai": | ||
| 218 | return NewOpenAIEmbedder(model), nil | ||
| 219 | default: | ||
| 220 | return nil, fmt.Errorf("unknown provider: %s", provider) | ||
| 221 | } | ||
| 222 | } | ||
diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go | |||
| @@ -0,0 +1,271 @@ | |||
| 1 | package index | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "database/sql" | ||
| 5 | "encoding/binary" | ||
| 6 | "math" | ||
| 7 | "os" | ||
| 8 | "path/filepath" | ||
| 9 | "sort" | ||
| 10 | |||
| 11 | _ "modernc.org/sqlite" | ||
| 12 | |||
| 13 | "code.northwest.io/codevec/internal/chunker" | ||
| 14 | ) | ||
| 15 | |||
| 16 | // Index stores chunks and embeddings in SQLite | ||
| 17 | type Index struct { | ||
| 18 | db *sql.DB | ||
| 19 | dims int | ||
| 20 | } | ||
| 21 | |||
| 22 | // Open opens or creates an index at the given path | ||
| 23 | func Open(path string, dims int) (*Index, error) { | ||
| 24 | // Ensure directory exists | ||
| 25 | dir := filepath.Dir(path) | ||
| 26 | if err := os.MkdirAll(dir, 0755); err != nil { | ||
| 27 | return nil, err | ||
| 28 | } | ||
| 29 | |||
| 30 | db, err := sql.Open("sqlite", path) | ||
| 31 | if err != nil { | ||
| 32 | return nil, err | ||
| 33 | } | ||
| 34 | |||
| 35 | idx := &Index{db: db, dims: dims} | ||
| 36 | if err := idx.init(); err != nil { | ||
| 37 | db.Close() | ||
| 38 | return nil, err | ||
| 39 | } | ||
| 40 | |||
| 41 | return idx, nil | ||
| 42 | } | ||
| 43 | |||
| 44 | func (idx *Index) init() error { | ||
| 45 | // Create chunks table with embedding column | ||
| 46 | _, err := idx.db.Exec(` | ||
| 47 | CREATE TABLE IF NOT EXISTS chunks ( | ||
| 48 | id INTEGER PRIMARY KEY, | ||
| 49 | file TEXT NOT NULL, | ||
| 50 | start_line INTEGER NOT NULL, | ||
| 51 | end_line INTEGER NOT NULL, | ||
| 52 | chunk_type TEXT, | ||
| 53 | name TEXT, | ||
| 54 | content TEXT NOT NULL, | ||
| 55 | hash TEXT NOT NULL, | ||
| 56 | embedding BLOB, | ||
| 57 | created_at INTEGER DEFAULT (unixepoch()) | ||
| 58 | ) | ||
| 59 | `) | ||
| 60 | if err != nil { | ||
| 61 | return err | ||
| 62 | } | ||
| 63 | |||
| 64 | // Create files table for tracking indexed files | ||
| 65 | _, err = idx.db.Exec(` | ||
| 66 | CREATE TABLE IF NOT EXISTS files ( | ||
| 67 | path TEXT PRIMARY KEY, | ||
| 68 | hash TEXT NOT NULL, | ||
| 69 | indexed_at INTEGER DEFAULT (unixepoch()) | ||
| 70 | ) | ||
| 71 | `) | ||
| 72 | if err != nil { | ||
| 73 | return err | ||
| 74 | } | ||
| 75 | |||
| 76 | // Create metadata table | ||
| 77 | _, err = idx.db.Exec(` | ||
| 78 | CREATE TABLE IF NOT EXISTS metadata ( | ||
| 79 | key TEXT PRIMARY KEY, | ||
| 80 | value TEXT | ||
| 81 | ) | ||
| 82 | `) | ||
| 83 | if err != nil { | ||
| 84 | return err | ||
| 85 | } | ||
| 86 | |||
| 87 | // Index on file for faster deletion | ||
| 88 | _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) | ||
| 89 | return err | ||
| 90 | } | ||
| 91 | |||
| 92 | // Close closes the index | ||
| 93 | func (idx *Index) Close() error { | ||
| 94 | return idx.db.Close() | ||
| 95 | } | ||
| 96 | |||
| 97 | // InsertChunk inserts a chunk with its embedding | ||
| 98 | func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { | ||
| 99 | embeddingBlob := serializeEmbedding(embedding) | ||
| 100 | _, err := idx.db.Exec(` | ||
| 101 | INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) | ||
| 102 | VALUES (?, ?, ?, ?, ?, ?, ?, ?) | ||
| 103 | `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) | ||
| 104 | return err | ||
| 105 | } | ||
| 106 | |||
| 107 | // SearchResult represents a search result | ||
| 108 | type SearchResult struct { | ||
| 109 | Chunk chunker.Chunk | ||
| 110 | Distance float64 | ||
| 111 | } | ||
| 112 | |||
| 113 | // Search finds chunks similar to the query embedding using cosine similarity | ||
| 114 | func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { | ||
| 115 | // Load all embeddings | ||
| 116 | rows, err := idx.db.Query(` | ||
| 117 | SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding | ||
| 118 | FROM chunks | ||
| 119 | WHERE embedding IS NOT NULL | ||
| 120 | `) | ||
| 121 | if err != nil { | ||
| 122 | return nil, err | ||
| 123 | } | ||
| 124 | defer rows.Close() | ||
| 125 | |||
| 126 | type candidate struct { | ||
| 127 | chunk chunker.Chunk | ||
| 128 | distance float64 | ||
| 129 | } | ||
| 130 | var candidates []candidate | ||
| 131 | |||
| 132 | for rows.Next() { | ||
| 133 | var id int64 | ||
| 134 | var c chunker.Chunk | ||
| 135 | var embBlob []byte | ||
| 136 | err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) | ||
| 137 | if err != nil { | ||
| 138 | return nil, err | ||
| 139 | } | ||
| 140 | |||
| 141 | emb := deserializeEmbedding(embBlob) | ||
| 142 | dist := cosineDistance(queryEmb, emb) | ||
| 143 | candidates = append(candidates, candidate{chunk: c, distance: dist}) | ||
| 144 | } | ||
| 145 | |||
| 146 | if err := rows.Err(); err != nil { | ||
| 147 | return nil, err | ||
| 148 | } | ||
| 149 | |||
| 150 | // Sort by distance (lower is better) | ||
| 151 | sort.Slice(candidates, func(i, j int) bool { | ||
| 152 | return candidates[i].distance < candidates[j].distance | ||
| 153 | }) | ||
| 154 | |||
| 155 | // Return top-k | ||
| 156 | if limit > len(candidates) { | ||
| 157 | limit = len(candidates) | ||
| 158 | } | ||
| 159 | |||
| 160 | results := make([]SearchResult, limit) | ||
| 161 | for i := 0; i < limit; i++ { | ||
| 162 | results[i] = SearchResult{ | ||
| 163 | Chunk: candidates[i].chunk, | ||
| 164 | Distance: candidates[i].distance, | ||
| 165 | } | ||
| 166 | } | ||
| 167 | |||
| 168 | return results, nil | ||
| 169 | } | ||
| 170 | |||
| 171 | // GetFileHash returns the stored hash for a file, or empty string if not indexed | ||
| 172 | func (idx *Index) GetFileHash(path string) (string, error) { | ||
| 173 | var hash string | ||
| 174 | err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) | ||
| 175 | if err == sql.ErrNoRows { | ||
| 176 | return "", nil | ||
| 177 | } | ||
| 178 | return hash, err | ||
| 179 | } | ||
| 180 | |||
| 181 | // SetFileHash updates the hash for a file | ||
| 182 | func (idx *Index) SetFileHash(path, hash string) error { | ||
| 183 | _, err := idx.db.Exec(` | ||
| 184 | INSERT OR REPLACE INTO files (path, hash, indexed_at) | ||
| 185 | VALUES (?, ?, unixepoch()) | ||
| 186 | `, path, hash) | ||
| 187 | return err | ||
| 188 | } | ||
| 189 | |||
| 190 | // DeleteChunksForFile removes all chunks for a file | ||
| 191 | func (idx *Index) DeleteChunksForFile(path string) error { | ||
| 192 | _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) | ||
| 193 | if err != nil { | ||
| 194 | return err | ||
| 195 | } | ||
| 196 | _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) | ||
| 197 | return err | ||
| 198 | } | ||
| 199 | |||
| 200 | // Stats returns index statistics | ||
| 201 | type Stats struct { | ||
| 202 | Files int | ||
| 203 | Chunks int | ||
| 204 | } | ||
| 205 | |||
| 206 | func (idx *Index) Stats() (Stats, error) { | ||
| 207 | var s Stats | ||
| 208 | err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) | ||
| 209 | if err != nil { | ||
| 210 | return s, err | ||
| 211 | } | ||
| 212 | err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) | ||
| 213 | return s, err | ||
| 214 | } | ||
| 215 | |||
| 216 | // SetMetadata stores metadata | ||
| 217 | func (idx *Index) SetMetadata(key, value string) error { | ||
| 218 | _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) | ||
| 219 | return err | ||
| 220 | } | ||
| 221 | |||
| 222 | // GetMetadata retrieves metadata | ||
| 223 | func (idx *Index) GetMetadata(key string) (string, error) { | ||
| 224 | var value string | ||
| 225 | err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) | ||
| 226 | if err == sql.ErrNoRows { | ||
| 227 | return "", nil | ||
| 228 | } | ||
| 229 | return value, err | ||
| 230 | } | ||
| 231 | |||
| 232 | // serializeEmbedding converts float32 slice to bytes | ||
| 233 | func serializeEmbedding(embedding []float32) []byte { | ||
| 234 | buf := make([]byte, len(embedding)*4) | ||
| 235 | for i, v := range embedding { | ||
| 236 | binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) | ||
| 237 | } | ||
| 238 | return buf | ||
| 239 | } | ||
| 240 | |||
| 241 | // deserializeEmbedding converts bytes back to float32 slice | ||
| 242 | func deserializeEmbedding(data []byte) []float32 { | ||
| 243 | n := len(data) / 4 | ||
| 244 | result := make([]float32, n) | ||
| 245 | for i := 0; i < n; i++ { | ||
| 246 | bits := binary.LittleEndian.Uint32(data[i*4:]) | ||
| 247 | result[i] = math.Float32frombits(bits) | ||
| 248 | } | ||
| 249 | return result | ||
| 250 | } | ||
| 251 | |||
| 252 | // cosineDistance computes 1 - cosine_similarity (so lower is more similar) | ||
| 253 | func cosineDistance(a, b []float32) float64 { | ||
| 254 | if len(a) != len(b) { | ||
| 255 | return 1.0 | ||
| 256 | } | ||
| 257 | |||
| 258 | var dotProduct, normA, normB float64 | ||
| 259 | for i := range a { | ||
| 260 | dotProduct += float64(a[i]) * float64(b[i]) | ||
| 261 | normA += float64(a[i]) * float64(a[i]) | ||
| 262 | normB += float64(b[i]) * float64(b[i]) | ||
| 263 | } | ||
| 264 | |||
| 265 | if normA == 0 || normB == 0 { | ||
| 266 | return 1.0 | ||
| 267 | } | ||
| 268 | |||
| 269 | similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) | ||
| 270 | return 1.0 - similarity | ||
| 271 | } | ||
diff --git a/internal/walker/walker.go b/internal/walker/walker.go new file mode 100644 index 0000000..0ac470d --- /dev/null +++ b/internal/walker/walker.go | |||
| @@ -0,0 +1,109 @@ | |||
| 1 | package walker | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "os" | ||
| 5 | "path/filepath" | ||
| 6 | "strings" | ||
| 7 | |||
| 8 | ignore "github.com/sabhiram/go-gitignore" | ||
| 9 | ) | ||
| 10 | |||
| 11 | // DefaultIgnore patterns applied to all walks | ||
| 12 | var DefaultIgnore = []string{ | ||
| 13 | "vendor/", | ||
| 14 | "node_modules/", | ||
| 15 | ".git/", | ||
| 16 | ".codevec/", | ||
| 17 | } | ||
| 18 | |||
| 19 | // Walker walks a directory tree finding files to index | ||
| 20 | type Walker struct { | ||
| 21 | root string | ||
| 22 | extensions []string // e.g., [".go"] | ||
| 23 | gitignore *ignore.GitIgnore | ||
| 24 | } | ||
| 25 | |||
| 26 | // New creates a walker for the given root directory | ||
| 27 | func New(root string, extensions []string) (*Walker, error) { | ||
| 28 | root, err := filepath.Abs(root) | ||
| 29 | if err != nil { | ||
| 30 | return nil, err | ||
| 31 | } | ||
| 32 | |||
| 33 | w := &Walker{ | ||
| 34 | root: root, | ||
| 35 | extensions: extensions, | ||
| 36 | } | ||
| 37 | |||
| 38 | // Load .gitignore if present | ||
| 39 | gitignorePath := filepath.Join(root, ".gitignore") | ||
| 40 | if _, err := os.Stat(gitignorePath); err == nil { | ||
| 41 | gi, err := ignore.CompileIgnoreFile(gitignorePath) | ||
| 42 | if err == nil { | ||
| 43 | w.gitignore = gi | ||
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 47 | return w, nil | ||
| 48 | } | ||
| 49 | |||
| 50 | // Walk returns all matching files in the directory tree | ||
| 51 | func (w *Walker) Walk() ([]string, error) { | ||
| 52 | var files []string | ||
| 53 | |||
| 54 | err := filepath.WalkDir(w.root, func(path string, d os.DirEntry, err error) error { | ||
| 55 | if err != nil { | ||
| 56 | return err | ||
| 57 | } | ||
| 58 | |||
| 59 | // Get path relative to root for ignore matching | ||
| 60 | relPath, err := filepath.Rel(w.root, path) | ||
| 61 | if err != nil { | ||
| 62 | return err | ||
| 63 | } | ||
| 64 | |||
| 65 | // Skip default ignored directories | ||
| 66 | if d.IsDir() { | ||
| 67 | for _, pattern := range DefaultIgnore { | ||
| 68 | if strings.HasPrefix(relPath+"/", pattern) || relPath+"/" == pattern { | ||
| 69 | return filepath.SkipDir | ||
| 70 | } | ||
| 71 | } | ||
| 72 | } | ||
| 73 | |||
| 74 | // Skip if matched by .gitignore | ||
| 75 | if w.gitignore != nil && w.gitignore.MatchesPath(relPath) { | ||
| 76 | if d.IsDir() { | ||
| 77 | return filepath.SkipDir | ||
| 78 | } | ||
| 79 | return nil | ||
| 80 | } | ||
| 81 | |||
| 82 | // Skip directories and non-matching extensions | ||
| 83 | if d.IsDir() { | ||
| 84 | return nil | ||
| 85 | } | ||
| 86 | |||
| 87 | if !w.matchesExtension(path) { | ||
| 88 | return nil | ||
| 89 | } | ||
| 90 | |||
| 91 | files = append(files, path) | ||
| 92 | return nil | ||
| 93 | }) | ||
| 94 | |||
| 95 | return files, err | ||
| 96 | } | ||
| 97 | |||
| 98 | func (w *Walker) matchesExtension(path string) bool { | ||
| 99 | if len(w.extensions) == 0 { | ||
| 100 | return true | ||
| 101 | } | ||
| 102 | ext := filepath.Ext(path) | ||
| 103 | for _, e := range w.extensions { | ||
| 104 | if ext == e { | ||
| 105 | return true | ||
| 106 | } | ||
| 107 | } | ||
| 108 | return false | ||
| 109 | } | ||
