diff options
| author | Clawd <ai@clawd.bot> | 2026-03-05 07:29:00 -0800 |
|---|---|---|
| committer | Clawd <ai@clawd.bot> | 2026-03-05 07:29:00 -0800 |
| commit | f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 (patch) | |
| tree | 5e694f4a2e864c9fcdfcbb1ab869c3bae05b50e3 /cmd | |
| parent | 03d8f49479b3446cf7f8ab9b6fdb2401584e3f12 (diff) | |
Implement core: walker, chunker, embedder, index, CLI
Diffstat (limited to 'cmd')
| -rw-r--r-- | cmd/codevec/main.go | 369 |
1 files changed, 369 insertions, 0 deletions
diff --git a/cmd/codevec/main.go b/cmd/codevec/main.go new file mode 100644 index 0000000..8337367 --- /dev/null +++ b/cmd/codevec/main.go | |||
| @@ -0,0 +1,369 @@ | |||
| 1 | package main | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "context" | ||
| 5 | "crypto/sha256" | ||
| 6 | "encoding/json" | ||
| 7 | "fmt" | ||
| 8 | "os" | ||
| 9 | "path/filepath" | ||
| 10 | "strings" | ||
| 11 | |||
| 12 | "github.com/spf13/cobra" | ||
| 13 | |||
| 14 | "code.northwest.io/codevec/internal/chunker" | ||
| 15 | "code.northwest.io/codevec/internal/embedder" | ||
| 16 | "code.northwest.io/codevec/internal/index" | ||
| 17 | "code.northwest.io/codevec/internal/walker" | ||
| 18 | ) | ||
| 19 | |||
| 20 | const codevecDir = ".codevec" | ||
| 21 | const indexFile = "index.db" | ||
| 22 | |||
| 23 | var rootCmd = &cobra.Command{ | ||
| 24 | Use: "codevec", | ||
| 25 | Short: "Semantic code search via embeddings", | ||
| 26 | Long: `Index your codebase and query by concept. Get relevant code chunks with file paths and line numbers.`, | ||
| 27 | } | ||
| 28 | |||
| 29 | var indexCmd = &cobra.Command{ | ||
| 30 | Use: "index [path]", | ||
| 31 | Short: "Index a directory for semantic search", | ||
| 32 | Args: cobra.MaximumNArgs(1), | ||
| 33 | RunE: runIndex, | ||
| 34 | } | ||
| 35 | |||
| 36 | var queryCmd = &cobra.Command{ | ||
| 37 | Use: "query <text>", | ||
| 38 | Short: "Search for relevant code", | ||
| 39 | Args: cobra.ExactArgs(1), | ||
| 40 | RunE: runQuery, | ||
| 41 | } | ||
| 42 | |||
| 43 | var statusCmd = &cobra.Command{ | ||
| 44 | Use: "status", | ||
| 45 | Short: "Show index statistics", | ||
| 46 | RunE: runStatus, | ||
| 47 | } | ||
| 48 | |||
| 49 | func init() { | ||
| 50 | // index flags | ||
| 51 | indexCmd.Flags().BoolP("force", "f", false, "Re-index everything") | ||
| 52 | indexCmd.Flags().BoolP("verbose", "v", false, "Show progress") | ||
| 53 | indexCmd.Flags().StringP("provider", "p", "ollama", "Embedding provider (ollama, openai)") | ||
| 54 | indexCmd.Flags().StringP("model", "m", "", "Embedding model (default: provider-specific)") | ||
| 55 | |||
| 56 | // query flags | ||
| 57 | queryCmd.Flags().IntP("limit", "l", 10, "Max results") | ||
| 58 | queryCmd.Flags().Float64P("threshold", "t", 0.0, "Min similarity score (0-1)") | ||
| 59 | queryCmd.Flags().BoolP("show", "s", false, "Print chunk content") | ||
| 60 | queryCmd.Flags().Bool("json", false, "Output as JSON") | ||
| 61 | |||
| 62 | rootCmd.AddCommand(indexCmd) | ||
| 63 | rootCmd.AddCommand(queryCmd) | ||
| 64 | rootCmd.AddCommand(statusCmd) | ||
| 65 | } | ||
| 66 | |||
| 67 | func main() { | ||
| 68 | if err := rootCmd.Execute(); err != nil { | ||
| 69 | os.Exit(1) | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | func runIndex(cmd *cobra.Command, args []string) error { | ||
| 74 | path := "." | ||
| 75 | if len(args) > 0 { | ||
| 76 | path = args[0] | ||
| 77 | } | ||
| 78 | |||
| 79 | force, _ := cmd.Flags().GetBool("force") | ||
| 80 | verbose, _ := cmd.Flags().GetBool("verbose") | ||
| 81 | provider, _ := cmd.Flags().GetString("provider") | ||
| 82 | model, _ := cmd.Flags().GetString("model") | ||
| 83 | |||
| 84 | // Resolve absolute path | ||
| 85 | absPath, err := filepath.Abs(path) | ||
| 86 | if err != nil { | ||
| 87 | return err | ||
| 88 | } | ||
| 89 | |||
| 90 | // Create embedder | ||
| 91 | emb, err := embedder.New(provider, model) | ||
| 92 | if err != nil { | ||
| 93 | return err | ||
| 94 | } | ||
| 95 | |||
| 96 | // Open index | ||
| 97 | indexPath := filepath.Join(absPath, codevecDir, indexFile) | ||
| 98 | idx, err := index.Open(indexPath, emb.Dimensions()) | ||
| 99 | if err != nil { | ||
| 100 | return fmt.Errorf("failed to open index: %w", err) | ||
| 101 | } | ||
| 102 | defer idx.Close() | ||
| 103 | |||
| 104 | // Store metadata | ||
| 105 | idx.SetMetadata("provider", provider) | ||
| 106 | if model != "" { | ||
| 107 | idx.SetMetadata("model", model) | ||
| 108 | } | ||
| 109 | idx.SetMetadata("dimensions", fmt.Sprintf("%d", emb.Dimensions())) | ||
| 110 | |||
| 111 | // Walk directory | ||
| 112 | w, err := walker.New(absPath, []string{".go"}) | ||
| 113 | if err != nil { | ||
| 114 | return err | ||
| 115 | } | ||
| 116 | |||
| 117 | files, err := w.Walk() | ||
| 118 | if err != nil { | ||
| 119 | return err | ||
| 120 | } | ||
| 121 | |||
| 122 | if verbose { | ||
| 123 | fmt.Printf("Found %d Go files\n", len(files)) | ||
| 124 | } | ||
| 125 | |||
| 126 | // Create chunker | ||
| 127 | goChunker := chunker.NewGoChunker() | ||
| 128 | |||
| 129 | // Process files | ||
| 130 | var totalChunks int | ||
| 131 | var skipped int | ||
| 132 | ctx := context.Background() | ||
| 133 | |||
| 134 | for _, file := range files { | ||
| 135 | // Read file content | ||
| 136 | content, err := os.ReadFile(file) | ||
| 137 | if err != nil { | ||
| 138 | fmt.Fprintf(os.Stderr, "Warning: failed to read %s: %v\n", file, err) | ||
| 139 | continue | ||
| 140 | } | ||
| 141 | |||
| 142 | // Compute file hash | ||
| 143 | fileHash := fmt.Sprintf("%x", sha256.Sum256(content)) | ||
| 144 | |||
| 145 | // Check if already indexed | ||
| 146 | if !force { | ||
| 147 | existingHash, _ := idx.GetFileHash(file) | ||
| 148 | if existingHash == fileHash { | ||
| 149 | skipped++ | ||
| 150 | continue | ||
| 151 | } | ||
| 152 | } | ||
| 153 | |||
| 154 | // Delete old chunks for this file | ||
| 155 | idx.DeleteChunksForFile(file) | ||
| 156 | |||
| 157 | // Chunk file | ||
| 158 | chunks, err := goChunker.Chunk(file, content) | ||
| 159 | if err != nil { | ||
| 160 | fmt.Fprintf(os.Stderr, "Warning: failed to parse %s: %v\n", file, err) | ||
| 161 | continue | ||
| 162 | } | ||
| 163 | |||
| 164 | if len(chunks) == 0 { | ||
| 165 | continue | ||
| 166 | } | ||
| 167 | |||
| 168 | // Generate embeddings | ||
| 169 | texts := make([]string, len(chunks)) | ||
| 170 | for i, c := range chunks { | ||
| 171 | // Include file path and name for context | ||
| 172 | relPath, _ := filepath.Rel(absPath, c.File) | ||
| 173 | texts[i] = fmt.Sprintf("File: %s\n%s %s\n\n%s", relPath, c.Type, c.Name, c.Content) | ||
| 174 | } | ||
| 175 | |||
| 176 | embeddings, err := emb.Embed(ctx, texts) | ||
| 177 | if err != nil { | ||
| 178 | return fmt.Errorf("embedding failed for %s: %w", file, err) | ||
| 179 | } | ||
| 180 | |||
| 181 | // Store chunks and embeddings | ||
| 182 | for i, chunk := range chunks { | ||
| 183 | if err := idx.InsertChunk(chunk, embeddings[i]); err != nil { | ||
| 184 | return fmt.Errorf("failed to insert chunk: %w", err) | ||
| 185 | } | ||
| 186 | } | ||
| 187 | |||
| 188 | // Update file hash | ||
| 189 | idx.SetFileHash(file, fileHash) | ||
| 190 | |||
| 191 | totalChunks += len(chunks) | ||
| 192 | if verbose { | ||
| 193 | relPath, _ := filepath.Rel(absPath, file) | ||
| 194 | fmt.Printf(" %s: %d chunks\n", relPath, len(chunks)) | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | fmt.Printf("Indexed %d chunks from %d files", totalChunks, len(files)-skipped) | ||
| 199 | if skipped > 0 { | ||
| 200 | fmt.Printf(" (%d unchanged)", skipped) | ||
| 201 | } | ||
| 202 | fmt.Println() | ||
| 203 | |||
| 204 | return nil | ||
| 205 | } | ||
| 206 | |||
| 207 | func runQuery(cmd *cobra.Command, args []string) error { | ||
| 208 | query := args[0] | ||
| 209 | limit, _ := cmd.Flags().GetInt("limit") | ||
| 210 | threshold, _ := cmd.Flags().GetFloat64("threshold") | ||
| 211 | show, _ := cmd.Flags().GetBool("show") | ||
| 212 | jsonOutput, _ := cmd.Flags().GetBool("json") | ||
| 213 | |||
| 214 | // Find index | ||
| 215 | cwd, err := os.Getwd() | ||
| 216 | if err != nil { | ||
| 217 | return err | ||
| 218 | } | ||
| 219 | indexPath := filepath.Join(cwd, codevecDir, indexFile) | ||
| 220 | |||
| 221 | if _, err := os.Stat(indexPath); os.IsNotExist(err) { | ||
| 222 | return fmt.Errorf("no index found. Run 'codevec index' first") | ||
| 223 | } | ||
| 224 | |||
| 225 | // Get provider/model from metadata | ||
| 226 | idx, err := index.Open(indexPath, 768) // temp dims, we'll read from metadata | ||
| 227 | if err != nil { | ||
| 228 | return err | ||
| 229 | } | ||
| 230 | |||
| 231 | provider, _ := idx.GetMetadata("provider") | ||
| 232 | model, _ := idx.GetMetadata("model") | ||
| 233 | idx.Close() | ||
| 234 | |||
| 235 | if provider == "" { | ||
| 236 | provider = "ollama" | ||
| 237 | } | ||
| 238 | |||
| 239 | // Create embedder | ||
| 240 | emb, err := embedder.New(provider, model) | ||
| 241 | if err != nil { | ||
| 242 | return err | ||
| 243 | } | ||
| 244 | |||
| 245 | // Reopen with correct dimensions | ||
| 246 | idx, err = index.Open(indexPath, emb.Dimensions()) | ||
| 247 | if err != nil { | ||
| 248 | return err | ||
| 249 | } | ||
| 250 | defer idx.Close() | ||
| 251 | |||
| 252 | // Generate query embedding | ||
| 253 | ctx := context.Background() | ||
| 254 | embeddings, err := emb.Embed(ctx, []string{query}) | ||
| 255 | if err != nil { | ||
| 256 | return fmt.Errorf("failed to embed query: %w", err) | ||
| 257 | } | ||
| 258 | |||
| 259 | // Search | ||
| 260 | results, err := idx.Search(embeddings[0], limit) | ||
| 261 | if err != nil { | ||
| 262 | return fmt.Errorf("search failed: %w", err) | ||
| 263 | } | ||
| 264 | |||
| 265 | // Filter by threshold (distance is lower = more similar) | ||
| 266 | // Convert distance to similarity for threshold comparison | ||
| 267 | var filtered []index.SearchResult | ||
| 268 | for _, r := range results { | ||
| 269 | similarity := 1 - r.Distance | ||
| 270 | if similarity >= threshold { | ||
| 271 | filtered = append(filtered, r) | ||
| 272 | } | ||
| 273 | } | ||
| 274 | results = filtered | ||
| 275 | |||
| 276 | // Output | ||
| 277 | if jsonOutput { | ||
| 278 | type jsonResult struct { | ||
| 279 | File string `json:"file"` | ||
| 280 | StartLine int `json:"start_line"` | ||
| 281 | EndLine int `json:"end_line"` | ||
| 282 | Type string `json:"type"` | ||
| 283 | Name string `json:"name"` | ||
| 284 | Score float64 `json:"score"` | ||
| 285 | Content string `json:"content,omitempty"` | ||
| 286 | } | ||
| 287 | |||
| 288 | var output []jsonResult | ||
| 289 | for _, r := range results { | ||
| 290 | relPath, _ := filepath.Rel(cwd, r.Chunk.File) | ||
| 291 | jr := jsonResult{ | ||
| 292 | File: relPath, | ||
| 293 | StartLine: r.Chunk.StartLine, | ||
| 294 | EndLine: r.Chunk.EndLine, | ||
| 295 | Type: r.Chunk.Type, | ||
| 296 | Name: r.Chunk.Name, | ||
| 297 | Score: 1 - r.Distance, | ||
| 298 | } | ||
| 299 | if show { | ||
| 300 | jr.Content = r.Chunk.Content | ||
| 301 | } | ||
| 302 | output = append(output, jr) | ||
| 303 | } | ||
| 304 | |||
| 305 | enc := json.NewEncoder(os.Stdout) | ||
| 306 | enc.SetIndent("", " ") | ||
| 307 | return enc.Encode(output) | ||
| 308 | } | ||
| 309 | |||
| 310 | // Text output | ||
| 311 | if len(results) == 0 { | ||
| 312 | fmt.Println("No results found") | ||
| 313 | return nil | ||
| 314 | } | ||
| 315 | |||
| 316 | for _, r := range results { | ||
| 317 | relPath, _ := filepath.Rel(cwd, r.Chunk.File) | ||
| 318 | similarity := 1 - r.Distance | ||
| 319 | fmt.Printf("%s:%d-%d %s (%.2f)\n", relPath, r.Chunk.StartLine, r.Chunk.EndLine, r.Chunk.Name, similarity) | ||
| 320 | if show { | ||
| 321 | fmt.Println(strings.Repeat("-", 40)) | ||
| 322 | fmt.Println(r.Chunk.Content) | ||
| 323 | fmt.Println() | ||
| 324 | } | ||
| 325 | } | ||
| 326 | |||
| 327 | return nil | ||
| 328 | } | ||
| 329 | |||
| 330 | func runStatus(cmd *cobra.Command, args []string) error { | ||
| 331 | cwd, err := os.Getwd() | ||
| 332 | if err != nil { | ||
| 333 | return err | ||
| 334 | } | ||
| 335 | indexPath := filepath.Join(cwd, codevecDir, indexFile) | ||
| 336 | |||
| 337 | if _, err := os.Stat(indexPath); os.IsNotExist(err) { | ||
| 338 | fmt.Println("No index found. Run 'codevec index' first.") | ||
| 339 | return nil | ||
| 340 | } | ||
| 341 | |||
| 342 | idx, err := index.Open(indexPath, 768) | ||
| 343 | if err != nil { | ||
| 344 | return err | ||
| 345 | } | ||
| 346 | defer idx.Close() | ||
| 347 | |||
| 348 | stats, err := idx.Stats() | ||
| 349 | if err != nil { | ||
| 350 | return err | ||
| 351 | } | ||
| 352 | |||
| 353 | provider, _ := idx.GetMetadata("provider") | ||
| 354 | model, _ := idx.GetMetadata("model") | ||
| 355 | dims, _ := idx.GetMetadata("dimensions") | ||
| 356 | |||
| 357 | fmt.Printf("Index: %s\n", indexPath) | ||
| 358 | fmt.Printf("Files: %d\n", stats.Files) | ||
| 359 | fmt.Printf("Chunks: %d\n", stats.Chunks) | ||
| 360 | fmt.Printf("Provider: %s\n", provider) | ||
| 361 | if model != "" { | ||
| 362 | fmt.Printf("Model: %s\n", model) | ||
| 363 | } | ||
| 364 | if dims != "" { | ||
| 365 | fmt.Printf("Dimensions: %s\n", dims) | ||
| 366 | } | ||
| 367 | |||
| 368 | return nil | ||
| 369 | } | ||
