From f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:29:00 -0800 Subject: Implement core: walker, chunker, embedder, index, CLI --- cmd/codevec/main.go | 369 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 369 insertions(+) create mode 100644 cmd/codevec/main.go (limited to 'cmd/codevec') diff --git a/cmd/codevec/main.go b/cmd/codevec/main.go new file mode 100644 index 0000000..8337367 --- /dev/null +++ b/cmd/codevec/main.go @@ -0,0 +1,369 @@ +package main + +import ( + "context" + "crypto/sha256" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/spf13/cobra" + + "code.northwest.io/codevec/internal/chunker" + "code.northwest.io/codevec/internal/embedder" + "code.northwest.io/codevec/internal/index" + "code.northwest.io/codevec/internal/walker" +) + +const codevecDir = ".codevec" +const indexFile = "index.db" + +var rootCmd = &cobra.Command{ + Use: "codevec", + Short: "Semantic code search via embeddings", + Long: `Index your codebase and query by concept. Get relevant code chunks with file paths and line numbers.`, +} + +var indexCmd = &cobra.Command{ + Use: "index [path]", + Short: "Index a directory for semantic search", + Args: cobra.MaximumNArgs(1), + RunE: runIndex, +} + +var queryCmd = &cobra.Command{ + Use: "query ", + Short: "Search for relevant code", + Args: cobra.ExactArgs(1), + RunE: runQuery, +} + +var statusCmd = &cobra.Command{ + Use: "status", + Short: "Show index statistics", + RunE: runStatus, +} + +func init() { + // index flags + indexCmd.Flags().BoolP("force", "f", false, "Re-index everything") + indexCmd.Flags().BoolP("verbose", "v", false, "Show progress") + indexCmd.Flags().StringP("provider", "p", "ollama", "Embedding provider (ollama, openai)") + indexCmd.Flags().StringP("model", "m", "", "Embedding model (default: provider-specific)") + + // query flags + queryCmd.Flags().IntP("limit", "l", 10, "Max results") + queryCmd.Flags().Float64P("threshold", "t", 0.0, "Min similarity score (0-1)") + queryCmd.Flags().BoolP("show", "s", false, "Print chunk content") + queryCmd.Flags().Bool("json", false, "Output as JSON") + + rootCmd.AddCommand(indexCmd) + rootCmd.AddCommand(queryCmd) + rootCmd.AddCommand(statusCmd) +} + +func main() { + if err := rootCmd.Execute(); err != nil { + os.Exit(1) + } +} + +func runIndex(cmd *cobra.Command, args []string) error { + path := "." + if len(args) > 0 { + path = args[0] + } + + force, _ := cmd.Flags().GetBool("force") + verbose, _ := cmd.Flags().GetBool("verbose") + provider, _ := cmd.Flags().GetString("provider") + model, _ := cmd.Flags().GetString("model") + + // Resolve absolute path + absPath, err := filepath.Abs(path) + if err != nil { + return err + } + + // Create embedder + emb, err := embedder.New(provider, model) + if err != nil { + return err + } + + // Open index + indexPath := filepath.Join(absPath, codevecDir, indexFile) + idx, err := index.Open(indexPath, emb.Dimensions()) + if err != nil { + return fmt.Errorf("failed to open index: %w", err) + } + defer idx.Close() + + // Store metadata + idx.SetMetadata("provider", provider) + if model != "" { + idx.SetMetadata("model", model) + } + idx.SetMetadata("dimensions", fmt.Sprintf("%d", emb.Dimensions())) + + // Walk directory + w, err := walker.New(absPath, []string{".go"}) + if err != nil { + return err + } + + files, err := w.Walk() + if err != nil { + return err + } + + if verbose { + fmt.Printf("Found %d Go files\n", len(files)) + } + + // Create chunker + goChunker := chunker.NewGoChunker() + + // Process files + var totalChunks int + var skipped int + ctx := context.Background() + + for _, file := range files { + // Read file content + content, err := os.ReadFile(file) + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to read %s: %v\n", file, err) + continue + } + + // Compute file hash + fileHash := fmt.Sprintf("%x", sha256.Sum256(content)) + + // Check if already indexed + if !force { + existingHash, _ := idx.GetFileHash(file) + if existingHash == fileHash { + skipped++ + continue + } + } + + // Delete old chunks for this file + idx.DeleteChunksForFile(file) + + // Chunk file + chunks, err := goChunker.Chunk(file, content) + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to parse %s: %v\n", file, err) + continue + } + + if len(chunks) == 0 { + continue + } + + // Generate embeddings + texts := make([]string, len(chunks)) + for i, c := range chunks { + // Include file path and name for context + relPath, _ := filepath.Rel(absPath, c.File) + texts[i] = fmt.Sprintf("File: %s\n%s %s\n\n%s", relPath, c.Type, c.Name, c.Content) + } + + embeddings, err := emb.Embed(ctx, texts) + if err != nil { + return fmt.Errorf("embedding failed for %s: %w", file, err) + } + + // Store chunks and embeddings + for i, chunk := range chunks { + if err := idx.InsertChunk(chunk, embeddings[i]); err != nil { + return fmt.Errorf("failed to insert chunk: %w", err) + } + } + + // Update file hash + idx.SetFileHash(file, fileHash) + + totalChunks += len(chunks) + if verbose { + relPath, _ := filepath.Rel(absPath, file) + fmt.Printf(" %s: %d chunks\n", relPath, len(chunks)) + } + } + + fmt.Printf("Indexed %d chunks from %d files", totalChunks, len(files)-skipped) + if skipped > 0 { + fmt.Printf(" (%d unchanged)", skipped) + } + fmt.Println() + + return nil +} + +func runQuery(cmd *cobra.Command, args []string) error { + query := args[0] + limit, _ := cmd.Flags().GetInt("limit") + threshold, _ := cmd.Flags().GetFloat64("threshold") + show, _ := cmd.Flags().GetBool("show") + jsonOutput, _ := cmd.Flags().GetBool("json") + + // Find index + cwd, err := os.Getwd() + if err != nil { + return err + } + indexPath := filepath.Join(cwd, codevecDir, indexFile) + + if _, err := os.Stat(indexPath); os.IsNotExist(err) { + return fmt.Errorf("no index found. Run 'codevec index' first") + } + + // Get provider/model from metadata + idx, err := index.Open(indexPath, 768) // temp dims, we'll read from metadata + if err != nil { + return err + } + + provider, _ := idx.GetMetadata("provider") + model, _ := idx.GetMetadata("model") + idx.Close() + + if provider == "" { + provider = "ollama" + } + + // Create embedder + emb, err := embedder.New(provider, model) + if err != nil { + return err + } + + // Reopen with correct dimensions + idx, err = index.Open(indexPath, emb.Dimensions()) + if err != nil { + return err + } + defer idx.Close() + + // Generate query embedding + ctx := context.Background() + embeddings, err := emb.Embed(ctx, []string{query}) + if err != nil { + return fmt.Errorf("failed to embed query: %w", err) + } + + // Search + results, err := idx.Search(embeddings[0], limit) + if err != nil { + return fmt.Errorf("search failed: %w", err) + } + + // Filter by threshold (distance is lower = more similar) + // Convert distance to similarity for threshold comparison + var filtered []index.SearchResult + for _, r := range results { + similarity := 1 - r.Distance + if similarity >= threshold { + filtered = append(filtered, r) + } + } + results = filtered + + // Output + if jsonOutput { + type jsonResult struct { + File string `json:"file"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Type string `json:"type"` + Name string `json:"name"` + Score float64 `json:"score"` + Content string `json:"content,omitempty"` + } + + var output []jsonResult + for _, r := range results { + relPath, _ := filepath.Rel(cwd, r.Chunk.File) + jr := jsonResult{ + File: relPath, + StartLine: r.Chunk.StartLine, + EndLine: r.Chunk.EndLine, + Type: r.Chunk.Type, + Name: r.Chunk.Name, + Score: 1 - r.Distance, + } + if show { + jr.Content = r.Chunk.Content + } + output = append(output, jr) + } + + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(output) + } + + // Text output + if len(results) == 0 { + fmt.Println("No results found") + return nil + } + + for _, r := range results { + relPath, _ := filepath.Rel(cwd, r.Chunk.File) + similarity := 1 - r.Distance + fmt.Printf("%s:%d-%d %s (%.2f)\n", relPath, r.Chunk.StartLine, r.Chunk.EndLine, r.Chunk.Name, similarity) + if show { + fmt.Println(strings.Repeat("-", 40)) + fmt.Println(r.Chunk.Content) + fmt.Println() + } + } + + return nil +} + +func runStatus(cmd *cobra.Command, args []string) error { + cwd, err := os.Getwd() + if err != nil { + return err + } + indexPath := filepath.Join(cwd, codevecDir, indexFile) + + if _, err := os.Stat(indexPath); os.IsNotExist(err) { + fmt.Println("No index found. Run 'codevec index' first.") + return nil + } + + idx, err := index.Open(indexPath, 768) + if err != nil { + return err + } + defer idx.Close() + + stats, err := idx.Stats() + if err != nil { + return err + } + + provider, _ := idx.GetMetadata("provider") + model, _ := idx.GetMetadata("model") + dims, _ := idx.GetMetadata("dimensions") + + fmt.Printf("Index: %s\n", indexPath) + fmt.Printf("Files: %d\n", stats.Files) + fmt.Printf("Chunks: %d\n", stats.Chunks) + fmt.Printf("Provider: %s\n", provider) + if model != "" { + fmt.Printf("Model: %s\n", model) + } + if dims != "" { + fmt.Printf("Dimensions: %s\n", dims) + } + + return nil +} -- cgit v1.2.3