package main import ( "context" "crypto/sha256" "encoding/json" "fmt" "os" "path/filepath" "strings" "github.com/spf13/cobra" "code.northwest.io/codevec/internal/chunker" "code.northwest.io/codevec/internal/embedder" "code.northwest.io/codevec/internal/index" "code.northwest.io/codevec/internal/walker" ) const codevecDir = ".codevec" const indexFile = "index.db" var rootCmd = &cobra.Command{ Use: "codevec", Short: "Semantic code search via embeddings", Long: `Index your codebase and query by concept. Get relevant code chunks with file paths and line numbers.`, } var indexCmd = &cobra.Command{ Use: "index [path]", Short: "Index a directory for semantic search", Args: cobra.MaximumNArgs(1), RunE: runIndex, } var queryCmd = &cobra.Command{ Use: "query ", Short: "Search for relevant code", Args: cobra.ExactArgs(1), RunE: runQuery, } var statusCmd = &cobra.Command{ Use: "status", Short: "Show index statistics", RunE: runStatus, } func init() { // index flags indexCmd.Flags().BoolP("force", "f", false, "Re-index everything") indexCmd.Flags().BoolP("verbose", "v", false, "Show progress") indexCmd.Flags().StringP("provider", "p", "ollama", "Embedding provider (ollama, openai)") indexCmd.Flags().StringP("model", "m", "", "Embedding model (default: provider-specific)") // query flags queryCmd.Flags().IntP("limit", "l", 10, "Max results") queryCmd.Flags().Float64P("threshold", "t", 0.0, "Min similarity score (0-1)") queryCmd.Flags().BoolP("show", "s", false, "Print chunk content") queryCmd.Flags().Bool("json", false, "Output as JSON") rootCmd.AddCommand(indexCmd) rootCmd.AddCommand(queryCmd) rootCmd.AddCommand(statusCmd) } func main() { if err := rootCmd.Execute(); err != nil { os.Exit(1) } } func runIndex(cmd *cobra.Command, args []string) error { path := "." if len(args) > 0 { path = args[0] } force, _ := cmd.Flags().GetBool("force") verbose, _ := cmd.Flags().GetBool("verbose") provider, _ := cmd.Flags().GetString("provider") model, _ := cmd.Flags().GetString("model") // Resolve absolute path absPath, err := filepath.Abs(path) if err != nil { return err } // Create embedder emb, err := embedder.New(provider, model) if err != nil { return err } // Open index indexPath := filepath.Join(absPath, codevecDir, indexFile) idx, err := index.Open(indexPath, emb.Dimensions()) if err != nil { return fmt.Errorf("failed to open index: %w", err) } defer idx.Close() // Store metadata idx.SetMetadata("provider", provider) if model != "" { idx.SetMetadata("model", model) } idx.SetMetadata("dimensions", fmt.Sprintf("%d", emb.Dimensions())) // Walk directory w, err := walker.New(absPath, []string{".go"}) if err != nil { return err } files, err := w.Walk() if err != nil { return err } if verbose { fmt.Printf("Found %d Go files\n", len(files)) } // Create chunker goChunker := chunker.NewGoChunker() // Process files var totalChunks int var skipped int ctx := context.Background() for _, file := range files { // Read file content content, err := os.ReadFile(file) if err != nil { fmt.Fprintf(os.Stderr, "Warning: failed to read %s: %v\n", file, err) continue } // Compute file hash fileHash := fmt.Sprintf("%x", sha256.Sum256(content)) // Check if already indexed if !force { existingHash, _ := idx.GetFileHash(file) if existingHash == fileHash { skipped++ continue } } // Delete old chunks for this file idx.DeleteChunksForFile(file) // Chunk file chunks, err := goChunker.Chunk(file, content) if err != nil { fmt.Fprintf(os.Stderr, "Warning: failed to parse %s: %v\n", file, err) continue } if len(chunks) == 0 { continue } // Generate embeddings texts := make([]string, len(chunks)) for i, c := range chunks { // Include file path and name for context relPath, _ := filepath.Rel(absPath, c.File) texts[i] = fmt.Sprintf("File: %s\n%s %s\n\n%s", relPath, c.Type, c.Name, c.Content) } embeddings, err := emb.Embed(ctx, texts) if err != nil { return fmt.Errorf("embedding failed for %s: %w", file, err) } // Store chunks and embeddings for i, chunk := range chunks { if err := idx.InsertChunk(chunk, embeddings[i]); err != nil { return fmt.Errorf("failed to insert chunk: %w", err) } } // Update file hash idx.SetFileHash(file, fileHash) totalChunks += len(chunks) if verbose { relPath, _ := filepath.Rel(absPath, file) fmt.Printf(" %s: %d chunks\n", relPath, len(chunks)) } } fmt.Printf("Indexed %d chunks from %d files", totalChunks, len(files)-skipped) if skipped > 0 { fmt.Printf(" (%d unchanged)", skipped) } fmt.Println() return nil } func runQuery(cmd *cobra.Command, args []string) error { query := args[0] limit, _ := cmd.Flags().GetInt("limit") threshold, _ := cmd.Flags().GetFloat64("threshold") show, _ := cmd.Flags().GetBool("show") jsonOutput, _ := cmd.Flags().GetBool("json") // Find index cwd, err := os.Getwd() if err != nil { return err } indexPath := filepath.Join(cwd, codevecDir, indexFile) if _, err := os.Stat(indexPath); os.IsNotExist(err) { return fmt.Errorf("no index found. Run 'codevec index' first") } // Get provider/model from metadata idx, err := index.Open(indexPath, 768) // temp dims, we'll read from metadata if err != nil { return err } provider, _ := idx.GetMetadata("provider") model, _ := idx.GetMetadata("model") idx.Close() if provider == "" { provider = "ollama" } // Create embedder emb, err := embedder.New(provider, model) if err != nil { return err } // Reopen with correct dimensions idx, err = index.Open(indexPath, emb.Dimensions()) if err != nil { return err } defer idx.Close() // Generate query embedding ctx := context.Background() embeddings, err := emb.Embed(ctx, []string{query}) if err != nil { return fmt.Errorf("failed to embed query: %w", err) } // Search results, err := idx.Search(embeddings[0], limit) if err != nil { return fmt.Errorf("search failed: %w", err) } // Filter by threshold (distance is lower = more similar) // Convert distance to similarity for threshold comparison var filtered []index.SearchResult for _, r := range results { similarity := 1 - r.Distance if similarity >= threshold { filtered = append(filtered, r) } } results = filtered // Output if jsonOutput { type jsonResult struct { File string `json:"file"` StartLine int `json:"start_line"` EndLine int `json:"end_line"` Type string `json:"type"` Name string `json:"name"` Score float64 `json:"score"` Content string `json:"content,omitempty"` } var output []jsonResult for _, r := range results { relPath, _ := filepath.Rel(cwd, r.Chunk.File) jr := jsonResult{ File: relPath, StartLine: r.Chunk.StartLine, EndLine: r.Chunk.EndLine, Type: r.Chunk.Type, Name: r.Chunk.Name, Score: 1 - r.Distance, } if show { jr.Content = r.Chunk.Content } output = append(output, jr) } enc := json.NewEncoder(os.Stdout) enc.SetIndent("", " ") return enc.Encode(output) } // Text output if len(results) == 0 { fmt.Println("No results found") return nil } for _, r := range results { relPath, _ := filepath.Rel(cwd, r.Chunk.File) similarity := 1 - r.Distance fmt.Printf("%s:%d-%d %s (%.2f)\n", relPath, r.Chunk.StartLine, r.Chunk.EndLine, r.Chunk.Name, similarity) if show { fmt.Println(strings.Repeat("-", 40)) fmt.Println(r.Chunk.Content) fmt.Println() } } return nil } func runStatus(cmd *cobra.Command, args []string) error { cwd, err := os.Getwd() if err != nil { return err } indexPath := filepath.Join(cwd, codevecDir, indexFile) if _, err := os.Stat(indexPath); os.IsNotExist(err) { fmt.Println("No index found. Run 'codevec index' first.") return nil } idx, err := index.Open(indexPath, 768) if err != nil { return err } defer idx.Close() stats, err := idx.Stats() if err != nil { return err } provider, _ := idx.GetMetadata("provider") model, _ := idx.GetMetadata("model") dims, _ := idx.GetMetadata("dimensions") fmt.Printf("Index: %s\n", indexPath) fmt.Printf("Files: %d\n", stats.Files) fmt.Printf("Chunks: %d\n", stats.Chunks) fmt.Printf("Provider: %s\n", provider) if model != "" { fmt.Printf("Model: %s\n", model) } if dims != "" { fmt.Printf("Dimensions: %s\n", dims) } return nil }