From f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:29:00 -0800 Subject: Implement core: walker, chunker, embedder, index, CLI --- .gitignore | 2 + Makefile | 13 ++ cmd/codevec/main.go | 369 ++++++++++++++++++++++++++++++++++++++++++ go.mod | 26 +++ go.sum | 79 +++++++++ internal/chunker/chunker.go | 185 +++++++++++++++++++++ internal/embedder/embedder.go | 222 +++++++++++++++++++++++++ internal/index/index.go | 271 +++++++++++++++++++++++++++++++ internal/walker/walker.go | 109 +++++++++++++ 9 files changed, 1276 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 cmd/codevec/main.go create mode 100644 go.mod create mode 100644 go.sum create mode 100644 internal/chunker/chunker.go create mode 100644 internal/embedder/embedder.go create mode 100644 internal/index/index.go create mode 100644 internal/walker/walker.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96f1663 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +bin/ +.codevec/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5f37491 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +.PHONY: build install clean + +BINARY := codevec +BUILD_DIR := bin + +build: + go build -o $(BUILD_DIR)/$(BINARY) ./cmd/codevec + +install: build + cp $(BUILD_DIR)/$(BINARY) ~/.local/bin/ + +clean: + rm -rf $(BUILD_DIR) diff --git a/cmd/codevec/main.go b/cmd/codevec/main.go new file mode 100644 index 0000000..8337367 --- /dev/null +++ b/cmd/codevec/main.go @@ -0,0 +1,369 @@ +package main + +import ( + "context" + "crypto/sha256" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/spf13/cobra" + + "code.northwest.io/codevec/internal/chunker" + "code.northwest.io/codevec/internal/embedder" + "code.northwest.io/codevec/internal/index" + "code.northwest.io/codevec/internal/walker" +) + +const codevecDir = ".codevec" +const indexFile = "index.db" + +var rootCmd = &cobra.Command{ + Use: "codevec", + Short: "Semantic code search via embeddings", + Long: `Index your codebase and query by concept. Get relevant code chunks with file paths and line numbers.`, +} + +var indexCmd = &cobra.Command{ + Use: "index [path]", + Short: "Index a directory for semantic search", + Args: cobra.MaximumNArgs(1), + RunE: runIndex, +} + +var queryCmd = &cobra.Command{ + Use: "query ", + Short: "Search for relevant code", + Args: cobra.ExactArgs(1), + RunE: runQuery, +} + +var statusCmd = &cobra.Command{ + Use: "status", + Short: "Show index statistics", + RunE: runStatus, +} + +func init() { + // index flags + indexCmd.Flags().BoolP("force", "f", false, "Re-index everything") + indexCmd.Flags().BoolP("verbose", "v", false, "Show progress") + indexCmd.Flags().StringP("provider", "p", "ollama", "Embedding provider (ollama, openai)") + indexCmd.Flags().StringP("model", "m", "", "Embedding model (default: provider-specific)") + + // query flags + queryCmd.Flags().IntP("limit", "l", 10, "Max results") + queryCmd.Flags().Float64P("threshold", "t", 0.0, "Min similarity score (0-1)") + queryCmd.Flags().BoolP("show", "s", false, "Print chunk content") + queryCmd.Flags().Bool("json", false, "Output as JSON") + + rootCmd.AddCommand(indexCmd) + rootCmd.AddCommand(queryCmd) + rootCmd.AddCommand(statusCmd) +} + +func main() { + if err := rootCmd.Execute(); err != nil { + os.Exit(1) + } +} + +func runIndex(cmd *cobra.Command, args []string) error { + path := "." + if len(args) > 0 { + path = args[0] + } + + force, _ := cmd.Flags().GetBool("force") + verbose, _ := cmd.Flags().GetBool("verbose") + provider, _ := cmd.Flags().GetString("provider") + model, _ := cmd.Flags().GetString("model") + + // Resolve absolute path + absPath, err := filepath.Abs(path) + if err != nil { + return err + } + + // Create embedder + emb, err := embedder.New(provider, model) + if err != nil { + return err + } + + // Open index + indexPath := filepath.Join(absPath, codevecDir, indexFile) + idx, err := index.Open(indexPath, emb.Dimensions()) + if err != nil { + return fmt.Errorf("failed to open index: %w", err) + } + defer idx.Close() + + // Store metadata + idx.SetMetadata("provider", provider) + if model != "" { + idx.SetMetadata("model", model) + } + idx.SetMetadata("dimensions", fmt.Sprintf("%d", emb.Dimensions())) + + // Walk directory + w, err := walker.New(absPath, []string{".go"}) + if err != nil { + return err + } + + files, err := w.Walk() + if err != nil { + return err + } + + if verbose { + fmt.Printf("Found %d Go files\n", len(files)) + } + + // Create chunker + goChunker := chunker.NewGoChunker() + + // Process files + var totalChunks int + var skipped int + ctx := context.Background() + + for _, file := range files { + // Read file content + content, err := os.ReadFile(file) + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to read %s: %v\n", file, err) + continue + } + + // Compute file hash + fileHash := fmt.Sprintf("%x", sha256.Sum256(content)) + + // Check if already indexed + if !force { + existingHash, _ := idx.GetFileHash(file) + if existingHash == fileHash { + skipped++ + continue + } + } + + // Delete old chunks for this file + idx.DeleteChunksForFile(file) + + // Chunk file + chunks, err := goChunker.Chunk(file, content) + if err != nil { + fmt.Fprintf(os.Stderr, "Warning: failed to parse %s: %v\n", file, err) + continue + } + + if len(chunks) == 0 { + continue + } + + // Generate embeddings + texts := make([]string, len(chunks)) + for i, c := range chunks { + // Include file path and name for context + relPath, _ := filepath.Rel(absPath, c.File) + texts[i] = fmt.Sprintf("File: %s\n%s %s\n\n%s", relPath, c.Type, c.Name, c.Content) + } + + embeddings, err := emb.Embed(ctx, texts) + if err != nil { + return fmt.Errorf("embedding failed for %s: %w", file, err) + } + + // Store chunks and embeddings + for i, chunk := range chunks { + if err := idx.InsertChunk(chunk, embeddings[i]); err != nil { + return fmt.Errorf("failed to insert chunk: %w", err) + } + } + + // Update file hash + idx.SetFileHash(file, fileHash) + + totalChunks += len(chunks) + if verbose { + relPath, _ := filepath.Rel(absPath, file) + fmt.Printf(" %s: %d chunks\n", relPath, len(chunks)) + } + } + + fmt.Printf("Indexed %d chunks from %d files", totalChunks, len(files)-skipped) + if skipped > 0 { + fmt.Printf(" (%d unchanged)", skipped) + } + fmt.Println() + + return nil +} + +func runQuery(cmd *cobra.Command, args []string) error { + query := args[0] + limit, _ := cmd.Flags().GetInt("limit") + threshold, _ := cmd.Flags().GetFloat64("threshold") + show, _ := cmd.Flags().GetBool("show") + jsonOutput, _ := cmd.Flags().GetBool("json") + + // Find index + cwd, err := os.Getwd() + if err != nil { + return err + } + indexPath := filepath.Join(cwd, codevecDir, indexFile) + + if _, err := os.Stat(indexPath); os.IsNotExist(err) { + return fmt.Errorf("no index found. Run 'codevec index' first") + } + + // Get provider/model from metadata + idx, err := index.Open(indexPath, 768) // temp dims, we'll read from metadata + if err != nil { + return err + } + + provider, _ := idx.GetMetadata("provider") + model, _ := idx.GetMetadata("model") + idx.Close() + + if provider == "" { + provider = "ollama" + } + + // Create embedder + emb, err := embedder.New(provider, model) + if err != nil { + return err + } + + // Reopen with correct dimensions + idx, err = index.Open(indexPath, emb.Dimensions()) + if err != nil { + return err + } + defer idx.Close() + + // Generate query embedding + ctx := context.Background() + embeddings, err := emb.Embed(ctx, []string{query}) + if err != nil { + return fmt.Errorf("failed to embed query: %w", err) + } + + // Search + results, err := idx.Search(embeddings[0], limit) + if err != nil { + return fmt.Errorf("search failed: %w", err) + } + + // Filter by threshold (distance is lower = more similar) + // Convert distance to similarity for threshold comparison + var filtered []index.SearchResult + for _, r := range results { + similarity := 1 - r.Distance + if similarity >= threshold { + filtered = append(filtered, r) + } + } + results = filtered + + // Output + if jsonOutput { + type jsonResult struct { + File string `json:"file"` + StartLine int `json:"start_line"` + EndLine int `json:"end_line"` + Type string `json:"type"` + Name string `json:"name"` + Score float64 `json:"score"` + Content string `json:"content,omitempty"` + } + + var output []jsonResult + for _, r := range results { + relPath, _ := filepath.Rel(cwd, r.Chunk.File) + jr := jsonResult{ + File: relPath, + StartLine: r.Chunk.StartLine, + EndLine: r.Chunk.EndLine, + Type: r.Chunk.Type, + Name: r.Chunk.Name, + Score: 1 - r.Distance, + } + if show { + jr.Content = r.Chunk.Content + } + output = append(output, jr) + } + + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(output) + } + + // Text output + if len(results) == 0 { + fmt.Println("No results found") + return nil + } + + for _, r := range results { + relPath, _ := filepath.Rel(cwd, r.Chunk.File) + similarity := 1 - r.Distance + fmt.Printf("%s:%d-%d %s (%.2f)\n", relPath, r.Chunk.StartLine, r.Chunk.EndLine, r.Chunk.Name, similarity) + if show { + fmt.Println(strings.Repeat("-", 40)) + fmt.Println(r.Chunk.Content) + fmt.Println() + } + } + + return nil +} + +func runStatus(cmd *cobra.Command, args []string) error { + cwd, err := os.Getwd() + if err != nil { + return err + } + indexPath := filepath.Join(cwd, codevecDir, indexFile) + + if _, err := os.Stat(indexPath); os.IsNotExist(err) { + fmt.Println("No index found. Run 'codevec index' first.") + return nil + } + + idx, err := index.Open(indexPath, 768) + if err != nil { + return err + } + defer idx.Close() + + stats, err := idx.Stats() + if err != nil { + return err + } + + provider, _ := idx.GetMetadata("provider") + model, _ := idx.GetMetadata("model") + dims, _ := idx.GetMetadata("dimensions") + + fmt.Printf("Index: %s\n", indexPath) + fmt.Printf("Files: %d\n", stats.Files) + fmt.Printf("Chunks: %d\n", stats.Chunks) + fmt.Printf("Provider: %s\n", provider) + if model != "" { + fmt.Printf("Model: %s\n", model) + } + if dims != "" { + fmt.Printf("Dimensions: %s\n", dims) + } + + return nil +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..914896e --- /dev/null +++ b/go.mod @@ -0,0 +1,26 @@ +module code.northwest.io/codevec + +go 1.24.0 + +require ( + github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 + github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 + github.com/spf13/cobra v1.10.2 + modernc.org/sqlite v1.46.1 +) + +require ( + github.com/dustin/go-humanize v1.0.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/ncruces/go-strftime v1.0.0 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect + github.com/spf13/pflag v1.0.9 // indirect + golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect + golang.org/x/sync v0.19.0 // indirect + golang.org/x/sys v0.40.0 // indirect + modernc.org/libc v1.67.6 // indirect + modernc.org/mathutil v1.7.1 // indirect + modernc.org/memory v1.11.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..0d5edbc --- /dev/null +++ b/go.sum @@ -0,0 +1,79 @@ +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= +github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs= +github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= +github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= +github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= +github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06 h1:OkMGxebDjyw0ULyrTYWeN0UNCCkmCWfjPnIA2W6oviI= +github.com/sabhiram/go-gitignore v0.0.0-20210923224102-525f6e181f06/go.mod h1:+ePHsJ1keEjQtpvf9HHw0f4ZeJ0TLRsxhunSI2hYJSs= +github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 h1:6C8qej6f1bStuePVkLSFxoU22XBS165D3klxlzRg8F4= +github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82/go.mod h1:xe4pgH49k4SsmkQq5OT8abwhWmnzkhpgnXeekbx2efw= +github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= +github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY= +golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis= +modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0= +modernc.org/ccgo/v4 v4.30.1 h1:4r4U1J6Fhj98NKfSjnPUN7Ze2c6MnAdL0hWw6+LrJpc= +modernc.org/ccgo/v4 v4.30.1/go.mod h1:bIOeI1JL54Utlxn+LwrFyjCx2n2RDiYEaJVSrgdrRfM= +modernc.org/fileutil v1.3.40 h1:ZGMswMNc9JOCrcrakF1HrvmergNLAmxOPjizirpfqBA= +modernc.org/fileutil v1.3.40/go.mod h1:HxmghZSZVAz/LXcMNwZPA/DRrQZEVP9VX0V4LQGQFOc= +modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI= +modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito= +modernc.org/gc/v3 v3.1.1 h1:k8T3gkXWY9sEiytKhcgyiZ2L0DTyCQ/nvX+LoCljoRE= +modernc.org/gc/v3 v3.1.1/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY= +modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks= +modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI= +modernc.org/libc v1.67.6 h1:eVOQvpModVLKOdT+LvBPjdQqfrZq+pC39BygcT+E7OI= +modernc.org/libc v1.67.6/go.mod h1:JAhxUVlolfYDErnwiqaLvUqc8nfb2r6S6slAgZOnaiE= +modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU= +modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg= +modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI= +modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw= +modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8= +modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns= +modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w= +modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE= +modernc.org/sqlite v1.46.1 h1:eFJ2ShBLIEnUWlLy12raN0Z1plqmFX9Qe3rjQTKt6sU= +modernc.org/sqlite v1.46.1/go.mod h1:CzbrU2lSB1DKUusvwGz7rqEKIq+NUd8GWuBBZDs9/nA= +modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0= +modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A= +modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= +modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go new file mode 100644 index 0000000..f8de08d --- /dev/null +++ b/internal/chunker/chunker.go @@ -0,0 +1,185 @@ +package chunker + +import ( + "crypto/sha256" + "fmt" + "os" + "strings" + + sitter "github.com/smacker/go-tree-sitter" + "github.com/smacker/go-tree-sitter/golang" +) + +// Chunk represents a semantically meaningful piece of code +type Chunk struct { + File string + StartLine int + EndLine int + Type string // "function", "method", "type" + Name string + Content string + Hash string +} + +// Chunker extracts semantic chunks from source code +type Chunker interface { + Chunk(path string, content []byte) ([]Chunk, error) +} + +// GoChunker extracts chunks from Go source files using tree-sitter +type GoChunker struct { + parser *sitter.Parser +} + +// NewGoChunker creates a new Go chunker +func NewGoChunker() *GoChunker { + parser := sitter.NewParser() + parser.SetLanguage(golang.GetLanguage()) + return &GoChunker{parser: parser} +} + +// ChunkFile reads and chunks a file +func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) { + content, err := os.ReadFile(path) + if err != nil { + return nil, err + } + return c.Chunk(path, content) +} + +// Chunk extracts semantic chunks from Go source +func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) { + tree := c.parser.Parse(nil, content) + if tree == nil { + return nil, fmt.Errorf("failed to parse %s", path) + } + defer tree.Close() + + var chunks []Chunk + root := tree.RootNode() + + // Walk top-level declarations + for i := 0; i < int(root.ChildCount()); i++ { + node := root.Child(i) + chunk := c.extractChunk(node, content, path) + if chunk != nil { + chunks = append(chunks, *chunk) + } + } + + return chunks, nil +} + +func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk { + nodeType := node.Type() + + switch nodeType { + case "function_declaration": + return c.extractFunction(node, content, path) + case "method_declaration": + return c.extractMethod(node, content, path) + case "type_declaration": + return c.extractType(node, content, path) + } + + return nil +} + +func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "function", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + receiverNode := node.ChildByFieldName("receiver") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + + // Build receiver prefix like (*Server) or (s Server) + if receiverNode != nil { + recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()]) + // Extract type from receiver, e.g., "(s *Server)" -> "*Server" + recvType := extractReceiverType(recvText) + if recvType != "" { + name = fmt.Sprintf("(%s).%s", recvType, name) + } + } + + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "method", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk { + // type_declaration contains type_spec children + for i := 0; i < int(node.ChildCount()); i++ { + child := node.Child(i) + if child.Type() == "type_spec" { + nameNode := child.ChildByFieldName("name") + if nameNode == nil { + continue + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "type", + Name: name, + Content: text, + Hash: hash(text), + } + } + } + return nil +} + +// extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server" +func extractReceiverType(recv string) string { + // Remove parens + recv = strings.TrimPrefix(recv, "(") + recv = strings.TrimSuffix(recv, ")") + recv = strings.TrimSpace(recv) + + // Split on space, take last part (the type) + parts := strings.Fields(recv) + if len(parts) == 0 { + return "" + } + return parts[len(parts)-1] +} + +func hash(s string) string { + h := sha256.Sum256([]byte(s)) + return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars +} diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go new file mode 100644 index 0000000..42f8518 --- /dev/null +++ b/internal/embedder/embedder.go @@ -0,0 +1,222 @@ +package embedder + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" +) + +// Embedder generates embeddings for text +type Embedder interface { + Embed(ctx context.Context, texts []string) ([][]float32, error) + Dimensions() int +} + +// OllamaEmbedder uses Ollama's embedding API +type OllamaEmbedder struct { + baseURL string + model string + dims int +} + +// NewOllamaEmbedder creates an Ollama embedder +func NewOllamaEmbedder(model string) *OllamaEmbedder { + baseURL := os.Getenv("CODEVEC_BASE_URL") + if baseURL == "" { + baseURL = "http://localhost:11434" + } + if model == "" { + model = "nomic-embed-text" + } + + // Model dimensions + dims := 768 // nomic-embed-text default + switch model { + case "mxbai-embed-large": + dims = 1024 + case "all-minilm": + dims = 384 + } + + return &OllamaEmbedder{ + baseURL: baseURL, + model: model, + dims: dims, + } +} + +func (e *OllamaEmbedder) Dimensions() int { + return e.dims +} + +type ollamaRequest struct { + Model string `json:"model"` + Prompt string `json:"prompt"` +} + +type ollamaResponse struct { + Embedding []float32 `json:"embedding"` +} + +func (e *OllamaEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { + embeddings := make([][]float32, len(texts)) + + // Ollama's /api/embeddings takes one prompt at a time + for i, text := range texts { + req := ollamaRequest{ + Model: e.model, + Prompt: text, + } + + body, err := json.Marshal(req) + if err != nil { + return nil, err + } + + httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/api/embeddings", bytes.NewReader(body)) + if err != nil { + return nil, err + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("ollama request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode) + } + + var result ollamaResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + embeddings[i] = result.Embedding + } + + return embeddings, nil +} + +// OpenAIEmbedder uses OpenAI-compatible embedding API +type OpenAIEmbedder struct { + baseURL string + apiKey string + model string + dims int +} + +// NewOpenAIEmbedder creates an OpenAI-compatible embedder +func NewOpenAIEmbedder(model string) *OpenAIEmbedder { + baseURL := os.Getenv("CODEVEC_BASE_URL") + if baseURL == "" { + baseURL = "https://api.openai.com" + } + apiKey := os.Getenv("CODEVEC_API_KEY") + if model == "" { + model = "text-embedding-3-small" + } + + dims := 1536 // text-embedding-3-small default + switch model { + case "text-embedding-3-large": + dims = 3072 + case "text-embedding-ada-002": + dims = 1536 + } + + return &OpenAIEmbedder{ + baseURL: baseURL, + apiKey: apiKey, + model: model, + dims: dims, + } +} + +func (e *OpenAIEmbedder) Dimensions() int { + return e.dims +} + +type openaiRequest struct { + Model string `json:"model"` + Input []string `json:"input"` +} + +type openaiResponse struct { + Data []struct { + Embedding []float32 `json:"embedding"` + } `json:"data"` +} + +func (e *OpenAIEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { + if e.apiKey == "" { + return nil, fmt.Errorf("CODEVEC_API_KEY not set") + } + + // Batch in groups of 100 + const batchSize = 100 + embeddings := make([][]float32, len(texts)) + + for start := 0; start < len(texts); start += batchSize { + end := start + batchSize + if end > len(texts) { + end = len(texts) + } + batch := texts[start:end] + + req := openaiRequest{ + Model: e.model, + Input: batch, + } + + body, err := json.Marshal(req) + if err != nil { + return nil, err + } + + httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/v1/embeddings", bytes.NewReader(body)) + if err != nil { + return nil, err + } + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Authorization", "Bearer "+e.apiKey) + + resp, err := http.DefaultClient.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("openai request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("openai returned status %d", resp.StatusCode) + } + + var result openaiResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + for i, d := range result.Data { + embeddings[start+i] = d.Embedding + } + } + + return embeddings, nil +} + +// New creates an embedder based on provider name +func New(provider, model string) (Embedder, error) { + switch provider { + case "ollama": + return NewOllamaEmbedder(model), nil + case "openai": + return NewOpenAIEmbedder(model), nil + default: + return nil, fmt.Errorf("unknown provider: %s", provider) + } +} diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go @@ -0,0 +1,271 @@ +package index + +import ( + "database/sql" + "encoding/binary" + "math" + "os" + "path/filepath" + "sort" + + _ "modernc.org/sqlite" + + "code.northwest.io/codevec/internal/chunker" +) + +// Index stores chunks and embeddings in SQLite +type Index struct { + db *sql.DB + dims int +} + +// Open opens or creates an index at the given path +func Open(path string, dims int) (*Index, error) { + // Ensure directory exists + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0755); err != nil { + return nil, err + } + + db, err := sql.Open("sqlite", path) + if err != nil { + return nil, err + } + + idx := &Index{db: db, dims: dims} + if err := idx.init(); err != nil { + db.Close() + return nil, err + } + + return idx, nil +} + +func (idx *Index) init() error { + // Create chunks table with embedding column + _, err := idx.db.Exec(` + CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY, + file TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + chunk_type TEXT, + name TEXT, + content TEXT NOT NULL, + hash TEXT NOT NULL, + embedding BLOB, + created_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create files table for tracking indexed files + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + hash TEXT NOT NULL, + indexed_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create metadata table + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS metadata ( + key TEXT PRIMARY KEY, + value TEXT + ) + `) + if err != nil { + return err + } + + // Index on file for faster deletion + _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) + return err +} + +// Close closes the index +func (idx *Index) Close() error { + return idx.db.Close() +} + +// InsertChunk inserts a chunk with its embedding +func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { + embeddingBlob := serializeEmbedding(embedding) + _, err := idx.db.Exec(` + INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) + return err +} + +// SearchResult represents a search result +type SearchResult struct { + Chunk chunker.Chunk + Distance float64 +} + +// Search finds chunks similar to the query embedding using cosine similarity +func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { + // Load all embeddings + rows, err := idx.db.Query(` + SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding + FROM chunks + WHERE embedding IS NOT NULL + `) + if err != nil { + return nil, err + } + defer rows.Close() + + type candidate struct { + chunk chunker.Chunk + distance float64 + } + var candidates []candidate + + for rows.Next() { + var id int64 + var c chunker.Chunk + var embBlob []byte + err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) + if err != nil { + return nil, err + } + + emb := deserializeEmbedding(embBlob) + dist := cosineDistance(queryEmb, emb) + candidates = append(candidates, candidate{chunk: c, distance: dist}) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + // Sort by distance (lower is better) + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].distance < candidates[j].distance + }) + + // Return top-k + if limit > len(candidates) { + limit = len(candidates) + } + + results := make([]SearchResult, limit) + for i := 0; i < limit; i++ { + results[i] = SearchResult{ + Chunk: candidates[i].chunk, + Distance: candidates[i].distance, + } + } + + return results, nil +} + +// GetFileHash returns the stored hash for a file, or empty string if not indexed +func (idx *Index) GetFileHash(path string) (string, error) { + var hash string + err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) + if err == sql.ErrNoRows { + return "", nil + } + return hash, err +} + +// SetFileHash updates the hash for a file +func (idx *Index) SetFileHash(path, hash string) error { + _, err := idx.db.Exec(` + INSERT OR REPLACE INTO files (path, hash, indexed_at) + VALUES (?, ?, unixepoch()) + `, path, hash) + return err +} + +// DeleteChunksForFile removes all chunks for a file +func (idx *Index) DeleteChunksForFile(path string) error { + _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) + if err != nil { + return err + } + _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) + return err +} + +// Stats returns index statistics +type Stats struct { + Files int + Chunks int +} + +func (idx *Index) Stats() (Stats, error) { + var s Stats + err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) + if err != nil { + return s, err + } + err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) + return s, err +} + +// SetMetadata stores metadata +func (idx *Index) SetMetadata(key, value string) error { + _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) + return err +} + +// GetMetadata retrieves metadata +func (idx *Index) GetMetadata(key string) (string, error) { + var value string + err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) + if err == sql.ErrNoRows { + return "", nil + } + return value, err +} + +// serializeEmbedding converts float32 slice to bytes +func serializeEmbedding(embedding []float32) []byte { + buf := make([]byte, len(embedding)*4) + for i, v := range embedding { + binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) + } + return buf +} + +// deserializeEmbedding converts bytes back to float32 slice +func deserializeEmbedding(data []byte) []float32 { + n := len(data) / 4 + result := make([]float32, n) + for i := 0; i < n; i++ { + bits := binary.LittleEndian.Uint32(data[i*4:]) + result[i] = math.Float32frombits(bits) + } + return result +} + +// cosineDistance computes 1 - cosine_similarity (so lower is more similar) +func cosineDistance(a, b []float32) float64 { + if len(a) != len(b) { + return 1.0 + } + + var dotProduct, normA, normB float64 + for i := range a { + dotProduct += float64(a[i]) * float64(b[i]) + normA += float64(a[i]) * float64(a[i]) + normB += float64(b[i]) * float64(b[i]) + } + + if normA == 0 || normB == 0 { + return 1.0 + } + + similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) + return 1.0 - similarity +} diff --git a/internal/walker/walker.go b/internal/walker/walker.go new file mode 100644 index 0000000..0ac470d --- /dev/null +++ b/internal/walker/walker.go @@ -0,0 +1,109 @@ +package walker + +import ( + "os" + "path/filepath" + "strings" + + ignore "github.com/sabhiram/go-gitignore" +) + +// DefaultIgnore patterns applied to all walks +var DefaultIgnore = []string{ + "vendor/", + "node_modules/", + ".git/", + ".codevec/", +} + +// Walker walks a directory tree finding files to index +type Walker struct { + root string + extensions []string // e.g., [".go"] + gitignore *ignore.GitIgnore +} + +// New creates a walker for the given root directory +func New(root string, extensions []string) (*Walker, error) { + root, err := filepath.Abs(root) + if err != nil { + return nil, err + } + + w := &Walker{ + root: root, + extensions: extensions, + } + + // Load .gitignore if present + gitignorePath := filepath.Join(root, ".gitignore") + if _, err := os.Stat(gitignorePath); err == nil { + gi, err := ignore.CompileIgnoreFile(gitignorePath) + if err == nil { + w.gitignore = gi + } + } + + return w, nil +} + +// Walk returns all matching files in the directory tree +func (w *Walker) Walk() ([]string, error) { + var files []string + + err := filepath.WalkDir(w.root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + // Get path relative to root for ignore matching + relPath, err := filepath.Rel(w.root, path) + if err != nil { + return err + } + + // Skip default ignored directories + if d.IsDir() { + for _, pattern := range DefaultIgnore { + if strings.HasPrefix(relPath+"/", pattern) || relPath+"/" == pattern { + return filepath.SkipDir + } + } + } + + // Skip if matched by .gitignore + if w.gitignore != nil && w.gitignore.MatchesPath(relPath) { + if d.IsDir() { + return filepath.SkipDir + } + return nil + } + + // Skip directories and non-matching extensions + if d.IsDir() { + return nil + } + + if !w.matchesExtension(path) { + return nil + } + + files = append(files, path) + return nil + }) + + return files, err +} + +func (w *Walker) matchesExtension(path string) bool { + if len(w.extensions) == 0 { + return true + } + ext := filepath.Ext(path) + for _, e := range w.extensions { + if ext == e { + return true + } + } + return false +} -- cgit v1.2.3