From f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:29:00 -0800 Subject: Implement core: walker, chunker, embedder, index, CLI --- internal/chunker/chunker.go | 185 ++++++++++++++++++++++++++++ internal/embedder/embedder.go | 222 ++++++++++++++++++++++++++++++++++ internal/index/index.go | 271 ++++++++++++++++++++++++++++++++++++++++++ internal/walker/walker.go | 109 +++++++++++++++++ 4 files changed, 787 insertions(+) create mode 100644 internal/chunker/chunker.go create mode 100644 internal/embedder/embedder.go create mode 100644 internal/index/index.go create mode 100644 internal/walker/walker.go (limited to 'internal') diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go new file mode 100644 index 0000000..f8de08d --- /dev/null +++ b/internal/chunker/chunker.go @@ -0,0 +1,185 @@ +package chunker + +import ( + "crypto/sha256" + "fmt" + "os" + "strings" + + sitter "github.com/smacker/go-tree-sitter" + "github.com/smacker/go-tree-sitter/golang" +) + +// Chunk represents a semantically meaningful piece of code +type Chunk struct { + File string + StartLine int + EndLine int + Type string // "function", "method", "type" + Name string + Content string + Hash string +} + +// Chunker extracts semantic chunks from source code +type Chunker interface { + Chunk(path string, content []byte) ([]Chunk, error) +} + +// GoChunker extracts chunks from Go source files using tree-sitter +type GoChunker struct { + parser *sitter.Parser +} + +// NewGoChunker creates a new Go chunker +func NewGoChunker() *GoChunker { + parser := sitter.NewParser() + parser.SetLanguage(golang.GetLanguage()) + return &GoChunker{parser: parser} +} + +// ChunkFile reads and chunks a file +func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) { + content, err := os.ReadFile(path) + if err != nil { + return nil, err + } + return c.Chunk(path, content) +} + +// Chunk extracts semantic chunks from Go source +func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) { + tree := c.parser.Parse(nil, content) + if tree == nil { + return nil, fmt.Errorf("failed to parse %s", path) + } + defer tree.Close() + + var chunks []Chunk + root := tree.RootNode() + + // Walk top-level declarations + for i := 0; i < int(root.ChildCount()); i++ { + node := root.Child(i) + chunk := c.extractChunk(node, content, path) + if chunk != nil { + chunks = append(chunks, *chunk) + } + } + + return chunks, nil +} + +func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk { + nodeType := node.Type() + + switch nodeType { + case "function_declaration": + return c.extractFunction(node, content, path) + case "method_declaration": + return c.extractMethod(node, content, path) + case "type_declaration": + return c.extractType(node, content, path) + } + + return nil +} + +func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "function", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + receiverNode := node.ChildByFieldName("receiver") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + + // Build receiver prefix like (*Server) or (s Server) + if receiverNode != nil { + recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()]) + // Extract type from receiver, e.g., "(s *Server)" -> "*Server" + recvType := extractReceiverType(recvText) + if recvType != "" { + name = fmt.Sprintf("(%s).%s", recvType, name) + } + } + + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "method", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk { + // type_declaration contains type_spec children + for i := 0; i < int(node.ChildCount()); i++ { + child := node.Child(i) + if child.Type() == "type_spec" { + nameNode := child.ChildByFieldName("name") + if nameNode == nil { + continue + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "type", + Name: name, + Content: text, + Hash: hash(text), + } + } + } + return nil +} + +// extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server" +func extractReceiverType(recv string) string { + // Remove parens + recv = strings.TrimPrefix(recv, "(") + recv = strings.TrimSuffix(recv, ")") + recv = strings.TrimSpace(recv) + + // Split on space, take last part (the type) + parts := strings.Fields(recv) + if len(parts) == 0 { + return "" + } + return parts[len(parts)-1] +} + +func hash(s string) string { + h := sha256.Sum256([]byte(s)) + return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars +} diff --git a/internal/embedder/embedder.go b/internal/embedder/embedder.go new file mode 100644 index 0000000..42f8518 --- /dev/null +++ b/internal/embedder/embedder.go @@ -0,0 +1,222 @@ +package embedder + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" +) + +// Embedder generates embeddings for text +type Embedder interface { + Embed(ctx context.Context, texts []string) ([][]float32, error) + Dimensions() int +} + +// OllamaEmbedder uses Ollama's embedding API +type OllamaEmbedder struct { + baseURL string + model string + dims int +} + +// NewOllamaEmbedder creates an Ollama embedder +func NewOllamaEmbedder(model string) *OllamaEmbedder { + baseURL := os.Getenv("CODEVEC_BASE_URL") + if baseURL == "" { + baseURL = "http://localhost:11434" + } + if model == "" { + model = "nomic-embed-text" + } + + // Model dimensions + dims := 768 // nomic-embed-text default + switch model { + case "mxbai-embed-large": + dims = 1024 + case "all-minilm": + dims = 384 + } + + return &OllamaEmbedder{ + baseURL: baseURL, + model: model, + dims: dims, + } +} + +func (e *OllamaEmbedder) Dimensions() int { + return e.dims +} + +type ollamaRequest struct { + Model string `json:"model"` + Prompt string `json:"prompt"` +} + +type ollamaResponse struct { + Embedding []float32 `json:"embedding"` +} + +func (e *OllamaEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { + embeddings := make([][]float32, len(texts)) + + // Ollama's /api/embeddings takes one prompt at a time + for i, text := range texts { + req := ollamaRequest{ + Model: e.model, + Prompt: text, + } + + body, err := json.Marshal(req) + if err != nil { + return nil, err + } + + httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/api/embeddings", bytes.NewReader(body)) + if err != nil { + return nil, err + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := http.DefaultClient.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("ollama request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("ollama returned status %d", resp.StatusCode) + } + + var result ollamaResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + embeddings[i] = result.Embedding + } + + return embeddings, nil +} + +// OpenAIEmbedder uses OpenAI-compatible embedding API +type OpenAIEmbedder struct { + baseURL string + apiKey string + model string + dims int +} + +// NewOpenAIEmbedder creates an OpenAI-compatible embedder +func NewOpenAIEmbedder(model string) *OpenAIEmbedder { + baseURL := os.Getenv("CODEVEC_BASE_URL") + if baseURL == "" { + baseURL = "https://api.openai.com" + } + apiKey := os.Getenv("CODEVEC_API_KEY") + if model == "" { + model = "text-embedding-3-small" + } + + dims := 1536 // text-embedding-3-small default + switch model { + case "text-embedding-3-large": + dims = 3072 + case "text-embedding-ada-002": + dims = 1536 + } + + return &OpenAIEmbedder{ + baseURL: baseURL, + apiKey: apiKey, + model: model, + dims: dims, + } +} + +func (e *OpenAIEmbedder) Dimensions() int { + return e.dims +} + +type openaiRequest struct { + Model string `json:"model"` + Input []string `json:"input"` +} + +type openaiResponse struct { + Data []struct { + Embedding []float32 `json:"embedding"` + } `json:"data"` +} + +func (e *OpenAIEmbedder) Embed(ctx context.Context, texts []string) ([][]float32, error) { + if e.apiKey == "" { + return nil, fmt.Errorf("CODEVEC_API_KEY not set") + } + + // Batch in groups of 100 + const batchSize = 100 + embeddings := make([][]float32, len(texts)) + + for start := 0; start < len(texts); start += batchSize { + end := start + batchSize + if end > len(texts) { + end = len(texts) + } + batch := texts[start:end] + + req := openaiRequest{ + Model: e.model, + Input: batch, + } + + body, err := json.Marshal(req) + if err != nil { + return nil, err + } + + httpReq, err := http.NewRequestWithContext(ctx, "POST", e.baseURL+"/v1/embeddings", bytes.NewReader(body)) + if err != nil { + return nil, err + } + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Authorization", "Bearer "+e.apiKey) + + resp, err := http.DefaultClient.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("openai request failed: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("openai returned status %d", resp.StatusCode) + } + + var result openaiResponse + if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { + return nil, err + } + + for i, d := range result.Data { + embeddings[start+i] = d.Embedding + } + } + + return embeddings, nil +} + +// New creates an embedder based on provider name +func New(provider, model string) (Embedder, error) { + switch provider { + case "ollama": + return NewOllamaEmbedder(model), nil + case "openai": + return NewOpenAIEmbedder(model), nil + default: + return nil, fmt.Errorf("unknown provider: %s", provider) + } +} diff --git a/internal/index/index.go b/internal/index/index.go new file mode 100644 index 0000000..008e487 --- /dev/null +++ b/internal/index/index.go @@ -0,0 +1,271 @@ +package index + +import ( + "database/sql" + "encoding/binary" + "math" + "os" + "path/filepath" + "sort" + + _ "modernc.org/sqlite" + + "code.northwest.io/codevec/internal/chunker" +) + +// Index stores chunks and embeddings in SQLite +type Index struct { + db *sql.DB + dims int +} + +// Open opens or creates an index at the given path +func Open(path string, dims int) (*Index, error) { + // Ensure directory exists + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0755); err != nil { + return nil, err + } + + db, err := sql.Open("sqlite", path) + if err != nil { + return nil, err + } + + idx := &Index{db: db, dims: dims} + if err := idx.init(); err != nil { + db.Close() + return nil, err + } + + return idx, nil +} + +func (idx *Index) init() error { + // Create chunks table with embedding column + _, err := idx.db.Exec(` + CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY, + file TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + chunk_type TEXT, + name TEXT, + content TEXT NOT NULL, + hash TEXT NOT NULL, + embedding BLOB, + created_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create files table for tracking indexed files + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + hash TEXT NOT NULL, + indexed_at INTEGER DEFAULT (unixepoch()) + ) + `) + if err != nil { + return err + } + + // Create metadata table + _, err = idx.db.Exec(` + CREATE TABLE IF NOT EXISTS metadata ( + key TEXT PRIMARY KEY, + value TEXT + ) + `) + if err != nil { + return err + } + + // Index on file for faster deletion + _, err = idx.db.Exec(`CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file)`) + return err +} + +// Close closes the index +func (idx *Index) Close() error { + return idx.db.Close() +} + +// InsertChunk inserts a chunk with its embedding +func (idx *Index) InsertChunk(chunk chunker.Chunk, embedding []float32) error { + embeddingBlob := serializeEmbedding(embedding) + _, err := idx.db.Exec(` + INSERT INTO chunks (file, start_line, end_line, chunk_type, name, content, hash, embedding) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + `, chunk.File, chunk.StartLine, chunk.EndLine, chunk.Type, chunk.Name, chunk.Content, chunk.Hash, embeddingBlob) + return err +} + +// SearchResult represents a search result +type SearchResult struct { + Chunk chunker.Chunk + Distance float64 +} + +// Search finds chunks similar to the query embedding using cosine similarity +func (idx *Index) Search(queryEmb []float32, limit int) ([]SearchResult, error) { + // Load all embeddings + rows, err := idx.db.Query(` + SELECT id, file, start_line, end_line, chunk_type, name, content, hash, embedding + FROM chunks + WHERE embedding IS NOT NULL + `) + if err != nil { + return nil, err + } + defer rows.Close() + + type candidate struct { + chunk chunker.Chunk + distance float64 + } + var candidates []candidate + + for rows.Next() { + var id int64 + var c chunker.Chunk + var embBlob []byte + err := rows.Scan(&id, &c.File, &c.StartLine, &c.EndLine, &c.Type, &c.Name, &c.Content, &c.Hash, &embBlob) + if err != nil { + return nil, err + } + + emb := deserializeEmbedding(embBlob) + dist := cosineDistance(queryEmb, emb) + candidates = append(candidates, candidate{chunk: c, distance: dist}) + } + + if err := rows.Err(); err != nil { + return nil, err + } + + // Sort by distance (lower is better) + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].distance < candidates[j].distance + }) + + // Return top-k + if limit > len(candidates) { + limit = len(candidates) + } + + results := make([]SearchResult, limit) + for i := 0; i < limit; i++ { + results[i] = SearchResult{ + Chunk: candidates[i].chunk, + Distance: candidates[i].distance, + } + } + + return results, nil +} + +// GetFileHash returns the stored hash for a file, or empty string if not indexed +func (idx *Index) GetFileHash(path string) (string, error) { + var hash string + err := idx.db.QueryRow(`SELECT hash FROM files WHERE path = ?`, path).Scan(&hash) + if err == sql.ErrNoRows { + return "", nil + } + return hash, err +} + +// SetFileHash updates the hash for a file +func (idx *Index) SetFileHash(path, hash string) error { + _, err := idx.db.Exec(` + INSERT OR REPLACE INTO files (path, hash, indexed_at) + VALUES (?, ?, unixepoch()) + `, path, hash) + return err +} + +// DeleteChunksForFile removes all chunks for a file +func (idx *Index) DeleteChunksForFile(path string) error { + _, err := idx.db.Exec(`DELETE FROM chunks WHERE file = ?`, path) + if err != nil { + return err + } + _, err = idx.db.Exec(`DELETE FROM files WHERE path = ?`, path) + return err +} + +// Stats returns index statistics +type Stats struct { + Files int + Chunks int +} + +func (idx *Index) Stats() (Stats, error) { + var s Stats + err := idx.db.QueryRow(`SELECT COUNT(*) FROM files`).Scan(&s.Files) + if err != nil { + return s, err + } + err = idx.db.QueryRow(`SELECT COUNT(*) FROM chunks`).Scan(&s.Chunks) + return s, err +} + +// SetMetadata stores metadata +func (idx *Index) SetMetadata(key, value string) error { + _, err := idx.db.Exec(`INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)`, key, value) + return err +} + +// GetMetadata retrieves metadata +func (idx *Index) GetMetadata(key string) (string, error) { + var value string + err := idx.db.QueryRow(`SELECT value FROM metadata WHERE key = ?`, key).Scan(&value) + if err == sql.ErrNoRows { + return "", nil + } + return value, err +} + +// serializeEmbedding converts float32 slice to bytes +func serializeEmbedding(embedding []float32) []byte { + buf := make([]byte, len(embedding)*4) + for i, v := range embedding { + binary.LittleEndian.PutUint32(buf[i*4:], math.Float32bits(v)) + } + return buf +} + +// deserializeEmbedding converts bytes back to float32 slice +func deserializeEmbedding(data []byte) []float32 { + n := len(data) / 4 + result := make([]float32, n) + for i := 0; i < n; i++ { + bits := binary.LittleEndian.Uint32(data[i*4:]) + result[i] = math.Float32frombits(bits) + } + return result +} + +// cosineDistance computes 1 - cosine_similarity (so lower is more similar) +func cosineDistance(a, b []float32) float64 { + if len(a) != len(b) { + return 1.0 + } + + var dotProduct, normA, normB float64 + for i := range a { + dotProduct += float64(a[i]) * float64(b[i]) + normA += float64(a[i]) * float64(a[i]) + normB += float64(b[i]) * float64(b[i]) + } + + if normA == 0 || normB == 0 { + return 1.0 + } + + similarity := dotProduct / (math.Sqrt(normA) * math.Sqrt(normB)) + return 1.0 - similarity +} diff --git a/internal/walker/walker.go b/internal/walker/walker.go new file mode 100644 index 0000000..0ac470d --- /dev/null +++ b/internal/walker/walker.go @@ -0,0 +1,109 @@ +package walker + +import ( + "os" + "path/filepath" + "strings" + + ignore "github.com/sabhiram/go-gitignore" +) + +// DefaultIgnore patterns applied to all walks +var DefaultIgnore = []string{ + "vendor/", + "node_modules/", + ".git/", + ".codevec/", +} + +// Walker walks a directory tree finding files to index +type Walker struct { + root string + extensions []string // e.g., [".go"] + gitignore *ignore.GitIgnore +} + +// New creates a walker for the given root directory +func New(root string, extensions []string) (*Walker, error) { + root, err := filepath.Abs(root) + if err != nil { + return nil, err + } + + w := &Walker{ + root: root, + extensions: extensions, + } + + // Load .gitignore if present + gitignorePath := filepath.Join(root, ".gitignore") + if _, err := os.Stat(gitignorePath); err == nil { + gi, err := ignore.CompileIgnoreFile(gitignorePath) + if err == nil { + w.gitignore = gi + } + } + + return w, nil +} + +// Walk returns all matching files in the directory tree +func (w *Walker) Walk() ([]string, error) { + var files []string + + err := filepath.WalkDir(w.root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return err + } + + // Get path relative to root for ignore matching + relPath, err := filepath.Rel(w.root, path) + if err != nil { + return err + } + + // Skip default ignored directories + if d.IsDir() { + for _, pattern := range DefaultIgnore { + if strings.HasPrefix(relPath+"/", pattern) || relPath+"/" == pattern { + return filepath.SkipDir + } + } + } + + // Skip if matched by .gitignore + if w.gitignore != nil && w.gitignore.MatchesPath(relPath) { + if d.IsDir() { + return filepath.SkipDir + } + return nil + } + + // Skip directories and non-matching extensions + if d.IsDir() { + return nil + } + + if !w.matchesExtension(path) { + return nil + } + + files = append(files, path) + return nil + }) + + return files, err +} + +func (w *Walker) matchesExtension(path string) bool { + if len(w.extensions) == 0 { + return true + } + ext := filepath.Ext(path) + for _, e := range w.extensions { + if ext == e { + return true + } + } + return false +} -- cgit v1.2.3