From f1ff85c7acad6b2ae7ec10720619ef2023cb7dc9 Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:29:00 -0800 Subject: Implement core: walker, chunker, embedder, index, CLI --- internal/chunker/chunker.go | 185 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 internal/chunker/chunker.go (limited to 'internal/chunker/chunker.go') diff --git a/internal/chunker/chunker.go b/internal/chunker/chunker.go new file mode 100644 index 0000000..f8de08d --- /dev/null +++ b/internal/chunker/chunker.go @@ -0,0 +1,185 @@ +package chunker + +import ( + "crypto/sha256" + "fmt" + "os" + "strings" + + sitter "github.com/smacker/go-tree-sitter" + "github.com/smacker/go-tree-sitter/golang" +) + +// Chunk represents a semantically meaningful piece of code +type Chunk struct { + File string + StartLine int + EndLine int + Type string // "function", "method", "type" + Name string + Content string + Hash string +} + +// Chunker extracts semantic chunks from source code +type Chunker interface { + Chunk(path string, content []byte) ([]Chunk, error) +} + +// GoChunker extracts chunks from Go source files using tree-sitter +type GoChunker struct { + parser *sitter.Parser +} + +// NewGoChunker creates a new Go chunker +func NewGoChunker() *GoChunker { + parser := sitter.NewParser() + parser.SetLanguage(golang.GetLanguage()) + return &GoChunker{parser: parser} +} + +// ChunkFile reads and chunks a file +func (c *GoChunker) ChunkFile(path string) ([]Chunk, error) { + content, err := os.ReadFile(path) + if err != nil { + return nil, err + } + return c.Chunk(path, content) +} + +// Chunk extracts semantic chunks from Go source +func (c *GoChunker) Chunk(path string, content []byte) ([]Chunk, error) { + tree := c.parser.Parse(nil, content) + if tree == nil { + return nil, fmt.Errorf("failed to parse %s", path) + } + defer tree.Close() + + var chunks []Chunk + root := tree.RootNode() + + // Walk top-level declarations + for i := 0; i < int(root.ChildCount()); i++ { + node := root.Child(i) + chunk := c.extractChunk(node, content, path) + if chunk != nil { + chunks = append(chunks, *chunk) + } + } + + return chunks, nil +} + +func (c *GoChunker) extractChunk(node *sitter.Node, content []byte, path string) *Chunk { + nodeType := node.Type() + + switch nodeType { + case "function_declaration": + return c.extractFunction(node, content, path) + case "method_declaration": + return c.extractMethod(node, content, path) + case "type_declaration": + return c.extractType(node, content, path) + } + + return nil +} + +func (c *GoChunker) extractFunction(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "function", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractMethod(node *sitter.Node, content []byte, path string) *Chunk { + nameNode := node.ChildByFieldName("name") + receiverNode := node.ChildByFieldName("receiver") + if nameNode == nil { + return nil + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + + // Build receiver prefix like (*Server) or (s Server) + if receiverNode != nil { + recvText := string(content[receiverNode.StartByte():receiverNode.EndByte()]) + // Extract type from receiver, e.g., "(s *Server)" -> "*Server" + recvType := extractReceiverType(recvText) + if recvType != "" { + name = fmt.Sprintf("(%s).%s", recvType, name) + } + } + + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "method", + Name: name, + Content: text, + Hash: hash(text), + } +} + +func (c *GoChunker) extractType(node *sitter.Node, content []byte, path string) *Chunk { + // type_declaration contains type_spec children + for i := 0; i < int(node.ChildCount()); i++ { + child := node.Child(i) + if child.Type() == "type_spec" { + nameNode := child.ChildByFieldName("name") + if nameNode == nil { + continue + } + + name := string(content[nameNode.StartByte():nameNode.EndByte()]) + text := string(content[node.StartByte():node.EndByte()]) + + return &Chunk{ + File: path, + StartLine: int(node.StartPoint().Row) + 1, + EndLine: int(node.EndPoint().Row) + 1, + Type: "type", + Name: name, + Content: text, + Hash: hash(text), + } + } + } + return nil +} + +// extractReceiverType extracts the type from a receiver like "(s *Server)" -> "*Server" +func extractReceiverType(recv string) string { + // Remove parens + recv = strings.TrimPrefix(recv, "(") + recv = strings.TrimSuffix(recv, ")") + recv = strings.TrimSpace(recv) + + // Split on space, take last part (the type) + parts := strings.Fields(recv) + if len(parts) == 0 { + return "" + } + return parts[len(parts)-1] +} + +func hash(s string) string { + h := sha256.Sum256([]byte(s)) + return fmt.Sprintf("%x", h[:8]) // First 8 bytes = 16 hex chars +} -- cgit v1.2.3