diff options
Diffstat (limited to 'PLAN.md')
| -rw-r--r-- | PLAN.md | 342 |
1 files changed, 342 insertions, 0 deletions
| @@ -0,0 +1,342 @@ | |||
| 1 | # codevec Implementation Plan | ||
| 2 | |||
| 3 | **Goal:** Build a CLI that indexes Go codebases for semantic search. | ||
| 4 | |||
| 5 | **Scope:** Go-only MVP, then expand to TypeScript/Python. | ||
| 6 | |||
| 7 | --- | ||
| 8 | |||
| 9 | ## Phase 1: Project Skeleton | ||
| 10 | |||
| 11 | Set up the basic Go project structure. | ||
| 12 | |||
| 13 | ``` | ||
| 14 | codevec/ | ||
| 15 | ├── cmd/ | ||
| 16 | │ └── codevec/ | ||
| 17 | │ └── main.go # CLI entry point | ||
| 18 | ├── internal/ | ||
| 19 | │ ├── chunker/ | ||
| 20 | │ │ └── chunker.go # Interface + Go implementation | ||
| 21 | │ ├── embedder/ | ||
| 22 | │ │ └── embedder.go # OpenAI embedding client | ||
| 23 | │ ├── index/ | ||
| 24 | │ │ └── index.go # sqlite-vec storage layer | ||
| 25 | │ └── walker/ | ||
| 26 | │ └── walker.go # File discovery + .gitignore | ||
| 27 | ├── go.mod | ||
| 28 | ├── go.sum | ||
| 29 | ├── Makefile | ||
| 30 | └── README.md | ||
| 31 | ``` | ||
| 32 | |||
| 33 | **Tasks:** | ||
| 34 | - [ ] `go mod init code.northwest.io/codevec` | ||
| 35 | - [ ] Basic CLI with cobra or just flag package | ||
| 36 | - [ ] Subcommands: `index`, `query`, `status` | ||
| 37 | - [ ] Makefile with `build`, `test`, `install` | ||
| 38 | |||
| 39 | --- | ||
| 40 | |||
| 41 | ## Phase 2: File Walker | ||
| 42 | |||
| 43 | Walk a directory, respect .gitignore, filter by extension. | ||
| 44 | |||
| 45 | **Input:** Root path | ||
| 46 | **Output:** List of `.go` files to index | ||
| 47 | |||
| 48 | ```go | ||
| 49 | type Walker struct { | ||
| 50 | root string | ||
| 51 | ignores []string // from .gitignore | ||
| 52 | } | ||
| 53 | |||
| 54 | func (w *Walker) Walk() ([]string, error) | ||
| 55 | ``` | ||
| 56 | |||
| 57 | **Tasks:** | ||
| 58 | - [ ] Implement directory walking with `filepath.WalkDir` | ||
| 59 | - [ ] Parse `.gitignore` patterns (use `go-gitignore` or similar) | ||
| 60 | - [ ] Filter to `.go` files only (configurable later) | ||
| 61 | - [ ] Skip `vendor/`, `testdata/`, `*_test.go` by default (configurable) | ||
| 62 | |||
| 63 | **Test:** Walk the `nostr` SDK repo, verify correct file list. | ||
| 64 | |||
| 65 | --- | ||
| 66 | |||
| 67 | ## Phase 3: Go Chunker (tree-sitter) | ||
| 68 | |||
| 69 | Parse Go files and extract function/type chunks. | ||
| 70 | |||
| 71 | **Input:** File path + content | ||
| 72 | **Output:** List of chunks with metadata | ||
| 73 | |||
| 74 | ```go | ||
| 75 | type Chunk struct { | ||
| 76 | File string | ||
| 77 | StartLine int | ||
| 78 | EndLine int | ||
| 79 | Type string // "function", "method", "type", "const", "var" | ||
| 80 | Name string // function/type name | ||
| 81 | Content string // raw source code | ||
| 82 | Hash string // sha256 of content | ||
| 83 | } | ||
| 84 | |||
| 85 | type Chunker interface { | ||
| 86 | Chunk(path string, content []byte) ([]Chunk, error) | ||
| 87 | } | ||
| 88 | ``` | ||
| 89 | |||
| 90 | **Go-specific extraction:** | ||
| 91 | - `function_declaration` → standalone functions | ||
| 92 | - `method_declaration` → methods (include receiver in name: `(*Server).Handle`) | ||
| 93 | - `type_declaration` → structs, interfaces | ||
| 94 | - `const_declaration` / `var_declaration` → top-level const/var blocks | ||
| 95 | |||
| 96 | **Tasks:** | ||
| 97 | - [ ] Add tree-sitter dependency: `github.com/smacker/go-tree-sitter` | ||
| 98 | - [ ] Add Go grammar: `github.com/smacker/go-tree-sitter/golang` | ||
| 99 | - [ ] Implement `GoChunker` that parses and walks AST | ||
| 100 | - [ ] Extract nodes by type, capture line numbers | ||
| 101 | - [ ] Handle edge cases: empty files, syntax errors (skip gracefully) | ||
| 102 | - [ ] Chunk size limit: if function > 1000 tokens, note it but keep whole | ||
| 103 | |||
| 104 | **Test:** Chunk `nostr/relay.go`, verify functions extracted correctly. | ||
| 105 | |||
| 106 | --- | ||
| 107 | |||
| 108 | ## Phase 4: Embedding Generation | ||
| 109 | |||
| 110 | Generate embeddings via OpenAI API. | ||
| 111 | |||
| 112 | **Input:** List of chunks | ||
| 113 | **Output:** Chunks with embedding vectors | ||
| 114 | |||
| 115 | ```go | ||
| 116 | type Embedder interface { | ||
| 117 | Embed(ctx context.Context, texts []string) ([][]float32, error) | ||
| 118 | } | ||
| 119 | |||
| 120 | type OpenAIEmbedder struct { | ||
| 121 | apiKey string | ||
| 122 | model string // "text-embedding-3-small" | ||
| 123 | } | ||
| 124 | ``` | ||
| 125 | |||
| 126 | **Batching:** OpenAI supports up to 2048 inputs per request. Batch chunks to minimize API calls. | ||
| 127 | |||
| 128 | **Tasks:** | ||
| 129 | - [ ] Implement OpenAI embedding client (stdlib `net/http`, no SDK) | ||
| 130 | - [ ] Batch requests (100 chunks per request to stay safe) | ||
| 131 | - [ ] Handle rate limits with exponential backoff | ||
| 132 | - [ ] Config: model selection, API key from env `OPENAI_API_KEY` | ||
| 133 | |||
| 134 | **Test:** Embed a few chunks, verify 1536-dim vectors returned. | ||
| 135 | |||
| 136 | --- | ||
| 137 | |||
| 138 | ## Phase 5: sqlite-vec Storage | ||
| 139 | |||
| 140 | Store chunks and embeddings in SQLite with vector search. | ||
| 141 | |||
| 142 | **Schema:** | ||
| 143 | ```sql | ||
| 144 | CREATE TABLE chunks ( | ||
| 145 | id INTEGER PRIMARY KEY, | ||
| 146 | file TEXT NOT NULL, | ||
| 147 | start_line INTEGER NOT NULL, | ||
| 148 | end_line INTEGER NOT NULL, | ||
| 149 | chunk_type TEXT, | ||
| 150 | name TEXT, | ||
| 151 | content TEXT NOT NULL, | ||
| 152 | hash TEXT NOT NULL, | ||
| 153 | created_at INTEGER DEFAULT (unixepoch()) | ||
| 154 | ); | ||
| 155 | |||
| 156 | CREATE TABLE files ( | ||
| 157 | path TEXT PRIMARY KEY, | ||
| 158 | hash TEXT NOT NULL, | ||
| 159 | indexed_at INTEGER DEFAULT (unixepoch()) | ||
| 160 | ); | ||
| 161 | |||
| 162 | CREATE VIRTUAL TABLE vec_chunks USING vec0( | ||
| 163 | id INTEGER PRIMARY KEY, | ||
| 164 | embedding FLOAT[1536] | ||
| 165 | ); | ||
| 166 | ``` | ||
| 167 | |||
| 168 | **Queries:** | ||
| 169 | ```sql | ||
| 170 | -- Similarity search | ||
| 171 | SELECT c.*, vec_distance_cosine(v.embedding, ?) as distance | ||
| 172 | FROM vec_chunks v | ||
| 173 | JOIN chunks c ON c.id = v.id | ||
| 174 | ORDER BY distance | ||
| 175 | LIMIT 10; | ||
| 176 | ``` | ||
| 177 | |||
| 178 | **Tasks:** | ||
| 179 | - [ ] Add sqlite-vec: `github.com/asg017/sqlite-vec-go-bindings` | ||
| 180 | - [ ] Initialize DB with schema | ||
| 181 | - [ ] Insert chunks + embeddings | ||
| 182 | - [ ] Query by vector similarity | ||
| 183 | - [ ] Store in `.codevec/index.db` | ||
| 184 | |||
| 185 | **Test:** Insert chunks, query, verify results ranked by similarity. | ||
| 186 | |||
| 187 | --- | ||
| 188 | |||
| 189 | ## Phase 6: CLI Commands | ||
| 190 | |||
| 191 | Wire everything together. | ||
| 192 | |||
| 193 | ### `codevec index <path>` | ||
| 194 | |||
| 195 | ``` | ||
| 196 | 1. Walk directory → file list | ||
| 197 | 2. For each file: | ||
| 198 | a. Check if already indexed (compare file hash) | ||
| 199 | b. Parse with tree-sitter → chunks | ||
| 200 | c. Generate embeddings (batched) | ||
| 201 | d. Store in sqlite-vec | ||
| 202 | 3. Update file manifest | ||
| 203 | 4. Print summary | ||
| 204 | ``` | ||
| 205 | |||
| 206 | **Flags:** | ||
| 207 | - `--force` — re-index everything | ||
| 208 | - `--verbose` — show progress | ||
| 209 | |||
| 210 | ### `codevec query <text>` | ||
| 211 | |||
| 212 | ``` | ||
| 213 | 1. Generate embedding for query text | ||
| 214 | 2. Search sqlite-vec for similar chunks | ||
| 215 | 3. Print results with file:line and similarity score | ||
| 216 | ``` | ||
| 217 | |||
| 218 | **Flags:** | ||
| 219 | - `--limit N` — max results (default 10) | ||
| 220 | - `--threshold F` — min similarity (default 0.5) | ||
| 221 | - `--show` — print chunk content | ||
| 222 | - `--json` — output as JSON | ||
| 223 | |||
| 224 | ### `codevec status` | ||
| 225 | |||
| 226 | ``` | ||
| 227 | 1. Read index.db | ||
| 228 | 2. Print stats: files, chunks, last indexed, model used | ||
| 229 | ``` | ||
| 230 | |||
| 231 | **Tasks:** | ||
| 232 | - [ ] Implement `index` command with progress bar | ||
| 233 | - [ ] Implement `query` command with formatted output | ||
| 234 | - [ ] Implement `status` command | ||
| 235 | - [ ] Add `--json` output for tool integration | ||
| 236 | |||
| 237 | --- | ||
| 238 | |||
| 239 | ## Phase 7: Incremental Updates | ||
| 240 | |||
| 241 | Only re-index changed files. | ||
| 242 | |||
| 243 | **Manifest:** `.codevec/manifest.json` | ||
| 244 | ```json | ||
| 245 | { | ||
| 246 | "files": { | ||
| 247 | "src/relay.go": { | ||
| 248 | "hash": "sha256:abc...", | ||
| 249 | "indexed_at": 1709654400 | ||
| 250 | } | ||
| 251 | }, | ||
| 252 | "model": "text-embedding-3-small", | ||
| 253 | "version": 1 | ||
| 254 | } | ||
| 255 | ``` | ||
| 256 | |||
| 257 | **Logic:** | ||
| 258 | 1. Walk directory | ||
| 259 | 2. For each file, compute hash | ||
| 260 | 3. If hash matches manifest → skip | ||
| 261 | 4. If hash differs → delete old chunks, re-index | ||
| 262 | 5. If file removed → delete chunks | ||
| 263 | 6. Update manifest | ||
| 264 | |||
| 265 | **Tasks:** | ||
| 266 | - [ ] Implement file hashing (sha256 of content) | ||
| 267 | - [ ] Compare against manifest | ||
| 268 | - [ ] Delete stale chunks on re-index | ||
| 269 | - [ ] Handle deleted files | ||
| 270 | |||
| 271 | --- | ||
| 272 | |||
| 273 | ## Phase 8: Testing & Polish | ||
| 274 | |||
| 275 | - [ ] Unit tests for chunker | ||
| 276 | - [ ] Unit tests for walker | ||
| 277 | - [ ] Integration test: index small repo, query, verify results | ||
| 278 | - [ ] Error handling: missing API key, parse failures, network errors | ||
| 279 | - [ ] README with usage examples | ||
| 280 | - [ ] `make install` to put binary in PATH | ||
| 281 | |||
| 282 | --- | ||
| 283 | |||
| 284 | ## Future (Post-MVP) | ||
| 285 | |||
| 286 | - TypeScript chunker (tree-sitter + TS grammar) | ||
| 287 | - Python chunker | ||
| 288 | - Ollama embedder for local/offline use | ||
| 289 | - `codevec serve` HTTP API | ||
| 290 | - Watch mode (re-index on file change) | ||
| 291 | - Import/export index | ||
| 292 | |||
| 293 | --- | ||
| 294 | |||
| 295 | ## Dependencies | ||
| 296 | |||
| 297 | ```go | ||
| 298 | require ( | ||
| 299 | github.com/smacker/go-tree-sitter v0.0.0-... | ||
| 300 | github.com/smacker/go-tree-sitter/golang v0.0.0-... | ||
| 301 | github.com/asg017/sqlite-vec-go-bindings v0.0.0-... | ||
| 302 | github.com/sabhiram/go-gitignore v0.0.0-... // or similar | ||
| 303 | ) | ||
| 304 | ``` | ||
| 305 | |||
| 306 | --- | ||
| 307 | |||
| 308 | ## Estimated Effort | ||
| 309 | |||
| 310 | | Phase | Effort | | ||
| 311 | |-------|--------| | ||
| 312 | | 1. Skeleton | 30 min | | ||
| 313 | | 2. Walker | 1 hr | | ||
| 314 | | 3. Chunker | 2 hr | | ||
| 315 | | 4. Embedder | 1 hr | | ||
| 316 | | 5. Storage | 2 hr | | ||
| 317 | | 6. CLI | 1 hr | | ||
| 318 | | 7. Incremental | 1 hr | | ||
| 319 | | 8. Polish | 1 hr | | ||
| 320 | | **Total** | ~10 hr | | ||
| 321 | |||
| 322 | --- | ||
| 323 | |||
| 324 | ## Open Decisions | ||
| 325 | |||
| 326 | 1. **CLI framework:** `cobra` vs stdlib `flag`? Leaning stdlib for simplicity. | ||
| 327 | 2. **Config file:** YAML in `.codevec/config.yaml` or just flags? | ||
| 328 | 3. **Chunk overlap:** Include N lines of context above/below functions? | ||
| 329 | 4. **Test files:** Index `*_test.go` by default or skip? | ||
| 330 | |||
| 331 | --- | ||
| 332 | |||
| 333 | ## First Milestone | ||
| 334 | |||
| 335 | End of Phase 5: Can index a Go repo and query it. | ||
| 336 | |||
| 337 | ```bash | ||
| 338 | cd ~/vault/code/nostr | ||
| 339 | codevec index . | ||
| 340 | codevec query "publish event to relay" | ||
| 341 | # → relay.go:45-89 Publish (0.87) | ||
| 342 | ``` | ||
