From 9ecdd63319b897e77e89b5dd8d0ee9264f0be0ff Mon Sep 17 00:00:00 2001 From: Clawd Date: Thu, 5 Mar 2026 07:14:35 -0800 Subject: Update plan: Cobra, internal API, no tests --- PLAN.md | 55 +++++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 30 deletions(-) (limited to 'PLAN.md') diff --git a/PLAN.md b/PLAN.md index a7253e1..4372a7a 100644 --- a/PLAN.md +++ b/PLAN.md @@ -32,9 +32,9 @@ codevec/ **Tasks:** - [ ] `go mod init code.northwest.io/codevec` -- [ ] Basic CLI with cobra or just flag package +- [ ] Basic CLI with Cobra - [ ] Subcommands: `index`, `query`, `status` -- [ ] Makefile with `build`, `test`, `install` +- [ ] Makefile with `build`, `install` --- @@ -58,9 +58,7 @@ func (w *Walker) Walk() ([]string, error) - [ ] Implement directory walking with `filepath.WalkDir` - [ ] Parse `.gitignore` patterns (use `go-gitignore` or similar) - [ ] Filter to `.go` files only (configurable later) -- [ ] Skip `vendor/`, `testdata/`, `*_test.go` by default (configurable) - -**Test:** Walk the `nostr` SDK repo, verify correct file list. +- [ ] Skip `vendor/`, `testdata/` by default (configurable) --- @@ -101,13 +99,11 @@ type Chunker interface { - [ ] Handle edge cases: empty files, syntax errors (skip gracefully) - [ ] Chunk size limit: if function > 1000 tokens, note it but keep whole -**Test:** Chunk `nostr/relay.go`, verify functions extracted correctly. - --- ## Phase 4: Embedding Generation -Generate embeddings via OpenAI API. +Generate embeddings via OpenAI-compatible API (internal endpoint). **Input:** List of chunks **Output:** Chunks with embedding vectors @@ -117,21 +113,25 @@ type Embedder interface { Embed(ctx context.Context, texts []string) ([][]float32, error) } -type OpenAIEmbedder struct { - apiKey string - model string // "text-embedding-3-small" +type Embedder struct { + baseURL string // defaults to OpenAI, configurable for internal API + apiKey string + model string // "text-embedding-3-small" } ``` -**Batching:** OpenAI supports up to 2048 inputs per request. Batch chunks to minimize API calls. +**Batching:** Batch chunks to minimize API calls (~100 per request). + +**Config:** +- `OPENAI_API_KEY` — API key (standard env var) +- `OPENAI_BASE_URL` — Override endpoint for internal API (optional) +- `--model` flag for model selection **Tasks:** -- [ ] Implement OpenAI embedding client (stdlib `net/http`, no SDK) -- [ ] Batch requests (100 chunks per request to stay safe) +- [ ] Implement OpenAI-compatible embedding client (stdlib `net/http`) +- [ ] Support custom base URL for internal API +- [ ] Batch requests - [ ] Handle rate limits with exponential backoff -- [ ] Config: model selection, API key from env `OPENAI_API_KEY` - -**Test:** Embed a few chunks, verify 1536-dim vectors returned. --- @@ -182,8 +182,6 @@ LIMIT 10; - [ ] Query by vector similarity - [ ] Store in `.codevec/index.db` -**Test:** Insert chunks, query, verify results ranked by similarity. - --- ## Phase 6: CLI Commands @@ -270,11 +268,8 @@ Only re-index changed files. --- -## Phase 8: Testing & Polish +## Phase 8: Polish -- [ ] Unit tests for chunker -- [ ] Unit tests for walker -- [ ] Integration test: index small repo, query, verify results - [ ] Error handling: missing API key, parse failures, network errors - [ ] README with usage examples - [ ] `make install` to put binary in PATH @@ -316,17 +311,17 @@ require ( | 5. Storage | 2 hr | | 6. CLI | 1 hr | | 7. Incremental | 1 hr | -| 8. Polish | 1 hr | -| **Total** | ~10 hr | +| 8. Polish | 30 min | +| **Total** | ~9 hr | --- -## Open Decisions +## Decisions -1. **CLI framework:** `cobra` vs stdlib `flag`? Leaning stdlib for simplicity. -2. **Config file:** YAML in `.codevec/config.yaml` or just flags? -3. **Chunk overlap:** Include N lines of context above/below functions? -4. **Test files:** Index `*_test.go` by default or skip? +1. **CLI framework:** Cobra +2. **Config:** Flags preferred; config file only if complexity warrants it +3. **Test files:** Index `*_test.go` by default (useful context) +4. **Tests:** None — move fast --- -- cgit v1.2.3