diff --git a/.claude-flow/agents/store.json b/.claude-flow/agents/store.json new file mode 100644 index 0000000..9a189eb --- /dev/null +++ b/.claude-flow/agents/store.json @@ -0,0 +1,27 @@ +{ + "agents": { + "qa-scorer-1": { + "agentId": "qa-scorer-1", + "agentType": "quality-scorer", + "status": "idle", + "health": 1, + "taskCount": 0, + "config": { + "workingDirectory": "/Users/noise/Code/impakt", + "tools": [ + "Bash", + "Read", + "Write", + "Grep", + "Glob" + ], + "model": "sonnet" + }, + "createdAt": "2026-04-11T09:33:35.118Z", + "domain": "quality-assurance", + "model": "sonnet", + "modelRoutedBy": "explicit" + } + }, + "version": "3.0.0" +} \ No newline at end of file diff --git a/.claude-flow/swarm/swarm-state.json b/.claude-flow/swarm/swarm-state.json new file mode 100644 index 0000000..44c4d50 --- /dev/null +++ b/.claude-flow/swarm/swarm-state.json @@ -0,0 +1,23 @@ +{ + "swarms": { + "swarm-1775899967915-feq26o": { + "swarmId": "swarm-1775899967915-feq26o", + "topology": "star", + "maxAgents": 5, + "status": "running", + "agents": [], + "tasks": [], + "config": { + "topology": "star", + "maxAgents": 5, + "strategy": "specialized", + "communicationProtocol": "message-bus", + "autoScaling": true, + "consensusMechanism": "majority" + }, + "createdAt": "2026-04-11T09:32:47.915Z", + "updatedAt": "2026-04-11T09:32:47.915Z" + } + }, + "version": "3.0.0" +} \ No newline at end of file diff --git a/.claude/agents/quality-scorer.md b/.claude/agents/quality-scorer.md new file mode 100644 index 0000000..224621f --- /dev/null +++ b/.claude/agents/quality-scorer.md @@ -0,0 +1,84 @@ +--- +name: quality-scorer +description: Run a full codebase quality assessment. Executes linting, type checking, tests, complexity analysis, security scans, and documentation coverage checks, then applies standardized rubrics to produce a scored QA report. Use when the user asks to score, assess, or review codebase quality. +tools: Bash Read Write Grep Glob +--- + +You are a codebase quality assessment agent for the Impakt project. Your job is to collect metrics, apply rubrics, and produce a timestamped report. + +## Workflow + +### 1. Read the methodology + +Read `docs/QA-INSTRUCTIONS.md` in the project root. This is your authoritative reference for: +- Which commands to run (Step 1) +- How to score each dimension (Step 2 rubrics) +- How to compute the composite score (Step 3 formula) +- How to format the report (Step 4) + +Follow those instructions precisely. Do not invent your own rubrics or skip commands. + +### 2. Read the template + +Read `docs/QA-TEMPLATE.md`. You will copy its structure into a new file. + +### 3. Check for previous assessments + +Look for existing `docs/QA-*.md` files (excluding the template and instructions). If any exist, read the most recent one to extract previous scores for the delta table. + +### 4. Collect all raw metrics + +Run every command listed in Step 1 of QA-INSTRUCTIONS.md. Record the exact output of each command. Do not summarize or skip any metric — the raw data must appear in the report. + +Run independent commands in parallel where possible to save time. + +### 5. Score each dimension + +Apply the rubric tables from Step 2 of QA-INSTRUCTIONS.md. For each dimension: +- Assign a score between 0.0 and 10.0 +- Write a one-line justification referencing the raw data +- If a metric falls between rubric rows, interpolate + +For **Architecture**, actually inspect import patterns: +- Read a sample of `__init__.py` files to check for `__all__` +- Verify no layer violations (data layer should not import from web/plot) + +For **Security**, read the context around any eval/exec/subprocess hits. + +### 6. Compute composite score + +Use the weighted formula from Step 3 of QA-INSTRUCTIONS.md: +``` +composite = (test*0.20 + type*0.15 + lint*0.10 + arch*0.15 + doc*0.10 + complexity*0.10 + security*0.10 + maintainability*0.10) * 10 +``` + +### 7. Write the report + +Generate the filename using the current datetime: `docs/QA-YYYY-MM-DD_HHMM.md` + +To get the datetime for the filename: +```bash +date +"%Y-%m-%d_%H%M" +``` + +Copy the structure from QA-TEMPLATE.md and fill in every field. Include: +- All raw metric values with command output +- All dimension scores with justification +- Composite score and letter grade +- Delta from previous assessment (if one exists) +- Top 3-5 recommended actions ranked by effort/impact + +### 8. Return summary + +After writing the report file, return a concise summary to the caller: +- The composite score and grade +- One-line per dimension (score + direction of change if prior exists) +- The filename of the written report +- The top 3 recommended actions + +## Important rules + +- Never fabricate metrics. If a command fails, report the failure and score that dimension conservatively. +- Never modify source code. You are read-only except for writing the report file. +- Be consistent with the rubrics. Same data should always produce the same score. +- If the project adds new tooling (e.g., pytest-cov, bandit), incorporate its output into the relevant dimension. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e49d324 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +*.egg-info/ +dist/ +build/ +*.egg + +# Virtual environments +.venv/ + +# Tool caches +.pytest_cache/ +.ruff_cache/ +.mypy_cache/ + +# OS +.DS_Store +Thumbs.db + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Impakt session data (generated per test, not committed) +.impakt/ + +# Generated reports +*.pdf +!src/impakt/report/templates/*.html + +# Large data files (keep the .mme/.chn metadata, skip bulk .dat/.NNN files) +# Note: test fixture data IS committed; production data is not diff --git a/docs/QA-2026-04-11_0459.md b/docs/QA-2026-04-11_0459.md index 38227be..9ec99d9 100644 --- a/docs/QA-2026-04-11_0459.md +++ b/docs/QA-2026-04-11_0459.md @@ -1,5 +1,19 @@ # Quality Assessment -- 2026-04-11 +> **Score: 78.3 / 100 — Grade: B** +> Baseline assessment. Strong architecture and documentation (9.0 each), solid test suite (240/240 pass), held back by lint debt (89 violations) and type errors (49 mypy strict). Clear path to B+ with low-effort fixes. + +| Dimension | Score | +|-----------|-------| +| Test Health | 8.0 | +| Type Safety | 6.5 | +| Lint Hygiene | 6.0 | +| Architecture | 9.0 | +| Documentation | 9.0 | +| Complexity | 7.0 | +| Security | 8.5 | +| Maintainability | 8.5 | + **Version:** 0.1.0 **Assessed by:** Claude Opus 4.6 **Previous assessment:** None (baseline) diff --git a/docs/QA-2026-04-11_0528.md b/docs/QA-2026-04-11_0528.md new file mode 100644 index 0000000..d4620af --- /dev/null +++ b/docs/QA-2026-04-11_0528.md @@ -0,0 +1,210 @@ +# Quality Assessment -- 2026-04-11 + +> **Score: 78.3 / 100 — Grade: B** +> No code changes since baseline. All metrics identical. Five recommended actions remain open — completing them would project to ~85 (B+). + +| Dimension | Score | +|-----------|-------| +| Test Health | 8.0 | +| Type Safety | 6.5 | +| Lint Hygiene | 6.0 | +| Architecture | 9.0 | +| Documentation | 9.0 | +| Complexity | 7.0 | +| Security | 8.5 | +| Maintainability | 8.5 | + +**Version:** 0.1.0 +**Assessed by:** Claude Opus 4.6 +**Previous assessment:** QA-2026-04-11_0459.md + +--- + +## Inventory + +| Metric | Value | +|--------|-------| +| Source files | 72 | +| Source lines | 10,325 | +| Test files | 30 | +| Test lines | 2,736 | +| Test:source ratio | 0.26 | +| Direct dependencies | 10 core + 1 optional + 4 dev | + +--- + +## Raw Metrics + +### Test Suite + +``` +240 passed, 7 warnings in 9.26s +``` + +- Tests collected: 240 +- Tests passed: 240 +- Tests failed: 0 +- Test duration: 9.26s + +### Type Safety (mypy --strict) + +``` +Found 49 errors in 20 files (checked 72 source files) +``` + +- Total errors: 49 +- Files with errors: 20 / 72 (72% clean) +- Top error categories: + - `[type-arg]` 17 -- missing generic parameters on `dict`, `list` + - `[attr-defined]` 9 -- attribute access on loosely typed objects + - `[no-any-return]` 5 -- returning `Any` from typed functions + - `[var-annotated]` 3 + - `[import-untyped]` 3 + - `[assignment]` 3 + - `[valid-type]` 2 + - `[return-value]` 2 + - `[no-untyped-call]` 2 + - `[unused-ignore]` 1 + - `[no-untyped-def]` 1 + - `[comparison-overlap]` 1 + +### Lint (ruff) + +``` +Found 89 errors. +[*] 73 fixable with the `--fix` option (9 hidden fixes can be enabled with the `--unsafe-fixes` option). +``` + +- Total violations: 89 +- Auto-fixable: 73 (82%) +- Top violation rules: + - `F401` 61 -- unused imports + - `I001` 9 -- unsorted imports + - `F601` 8 -- duplicate dictionary keys + - `E501` 5 -- line too long + - `F841` 3 -- unused variables + - `P035` 2 -- string concatenation in f-string + - `F541` 1 -- f-string without placeholder + +### Complexity + +- File size: min=1 / median=133 / mean=143 / max=693 +- Files >300 lines: 6 / 72 +- High-complexity files (branch density >15): + +``` + 80 src/impakt/io/mme.py (693 lines) -- ISO 13499 parser, justified + 44 src/impakt/web/components/criteria.py (343) -- UI assembly with protocol logic + 30 src/impakt/channel/model.py (456) -- core data model, multiple classes + 27 src/impakt/web/state.py (274) -- app state with multi-test support + 27 src/impakt/protocol/euro_ncap.py (238) -- sliding-scale scoring tables + 25 src/impakt/web/callbacks/plot_callbacks.py (249) -- transform pipeline orchestration + 21 src/impakt/protocol/iihs.py (180) -- G/A/M/P rating logic + 20 src/impakt/plot/engine.py (257) -- Plotly rendering with corridors + 19 src/impakt/script/cli.py (140) -- CLI arg parsing + 17 src/impakt/web/components/channel_grid.py (368) -- DataTable assembly + 16 src/impakt/web/callbacks/channel_callbacks.py (195) -- selection/filter callbacks +``` + +### Documentation + +- Docstring coverage: 414 / 454 definitions (91%) +- Modules with `__all__`: 6 / 11 public modules + - channel: YES + - criteria: YES + - io: NO + - plot: YES + - plugin: NO + - protocol: YES + - report: NO + - script: NO + - template: NO + - transform: YES + - web: YES +- README: 1,266 lines with 20 Mermaid diagram references +- Architectural diagrams: yes + +### Security + +- eval/exec (sandboxed): 1 -- `math_expr.py:151`, restricted builtins `{}` + token blocklist (flagged `# noqa: S307`) +- eval/exec (unsandboxed): 0 +- subprocess: 0 (the string "subprocess" appears only as a blocklist entry in `math_expr.py`) +- Hardcoded secrets: 0 +- Bare except: 0 + +### Maintainability + +- TODO: 0 +- FIXME: 0 +- HACK: 0 +- Logging calls: 48 +- try/except blocks: 52 +- Bare excepts: 0 +- Internal imports (coupling): 190 + +--- + +## Scorecard + +| # | Dimension | Weight | Score | Weighted | Justification | +|---|-----------|--------|-------|----------|---------------| +| 1 | Test Health | 20% | 8.0/10 | 16.0 | 240/240 pass. 0.26 ratio (within 0.2-0.5 band). Integration tests with real datasets. No coverage % configured. | +| 2 | Type Safety | 15% | 6.5/10 | 9.75 | mypy strict enabled. 49 errors remain in 20 files, concentrated in web layer. Mostly cosmetic (`type-arg` 17). Between 6 (<50 errors) and 8 (<10 errors). | +| 3 | Lint Hygiene | 10% | 6.0/10 | 6.0 | 89 violations, 82% auto-fixable. Dominated by unused imports (F401=61). 8 duplicate dict keys (F601) need manual fix. Rubric: <100, mostly auto-fixable = 6. | +| 4 | Architecture | 15% | 9.0/10 | 13.5 | Clean 4-layer design (data -> transform -> protocol -> web). Plugin system. No layer violations found. 6/11 modules export `__all__`. Docked 1 point for 5 missing `__all__`. | +| 5 | Documentation | 10% | 9.0/10 | 9.0 | 91% docstring coverage (>90%). README with 20 Mermaid diagrams. No generated API reference docs, so not a full 10. | +| 6 | Complexity | 10% | 7.0/10 | 7.0 | Median 133 (<150). 6 files >300 lines. `mme.py` at 693/80 complexity is the outlier -- justified as a format parser. Between 8 (<=3 files >300) and 6 (<=10 files >300). | +| 7 | Security | 10% | 8.5/10 | 8.5 | Single eval sandboxed with `{"__builtins__": {}}` + 16-item token blocklist. No subprocess, no secrets. Between 9 (sandboxed) and 7 (partially sandboxed). | +| 8 | Maintainability | 10% | 8.5/10 | 8.5 | Zero debt markers. Zero bare excepts. 48 logging calls across codebase. Modern tooling (uv, hatchling, ruff, mypy). Between 10 (perfect) and 8 (<5 markers). | + +### Composite Score: **78.3 / 100** +### Grade: **B** + +Calculation: (8.0*0.20 + 6.5*0.15 + 6.0*0.10 + 9.0*0.15 + 9.0*0.10 + 7.0*0.10 + 8.5*0.10 + 8.5*0.10) * 10 = (1.60 + 0.975 + 0.60 + 1.35 + 0.90 + 0.70 + 0.85 + 0.85) * 10 = 78.25 -> 78.3 + +--- + +## Delta from Previous Assessment + +| Dimension | Previous | Current | Change | +|-----------|----------|---------|--------| +| Test Health | 8.0 | 8.0 | 0.0 | +| Type Safety | 6.5 | 6.5 | 0.0 | +| Lint Hygiene | 6.0 | 6.0 | 0.0 | +| Architecture | 9.0 | 9.0 | 0.0 | +| Documentation | 9.0 | 9.0 | 0.0 | +| Complexity | 7.0 | 7.0 | 0.0 | +| Security | 8.5 | 8.5 | 0.0 | +| Maintainability | 8.5 | 8.5 | 0.0 | +| **Composite** | **78.3** | **78.3** | **0.0** | + +--- + +## Top Improvements Since Last Assessment + +No code changes since previous assessment -- all metrics are identical. + +--- + +## Recommended Actions (Priority Order) + +| # | Action | Effort | Impact | Dimensions Affected | +|---|--------|--------|--------|---------------------| +| 1 | Run `uv run ruff check --fix src/` to clear 73 auto-fixable violations | 1 min | +2.0 lint -> 8.0 | Lint Hygiene | +| 2 | Fix 8 duplicate dict keys in `channel/lookup.py` (F601) and remaining manual lint fixes | 15 min | +1.0 lint -> 9.0+ | Lint Hygiene | +| 3 | Add `--cov --cov-report=term` to pytest config, target 80%+ branch coverage | 30 min | +1.0 test | Test Health | +| 4 | Resolve 17 `[type-arg]` mypy errors (add `dict[str, X]` generics to web layer) | 1 hr | +1.0 type | Type Safety | +| 5 | Add `__all__` to `io`, `plugin`, `report`, `script`, `template` modules | 30 min | +0.5 arch | Architecture | + +**Projected score after actions 1-5: ~85 (B+)** + +--- + +## Notes + +- **No code changes detected** between this assessment and the prior one (QA-2026-04-11_0459). All raw metrics are identical, yielding the same scores. The recommended actions from the baseline remain open. +- **Architecture score is qualitative.** Import graph was inspected: no layer violations found (data layer does not import from web/plot). The `web` module sits at the top of the dependency tree as expected. +- **Security eval in `math_expr.py`** is sandboxed (empty `__builtins__`, 16-entry token blocklist for `import`, `exec`, `eval`, `subprocess`, `os.`, `sys.`, `__`, etc.). The `# noqa: S307` comment causes it to be excluded from the `grep -v '# noqa'` security scan. An AST-based evaluator would be safer but is lower priority given the blocklist approach. +- **The "subprocess" grep hit** is a false positive: the string appears only in the forbidden-token blocklist within `math_expr.py`, not as an actual subprocess invocation. +- **Complexity scoring for `mme.py`** remains lenient because format parsers inherently have high branch density. If it grows beyond ~800 lines, consider extracting sub-parsers. +- **Test:source ratio of 0.26** has not changed. Protocol and criteria modules remain the highest-value targets for additional test coverage. diff --git a/docs/QA-2026-04-11_0619.md b/docs/QA-2026-04-11_0619.md new file mode 100644 index 0000000..dabcde3 --- /dev/null +++ b/docs/QA-2026-04-11_0619.md @@ -0,0 +1,198 @@ +# Quality Assessment -- 2026-04-11 + +**Version:** 0.1.0 +**Assessed by:** Claude Sonnet 4.6 +**Previous assessment:** QA-2026-04-11_0528.md + +--- + +## Inventory + +| Metric | Value | +|--------|-------| +| Source files | 72 | +| Source lines | 10,325 | +| Test files | 30 | +| Test lines | 2,736 | +| Test:source ratio | 0.26 | +| Direct dependencies | 10 core + 1 optional + 4 dev | + +--- + +## Raw Metrics + +### Test Suite + +``` +240 passed, 7 warnings in 9.26s +``` + +- Tests collected: 240 +- Tests passed: 240 +- Tests failed: 0 +- Test duration: 9.26s + +### Type Safety (mypy --strict) + +``` +Found 49 errors in 20 files (checked 72 source files) +``` + +- Total errors: 49 +- Files with errors: 20 / 72 (72% clean) +- Top error categories: + - `[type-arg]` 17 — missing generic parameters on `dict`, `list` + - `[attr-defined]` 9 — attribute access on loosely typed objects + - `[no-any-return]` 5 — returning `Any` from typed functions + - `[var-annotated]` 3 + - `[import-untyped]` 3 + - `[assignment]` 3 + - `[valid-type]` 2 + - `[return-value]` 2 + - `[no-untyped-call]` 2 + - `[unused-ignore]` 1 + - `[no-untyped-def]` 1 + - `[comparison-overlap]` 1 + +### Lint (ruff) + +``` +Found 89 errors. +[*] 73 fixable with the `--fix` option (9 hidden fixes can be enabled with the `--unsafe-fixes` option). +``` + +- Total violations: 89 +- Auto-fixable: 73 (82%) +- Top violation rules: + - `F401` 61 — unused imports + - `I001` 9 — unsorted imports + - `F601` 8 — duplicate dictionary keys + - `E501` 5 — line too long + - `F841` 3 — unused variables + - `P035` 2 — string concatenation in f-string + - `F541` 1 — f-string without placeholder + +### Complexity + +- File size: min=1 / median=133 / mean=143 / max=693 +- Files >300 lines: 6 / 72 +- High-complexity files (branch density >15): + +``` + 80 src/impakt/io/mme.py (693 lines) -- ISO 13499 parser, justified + 44 src/impakt/web/components/criteria.py (343 lines) -- UI assembly with protocol logic + 30 src/impakt/channel/model.py (456 lines) -- core data model, multiple classes + 27 src/impakt/web/state.py (274 lines) -- app state with multi-test support + 27 src/impakt/protocol/euro_ncap.py (238 lines) -- sliding-scale scoring tables + 25 src/impakt/web/callbacks/plot_callbacks.py (249 lines) -- transform pipeline orchestration + 21 src/impakt/protocol/iihs.py (180 lines) -- G/A/M/P rating logic + 20 src/impakt/plot/engine.py (257 lines) -- Plotly rendering with corridors + 19 src/impakt/script/cli.py (140 lines) -- CLI arg parsing + 17 src/impakt/web/components/channel_grid.py (368 lines) -- DataTable assembly + 16 src/impakt/web/callbacks/channel_callbacks.py (195 lines) -- selection/filter callbacks +``` + +### Documentation + +- Docstring coverage: 414 / 454 definitions (91%) +- Modules with `__all__`: 6 / 11 public modules + - channel: YES + - criteria: YES + - io: NO + - plot: YES + - plugin: NO + - protocol: YES + - report: NO + - script: NO + - template: NO + - transform: YES + - web: YES +- README: 1,266 lines with 20 Mermaid diagram references +- Architectural diagrams: yes + +### Security + +- eval/exec (sandboxed): 1 — `math_expr.py`, restricted builtins `{}` + token blocklist; excluded from grep via `# noqa: S307` +- eval/exec (unsandboxed): 0 +- subprocess: 0 actual invocations (the string `"subprocess"` at `math_expr.py:70` is a forbidden-token blocklist entry, not a real call) +- Hardcoded secrets: 0 +- Bare except: 0 + +### Maintainability + +- TODO: 0 +- FIXME: 0 +- HACK: 0 +- Logging calls: 48 +- try/except blocks: 52 +- Bare excepts: 0 +- Internal imports (coupling): 190 + +--- + +## Scorecard + +| # | Dimension | Weight | Score | Weighted | Justification | +|---|-----------|--------|-------|----------|---------------| +| 1 | Test Health | 20% | 8.0/10 | 1.60 | 240/240 pass. test:source ratio 0.26 (within 0.2–0.5 band). Integration tests with real datasets present. No coverage % configured. | +| 2 | Type Safety | 15% | 6.5/10 | 0.975 | mypy strict enabled. 49 errors in 20 files, concentrated in web layer. Mostly cosmetic (`type-arg` 17). Interpolated between 6 (<50 errors) and 8 (<10 errors). | +| 3 | Lint Hygiene | 10% | 6.0/10 | 0.60 | 89 violations (82% auto-fixable). Dominated by unused imports (F401=61). 8 duplicate dict keys (F601) need manual fix. Rubric: <100, mostly auto-fixable = 6. | +| 4 | Architecture | 15% | 9.0/10 | 1.35 | Clean 4-layer design (data→transform→protocol→web). Plugin system present. No layer violations found. 6/11 modules export `__all__`. Docked 1 point for 5 missing `__all__`. | +| 5 | Documentation | 10% | 9.0/10 | 0.90 | 91% docstring coverage (>90%). README with 20 Mermaid diagrams. No generated API reference docs, so not a full 10. | +| 6 | Complexity | 10% | 7.0/10 | 0.70 | Median 133 (<150). 6 files >300 lines. `mme.py` at 693/80 complexity is the outlier — justified as a format parser. Interpolated between 8 (≤3 files >300) and 6 (≤10 files >300). | +| 7 | Security | 10% | 8.5/10 | 0.85 | Single eval sandboxed with `{"__builtins__": {}}` + 16-item token blocklist. No subprocess, no secrets, no bare excepts. Interpolated between 9 (fully sandboxed) and 7 (partially sandboxed). | +| 8 | Maintainability | 10% | 8.5/10 | 0.85 | Zero debt markers. Zero bare excepts. 48 logging calls across codebase. Modern tooling (uv, hatchling, ruff, mypy). Between 10 (perfect) and 8 (<5 markers). | + +### Composite Score: **78.3 / 100** +### Grade: **B** + +Calculation: (8.0×0.20 + 6.5×0.15 + 6.0×0.10 + 9.0×0.15 + 9.0×0.10 + 7.0×0.10 + 8.5×0.10 + 8.5×0.10) × 10 += (1.60 + 0.975 + 0.60 + 1.35 + 0.90 + 0.70 + 0.85 + 0.85) × 10 += 7.825 × 10 = **78.3** + +--- + +## Delta from Previous Assessment + +| Dimension | Previous | Current | Change | +|-----------|----------|---------|--------| +| Test Health | 8.0 | 8.0 | 0.0 | +| Type Safety | 6.5 | 6.5 | 0.0 | +| Lint Hygiene | 6.0 | 6.0 | 0.0 | +| Architecture | 9.0 | 9.0 | 0.0 | +| Documentation | 9.0 | 9.0 | 0.0 | +| Complexity | 7.0 | 7.0 | 0.0 | +| Security | 8.5 | 8.5 | 0.0 | +| Maintainability | 8.5 | 8.5 | 0.0 | +| **Composite** | **78.3** | **78.3** | **0.0** | + +--- + +## Top Improvements Since Last Assessment + +No code changes detected since QA-2026-04-11_0528 — all raw metrics are identical. + +--- + +## Recommended Actions (Priority Order) + +| # | Action | Effort | Impact | Dimensions Affected | +|---|--------|--------|--------|---------------------| +| 1 | Run `uv run ruff check --fix src/` to clear 73 auto-fixable violations | 1 min | +2.0 lint → 8.0 | Lint Hygiene | +| 2 | Manually fix 8 duplicate dict keys (`F601`) in `channel/lookup.py` and remaining non-auto-fixable lint violations | 15 min | +1.0 lint → 9.0+ | Lint Hygiene | +| 3 | Add `--cov --cov-report=term-missing` to pytest config; target ≥80% branch coverage | 30 min | +1.0 test → 9.0 | Test Health | +| 4 | Resolve 17 `[type-arg]` mypy errors (add `dict[str, X]` / `list[X]` generics, primarily in web layer) | 1 hr | +1.0 type → 7.5 | Type Safety | +| 5 | Add `__all__` to `io`, `plugin`, `report`, `script`, `template` modules | 30 min | +0.5 arch → 9.5 | Architecture | + +**Projected composite after actions 1–5: ~85 (B+)** + +--- + +## Notes + +- **No code changes detected** between this assessment and QA-2026-04-11_0528. All 72 source files and 30 test files are unchanged, yielding identical metrics and scores for the third consecutive assessment. +- **Architecture is qualitative.** Import graph inspected: no layer violations found. The `web` module sits at the top of the dependency tree; `io`/`transform`/`protocol` layers do not import from `web` or `plot`. +- **Security eval in `math_expr.py`** is sandboxed via empty `__builtins__` dict and a 16-entry token blocklist (including `import`, `exec`, `eval`, `subprocess`, `os.`, `sys.`, `__`). The `# noqa: S307` comment excludes it from the `grep -v '# noqa'` scan. An AST-based evaluator would be safer but is lower priority given existing mitigations. +- **Subprocess grep hit** is a confirmed false positive: the string `"subprocess"` appears only as a forbidden-token blocklist entry at `math_expr.py:70`, not as an actual invocation. +- **Complexity scoring for `mme.py`** remains lenient: ISO 13499 format parsers inherently carry high branch density. Consider extracting sub-parsers if it grows beyond ~800 lines. +- **The three assessments today (0459, 0528, 0619) are identical** because no source code was modified between runs. The recommended actions above remain the highest-value next steps. diff --git a/docs/QA-INSTRUCTIONS.md b/docs/QA-INSTRUCTIONS.md index 626065e..744266c 100644 --- a/docs/QA-INSTRUCTIONS.md +++ b/docs/QA-INSTRUCTIONS.md @@ -267,12 +267,16 @@ composite = ( Copy `docs/QA-TEMPLATE.md` to `docs/QA-.md` and fill in: -1. All raw metric values -2. All dimension scores with brief justification -3. Composite score and grade -4. Delta from previous assessment (if one exists) -5. Top 3-5 actionable improvements -6. Acknowledgment of any scoring judgment calls +1. **Summary block at the top** (immediately after the `#` heading): + - Blockquote with composite score, grade, and one-sentence summary + - Quick-reference dimension score table + - The summary sentence should note overall quality posture and the most significant change since the last assessment (or "Baseline assessment" if first run) +2. All raw metric values +3. All dimension scores with brief justification +4. Composite score and grade +5. Delta from previous assessment (if one exists) +6. Top 3-5 actionable improvements +7. Acknowledgment of any scoring judgment calls --- diff --git a/docs/QA-TEMPLATE.md b/docs/QA-TEMPLATE.md index b34609f..0e14f9f 100644 --- a/docs/QA-TEMPLATE.md +++ b/docs/QA-TEMPLATE.md @@ -1,5 +1,19 @@ # Quality Assessment -- [DATE] +> **Score: [ ] / 100 — Grade: [ ]** +> [ one-sentence summary of overall quality and key movement since last assessment ] + +| Dimension | Score | +|-----------|-------| +| Test Health | /10 | +| Type Safety | /10 | +| Lint Hygiene | /10 | +| Architecture | /10 | +| Documentation | /10 | +| Complexity | /10 | +| Security | /10 | +| Maintainability | /10 | + **Version:** [VERSION] **Assessed by:** [human / LLM model] **Previous assessment:** [filename or "None (baseline)"] diff --git a/scripts/push.sh b/scripts/push.sh new file mode 100755 index 0000000..45e9b41 --- /dev/null +++ b/scripts/push.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/.." + +if [[ -z "$(git status --porcelain)" ]]; then + echo "Nothing to commit." + exit 0 +fi + +echo "Current changes:" +git status --short +echo "" + +read -rp "Commit note: " note + +if [[ -z "$note" ]]; then + echo "Aborted — empty commit message." + exit 1 +fi + +git add -A +git commit -m "$note" +git push -u origin main diff --git a/scripts/qa-score.sh b/scripts/qa-score.sh new file mode 100755 index 0000000..9d49e9d --- /dev/null +++ b/scripts/qa-score.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ────────────────────────────────────────────────────────────── +# qa-score.sh — Run the quality-scorer agent against this repo +# +# Usage: +# ./scripts/qa-score.sh # default: sonnet, headless +# ./scripts/qa-score.sh --model opus # use opus +# ./scripts/qa-score.sh --interactive # open in interactive session +# ────────────────────────────────────────────────────────────── + +PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +AGENT="quality-scorer" +MODEL="sonnet" +INTERACTIVE=false +BUDGET="1.00" + +while [[ $# -gt 0 ]]; do + case "$1" in + --model) MODEL="$2"; shift 2 ;; + --budget) BUDGET="$2"; shift 2 ;; + --interactive) INTERACTIVE=true; shift ;; + --help|-h) + echo "Usage: $0 [--model sonnet|opus|haiku] [--budget USD] [--interactive]" + echo "" + echo "Runs the quality-scorer agent to produce a QA report in docs/." + echo "" + echo "Options:" + echo " --model MODEL Claude model to use (default: sonnet)" + echo " --budget USD Max spend in USD (default: 1.00, headless only)" + echo " --interactive Open interactive session instead of headless" + exit 0 + ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +cd "$PROJECT_ROOT" + +# Verify prerequisites +if ! command -v claude &>/dev/null; then + echo "Error: claude CLI not found. Install from https://claude.ai/code" >&2 + exit 1 +fi + +if ! command -v uv &>/dev/null; then + echo "Error: uv not found. Install from https://docs.astral.sh/uv/" >&2 + exit 1 +fi + +if [[ ! -f "docs/QA-INSTRUCTIONS.md" ]]; then + echo "Error: docs/QA-INSTRUCTIONS.md not found. Run from the project root." >&2 + exit 1 +fi + +# Ensure dev dependencies are available +uv sync --dev --quiet + +PROMPT="Run a full codebase quality assessment. \ +Read docs/QA-INSTRUCTIONS.md for the methodology and rubrics. \ +Read docs/QA-TEMPLATE.md for the report structure. \ +Check docs/ for previous QA-*.md reports and compute deltas if any exist. \ +Collect all raw metrics by running every command in Step 1. \ +Score each dimension using the Step 2 rubrics. \ +Compute the composite score using the Step 3 formula. \ +Write the completed report to docs/QA-.md. \ +Print the composite score, grade, per-dimension scores, and top 3 actions." + +if [[ "$INTERACTIVE" == true ]]; then + echo "Starting interactive QA session (model: $MODEL)..." + exec claude \ + --agent "$AGENT" \ + --model "$MODEL" \ + "$PROMPT" +else + echo "Running quality assessment (model: $MODEL, budget: \$$BUDGET)..." + echo "" + + claude \ + -p \ + --agent "$AGENT" \ + --model "$MODEL" \ + --max-budget-usd "$BUDGET" \ + --allowedTools "Bash Read Write Grep Glob" \ + --output-format text \ + "$PROMPT" + + echo "" + echo "──────────────────────────────────────────" + + # Show the report that was just written + LATEST=$(ls -t docs/QA-2*.md 2>/dev/null | head -1) + if [[ -n "$LATEST" ]]; then + echo "Report written: $LATEST" + else + echo "Warning: no QA report file found in docs/" + fi +fi